Patchwork [RFC] spapr: support time base offset migration

login
register
mail settings
Submitter Alexey Kardashevskiy
Date Sept. 3, 2013, 7:31 a.m.
Message ID <1378193502-4968-1-git-send-email-aik@ozlabs.ru>
Download mbox | patch
Permalink /patch/272131/
State New
Headers show

Comments

Alexey Kardashevskiy - Sept. 3, 2013, 7:31 a.m.
This allows guests to have a different timebase origin from the host.

This is needed for migration, where a guest can migrate from one host
to another and the two hosts might have a different timebase origin.
However, the timebase seen by the guest must not go backwards, and
should go forwards only by a small amount corresponding to the time
taken for the migration.

This is only supported for recent POWER hardware which has the TBU40
(timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
970.

This adds kvm_access_one_reg() to access a special register which is not
in env->spr.

The feature must be present in the host kernel.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

This is an RFC but not a final patch. Can break something but I just do not see what.

---
 hw/ppc/ppc.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/ppc/ppc.h |  4 ++++
 target-ppc/kvm.c     | 23 +++++++++++++++++++++++
 target-ppc/machine.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 trace-events         |  3 +++
 5 files changed, 123 insertions(+)
Andreas Färber - Sept. 3, 2013, 8:42 a.m.
Am 03.09.2013 09:31, schrieb Alexey Kardashevskiy:
> This allows guests to have a different timebase origin from the host.
> 
> This is needed for migration, where a guest can migrate from one host
> to another and the two hosts might have a different timebase origin.
> However, the timebase seen by the guest must not go backwards, and
> should go forwards only by a small amount corresponding to the time
> taken for the migration.
> 
> This is only supported for recent POWER hardware which has the TBU40
> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
> 970.
> 
> This adds kvm_access_one_reg() to access a special register which is not
> in env->spr.
> 
> The feature must be present in the host kernel.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> 
> This is an RFC but not a final patch. Can break something but I just do not see what.
> 
> ---
>  hw/ppc/ppc.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/ppc.h |  4 ++++
>  target-ppc/kvm.c     | 23 +++++++++++++++++++++++
>  target-ppc/machine.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  trace-events         |  3 +++
>  5 files changed, 123 insertions(+)
> 
> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> index 1e3cab3..7d08c9a 100644
> --- a/hw/ppc/ppc.c
> +++ b/hw/ppc/ppc.c
> @@ -31,6 +31,7 @@
>  #include "hw/loader.h"
>  #include "sysemu/kvm.h"
>  #include "kvm_ppc.h"
> +#include "trace.h"
>  
>  //#define PPC_DEBUG_IRQ
>  #define PPC_DEBUG_TB
> @@ -796,6 +797,54 @@ static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
>      cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>  }
>  
> +/*
> + * Calculate timebase on the destination side of migration
> + *
> + * We calculate new timebase offset as shown below:
> + * 1) Gtb2 = Gtb1 + max(tod2 - tod1, 0)
> + *    Gtb2 = tb2 + off2
> + *    Gtb1 = tb1 + off1
> + * 2) tb2 + off2 = tb1 + off1 + max(tod2 - tod1, 0)
> + * 3) off2 = tb1 - tb2 + off1 + max(tod2 - tod1, 0)
> + *
> + * where:
> + * Gtb2 - destination guest timebase
> + * tb2 - destination host timebase
> + * off2 - destination timebase offset
> + * tod2 - destination time of the day
> + * Gtb1 - source guest timebase
> + * tb1 - source host timebase
> + * off1 - source timebase offset
> + * tod1 - source time of the day
> + *
> + * The result we want is in @off2
> + *
> + * Two conditions must be met for @off2:
> + * 1) off2 must be multiple of 2^24 ticks as it will be set via TBU40 SPR
> + * 2) Gtb2 >= Gtb1
> + */
> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env)
> +{
> +    uint64_t tb2, tod2, off2;
> +    int ratio = tb_env->tb_freq / 1000000;
> +    struct timeval tv;
> +
> +    tb2 = cpu_get_real_ticks();
> +    gettimeofday(&tv, NULL);
> +    tod2 = tv.tv_sec * 1000000 + tv.tv_usec;
> +
> +    off2 = tb_env->timebase - tb2 + tb_env->tb_offset;
> +    if (tod2 > tb_env->time_of_the_day) {
> +        off2 += (tod2 - tb_env->time_of_the_day) * ratio;
> +    }
> +    off2 = ROUND_UP(off2, 1 << 24);
> +
> +    trace_ppc_tb_adjust(tb_env->tb_offset, off2,
> +                        (int64_t)off2 - tb_env->tb_offset);
> +
> +    tb_env->tb_offset = off2;
> +}
> +
>  /* Set up (once) timebase frequency (in Hz) */
>  clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>  {
> diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
> index 132ab97..235871c 100644
> --- a/include/hw/ppc/ppc.h
> +++ b/include/hw/ppc/ppc.h
> @@ -32,6 +32,9 @@ struct ppc_tb_t {
>      uint64_t purr_start;
>      void *opaque;
>      uint32_t flags;
> +    /* Cached values for live migration purposes */
> +    uint64_t timebase;
> +    uint64_t time_of_the_day;
>  };
>  
>  /* PPC Timers flags */
> @@ -46,6 +49,7 @@ struct ppc_tb_t {
>                                                 */
>  
>  uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t tb_offset);
> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env);
>  clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq);
>  /* Embedded PowerPC DCR management */
>  typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn);
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 7af9e3d..93df955 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -35,6 +35,7 @@
>  #include "hw/sysbus.h"
>  #include "hw/ppc/spapr.h"
>  #include "hw/ppc/spapr_vio.h"
> +#include "hw/ppc/ppc.h"
>  #include "sysemu/watchdog.h"
>  
>  //#define DEBUG_KVM
> @@ -761,6 +762,22 @@ static int kvm_put_vpa(CPUState *cs)
>  }
>  #endif /* TARGET_PPC64 */
>  
> +static int kvm_access_one_reg(CPUState *cs, bool set, __u64 id, void *addr)
> +{
> +    struct kvm_one_reg reg = {
> +        .id = id,
> +        .addr = (uintptr_t)addr,
> +    };
> +    int ret = kvm_vcpu_ioctl(cs, set ? KVM_SET_ONE_REG : KVM_GET_ONE_REG, &reg);
> +
> +    if (ret) {
> +        DPRINTF("Unable to %s time base offset to KVM: %s\n",
> +                set ? "set" : "get", strerror(errno));
> +    }
> +
> +    return ret;
> +}
> +
>  int kvm_arch_put_registers(CPUState *cs, int level)
>  {
>      PowerPCCPU *cpu = POWERPC_CPU(cs);
> @@ -873,6 +890,9 @@ int kvm_arch_put_registers(CPUState *cs, int level)
>                  DPRINTF("Warning: Unable to set VPA information to KVM\n");
>              }
>          }
> +
> +        kvm_access_one_reg(cs, 1, KVM_REG_PPC_TB_OFFSET,
> +                           &env->tb_env->tb_offset);
>  #endif /* TARGET_PPC64 */
>      }
>  
> @@ -1082,6 +1102,9 @@ int kvm_arch_get_registers(CPUState *cs)
>                  DPRINTF("Warning: Unable to get VPA information from KVM\n");
>              }
>          }
> +
> +        kvm_access_one_reg(cs, 0, KVM_REG_PPC_TB_OFFSET,
> +                           &env->tb_env->tb_offset);
>  #endif
>      }
>  
> diff --git a/target-ppc/machine.c b/target-ppc/machine.c
> index 12e1512..d1ffc7f 100644
> --- a/target-ppc/machine.c
> +++ b/target-ppc/machine.c
> @@ -1,5 +1,6 @@
>  #include "hw/hw.h"
>  #include "hw/boards.h"
> +#include "hw/ppc/ppc.h"
>  #include "sysemu/kvm.h"
>  #include "helper_regs.h"
>  
> @@ -459,6 +460,45 @@ static const VMStateDescription vmstate_tlbmas = {
>      }
>  };
>  
> +static void timebase_pre_save(void *opaque)
> +{
> +    ppc_tb_t *tb_env = opaque;
> +    struct timeval tv;
> +
> +    gettimeofday(&tv, NULL);
> +    tb_env->time_of_the_day = tv.tv_sec * 1000000 + tv.tv_usec;
> +    tb_env->timebase = cpu_get_real_ticks();
> +}
> +
> +static int timebase_post_load(void *opaque, int version_id)
> +{
> +    ppc_tb_t *tb_env = opaque;
> +
> +    if (!tb_env) {
> +        printf("NO TB!\n");
> +        return -1;
> +    }
> +    cpu_ppc_adjust_tb_offset(tb_env);
> +
> +    return 0;
> +}
> +
> +static const VMStateDescription vmstate_timebase = {
> +    .name = "cpu/timebase",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .pre_save = timebase_pre_save,
> +    .post_load = timebase_post_load,
> +    .fields      = (VMStateField []) {
> +        VMSTATE_UINT64(timebase, ppc_tb_t),
> +        VMSTATE_INT64(tb_offset, ppc_tb_t),
> +        VMSTATE_UINT64(time_of_the_day, ppc_tb_t),
> +        VMSTATE_UINT32_EQUAL(tb_freq, ppc_tb_t),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  const VMStateDescription vmstate_ppc_cpu = {
>      .name = "cpu",
>      .version_id = 5,
> @@ -498,6 +538,10 @@ const VMStateDescription vmstate_ppc_cpu = {
>          VMSTATE_UINT64_EQUAL(env.insns_flags, PowerPCCPU),
>          VMSTATE_UINT64_EQUAL(env.insns_flags2, PowerPCCPU),
>          VMSTATE_UINT32_EQUAL(env.nb_BATs, PowerPCCPU),
> +
> +        /* Time offset */
> +        VMSTATE_STRUCT_POINTER(env.tb_env, PowerPCCPU,
> +                               vmstate_timebase, ppc_tb_t *),
>          VMSTATE_END_OF_LIST()
>      },
>      .subsections = (VMStateSubsection []) {

Breaks the migration format. ;) You need to bump version_id and use a
macro that accepts the version the field was added in as argument.

Andreas

> diff --git a/trace-events b/trace-events
> index 935b953..24cf4d2 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1141,6 +1141,9 @@ spapr_iommu_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "li
>  spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
>  spapr_iommu_new_table(uint64_t liobn, void *tcet, void *table, int fd) "liobn=%"PRIx64" tcet=%p table=%p fd=%d"
>  
> +# hw/ppc/ppc.c
> +ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff) "adjusted from %"PRIx64" to %"PRIx64", diff %"PRId64
> +
>  # util/hbitmap.c
>  hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
>  hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
>
David Gibson - Sept. 5, 2013, 4:30 a.m.
On Tue, Sep 03, 2013 at 05:31:42PM +1000, Alexey Kardashevskiy wrote:
> This allows guests to have a different timebase origin from the host.
> 
> This is needed for migration, where a guest can migrate from one host
> to another and the two hosts might have a different timebase origin.
> However, the timebase seen by the guest must not go backwards, and
> should go forwards only by a small amount corresponding to the time
> taken for the migration.
> 
> This is only supported for recent POWER hardware which has the TBU40
> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
> 970.
> 
> This adds kvm_access_one_reg() to access a special register which is not
> in env->spr.
> 
> The feature must be present in the host kernel.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> 
> This is an RFC but not a final patch. Can break something but I just do not see what.
> 
> ---
>  hw/ppc/ppc.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/ppc.h |  4 ++++
>  target-ppc/kvm.c     | 23 +++++++++++++++++++++++
>  target-ppc/machine.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  trace-events         |  3 +++
>  5 files changed, 123 insertions(+)
> 
> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> index 1e3cab3..7d08c9a 100644
> --- a/hw/ppc/ppc.c
> +++ b/hw/ppc/ppc.c
> @@ -31,6 +31,7 @@
>  #include "hw/loader.h"
>  #include "sysemu/kvm.h"
>  #include "kvm_ppc.h"
> +#include "trace.h"
>  
>  //#define PPC_DEBUG_IRQ
>  #define PPC_DEBUG_TB
> @@ -796,6 +797,54 @@ static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
>      cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>  }
>  
> +/*
> + * Calculate timebase on the destination side of migration

> + * We calculate new timebase offset as shown below:
> + * 1) Gtb2 = Gtb1 + max(tod2 - tod1, 0)
> + *    Gtb2 = tb2 + off2
> + *    Gtb1 = tb1 + off1
> + * 2) tb2 + off2 = tb1 + off1 + max(tod2 - tod1, 0)
> + * 3) off2 = tb1 - tb2 + off1 + max(tod2 - tod1, 0)
> + *
> + * where:
> + * Gtb2 - destination guest timebase
> + * tb2 - destination host timebase
> + * off2 - destination timebase offset
> + * tod2 - destination time of the day
> + * Gtb1 - source guest timebase
> + * tb1 - source host timebase
> + * off1 - source timebase offset
> + * tod1 - source time of the day
> + *
> + * The result we want is in @off2
> + *
> + * Two conditions must be met for @off2:
> + * 1) off2 must be multiple of 2^24 ticks as it will be set via TBU40 SPR
> + * 2) Gtb2 >= Gtb1

What about the TCG case, where there is not host timebase, only a a
host system clock?

> + */
> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env)
> +{
> +    uint64_t tb2, tod2, off2;
> +    int ratio = tb_env->tb_freq / 1000000;
> +    struct timeval tv;
> +
> +    tb2 = cpu_get_real_ticks();
> +    gettimeofday(&tv, NULL);
> +    tod2 = tv.tv_sec * 1000000 + tv.tv_usec;
> +
> +    off2 = tb_env->timebase - tb2 + tb_env->tb_offset;
> +    if (tod2 > tb_env->time_of_the_day) {
> +        off2 += (tod2 - tb_env->time_of_the_day) * ratio;
> +    }
> +    off2 = ROUND_UP(off2, 1 << 24);
> +
> +    trace_ppc_tb_adjust(tb_env->tb_offset, off2,
> +                        (int64_t)off2 - tb_env->tb_offset);
> +
> +    tb_env->tb_offset = off2;
> +}
> +
>  /* Set up (once) timebase frequency (in Hz) */
>  clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>  {
> diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
> index 132ab97..235871c 100644
> --- a/include/hw/ppc/ppc.h
> +++ b/include/hw/ppc/ppc.h
> @@ -32,6 +32,9 @@ struct ppc_tb_t {
>      uint64_t purr_start;
>      void *opaque;
>      uint32_t flags;
> +    /* Cached values for live migration purposes */
> +    uint64_t timebase;
> +    uint64_t time_of_the_day;

How is the time of day encoded here?

>  };
>  
>  /* PPC Timers flags */
> @@ -46,6 +49,7 @@ struct ppc_tb_t {
>                                                 */
>  
>  uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t tb_offset);
> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env);
>  clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq);
>  /* Embedded PowerPC DCR management */
>  typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn);
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 7af9e3d..93df955 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -35,6 +35,7 @@
>  #include "hw/sysbus.h"
>  #include "hw/ppc/spapr.h"
>  #include "hw/ppc/spapr_vio.h"
> +#include "hw/ppc/ppc.h"
>  #include "sysemu/watchdog.h"
>  
>  //#define DEBUG_KVM
> @@ -761,6 +762,22 @@ static int kvm_put_vpa(CPUState *cs)
>  }
>  #endif /* TARGET_PPC64 */
>  
> +static int kvm_access_one_reg(CPUState *cs, bool set, __u64 id,
> void *addr)

I think it would be nicer to have seperate set_one_reg and get_one_reg
functions, rather than using a direction parameter.

> +{
> +    struct kvm_one_reg reg = {
> +        .id = id,
> +        .addr = (uintptr_t)addr,
> +    };
> +    int ret = kvm_vcpu_ioctl(cs, set ? KVM_SET_ONE_REG : KVM_GET_ONE_REG, &reg);
> +
> +    if (ret) {
> +        DPRINTF("Unable to %s time base offset to KVM: %s\n",
> +                set ? "set" : "get", strerror(errno));
> +    }
> +
> +    return ret;
> +}
> +
>  int kvm_arch_put_registers(CPUState *cs, int level)
>  {
>      PowerPCCPU *cpu = POWERPC_CPU(cs);
> @@ -873,6 +890,9 @@ int kvm_arch_put_registers(CPUState *cs, int level)
>                  DPRINTF("Warning: Unable to set VPA information to KVM\n");
>              }
>          }
> +
> +        kvm_access_one_reg(cs, 1, KVM_REG_PPC_TB_OFFSET,
> +                           &env->tb_env->tb_offset);

Hrm.  It may be too late, but would it make more sense for qemu to
just calculate the "ideal" offset - not 24-bit aligned, and have the
kernel round that up to a value suitable for TBU40.  I'm thinking that
might be more robust if someone decides that POWER10 should have a
TBU50 or a TBU30 instead of TBU40.

>  #endif /* TARGET_PPC64 */
>      }
>  
> @@ -1082,6 +1102,9 @@ int kvm_arch_get_registers(CPUState *cs)
>                  DPRINTF("Warning: Unable to get VPA information from KVM\n");
>              }
>          }
> +
> +        kvm_access_one_reg(cs, 0, KVM_REG_PPC_TB_OFFSET,
> +                           &env->tb_env->tb_offset);
>  #endif
>      }
>  
> diff --git a/target-ppc/machine.c b/target-ppc/machine.c
> index 12e1512..d1ffc7f 100644
> --- a/target-ppc/machine.c
> +++ b/target-ppc/machine.c
> @@ -1,5 +1,6 @@
>  #include "hw/hw.h"
>  #include "hw/boards.h"
> +#include "hw/ppc/ppc.h"
>  #include "sysemu/kvm.h"
>  #include "helper_regs.h"
>  
> @@ -459,6 +460,45 @@ static const VMStateDescription vmstate_tlbmas = {
>      }
>  };
>  
> +static void timebase_pre_save(void *opaque)
> +{
> +    ppc_tb_t *tb_env = opaque;
> +    struct timeval tv;
> +
> +    gettimeofday(&tv, NULL);
> +    tb_env->time_of_the_day = tv.tv_sec * 1000000 + tv.tv_usec;
> +    tb_env->timebase = cpu_get_real_ticks();
> +}
> +
> +static int timebase_post_load(void *opaque, int version_id)
> +{
> +    ppc_tb_t *tb_env = opaque;
> +
> +    if (!tb_env) {
> +        printf("NO TB!\n");
> +        return -1;
> +    }
> +    cpu_ppc_adjust_tb_offset(tb_env);
> +
> +    return 0;
> +}
> +
> +static const VMStateDescription vmstate_timebase = {
> +    .name = "cpu/timebase",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .pre_save = timebase_pre_save,
> +    .post_load = timebase_post_load,
> +    .fields      = (VMStateField []) {
> +        VMSTATE_UINT64(timebase, ppc_tb_t),
> +        VMSTATE_INT64(tb_offset, ppc_tb_t),

tb_offset is inherently a local concept, since it depends on the host
timebase.  So how can it belong in the migration stream?

> +        VMSTATE_UINT64(time_of_the_day, ppc_tb_t),
> +        VMSTATE_UINT32_EQUAL(tb_freq, ppc_tb_t),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  const VMStateDescription vmstate_ppc_cpu = {
>      .name = "cpu",
>      .version_id = 5,
> @@ -498,6 +538,10 @@ const VMStateDescription vmstate_ppc_cpu = {
>          VMSTATE_UINT64_EQUAL(env.insns_flags, PowerPCCPU),
>          VMSTATE_UINT64_EQUAL(env.insns_flags2, PowerPCCPU),
>          VMSTATE_UINT32_EQUAL(env.nb_BATs, PowerPCCPU),
> +
> +        /* Time offset */
> +        VMSTATE_STRUCT_POINTER(env.tb_env, PowerPCCPU,
> +                               vmstate_timebase, ppc_tb_t *),
>          VMSTATE_END_OF_LIST()
>      },
>      .subsections = (VMStateSubsection []) {
> diff --git a/trace-events b/trace-events
> index 935b953..24cf4d2 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1141,6 +1141,9 @@ spapr_iommu_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "li
>  spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
>  spapr_iommu_new_table(uint64_t liobn, void *tcet, void *table, int fd) "liobn=%"PRIx64" tcet=%p table=%p fd=%d"
>  
> +# hw/ppc/ppc.c
> +ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff) "adjusted from %"PRIx64" to %"PRIx64", diff %"PRId64
> +
>  # util/hbitmap.c
>  hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
>  hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
Alexey Kardashevskiy - Sept. 5, 2013, 4:54 a.m.
On 09/05/2013 02:30 PM, David Gibson wrote:
> On Tue, Sep 03, 2013 at 05:31:42PM +1000, Alexey Kardashevskiy wrote:
>> This allows guests to have a different timebase origin from the host.
>>
>> This is needed for migration, where a guest can migrate from one host
>> to another and the two hosts might have a different timebase origin.
>> However, the timebase seen by the guest must not go backwards, and
>> should go forwards only by a small amount corresponding to the time
>> taken for the migration.
>>
>> This is only supported for recent POWER hardware which has the TBU40
>> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
>> 970.
>>
>> This adds kvm_access_one_reg() to access a special register which is not
>> in env->spr.
>>
>> The feature must be present in the host kernel.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>
>> This is an RFC but not a final patch. Can break something but I just do not see what.
>>
>> ---
>>  hw/ppc/ppc.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  include/hw/ppc/ppc.h |  4 ++++
>>  target-ppc/kvm.c     | 23 +++++++++++++++++++++++
>>  target-ppc/machine.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
>>  trace-events         |  3 +++
>>  5 files changed, 123 insertions(+)
>>
>> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
>> index 1e3cab3..7d08c9a 100644
>> --- a/hw/ppc/ppc.c
>> +++ b/hw/ppc/ppc.c
>> @@ -31,6 +31,7 @@
>>  #include "hw/loader.h"
>>  #include "sysemu/kvm.h"
>>  #include "kvm_ppc.h"
>> +#include "trace.h"
>>  
>>  //#define PPC_DEBUG_IRQ
>>  #define PPC_DEBUG_TB
>> @@ -796,6 +797,54 @@ static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
>>      cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>>  }
>>  
>> +/*
>> + * Calculate timebase on the destination side of migration
> 
>> + * We calculate new timebase offset as shown below:
>> + * 1) Gtb2 = Gtb1 + max(tod2 - tod1, 0)
>> + *    Gtb2 = tb2 + off2
>> + *    Gtb1 = tb1 + off1
>> + * 2) tb2 + off2 = tb1 + off1 + max(tod2 - tod1, 0)
>> + * 3) off2 = tb1 - tb2 + off1 + max(tod2 - tod1, 0)
>> + *
>> + * where:
>> + * Gtb2 - destination guest timebase
>> + * tb2 - destination host timebase
>> + * off2 - destination timebase offset
>> + * tod2 - destination time of the day
>> + * Gtb1 - source guest timebase
>> + * tb1 - source host timebase
>> + * off1 - source timebase offset
>> + * tod1 - source time of the day
>> + *
>> + * The result we want is in @off2
>> + *
>> + * Two conditions must be met for @off2:
>> + * 1) off2 must be multiple of 2^24 ticks as it will be set via TBU40 SPR
>> + * 2) Gtb2 >= Gtb1
> 
> What about the TCG case, where there is not host timebase, only a a
> host system clock?


cpu_get_real_ticks() returns ticks, this is what the patch cares about.
What is the difference between KVM and TCG here?


>> + */
>> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env)
>> +{
>> +    uint64_t tb2, tod2, off2;
>> +    int ratio = tb_env->tb_freq / 1000000;
>> +    struct timeval tv;
>> +
>> +    tb2 = cpu_get_real_ticks();
>> +    gettimeofday(&tv, NULL);
>> +    tod2 = tv.tv_sec * 1000000 + tv.tv_usec;
>> +
>> +    off2 = tb_env->timebase - tb2 + tb_env->tb_offset;
>> +    if (tod2 > tb_env->time_of_the_day) {
>> +        off2 += (tod2 - tb_env->time_of_the_day) * ratio;
>> +    }
>> +    off2 = ROUND_UP(off2, 1 << 24);
>> +
>> +    trace_ppc_tb_adjust(tb_env->tb_offset, off2,
>> +                        (int64_t)off2 - tb_env->tb_offset);
>> +
>> +    tb_env->tb_offset = off2;
>> +}
>> +
>>  /* Set up (once) timebase frequency (in Hz) */
>>  clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>>  {
>> diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
>> index 132ab97..235871c 100644
>> --- a/include/hw/ppc/ppc.h
>> +++ b/include/hw/ppc/ppc.h
>> @@ -32,6 +32,9 @@ struct ppc_tb_t {
>>      uint64_t purr_start;
>>      void *opaque;
>>      uint32_t flags;
>> +    /* Cached values for live migration purposes */
>> +    uint64_t timebase;
>> +    uint64_t time_of_the_day;
> 
> How is the time of day encoded here?


Microseconds. I'll put a comment here, I just thought it is quite obvious
as gettimeofday() returns microseconds.


>>  };
>>  
>>  /* PPC Timers flags */
>> @@ -46,6 +49,7 @@ struct ppc_tb_t {
>>                                                 */
>>  
>>  uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t tb_offset);
>> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env);
>>  clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq);
>>  /* Embedded PowerPC DCR management */
>>  typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn);
>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
>> index 7af9e3d..93df955 100644
>> --- a/target-ppc/kvm.c
>> +++ b/target-ppc/kvm.c
>> @@ -35,6 +35,7 @@
>>  #include "hw/sysbus.h"
>>  #include "hw/ppc/spapr.h"
>>  #include "hw/ppc/spapr_vio.h"
>> +#include "hw/ppc/ppc.h"
>>  #include "sysemu/watchdog.h"
>>  
>>  //#define DEBUG_KVM
>> @@ -761,6 +762,22 @@ static int kvm_put_vpa(CPUState *cs)
>>  }
>>  #endif /* TARGET_PPC64 */
>>  
>> +static int kvm_access_one_reg(CPUState *cs, bool set, __u64 id,
>> void *addr)
> 
> I think it would be nicer to have seperate set_one_reg and get_one_reg
> functions, rather than using a direction parameter.


I am regularly getting requests from Alex to merge functions which look
almost the same. Please do not make it worse :)


>> +{
>> +    struct kvm_one_reg reg = {
>> +        .id = id,
>> +        .addr = (uintptr_t)addr,
>> +    };
>> +    int ret = kvm_vcpu_ioctl(cs, set ? KVM_SET_ONE_REG : KVM_GET_ONE_REG, &reg);
>> +
>> +    if (ret) {
>> +        DPRINTF("Unable to %s time base offset to KVM: %s\n",
>> +                set ? "set" : "get", strerror(errno));
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>>  int kvm_arch_put_registers(CPUState *cs, int level)
>>  {
>>      PowerPCCPU *cpu = POWERPC_CPU(cs);
>> @@ -873,6 +890,9 @@ int kvm_arch_put_registers(CPUState *cs, int level)
>>                  DPRINTF("Warning: Unable to set VPA information to KVM\n");
>>              }
>>          }
>> +
>> +        kvm_access_one_reg(cs, 1, KVM_REG_PPC_TB_OFFSET,
>> +                           &env->tb_env->tb_offset);
> 
> Hrm.  It may be too late, but would it make more sense for qemu to
> just calculate the "ideal" offset - not 24-bit aligned, and have the
> kernel round that up to a value suitable for TBU40.  I'm thinking that
> might be more robust if someone decides that POWER10 should have a
> TBU50 or a TBU30 instead of TBU40.


No, not late, the kernel patchset has not been noticed yet by anyone :)
Just a little remark here - QEMU sets one value to the kernel as an offset
but when it will be about to migrate again and read this offset from the
kernel, it will be different (rounded up) from what QEMU set. Not a problem
though.


>>  #endif /* TARGET_PPC64 */
>>      }
>>  
>> @@ -1082,6 +1102,9 @@ int kvm_arch_get_registers(CPUState *cs)
>>                  DPRINTF("Warning: Unable to get VPA information from KVM\n");
>>              }
>>          }
>> +
>> +        kvm_access_one_reg(cs, 0, KVM_REG_PPC_TB_OFFSET,
>> +                           &env->tb_env->tb_offset);
>>  #endif
>>      }
>>  
>> diff --git a/target-ppc/machine.c b/target-ppc/machine.c
>> index 12e1512..d1ffc7f 100644
>> --- a/target-ppc/machine.c
>> +++ b/target-ppc/machine.c
>> @@ -1,5 +1,6 @@
>>  #include "hw/hw.h"
>>  #include "hw/boards.h"
>> +#include "hw/ppc/ppc.h"
>>  #include "sysemu/kvm.h"
>>  #include "helper_regs.h"
>>  
>> @@ -459,6 +460,45 @@ static const VMStateDescription vmstate_tlbmas = {
>>      }
>>  };
>>  
>> +static void timebase_pre_save(void *opaque)
>> +{
>> +    ppc_tb_t *tb_env = opaque;
>> +    struct timeval tv;
>> +
>> +    gettimeofday(&tv, NULL);
>> +    tb_env->time_of_the_day = tv.tv_sec * 1000000 + tv.tv_usec;
>> +    tb_env->timebase = cpu_get_real_ticks();
>> +}
>> +
>> +static int timebase_post_load(void *opaque, int version_id)
>> +{
>> +    ppc_tb_t *tb_env = opaque;
>> +
>> +    if (!tb_env) {
>> +        printf("NO TB!\n");
>> +        return -1;
>> +    }
>> +    cpu_ppc_adjust_tb_offset(tb_env);
>> +
>> +    return 0;
>> +}
>> +
>> +static const VMStateDescription vmstate_timebase = {
>> +    .name = "cpu/timebase",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .minimum_version_id_old = 1,
>> +    .pre_save = timebase_pre_save,
>> +    .post_load = timebase_post_load,
>> +    .fields      = (VMStateField []) {
>> +        VMSTATE_UINT64(timebase, ppc_tb_t),
>> +        VMSTATE_INT64(tb_offset, ppc_tb_t),
> 
> tb_offset is inherently a local concept, since it depends on the host
> timebase.  So how can it belong in the migration stream?


I do not have pure guest timebase in QEMU and I need it on the destination.
But I have host timebase + offset to calculate it. And tb_offset is already
in ppc_tb_t. It looked logical to me to send the existing field and add
only the missing part.



>> +        VMSTATE_UINT64(time_of_the_day, ppc_tb_t),
>> +        VMSTATE_UINT32_EQUAL(tb_freq, ppc_tb_t),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>>  const VMStateDescription vmstate_ppc_cpu = {
>>      .name = "cpu",
>>      .version_id = 5,
>> @@ -498,6 +538,10 @@ const VMStateDescription vmstate_ppc_cpu = {
>>          VMSTATE_UINT64_EQUAL(env.insns_flags, PowerPCCPU),
>>          VMSTATE_UINT64_EQUAL(env.insns_flags2, PowerPCCPU),
>>          VMSTATE_UINT32_EQUAL(env.nb_BATs, PowerPCCPU),
>> +
>> +        /* Time offset */
>> +        VMSTATE_STRUCT_POINTER(env.tb_env, PowerPCCPU,
>> +                               vmstate_timebase, ppc_tb_t *),
>>          VMSTATE_END_OF_LIST()
>>      },
>>      .subsections = (VMStateSubsection []) {
>> diff --git a/trace-events b/trace-events
>> index 935b953..24cf4d2 100644
>> --- a/trace-events
>> +++ b/trace-events
>> @@ -1141,6 +1141,9 @@ spapr_iommu_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "li
>>  spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
>>  spapr_iommu_new_table(uint64_t liobn, void *tcet, void *table, int fd) "liobn=%"PRIx64" tcet=%p table=%p fd=%d"
>>  
>> +# hw/ppc/ppc.c
>> +ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff) "adjusted from %"PRIx64" to %"PRIx64", diff %"PRId64
>> +
>>  # util/hbitmap.c
>>  hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
>>  hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
>
Alexander Graf - Sept. 5, 2013, 9:16 a.m.
On 05.09.2013, at 06:54, Alexey Kardashevskiy wrote:

> On 09/05/2013 02:30 PM, David Gibson wrote:
>> On Tue, Sep 03, 2013 at 05:31:42PM +1000, Alexey Kardashevskiy wrote:
>>> This allows guests to have a different timebase origin from the host.
>>> 
>>> This is needed for migration, where a guest can migrate from one host
>>> to another and the two hosts might have a different timebase origin.
>>> However, the timebase seen by the guest must not go backwards, and
>>> should go forwards only by a small amount corresponding to the time
>>> taken for the migration.
>>> 
>>> This is only supported for recent POWER hardware which has the TBU40
>>> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
>>> 970.
>>> 
>>> This adds kvm_access_one_reg() to access a special register which is not
>>> in env->spr.
>>> 
>>> The feature must be present in the host kernel.
>>> 
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> ---
>>> 
>>> This is an RFC but not a final patch. Can break something but I just do not see what.
>>> 
>>> ---
>>> hw/ppc/ppc.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
>>> include/hw/ppc/ppc.h |  4 ++++
>>> target-ppc/kvm.c     | 23 +++++++++++++++++++++++
>>> target-ppc/machine.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
>>> trace-events         |  3 +++
>>> 5 files changed, 123 insertions(+)
>>> 
>>> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
>>> index 1e3cab3..7d08c9a 100644
>>> --- a/hw/ppc/ppc.c
>>> +++ b/hw/ppc/ppc.c
>>> @@ -31,6 +31,7 @@
>>> #include "hw/loader.h"
>>> #include "sysemu/kvm.h"
>>> #include "kvm_ppc.h"
>>> +#include "trace.h"
>>> 
>>> //#define PPC_DEBUG_IRQ
>>> #define PPC_DEBUG_TB
>>> @@ -796,6 +797,54 @@ static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
>>>     cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>>> }
>>> 
>>> +/*
>>> + * Calculate timebase on the destination side of migration
>> 
>>> + * We calculate new timebase offset as shown below:
>>> + * 1) Gtb2 = Gtb1 + max(tod2 - tod1, 0)
>>> + *    Gtb2 = tb2 + off2
>>> + *    Gtb1 = tb1 + off1
>>> + * 2) tb2 + off2 = tb1 + off1 + max(tod2 - tod1, 0)
>>> + * 3) off2 = tb1 - tb2 + off1 + max(tod2 - tod1, 0)
>>> + *
>>> + * where:
>>> + * Gtb2 - destination guest timebase
>>> + * tb2 - destination host timebase
>>> + * off2 - destination timebase offset
>>> + * tod2 - destination time of the day
>>> + * Gtb1 - source guest timebase
>>> + * tb1 - source host timebase
>>> + * off1 - source timebase offset
>>> + * tod1 - source time of the day
>>> + *
>>> + * The result we want is in @off2
>>> + *
>>> + * Two conditions must be met for @off2:
>>> + * 1) off2 must be multiple of 2^24 ticks as it will be set via TBU40 SPR
>>> + * 2) Gtb2 >= Gtb1
>> 
>> What about the TCG case, where there is not host timebase, only a a
>> host system clock?
> 
> 
> cpu_get_real_ticks() returns ticks, this is what the patch cares about.
> What is the difference between KVM and TCG here?
> 
> 
>>> + */
>>> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env)
>>> +{
>>> +    uint64_t tb2, tod2, off2;
>>> +    int ratio = tb_env->tb_freq / 1000000;
>>> +    struct timeval tv;
>>> +
>>> +    tb2 = cpu_get_real_ticks();
>>> +    gettimeofday(&tv, NULL);
>>> +    tod2 = tv.tv_sec * 1000000 + tv.tv_usec;
>>> +
>>> +    off2 = tb_env->timebase - tb2 + tb_env->tb_offset;
>>> +    if (tod2 > tb_env->time_of_the_day) {
>>> +        off2 += (tod2 - tb_env->time_of_the_day) * ratio;
>>> +    }
>>> +    off2 = ROUND_UP(off2, 1 << 24);
>>> +
>>> +    trace_ppc_tb_adjust(tb_env->tb_offset, off2,
>>> +                        (int64_t)off2 - tb_env->tb_offset);
>>> +
>>> +    tb_env->tb_offset = off2;
>>> +}
>>> +
>>> /* Set up (once) timebase frequency (in Hz) */
>>> clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>>> {
>>> diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
>>> index 132ab97..235871c 100644
>>> --- a/include/hw/ppc/ppc.h
>>> +++ b/include/hw/ppc/ppc.h
>>> @@ -32,6 +32,9 @@ struct ppc_tb_t {
>>>     uint64_t purr_start;
>>>     void *opaque;
>>>     uint32_t flags;
>>> +    /* Cached values for live migration purposes */
>>> +    uint64_t timebase;
>>> +    uint64_t time_of_the_day;
>> 
>> How is the time of day encoded here?
> 
> 
> Microseconds. I'll put a comment here, I just thought it is quite obvious
> as gettimeofday() returns microseconds.
> 
> 
>>> };
>>> 
>>> /* PPC Timers flags */
>>> @@ -46,6 +49,7 @@ struct ppc_tb_t {
>>>                                                */
>>> 
>>> uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t tb_offset);
>>> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env);
>>> clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq);
>>> /* Embedded PowerPC DCR management */
>>> typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn);
>>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
>>> index 7af9e3d..93df955 100644
>>> --- a/target-ppc/kvm.c
>>> +++ b/target-ppc/kvm.c
>>> @@ -35,6 +35,7 @@
>>> #include "hw/sysbus.h"
>>> #include "hw/ppc/spapr.h"
>>> #include "hw/ppc/spapr_vio.h"
>>> +#include "hw/ppc/ppc.h"
>>> #include "sysemu/watchdog.h"
>>> 
>>> //#define DEBUG_KVM
>>> @@ -761,6 +762,22 @@ static int kvm_put_vpa(CPUState *cs)
>>> }
>>> #endif /* TARGET_PPC64 */
>>> 
>>> +static int kvm_access_one_reg(CPUState *cs, bool set, __u64 id,
>>> void *addr)
>> 
>> I think it would be nicer to have seperate set_one_reg and get_one_reg
>> functions, rather than using a direction parameter.
> 
> 
> I am regularly getting requests from Alex to merge functions which look
> almost the same. Please do not make it worse :)

The reason I ask you to merge function is to reduce code duplication. The API should still look sane and I think what David suggests makes a lot of sense. In normal QEMU fashion, that translates to:

static int kvm_access_one_reg(CPUState *cs, uint64_t id, void *addr, bool set)
{
    ....
}

int kvm_set_one_reg(CPUState *cs, uint64_t id, void *addr)
{
    return kvm_access_one_reg(cs, id, addr, true);
}

int kvm_get_one_reg(CPUState *cs, uint64_t id, void *addr)
{
    return kvm_access_one_reg(cs, id, addr, false);
}

Also, this code should live in generic KVM code that can be used by more than just the PPC target.


> 
> 
>>> +{
>>> +    struct kvm_one_reg reg = {
>>> +        .id = id,
>>> +        .addr = (uintptr_t)addr,
>>> +    };
>>> +    int ret = kvm_vcpu_ioctl(cs, set ? KVM_SET_ONE_REG : KVM_GET_ONE_REG, &reg);
>>> +
>>> +    if (ret) {
>>> +        DPRINTF("Unable to %s time base offset to KVM: %s\n",
>>> +                set ? "set" : "get", strerror(errno));
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> int kvm_arch_put_registers(CPUState *cs, int level)
>>> {
>>>     PowerPCCPU *cpu = POWERPC_CPU(cs);
>>> @@ -873,6 +890,9 @@ int kvm_arch_put_registers(CPUState *cs, int level)
>>>                 DPRINTF("Warning: Unable to set VPA information to KVM\n");
>>>             }
>>>         }
>>> +
>>> +        kvm_access_one_reg(cs, 1, KVM_REG_PPC_TB_OFFSET,
>>> +                           &env->tb_env->tb_offset);
>> 
>> Hrm.  It may be too late, but would it make more sense for qemu to
>> just calculate the "ideal" offset - not 24-bit aligned, and have the
>> kernel round that up to a value suitable for TBU40.  I'm thinking that
>> might be more robust if someone decides that POWER10 should have a
>> TBU50 or a TBU30 instead of TBU40.
> 
> 
> No, not late, the kernel patchset has not been noticed yet by anyone :)
> Just a little remark here - QEMU sets one value to the kernel as an offset
> but when it will be about to migrate again and read this offset from the
> kernel, it will be different (rounded up) from what QEMU set. Not a problem
> though.
> 
> 
>>> #endif /* TARGET_PPC64 */
>>>     }
>>> 
>>> @@ -1082,6 +1102,9 @@ int kvm_arch_get_registers(CPUState *cs)
>>>                 DPRINTF("Warning: Unable to get VPA information from KVM\n");
>>>             }
>>>         }
>>> +
>>> +        kvm_access_one_reg(cs, 0, KVM_REG_PPC_TB_OFFSET,
>>> +                           &env->tb_env->tb_offset);
>>> #endif
>>>     }
>>> 
>>> diff --git a/target-ppc/machine.c b/target-ppc/machine.c
>>> index 12e1512..d1ffc7f 100644
>>> --- a/target-ppc/machine.c
>>> +++ b/target-ppc/machine.c
>>> @@ -1,5 +1,6 @@
>>> #include "hw/hw.h"
>>> #include "hw/boards.h"
>>> +#include "hw/ppc/ppc.h"
>>> #include "sysemu/kvm.h"
>>> #include "helper_regs.h"
>>> 
>>> @@ -459,6 +460,45 @@ static const VMStateDescription vmstate_tlbmas = {
>>>     }
>>> };
>>> 
>>> +static void timebase_pre_save(void *opaque)
>>> +{
>>> +    ppc_tb_t *tb_env = opaque;
>>> +    struct timeval tv;
>>> +
>>> +    gettimeofday(&tv, NULL);
>>> +    tb_env->time_of_the_day = tv.tv_sec * 1000000 + tv.tv_usec;
>>> +    tb_env->timebase = cpu_get_real_ticks();
>>> +}
>>> +
>>> +static int timebase_post_load(void *opaque, int version_id)
>>> +{
>>> +    ppc_tb_t *tb_env = opaque;
>>> +
>>> +    if (!tb_env) {
>>> +        printf("NO TB!\n");
>>> +        return -1;
>>> +    }
>>> +    cpu_ppc_adjust_tb_offset(tb_env);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static const VMStateDescription vmstate_timebase = {
>>> +    .name = "cpu/timebase",
>>> +    .version_id = 1,
>>> +    .minimum_version_id = 1,
>>> +    .minimum_version_id_old = 1,
>>> +    .pre_save = timebase_pre_save,
>>> +    .post_load = timebase_post_load,
>>> +    .fields      = (VMStateField []) {
>>> +        VMSTATE_UINT64(timebase, ppc_tb_t),
>>> +        VMSTATE_INT64(tb_offset, ppc_tb_t),
>> 
>> tb_offset is inherently a local concept, since it depends on the host
>> timebase.  So how can it belong in the migration stream?
> 
> 
> I do not have pure guest timebase in QEMU and I need it on the destination.
> But I have host timebase + offset to calculate it. And tb_offset is already
> in ppc_tb_t. It looked logical to me to send the existing field and add
> only the missing part.

I still don't understand. You want the guest visible timebase in the migration stream, no?


Alex
Alexey Kardashevskiy - Sept. 5, 2013, 9:48 a.m.
On 09/05/2013 07:16 PM, Alexander Graf wrote:
> 
> On 05.09.2013, at 06:54, Alexey Kardashevskiy wrote:
> 
>> On 09/05/2013 02:30 PM, David Gibson wrote:
>>> On Tue, Sep 03, 2013 at 05:31:42PM +1000, Alexey Kardashevskiy wrote:
>>>> This allows guests to have a different timebase origin from the host.
>>>>
>>>> This is needed for migration, where a guest can migrate from one host
>>>> to another and the two hosts might have a different timebase origin.
>>>> However, the timebase seen by the guest must not go backwards, and
>>>> should go forwards only by a small amount corresponding to the time
>>>> taken for the migration.
>>>>
>>>> This is only supported for recent POWER hardware which has the TBU40
>>>> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
>>>> 970.
>>>>
>>>> This adds kvm_access_one_reg() to access a special register which is not
>>>> in env->spr.
>>>>
>>>> The feature must be present in the host kernel.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>
>>>> This is an RFC but not a final patch. Can break something but I just do not see what.
>>>>
>>>> ---
>>>> hw/ppc/ppc.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>> include/hw/ppc/ppc.h |  4 ++++
>>>> target-ppc/kvm.c     | 23 +++++++++++++++++++++++
>>>> target-ppc/machine.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
>>>> trace-events         |  3 +++
>>>> 5 files changed, 123 insertions(+)
>>>>
>>>> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
>>>> index 1e3cab3..7d08c9a 100644
>>>> --- a/hw/ppc/ppc.c
>>>> +++ b/hw/ppc/ppc.c
>>>> @@ -31,6 +31,7 @@
>>>> #include "hw/loader.h"
>>>> #include "sysemu/kvm.h"
>>>> #include "kvm_ppc.h"
>>>> +#include "trace.h"
>>>>
>>>> //#define PPC_DEBUG_IRQ
>>>> #define PPC_DEBUG_TB
>>>> @@ -796,6 +797,54 @@ static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
>>>>     cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>>>> }
>>>>
>>>> +/*
>>>> + * Calculate timebase on the destination side of migration
>>>
>>>> + * We calculate new timebase offset as shown below:
>>>> + * 1) Gtb2 = Gtb1 + max(tod2 - tod1, 0)
>>>> + *    Gtb2 = tb2 + off2
>>>> + *    Gtb1 = tb1 + off1
>>>> + * 2) tb2 + off2 = tb1 + off1 + max(tod2 - tod1, 0)
>>>> + * 3) off2 = tb1 - tb2 + off1 + max(tod2 - tod1, 0)
>>>> + *
>>>> + * where:
>>>> + * Gtb2 - destination guest timebase
>>>> + * tb2 - destination host timebase
>>>> + * off2 - destination timebase offset
>>>> + * tod2 - destination time of the day
>>>> + * Gtb1 - source guest timebase
>>>> + * tb1 - source host timebase
>>>> + * off1 - source timebase offset
>>>> + * tod1 - source time of the day
>>>> + *
>>>> + * The result we want is in @off2
>>>> + *
>>>> + * Two conditions must be met for @off2:
>>>> + * 1) off2 must be multiple of 2^24 ticks as it will be set via TBU40 SPR
>>>> + * 2) Gtb2 >= Gtb1
>>>
>>> What about the TCG case, where there is not host timebase, only a a
>>> host system clock?
>>
>>
>> cpu_get_real_ticks() returns ticks, this is what the patch cares about.
>> What is the difference between KVM and TCG here?
>>
>>
>>>> + */
>>>> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env)
>>>> +{
>>>> +    uint64_t tb2, tod2, off2;
>>>> +    int ratio = tb_env->tb_freq / 1000000;
>>>> +    struct timeval tv;
>>>> +
>>>> +    tb2 = cpu_get_real_ticks();
>>>> +    gettimeofday(&tv, NULL);
>>>> +    tod2 = tv.tv_sec * 1000000 + tv.tv_usec;
>>>> +
>>>> +    off2 = tb_env->timebase - tb2 + tb_env->tb_offset;
>>>> +    if (tod2 > tb_env->time_of_the_day) {
>>>> +        off2 += (tod2 - tb_env->time_of_the_day) * ratio;
>>>> +    }
>>>> +    off2 = ROUND_UP(off2, 1 << 24);
>>>> +
>>>> +    trace_ppc_tb_adjust(tb_env->tb_offset, off2,
>>>> +                        (int64_t)off2 - tb_env->tb_offset);
>>>> +
>>>> +    tb_env->tb_offset = off2;
>>>> +}
>>>> +
>>>> /* Set up (once) timebase frequency (in Hz) */
>>>> clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>>>> {
>>>> diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
>>>> index 132ab97..235871c 100644
>>>> --- a/include/hw/ppc/ppc.h
>>>> +++ b/include/hw/ppc/ppc.h
>>>> @@ -32,6 +32,9 @@ struct ppc_tb_t {
>>>>     uint64_t purr_start;
>>>>     void *opaque;
>>>>     uint32_t flags;
>>>> +    /* Cached values for live migration purposes */
>>>> +    uint64_t timebase;
>>>> +    uint64_t time_of_the_day;
>>>
>>> How is the time of day encoded here?
>>
>>
>> Microseconds. I'll put a comment here, I just thought it is quite obvious
>> as gettimeofday() returns microseconds.
>>
>>
>>>> };
>>>>
>>>> /* PPC Timers flags */
>>>> @@ -46,6 +49,7 @@ struct ppc_tb_t {
>>>>                                                */
>>>>
>>>> uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t tb_offset);
>>>> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env);
>>>> clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq);
>>>> /* Embedded PowerPC DCR management */
>>>> typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn);
>>>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
>>>> index 7af9e3d..93df955 100644
>>>> --- a/target-ppc/kvm.c
>>>> +++ b/target-ppc/kvm.c
>>>> @@ -35,6 +35,7 @@
>>>> #include "hw/sysbus.h"
>>>> #include "hw/ppc/spapr.h"
>>>> #include "hw/ppc/spapr_vio.h"
>>>> +#include "hw/ppc/ppc.h"
>>>> #include "sysemu/watchdog.h"
>>>>
>>>> //#define DEBUG_KVM
>>>> @@ -761,6 +762,22 @@ static int kvm_put_vpa(CPUState *cs)
>>>> }
>>>> #endif /* TARGET_PPC64 */
>>>>
>>>> +static int kvm_access_one_reg(CPUState *cs, bool set, __u64 id,
>>>> void *addr)
>>>
>>> I think it would be nicer to have seperate set_one_reg and get_one_reg
>>> functions, rather than using a direction parameter.
>>
>>
>> I am regularly getting requests from Alex to merge functions which look
>> almost the same. Please do not make it worse :)
> 
> The reason I ask you to merge function is to reduce code duplication. The API should still look sane and I think what David suggests makes a lot of sense. In normal QEMU fashion, that translates to:
> 
> static int kvm_access_one_reg(CPUState *cs, uint64_t id, void *addr, bool set)
> {
>     ....
> }
> 
> int kvm_set_one_reg(CPUState *cs, uint64_t id, void *addr)
> {
>     return kvm_access_one_reg(cs, id, addr, true);
> }
> 
> int kvm_get_one_reg(CPUState *cs, uint64_t id, void *addr)
> {
>     return kvm_access_one_reg(cs, id, addr, false);
> }
> 
> Also, this code should live in generic KVM code that can be used by more than just the PPC target.
> 
> 
>>
>>
>>>> +{
>>>> +    struct kvm_one_reg reg = {
>>>> +        .id = id,
>>>> +        .addr = (uintptr_t)addr,
>>>> +    };
>>>> +    int ret = kvm_vcpu_ioctl(cs, set ? KVM_SET_ONE_REG : KVM_GET_ONE_REG, &reg);
>>>> +
>>>> +    if (ret) {
>>>> +        DPRINTF("Unable to %s time base offset to KVM: %s\n",
>>>> +                set ? "set" : "get", strerror(errno));
>>>> +    }
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> int kvm_arch_put_registers(CPUState *cs, int level)
>>>> {
>>>>     PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>> @@ -873,6 +890,9 @@ int kvm_arch_put_registers(CPUState *cs, int level)
>>>>                 DPRINTF("Warning: Unable to set VPA information to KVM\n");
>>>>             }
>>>>         }
>>>> +
>>>> +        kvm_access_one_reg(cs, 1, KVM_REG_PPC_TB_OFFSET,
>>>> +                           &env->tb_env->tb_offset);
>>>
>>> Hrm.  It may be too late, but would it make more sense for qemu to
>>> just calculate the "ideal" offset - not 24-bit aligned, and have the
>>> kernel round that up to a value suitable for TBU40.  I'm thinking that
>>> might be more robust if someone decides that POWER10 should have a
>>> TBU50 or a TBU30 instead of TBU40.
>>
>>
>> No, not late, the kernel patchset has not been noticed yet by anyone :)
>> Just a little remark here - QEMU sets one value to the kernel as an offset
>> but when it will be about to migrate again and read this offset from the
>> kernel, it will be different (rounded up) from what QEMU set. Not a problem
>> though.
>>
>>
>>>> #endif /* TARGET_PPC64 */
>>>>     }
>>>>
>>>> @@ -1082,6 +1102,9 @@ int kvm_arch_get_registers(CPUState *cs)
>>>>                 DPRINTF("Warning: Unable to get VPA information from KVM\n");
>>>>             }
>>>>         }
>>>> +
>>>> +        kvm_access_one_reg(cs, 0, KVM_REG_PPC_TB_OFFSET,
>>>> +                           &env->tb_env->tb_offset);
>>>> #endif
>>>>     }
>>>>
>>>> diff --git a/target-ppc/machine.c b/target-ppc/machine.c
>>>> index 12e1512..d1ffc7f 100644
>>>> --- a/target-ppc/machine.c
>>>> +++ b/target-ppc/machine.c
>>>> @@ -1,5 +1,6 @@
>>>> #include "hw/hw.h"
>>>> #include "hw/boards.h"
>>>> +#include "hw/ppc/ppc.h"
>>>> #include "sysemu/kvm.h"
>>>> #include "helper_regs.h"
>>>>
>>>> @@ -459,6 +460,45 @@ static const VMStateDescription vmstate_tlbmas = {
>>>>     }
>>>> };
>>>>
>>>> +static void timebase_pre_save(void *opaque)
>>>> +{
>>>> +    ppc_tb_t *tb_env = opaque;
>>>> +    struct timeval tv;
>>>> +
>>>> +    gettimeofday(&tv, NULL);
>>>> +    tb_env->time_of_the_day = tv.tv_sec * 1000000 + tv.tv_usec;
>>>> +    tb_env->timebase = cpu_get_real_ticks();
>>>> +}
>>>> +
>>>> +static int timebase_post_load(void *opaque, int version_id)
>>>> +{
>>>> +    ppc_tb_t *tb_env = opaque;
>>>> +
>>>> +    if (!tb_env) {
>>>> +        printf("NO TB!\n");
>>>> +        return -1;
>>>> +    }
>>>> +    cpu_ppc_adjust_tb_offset(tb_env);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static const VMStateDescription vmstate_timebase = {
>>>> +    .name = "cpu/timebase",
>>>> +    .version_id = 1,
>>>> +    .minimum_version_id = 1,
>>>> +    .minimum_version_id_old = 1,
>>>> +    .pre_save = timebase_pre_save,
>>>> +    .post_load = timebase_post_load,
>>>> +    .fields      = (VMStateField []) {
>>>> +        VMSTATE_UINT64(timebase, ppc_tb_t),
>>>> +        VMSTATE_INT64(tb_offset, ppc_tb_t),
>>>
>>> tb_offset is inherently a local concept, since it depends on the host
>>> timebase.  So how can it belong in the migration stream?
>>
>>
>> I do not have pure guest timebase in QEMU and I need it on the destination.
>> But I have host timebase + offset to calculate it. And tb_offset is already
>> in ppc_tb_t. It looked logical to me to send the existing field and add
>> only the missing part.
> 
> I still don't understand. You want the guest visible timebase in the migration stream, no?


Yes. I do not really understand the problem here (and I am not playing
dump). Do you suggest sending just the guest timebase and do not send the
host timebase and the offset (one number instead of two)? I can do that,
makes sense, no problem, thanks for the idea.
Alexander Graf - Sept. 5, 2013, 9:58 a.m.
On 05.09.2013, at 11:48, Alexey Kardashevskiy wrote:

> On 09/05/2013 07:16 PM, Alexander Graf wrote:
>> 
>> On 05.09.2013, at 06:54, Alexey Kardashevskiy wrote:
>> 
>>> On 09/05/2013 02:30 PM, David Gibson wrote:

[...]

>>> 
>>>>> #endif /* TARGET_PPC64 */
>>>>>    }
>>>>> 
>>>>> @@ -1082,6 +1102,9 @@ int kvm_arch_get_registers(CPUState *cs)
>>>>>                DPRINTF("Warning: Unable to get VPA information from KVM\n");
>>>>>            }
>>>>>        }
>>>>> +
>>>>> +        kvm_access_one_reg(cs, 0, KVM_REG_PPC_TB_OFFSET,
>>>>> +                           &env->tb_env->tb_offset);
>>>>> #endif
>>>>>    }
>>>>> 
>>>>> diff --git a/target-ppc/machine.c b/target-ppc/machine.c
>>>>> index 12e1512..d1ffc7f 100644
>>>>> --- a/target-ppc/machine.c
>>>>> +++ b/target-ppc/machine.c
>>>>> @@ -1,5 +1,6 @@
>>>>> #include "hw/hw.h"
>>>>> #include "hw/boards.h"
>>>>> +#include "hw/ppc/ppc.h"
>>>>> #include "sysemu/kvm.h"
>>>>> #include "helper_regs.h"
>>>>> 
>>>>> @@ -459,6 +460,45 @@ static const VMStateDescription vmstate_tlbmas = {
>>>>>    }
>>>>> };
>>>>> 
>>>>> +static void timebase_pre_save(void *opaque)
>>>>> +{
>>>>> +    ppc_tb_t *tb_env = opaque;
>>>>> +    struct timeval tv;
>>>>> +
>>>>> +    gettimeofday(&tv, NULL);
>>>>> +    tb_env->time_of_the_day = tv.tv_sec * 1000000 + tv.tv_usec;
>>>>> +    tb_env->timebase = cpu_get_real_ticks();
>>>>> +}
>>>>> +
>>>>> +static int timebase_post_load(void *opaque, int version_id)
>>>>> +{
>>>>> +    ppc_tb_t *tb_env = opaque;
>>>>> +
>>>>> +    if (!tb_env) {
>>>>> +        printf("NO TB!\n");
>>>>> +        return -1;
>>>>> +    }
>>>>> +    cpu_ppc_adjust_tb_offset(tb_env);
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static const VMStateDescription vmstate_timebase = {
>>>>> +    .name = "cpu/timebase",
>>>>> +    .version_id = 1,
>>>>> +    .minimum_version_id = 1,
>>>>> +    .minimum_version_id_old = 1,
>>>>> +    .pre_save = timebase_pre_save,
>>>>> +    .post_load = timebase_post_load,
>>>>> +    .fields      = (VMStateField []) {
>>>>> +        VMSTATE_UINT64(timebase, ppc_tb_t),
>>>>> +        VMSTATE_INT64(tb_offset, ppc_tb_t),
>>>> 
>>>> tb_offset is inherently a local concept, since it depends on the host
>>>> timebase.  So how can it belong in the migration stream?
>>> 
>>> 
>>> I do not have pure guest timebase in QEMU and I need it on the destination.
>>> But I have host timebase + offset to calculate it. And tb_offset is already
>>> in ppc_tb_t. It looked logical to me to send the existing field and add
>>> only the missing part.
>> 
>> I still don't understand. You want the guest visible timebase in the migration stream, no?
> 
> 
> Yes. I do not really understand the problem here (and I am not playing
> dump). Do you suggest sending just the guest timebase and do not send the
> host timebase and the offset (one number instead of two)? I can do that,
> makes sense, no problem, thanks for the idea.

Yup, pretty much :). The receiving end should have no business in knowing how far off the guest and the host timebase were skewed on the sending end :).


Alex
Benjamin Herrenschmidt - Sept. 5, 2013, 11:42 a.m.
On Thu, 2013-09-05 at 19:48 +1000, Alexey Kardashevskiy wrote:
> >> I do not have pure guest timebase in QEMU and I need it on the destination.
> >> But I have host timebase + offset to calculate it. And tb_offset is already
> >> in ppc_tb_t. It looked logical to me to send the existing field and add
> >> only the missing part.
> > 
> > I still don't understand. You want the guest visible timebase in the migration stream, no?
> 
> 
> Yes. I do not really understand the problem here (and I am not playing
> dump). Do you suggest sending just the guest timebase and do not send the
> host timebase and the offset (one number instead of two)? I can do that,
> makes sense, no problem, thanks for the idea.

No, we want to send the guest TB and the corresponding "time of day" so that
the target can adjust the TB based on how long the migration took.

Cheers,
Ben.
Benjamin Herrenschmidt - Sept. 5, 2013, 11:44 a.m.
On Thu, 2013-09-05 at 11:58 +0200, Alexander Graf wrote:

> > Yes. I do not really understand the problem here (and I am not
> playing
> > dump). Do you suggest sending just the guest timebase and do not
> send the
> > host timebase and the offset (one number instead of two)? I can do
> that,
> > makes sense, no problem, thanks for the idea.
> 
> Yup, pretty much :). The receiving end should have no business in
> knowing how far off the guest and the host timebase were skewed on the
> sending end :).

Well, yes and no ... we'd like to account for the migration latency on
the timebase or the guest view of real time will get skewed since it
uses the TB to maintain it's clock.

So assuming both hosts are reasonably synchronized with NTP, we want a
correlation guest TB / TOD in order to properly make the adjustment on
the target.

This may not be what Alexey implemented but I think that's what Paulus
and I asked for :-)

Cheers,
Ben.
Alexey Kardashevskiy - Sept. 5, 2013, 12:09 p.m.
On 09/05/2013 09:42 PM, Benjamin Herrenschmidt wrote:
> On Thu, 2013-09-05 at 19:48 +1000, Alexey Kardashevskiy wrote:
>>>> I do not have pure guest timebase in QEMU and I need it on the destination.
>>>> But I have host timebase + offset to calculate it. And tb_offset is already
>>>> in ppc_tb_t. It looked logical to me to send the existing field and add
>>>> only the missing part.
>>>
>>> I still don't understand. You want the guest visible timebase in the migration stream, no?
>>
>>
>> Yes. I do not really understand the problem here (and I am not playing
>> dump). Do you suggest sending just the guest timebase and do not send the
>> host timebase and the offset (one number instead of two)? I can do that,
>> makes sense, no problem, thanks for the idea.
> 
> No, we want to send the guest TB and the corresponding "time of day" so that
> the target can adjust the TB based on how long the migration took.

Nobody is discussing time_of_the_day, it is there and transmitted. The only
question is whether to send host_timebase and tb_offset (2 values) or just
guest timebase (1 value).
Alexander Graf - Sept. 5, 2013, 12:37 p.m.
On 05.09.2013, at 13:44, Benjamin Herrenschmidt wrote:

> On Thu, 2013-09-05 at 11:58 +0200, Alexander Graf wrote:
> 
>>> Yes. I do not really understand the problem here (and I am not
>> playing
>>> dump). Do you suggest sending just the guest timebase and do not
>> send the
>>> host timebase and the offset (one number instead of two)? I can do
>> that,
>>> makes sense, no problem, thanks for the idea.
>> 
>> Yup, pretty much :). The receiving end should have no business in
>> knowing how far off the guest and the host timebase were skewed on the
>> sending end :).
> 
> Well, yes and no ... we'd like to account for the migration latency on
> the timebase or the guest view of real time will get skewed since it
> uses the TB to maintain it's clock.
> 
> So assuming both hosts are reasonably synchronized with NTP, we want a
> correlation guest TB / TOD in order to properly make the adjustment on
> the target.

Hrm, I think I'm starting to understand what this is about. So what we want is

  - timebase in guest
  - timebase frequency in guest
  - wall clock time in host

That way the receiving end can then take the timebase and add (new_timebase - old_timebase) * tb_freq to the guest's time base.

Which gets me to the next question. Can we modify the tb frequency in guests?


Alex
Benjamin Herrenschmidt - Sept. 5, 2013, 1:36 p.m.
On Thu, 2013-09-05 at 14:37 +0200, Alexander Graf wrote:

> Hrm, I think I'm starting to understand what this is about. So what we want is
> 
>   - timebase in guest
>   - timebase frequency in guest
>   - wall clock time in host
> 
> That way the receiving end can then take the timebase and add (new_timebase - old_timebase) * tb_freq to the guest's time base.
> 
> Which gets me to the next question. Can we modify the tb frequency in guests?

No. It's architected at 512Mhz however since P7 I think. Not sure how we
did before, it's possible that P6 was the same (at least it's sourced
from more/less the same chip TOD facility).

Cheers,
Ben.
Alexander Graf - Sept. 5, 2013, 1:39 p.m.
On 05.09.2013, at 15:36, Benjamin Herrenschmidt wrote:

> On Thu, 2013-09-05 at 14:37 +0200, Alexander Graf wrote:
> 
>> Hrm, I think I'm starting to understand what this is about. So what we want is
>> 
>>  - timebase in guest
>>  - timebase frequency in guest
>>  - wall clock time in host
>> 
>> That way the receiving end can then take the timebase and add (new_timebase - old_timebase) * tb_freq to the guest's time base.
>> 
>> Which gets me to the next question. Can we modify the tb frequency in guests?
> 
> No. It's architected at 512Mhz however since P7 I think. Not sure how we
> did before, it's possible that P6 was the same (at least it's sourced
> from more/less the same chip TOD facility).

I think we should transmit the tb frequency as well to at least allow us to adjust if a later chip derives here.

But yes, without frequency adjustment I see where you're coming from. We still only need the timebase the guest sees, not the offset. But we also need the host wall clock to allow for adjustments :).


Alex
Andreas Färber - Sept. 5, 2013, 2:14 p.m.
Am 05.09.2013 15:39, schrieb Alexander Graf:
> 
> On 05.09.2013, at 15:36, Benjamin Herrenschmidt wrote:
> 
>> On Thu, 2013-09-05 at 14:37 +0200, Alexander Graf wrote:
>>
>>> Hrm, I think I'm starting to understand what this is about. So what we want is
>>>
>>>  - timebase in guest
>>>  - timebase frequency in guest
>>>  - wall clock time in host
>>>
>>> That way the receiving end can then take the timebase and add (new_timebase - old_timebase) * tb_freq to the guest's time base.
>>>
>>> Which gets me to the next question. Can we modify the tb frequency in guests?
>>
>> No. It's architected at 512Mhz however since P7 I think. Not sure how we
>> did before, it's possible that P6 was the same (at least it's sourced
>> from more/less the same chip TOD facility).
> 
> I think we should transmit the tb frequency as well to at least allow us to adjust if a later chip derives here.

Are you thinking of POWER8 having a different frequency than POWER8 in
compat mode? Because migration from one -cpu to another is not supported
elsewhere.

Even if we want to migrate from one POWER7 revision to another, we
should let the destination use the revision of the source (guest ABI!),
via property if need be. Anything else will lead to confusion as to what
is supported and what is not. That -cpu host is the default for
convenience shouldn't relieve admins/libvirt to think about sensible
setups like they have to on x86.

Andreas

> 
> But yes, without frequency adjustment I see where you're coming from. We still only need the timebase the guest sees, not the offset. But we also need the host wall clock to allow for adjustments :).
> 
> 
> Alex
>
Benjamin Herrenschmidt - Sept. 5, 2013, 2:26 p.m.
On Thu, 2013-09-05 at 16:14 +0200, Andreas Färber wrote:

> Are you thinking of POWER8 having a different frequency than POWER8 in
> compat mode? Because migration from one -cpu to another is not supported
> elsewhere.
> 
> Even if we want to migrate from one POWER7 revision to another, we
> should let the destination use the revision of the source (guest ABI!),
> via property if need be. Anything else will lead to confusion as to what
> is supported and what is not. That -cpu host is the default for
> convenience shouldn't relieve admins/libvirt to think about sensible
> setups like they have to on x86.

Besides POWER8 uses 512Mhz too :-) It's been architected so it's
unlikely to change from now on.

Cheers,
Ben.

> Andreas
> 
> > 
> > But yes, without frequency adjustment I see where you're coming from. We still only need the timebase the guest sees, not the offset. But we also need the host wall clock to allow for adjustments :).
> > 
> > 
> > Alex
> > 
> 
>
Alexander Graf - Sept. 5, 2013, 3:11 p.m.
On 05.09.2013, at 16:26, Benjamin Herrenschmidt wrote:

> On Thu, 2013-09-05 at 16:14 +0200, Andreas Färber wrote:
> 
>> Are you thinking of POWER8 having a different frequency than POWER8 in
>> compat mode? Because migration from one -cpu to another is not supported
>> elsewhere.
>> 
>> Even if we want to migrate from one POWER7 revision to another, we
>> should let the destination use the revision of the source (guest ABI!),
>> via property if need be. Anything else will lead to confusion as to what
>> is supported and what is not. That -cpu host is the default for
>> convenience shouldn't relieve admins/libvirt to think about sensible
>> setups like they have to on x86.
> 
> Besides POWER8 uses 512Mhz too :-) It's been architected so it's
> unlikely to change from now on.

Ok, ditch the tb frequency then. We can always default to 512Mhz when it's absent.


Alex
David Gibson - Sept. 6, 2013, 3 a.m.
On Thu, Sep 05, 2013 at 02:54:46PM +1000, Alexey Kardashevskiy wrote:
> On 09/05/2013 02:30 PM, David Gibson wrote:
> > On Tue, Sep 03, 2013 at 05:31:42PM +1000, Alexey Kardashevskiy wrote:
> >> This allows guests to have a different timebase origin from the host.
> >>
> >> This is needed for migration, where a guest can migrate from one host
> >> to another and the two hosts might have a different timebase origin.
> >> However, the timebase seen by the guest must not go backwards, and
> >> should go forwards only by a small amount corresponding to the time
> >> taken for the migration.
> >>
> >> This is only supported for recent POWER hardware which has the TBU40
> >> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
> >> 970.
> >>
> >> This adds kvm_access_one_reg() to access a special register which is not
> >> in env->spr.
> >>
> >> The feature must be present in the host kernel.
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>
> >> This is an RFC but not a final patch. Can break something but I just do not see what.
> >>
> >> ---
> >>  hw/ppc/ppc.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>  include/hw/ppc/ppc.h |  4 ++++
> >>  target-ppc/kvm.c     | 23 +++++++++++++++++++++++
> >>  target-ppc/machine.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
> >>  trace-events         |  3 +++
> >>  5 files changed, 123 insertions(+)
> >>
> >> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> >> index 1e3cab3..7d08c9a 100644
> >> --- a/hw/ppc/ppc.c
> >> +++ b/hw/ppc/ppc.c
> >> @@ -31,6 +31,7 @@
> >>  #include "hw/loader.h"
> >>  #include "sysemu/kvm.h"
> >>  #include "kvm_ppc.h"
> >> +#include "trace.h"
> >>  
> >>  //#define PPC_DEBUG_IRQ
> >>  #define PPC_DEBUG_TB
> >> @@ -796,6 +797,54 @@ static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
> >>      cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
> >>  }
> >>  
> >> +/*
> >> + * Calculate timebase on the destination side of migration
> > 
> >> + * We calculate new timebase offset as shown below:
> >> + * 1) Gtb2 = Gtb1 + max(tod2 - tod1, 0)
> >> + *    Gtb2 = tb2 + off2
> >> + *    Gtb1 = tb1 + off1
> >> + * 2) tb2 + off2 = tb1 + off1 + max(tod2 - tod1, 0)
> >> + * 3) off2 = tb1 - tb2 + off1 + max(tod2 - tod1, 0)
> >> + *
> >> + * where:
> >> + * Gtb2 - destination guest timebase
> >> + * tb2 - destination host timebase
> >> + * off2 - destination timebase offset
> >> + * tod2 - destination time of the day
> >> + * Gtb1 - source guest timebase
> >> + * tb1 - source host timebase
> >> + * off1 - source timebase offset
> >> + * tod1 - source time of the day
> >> + *
> >> + * The result we want is in @off2
> >> + *
> >> + * Two conditions must be met for @off2:
> >> + * 1) off2 must be multiple of 2^24 ticks as it will be set via TBU40 SPR
> >> + * 2) Gtb2 >= Gtb1
> > 
> > What about the TCG case, where there is not host timebase, only a a
> > host system clock?
> 
> 
> cpu_get_real_ticks() returns ticks, this is what the patch cares about.
> What is the difference between KVM and TCG here?

Hrm.  It may just be a question of variable naming and comments.  But
my point is that "host tb" is not a concept that always exists, so it
shouldn't appear in anything external like a migration stream.

> >> + */
> >> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env)
> >> +{
> >> +    uint64_t tb2, tod2, off2;
> >> +    int ratio = tb_env->tb_freq / 1000000;
> >> +    struct timeval tv;
> >> +
> >> +    tb2 = cpu_get_real_ticks();
> >> +    gettimeofday(&tv, NULL);
> >> +    tod2 = tv.tv_sec * 1000000 + tv.tv_usec;
> >> +
> >> +    off2 = tb_env->timebase - tb2 + tb_env->tb_offset;
> >> +    if (tod2 > tb_env->time_of_the_day) {
> >> +        off2 += (tod2 - tb_env->time_of_the_day) * ratio;
> >> +    }
> >> +    off2 = ROUND_UP(off2, 1 << 24);
> >> +
> >> +    trace_ppc_tb_adjust(tb_env->tb_offset, off2,
> >> +                        (int64_t)off2 - tb_env->tb_offset);
> >> +
> >> +    tb_env->tb_offset = off2;
> >> +}
> >> +
> >>  /* Set up (once) timebase frequency (in Hz) */
> >>  clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
> >>  {
> >> diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
> >> index 132ab97..235871c 100644
> >> --- a/include/hw/ppc/ppc.h
> >> +++ b/include/hw/ppc/ppc.h
> >> @@ -32,6 +32,9 @@ struct ppc_tb_t {
> >>      uint64_t purr_start;
> >>      void *opaque;
> >>      uint32_t flags;
> >> +    /* Cached values for live migration purposes */
> >> +    uint64_t timebase;
> >> +    uint64_t time_of_the_day;
> > 
> > How is the time of day encoded here?
> 
> 
> Microseconds. I'll put a comment here, I just thought it is quite obvious
> as gettimeofday() returns microseconds.
> 
> 
> >>  };
> >>  
> >>  /* PPC Timers flags */
> >> @@ -46,6 +49,7 @@ struct ppc_tb_t {
> >>                                                 */
> >>  
> >>  uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t tb_offset);
> >> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env);
> >>  clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq);
> >>  /* Embedded PowerPC DCR management */
> >>  typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn);
> >> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> >> index 7af9e3d..93df955 100644
> >> --- a/target-ppc/kvm.c
> >> +++ b/target-ppc/kvm.c
> >> @@ -35,6 +35,7 @@
> >>  #include "hw/sysbus.h"
> >>  #include "hw/ppc/spapr.h"
> >>  #include "hw/ppc/spapr_vio.h"
> >> +#include "hw/ppc/ppc.h"
> >>  #include "sysemu/watchdog.h"
> >>  
> >>  //#define DEBUG_KVM
> >> @@ -761,6 +762,22 @@ static int kvm_put_vpa(CPUState *cs)
> >>  }
> >>  #endif /* TARGET_PPC64 */
> >>  
> >> +static int kvm_access_one_reg(CPUState *cs, bool set, __u64 id,
> >> void *addr)
> > 
> > I think it would be nicer to have seperate set_one_reg and get_one_reg
> > functions, rather than using a direction parameter.
> 
> 
> I am regularly getting requests from Alex to merge functions which look
> almost the same. Please do not make it worse :)
> 
> 
> >> +{
> >> +    struct kvm_one_reg reg = {
> >> +        .id = id,
> >> +        .addr = (uintptr_t)addr,
> >> +    };
> >> +    int ret = kvm_vcpu_ioctl(cs, set ? KVM_SET_ONE_REG : KVM_GET_ONE_REG, &reg);
> >> +
> >> +    if (ret) {
> >> +        DPRINTF("Unable to %s time base offset to KVM: %s\n",
> >> +                set ? "set" : "get", strerror(errno));
> >> +    }
> >> +
> >> +    return ret;
> >> +}
> >> +
> >>  int kvm_arch_put_registers(CPUState *cs, int level)
> >>  {
> >>      PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> @@ -873,6 +890,9 @@ int kvm_arch_put_registers(CPUState *cs, int level)
> >>                  DPRINTF("Warning: Unable to set VPA information to KVM\n");
> >>              }
> >>          }
> >> +
> >> +        kvm_access_one_reg(cs, 1, KVM_REG_PPC_TB_OFFSET,
> >> +                           &env->tb_env->tb_offset);
> > 
> > Hrm.  It may be too late, but would it make more sense for qemu to
> > just calculate the "ideal" offset - not 24-bit aligned, and have the
> > kernel round that up to a value suitable for TBU40.  I'm thinking that
> > might be more robust if someone decides that POWER10 should have a
> > TBU50 or a TBU30 instead of TBU40.
> 
> 
> No, not late, the kernel patchset has not been noticed yet by anyone :)
> Just a little remark here - QEMU sets one value to the kernel as an offset
> but when it will be about to migrate again and read this offset from the
> kernel, it will be different (rounded up) from what QEMU set. Not a problem
> though.
> 
> 
> >>  #endif /* TARGET_PPC64 */
> >>      }
> >>  
> >> @@ -1082,6 +1102,9 @@ int kvm_arch_get_registers(CPUState *cs)
> >>                  DPRINTF("Warning: Unable to get VPA information from KVM\n");
> >>              }
> >>          }
> >> +
> >> +        kvm_access_one_reg(cs, 0, KVM_REG_PPC_TB_OFFSET,
> >> +                           &env->tb_env->tb_offset);
> >>  #endif
> >>      }
> >>  
> >> diff --git a/target-ppc/machine.c b/target-ppc/machine.c
> >> index 12e1512..d1ffc7f 100644
> >> --- a/target-ppc/machine.c
> >> +++ b/target-ppc/machine.c
> >> @@ -1,5 +1,6 @@
> >>  #include "hw/hw.h"
> >>  #include "hw/boards.h"
> >> +#include "hw/ppc/ppc.h"
> >>  #include "sysemu/kvm.h"
> >>  #include "helper_regs.h"
> >>  
> >> @@ -459,6 +460,45 @@ static const VMStateDescription vmstate_tlbmas = {
> >>      }
> >>  };
> >>  
> >> +static void timebase_pre_save(void *opaque)
> >> +{
> >> +    ppc_tb_t *tb_env = opaque;
> >> +    struct timeval tv;
> >> +
> >> +    gettimeofday(&tv, NULL);
> >> +    tb_env->time_of_the_day = tv.tv_sec * 1000000 + tv.tv_usec;
> >> +    tb_env->timebase = cpu_get_real_ticks();
> >> +}
> >> +
> >> +static int timebase_post_load(void *opaque, int version_id)
> >> +{
> >> +    ppc_tb_t *tb_env = opaque;
> >> +
> >> +    if (!tb_env) {
> >> +        printf("NO TB!\n");
> >> +        return -1;
> >> +    }
> >> +    cpu_ppc_adjust_tb_offset(tb_env);
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static const VMStateDescription vmstate_timebase = {
> >> +    .name = "cpu/timebase",
> >> +    .version_id = 1,
> >> +    .minimum_version_id = 1,
> >> +    .minimum_version_id_old = 1,
> >> +    .pre_save = timebase_pre_save,
> >> +    .post_load = timebase_post_load,
> >> +    .fields      = (VMStateField []) {
> >> +        VMSTATE_UINT64(timebase, ppc_tb_t),
> >> +        VMSTATE_INT64(tb_offset, ppc_tb_t),
> > 
> > tb_offset is inherently a local concept, since it depends on the host
> > timebase.  So how can it belong in the migration stream?
> 
> 
> I do not have pure guest timebase in QEMU and I need it on the destination.
> But I have host timebase + offset to calculate it. And tb_offset is already
> in ppc_tb_t. It looked logical to me to send the existing field and add
> only the missing part.

Yeah, as others have said though, guest timebase is the right thing to
send, though, since the host timebase is not relevant to the other end
of the migration  and won't even exist in the case of TCG.

> >> +        VMSTATE_UINT64(time_of_the_day, ppc_tb_t),
> >> +        VMSTATE_UINT32_EQUAL(tb_freq, ppc_tb_t),
> >> +        VMSTATE_END_OF_LIST()
> >> +    },
> >> +};
> >> +
> >>  const VMStateDescription vmstate_ppc_cpu = {
> >>      .name = "cpu",
> >>      .version_id = 5,
> >> @@ -498,6 +538,10 @@ const VMStateDescription vmstate_ppc_cpu = {
> >>          VMSTATE_UINT64_EQUAL(env.insns_flags, PowerPCCPU),
> >>          VMSTATE_UINT64_EQUAL(env.insns_flags2, PowerPCCPU),
> >>          VMSTATE_UINT32_EQUAL(env.nb_BATs, PowerPCCPU),
> >> +
> >> +        /* Time offset */
> >> +        VMSTATE_STRUCT_POINTER(env.tb_env, PowerPCCPU,
> >> +                               vmstate_timebase, ppc_tb_t *),
> >>          VMSTATE_END_OF_LIST()
> >>      },
> >>      .subsections = (VMStateSubsection []) {
> >> diff --git a/trace-events b/trace-events
> >> index 935b953..24cf4d2 100644
> >> --- a/trace-events
> >> +++ b/trace-events
> >> @@ -1141,6 +1141,9 @@ spapr_iommu_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "li
> >>  spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
> >>  spapr_iommu_new_table(uint64_t liobn, void *tcet, void *table, int fd) "liobn=%"PRIx64" tcet=%p table=%p fd=%d"
> >>  
> >> +# hw/ppc/ppc.c
> >> +ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff) "adjusted from %"PRIx64" to %"PRIx64", diff %"PRId64
> >> +
> >>  # util/hbitmap.c
> >>  hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
> >>  hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
> > 
> 
>
Alexey Kardashevskiy - Sept. 9, 2013, 2:40 a.m.
On 09/06/2013 01:11 AM, Alexander Graf wrote:
> 
> On 05.09.2013, at 16:26, Benjamin Herrenschmidt wrote:
> 
>> On Thu, 2013-09-05 at 16:14 +0200, Andreas Färber wrote:
>>
>>> Are you thinking of POWER8 having a different frequency than POWER8 in
>>> compat mode? Because migration from one -cpu to another is not supported
>>> elsewhere.
>>>
>>> Even if we want to migrate from one POWER7 revision to another, we
>>> should let the destination use the revision of the source (guest ABI!),
>>> via property if need be. Anything else will lead to confusion as to what
>>> is supported and what is not. That -cpu host is the default for
>>> convenience shouldn't relieve admins/libvirt to think about sensible
>>> setups like they have to on x86.
>>
>> Besides POWER8 uses 512Mhz too :-) It's been architected so it's
>> unlikely to change from now on.
> 
> Ok, ditch the tb frequency then. We can always default to 512Mhz when it's absent.


QEMU uses what kvmppc_get_tbfreq() returns. And ppc_tb_t carries this
value. It does not use 512MHz automatically but migration should always
assume 512MHz :-/

If we remove it, I would like to add assert(tb_freq!=512MHz) into
timebase_pre_save() and timebase_post_load(), is that ok?
Alexander Graf - Sept. 9, 2013, 5:50 a.m.
Am 09.09.2013 um 04:40 schrieb Alexey Kardashevskiy <aik@ozlabs.ru>:

> On 09/06/2013 01:11 AM, Alexander Graf wrote:
>> 
>> On 05.09.2013, at 16:26, Benjamin Herrenschmidt wrote:
>> 
>>> On Thu, 2013-09-05 at 16:14 +0200, Andreas Färber wrote:
>>> 
>>>> Are you thinking of POWER8 having a different frequency than POWER8 in
>>>> compat mode? Because migration from one -cpu to another is not supported
>>>> elsewhere.
>>>> 
>>>> Even if we want to migrate from one POWER7 revision to another, we
>>>> should let the destination use the revision of the source (guest ABI!),
>>>> via property if need be. Anything else will lead to confusion as to what
>>>> is supported and what is not. That -cpu host is the default for
>>>> convenience shouldn't relieve admins/libvirt to think about sensible
>>>> setups like they have to on x86.
>>> 
>>> Besides POWER8 uses 512Mhz too :-) It's been architected so it's
>>> unlikely to change from now on.
>> 
>> Ok, ditch the tb frequency then. We can always default to 512Mhz when it's absent.
> 
> 
> QEMU uses what kvmppc_get_tbfreq() returns. And ppc_tb_t carries this
> value. It does not use 512MHz automatically but migration should always
> assume 512MHz :-/
> 
> If we remove it, I would like to add assert(tb_freq!=512MHz) into
> timebase_pre_save() and timebase_post_load(), is that ok?

Good point. This would break TCG migration, right?

Alex

> 
> 
> -- 
> Alexey
Alexey Kardashevskiy - Sept. 9, 2013, 5:58 a.m.
On 09/09/2013 03:50 PM, Alexander Graf wrote:
> 
> 
> Am 09.09.2013 um 04:40 schrieb Alexey Kardashevskiy <aik@ozlabs.ru>:
> 
>> On 09/06/2013 01:11 AM, Alexander Graf wrote:
>>>
>>> On 05.09.2013, at 16:26, Benjamin Herrenschmidt wrote:
>>>
>>>> On Thu, 2013-09-05 at 16:14 +0200, Andreas Färber wrote:
>>>>
>>>>> Are you thinking of POWER8 having a different frequency than POWER8 in
>>>>> compat mode? Because migration from one -cpu to another is not supported
>>>>> elsewhere.
>>>>>
>>>>> Even if we want to migrate from one POWER7 revision to another, we
>>>>> should let the destination use the revision of the source (guest ABI!),
>>>>> via property if need be. Anything else will lead to confusion as to what
>>>>> is supported and what is not. That -cpu host is the default for
>>>>> convenience shouldn't relieve admins/libvirt to think about sensible
>>>>> setups like they have to on x86.
>>>>
>>>> Besides POWER8 uses 512Mhz too :-) It's been architected so it's
>>>> unlikely to change from now on.
>>>
>>> Ok, ditch the tb frequency then. We can always default to 512Mhz when it's absent.
>>
>>
>> QEMU uses what kvmppc_get_tbfreq() returns. And ppc_tb_t carries this
>> value. It does not use 512MHz automatically but migration should always
>> assume 512MHz :-/
>>
>> If we remove it, I would like to add assert(tb_freq!=512MHz) into
>> timebase_pre_save() and timebase_post_load(), is that ok?
> 
> Good point. This would break TCG migration, right?


In TCG we do not need any timebase offset at all, the time should migrate
as is, no? :)

It could break something like power5 migration under PR KVM, do not know...
Alexander Graf - Sept. 9, 2013, 6:06 a.m.
Am 09.09.2013 um 07:58 schrieb Alexey Kardashevskiy <aik@ozlabs.ru>:

> On 09/09/2013 03:50 PM, Alexander Graf wrote:
>> 
>> 
>> Am 09.09.2013 um 04:40 schrieb Alexey Kardashevskiy <aik@ozlabs.ru>:
>> 
>>> On 09/06/2013 01:11 AM, Alexander Graf wrote:
>>>> 
>>>> On 05.09.2013, at 16:26, Benjamin Herrenschmidt wrote:
>>>> 
>>>>> On Thu, 2013-09-05 at 16:14 +0200, Andreas Färber wrote:
>>>>> 
>>>>>> Are you thinking of POWER8 having a different frequency than POWER8 in
>>>>>> compat mode? Because migration from one -cpu to another is not supported
>>>>>> elsewhere.
>>>>>> 
>>>>>> Even if we want to migrate from one POWER7 revision to another, we
>>>>>> should let the destination use the revision of the source (guest ABI!),
>>>>>> via property if need be. Anything else will lead to confusion as to what
>>>>>> is supported and what is not. That -cpu host is the default for
>>>>>> convenience shouldn't relieve admins/libvirt to think about sensible
>>>>>> setups like they have to on x86.
>>>>> 
>>>>> Besides POWER8 uses 512Mhz too :-) It's been architected so it's
>>>>> unlikely to change from now on.
>>>> 
>>>> Ok, ditch the tb frequency then. We can always default to 512Mhz when it's absent.
>>> 
>>> 
>>> QEMU uses what kvmppc_get_tbfreq() returns. And ppc_tb_t carries this
>>> value. It does not use 512MHz automatically but migration should always
>>> assume 512MHz :-/
>>> 
>>> If we remove it, I would like to add assert(tb_freq!=512MHz) into
>>> timebase_pre_save() and timebase_post_load(), is that ok?
>> 
>> Good point. This would break TCG migration, right?
> 
> 
> In TCG we do not need any timebase offset at all, the time should migrate
> as is, no? :)

I hope so, but we need to make sure we don't assert() in that case :).

> 
> It could break something like power5 migration under PR KVM, do not know...

Well, today we don't support full migration with PR KVM yet - IIRC we don't have access to all state. But that may change in the future.

I think it's ok to restrict live migration to machines with the same tb frequency when kvm is enabled. Whether you implement it through a hardcoded 512Mhz or through a timebase value that gets live migrated and then compared is up to you - both ways work for me :).

Alex

> 
> 
> -- 
> Alexey
Benjamin Herrenschmidt - Sept. 9, 2013, 9:29 a.m.
On Mon, 2013-09-09 at 08:06 +0200, Alexander Graf wrote:
> I think it's ok to restrict live migration to machines with the same
> tb frequency when kvm is enabled. Whether you implement it through a
> hardcoded 512Mhz or through a timebase value that gets live migrated
> and then compared is up to you - both ways work for me :).

The latter might be handy if we want to support migration on 970, though
we don't have the TBU40 stuff there so adjusting the TB in the host
kernel would be ... problematic.

Cheers,
Ben.
Alexander Graf - Sept. 9, 2013, 9:32 a.m.
On 09.09.2013, at 11:29, Benjamin Herrenschmidt wrote:

> On Mon, 2013-09-09 at 08:06 +0200, Alexander Graf wrote:
>> I think it's ok to restrict live migration to machines with the same
>> tb frequency when kvm is enabled. Whether you implement it through a
>> hardcoded 512Mhz or through a timebase value that gets live migrated
>> and then compared is up to you - both ways work for me :).
> 
> The latter might be handy if we want to support migration on 970, though
> we don't have the TBU40 stuff there so adjusting the TB in the host
> kernel would be ... problematic.

Well, we could save/restore TB when we enter/exit the guest, no? But I really only want to have something that doesn't block the door for someone who wants to enable things.


Alex
Benjamin Herrenschmidt - Sept. 9, 2013, 9:38 a.m.
On Mon, 2013-09-09 at 11:32 +0200, Alexander Graf wrote:
> On 09.09.2013, at 11:29, Benjamin Herrenschmidt wrote:
> 
> > On Mon, 2013-09-09 at 08:06 +0200, Alexander Graf wrote:
> >> I think it's ok to restrict live migration to machines with the same
> >> tb frequency when kvm is enabled. Whether you implement it through a
> >> hardcoded 512Mhz or through a timebase value that gets live migrated
> >> and then compared is up to you - both ways work for me :).
> > 
> > The latter might be handy if we want to support migration on 970, though
> > we don't have the TBU40 stuff there so adjusting the TB in the host
> > kernel would be ... problematic.
> 
> Well, we could save/restore TB when we enter/exit the guest, no? 

Hard to do without introducing drift...

> But I really only want to have something that doesn't block the door for someone who wants to enable things.
> 
> 
> Alex
Alexander Graf - Sept. 9, 2013, 9:41 a.m.
On 09.09.2013, at 11:38, Benjamin Herrenschmidt wrote:

> On Mon, 2013-09-09 at 11:32 +0200, Alexander Graf wrote:
>> On 09.09.2013, at 11:29, Benjamin Herrenschmidt wrote:
>> 
>>> On Mon, 2013-09-09 at 08:06 +0200, Alexander Graf wrote:
>>>> I think it's ok to restrict live migration to machines with the same
>>>> tb frequency when kvm is enabled. Whether you implement it through a
>>>> hardcoded 512Mhz or through a timebase value that gets live migrated
>>>> and then compared is up to you - both ways work for me :).
>>> 
>>> The latter might be handy if we want to support migration on 970, though
>>> we don't have the TBU40 stuff there so adjusting the TB in the host
>>> kernel would be ... problematic.
>> 
>> Well, we could save/restore TB when we enter/exit the guest, no? 
> 
> Hard to do without introducing drift...

We could probably quantify the drift through DEC. But I'm personally not too eager to worry about live migration on 970 :).


Alex
David Gibson - Sept. 13, 2013, 5:20 a.m.
On Mon, Sep 09, 2013 at 08:06:53AM +0200, Alexander Graf wrote:
> 
> 
> Am 09.09.2013 um 07:58 schrieb Alexey Kardashevskiy <aik@ozlabs.ru>:
> 
> > On 09/09/2013 03:50 PM, Alexander Graf wrote:
> >> 
> >> 
> >> Am 09.09.2013 um 04:40 schrieb Alexey Kardashevskiy <aik@ozlabs.ru>:
> >> 
> >>> On 09/06/2013 01:11 AM, Alexander Graf wrote:
> >>>> 
> >>>> On 05.09.2013, at 16:26, Benjamin Herrenschmidt wrote:
> >>>> 
> >>>>> On Thu, 2013-09-05 at 16:14 +0200, Andreas Färber wrote:
> >>>>> 
> >>>>>> Are you thinking of POWER8 having a different frequency than POWER8 in
> >>>>>> compat mode? Because migration from one -cpu to another is not supported
> >>>>>> elsewhere.
> >>>>>> 
> >>>>>> Even if we want to migrate from one POWER7 revision to another, we
> >>>>>> should let the destination use the revision of the source (guest ABI!),
> >>>>>> via property if need be. Anything else will lead to confusion as to what
> >>>>>> is supported and what is not. That -cpu host is the default for
> >>>>>> convenience shouldn't relieve admins/libvirt to think about sensible
> >>>>>> setups like they have to on x86.
> >>>>> 
> >>>>> Besides POWER8 uses 512Mhz too :-) It's been architected so it's
> >>>>> unlikely to change from now on.
> >>>> 
> >>>> Ok, ditch the tb frequency then. We can always default to 512Mhz when it's absent.
> >>> 
> >>> 
> >>> QEMU uses what kvmppc_get_tbfreq() returns. And ppc_tb_t carries this
> >>> value. It does not use 512MHz automatically but migration should always
> >>> assume 512MHz :-/
> >>> 
> >>> If we remove it, I would like to add assert(tb_freq!=512MHz) into
> >>> timebase_pre_save() and timebase_post_load(), is that ok?
> >> 
> >> Good point. This would break TCG migration, right?
> > 
> > 
> > In TCG we do not need any timebase offset at all, the time should migrate
> > as is, no? :)
> 
> I hope so, but we need to make sure we don't assert() in that case :).

Hrm.. that doesn't sound right.  Whether you have KVM or TCG, what you
need in the migration stream is enough data points that you can deduce
the guest timebase from the host real time.  On KVM that translates
into a diff between guest and host timebase, but on TCG you'll need to
implement that differently

> > 
> > It could break something like power5 migration under PR KVM, do not know...
> 
> Well, today we don't support full migration with PR KVM yet - IIRC we don't have access to all state. But that may change in the future.
> 
> I think it's ok to restrict live migration to machines with the same
> tb frequency when kvm is enabled. Whether you implement it through a
> hardcoded 512Mhz or through a timebase value that gets live migrated
> and then compared is up to you - both ways work for me :).

I don't think it's possible to allow KVM migration between hosts of
different tb frequency.  But I think encoding the frequency in the
stream and verifying it is a better option.  It might be useful for
migration on 970, and it might be useful for TCG migration and it
might be useful for TCG<->KVM migration. (For a TCG migration
destination you *can* potentially set the tb frequency according to
what you're told).
Alexander Graf - Sept. 13, 2013, 6:06 p.m.
On 13.09.2013, at 00:20, David Gibson wrote:

> On Mon, Sep 09, 2013 at 08:06:53AM +0200, Alexander Graf wrote:
>> 
>> 
>> Am 09.09.2013 um 07:58 schrieb Alexey Kardashevskiy <aik@ozlabs.ru>:
>> 
>>> On 09/09/2013 03:50 PM, Alexander Graf wrote:
>>>> 
>>>> 
>>>> Am 09.09.2013 um 04:40 schrieb Alexey Kardashevskiy <aik@ozlabs.ru>:
>>>> 
>>>>> On 09/06/2013 01:11 AM, Alexander Graf wrote:
>>>>>> 
>>>>>> On 05.09.2013, at 16:26, Benjamin Herrenschmidt wrote:
>>>>>> 
>>>>>>> On Thu, 2013-09-05 at 16:14 +0200, Andreas Färber wrote:
>>>>>>> 
>>>>>>>> Are you thinking of POWER8 having a different frequency than POWER8 in
>>>>>>>> compat mode? Because migration from one -cpu to another is not supported
>>>>>>>> elsewhere.
>>>>>>>> 
>>>>>>>> Even if we want to migrate from one POWER7 revision to another, we
>>>>>>>> should let the destination use the revision of the source (guest ABI!),
>>>>>>>> via property if need be. Anything else will lead to confusion as to what
>>>>>>>> is supported and what is not. That -cpu host is the default for
>>>>>>>> convenience shouldn't relieve admins/libvirt to think about sensible
>>>>>>>> setups like they have to on x86.
>>>>>>> 
>>>>>>> Besides POWER8 uses 512Mhz too :-) It's been architected so it's
>>>>>>> unlikely to change from now on.
>>>>>> 
>>>>>> Ok, ditch the tb frequency then. We can always default to 512Mhz when it's absent.
>>>>> 
>>>>> 
>>>>> QEMU uses what kvmppc_get_tbfreq() returns. And ppc_tb_t carries this
>>>>> value. It does not use 512MHz automatically but migration should always
>>>>> assume 512MHz :-/
>>>>> 
>>>>> If we remove it, I would like to add assert(tb_freq!=512MHz) into
>>>>> timebase_pre_save() and timebase_post_load(), is that ok?
>>>> 
>>>> Good point. This would break TCG migration, right?
>>> 
>>> 
>>> In TCG we do not need any timebase offset at all, the time should migrate
>>> as is, no? :)
>> 
>> I hope so, but we need to make sure we don't assert() in that case :).
> 
> Hrm.. that doesn't sound right.  Whether you have KVM or TCG, what you
> need in the migration stream is enough data points that you can deduce
> the guest timebase from the host real time.  On KVM that translates
> into a diff between guest and host timebase, but on TCG you'll need to
> implement that differently

I'd claim that for TCG we don't care about accuracy of the time base across live migration, so I'd be happy to simply ignore the adjustments we'd do with KVM there. Otherwise things would get heavily complicated.

> 
>>> 
>>> It could break something like power5 migration under PR KVM, do not know...
>> 
>> Well, today we don't support full migration with PR KVM yet - IIRC we don't have access to all state. But that may change in the future.
>> 
>> I think it's ok to restrict live migration to machines with the same
>> tb frequency when kvm is enabled. Whether you implement it through a
>> hardcoded 512Mhz or through a timebase value that gets live migrated
>> and then compared is up to you - both ways work for me :).
> 
> I don't think it's possible to allow KVM migration between hosts of
> different tb frequency.  But I think encoding the frequency in the
> stream and verifying it is a better option.  It might be useful for
> migration on 970, and it might be useful for TCG migration and it
> might be useful for TCG<->KVM migration. (For a TCG migration
> destination you *can* potentially set the tb frequency according to
> what you're told).

Yup, IIUC that's what Alexey's gut feeling told him too (and I concur) and is what he's doing ;).


Alex

Patch

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index 1e3cab3..7d08c9a 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -31,6 +31,7 @@ 
 #include "hw/loader.h"
 #include "sysemu/kvm.h"
 #include "kvm_ppc.h"
+#include "trace.h"
 
 //#define PPC_DEBUG_IRQ
 #define PPC_DEBUG_TB
@@ -796,6 +797,54 @@  static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
     cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
 }
 
+/*
+ * Calculate timebase on the destination side of migration
+ *
+ * We calculate new timebase offset as shown below:
+ * 1) Gtb2 = Gtb1 + max(tod2 - tod1, 0)
+ *    Gtb2 = tb2 + off2
+ *    Gtb1 = tb1 + off1
+ * 2) tb2 + off2 = tb1 + off1 + max(tod2 - tod1, 0)
+ * 3) off2 = tb1 - tb2 + off1 + max(tod2 - tod1, 0)
+ *
+ * where:
+ * Gtb2 - destination guest timebase
+ * tb2 - destination host timebase
+ * off2 - destination timebase offset
+ * tod2 - destination time of the day
+ * Gtb1 - source guest timebase
+ * tb1 - source host timebase
+ * off1 - source timebase offset
+ * tod1 - source time of the day
+ *
+ * The result we want is in @off2
+ *
+ * Two conditions must be met for @off2:
+ * 1) off2 must be multiple of 2^24 ticks as it will be set via TBU40 SPR
+ * 2) Gtb2 >= Gtb1
+ */
+void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env)
+{
+    uint64_t tb2, tod2, off2;
+    int ratio = tb_env->tb_freq / 1000000;
+    struct timeval tv;
+
+    tb2 = cpu_get_real_ticks();
+    gettimeofday(&tv, NULL);
+    tod2 = tv.tv_sec * 1000000 + tv.tv_usec;
+
+    off2 = tb_env->timebase - tb2 + tb_env->tb_offset;
+    if (tod2 > tb_env->time_of_the_day) {
+        off2 += (tod2 - tb_env->time_of_the_day) * ratio;
+    }
+    off2 = ROUND_UP(off2, 1 << 24);
+
+    trace_ppc_tb_adjust(tb_env->tb_offset, off2,
+                        (int64_t)off2 - tb_env->tb_offset);
+
+    tb_env->tb_offset = off2;
+}
+
 /* Set up (once) timebase frequency (in Hz) */
 clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
 {
diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
index 132ab97..235871c 100644
--- a/include/hw/ppc/ppc.h
+++ b/include/hw/ppc/ppc.h
@@ -32,6 +32,9 @@  struct ppc_tb_t {
     uint64_t purr_start;
     void *opaque;
     uint32_t flags;
+    /* Cached values for live migration purposes */
+    uint64_t timebase;
+    uint64_t time_of_the_day;
 };
 
 /* PPC Timers flags */
@@ -46,6 +49,7 @@  struct ppc_tb_t {
                                                */
 
 uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t tb_offset);
+void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env);
 clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq);
 /* Embedded PowerPC DCR management */
 typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn);
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 7af9e3d..93df955 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -35,6 +35,7 @@ 
 #include "hw/sysbus.h"
 #include "hw/ppc/spapr.h"
 #include "hw/ppc/spapr_vio.h"
+#include "hw/ppc/ppc.h"
 #include "sysemu/watchdog.h"
 
 //#define DEBUG_KVM
@@ -761,6 +762,22 @@  static int kvm_put_vpa(CPUState *cs)
 }
 #endif /* TARGET_PPC64 */
 
+static int kvm_access_one_reg(CPUState *cs, bool set, __u64 id, void *addr)
+{
+    struct kvm_one_reg reg = {
+        .id = id,
+        .addr = (uintptr_t)addr,
+    };
+    int ret = kvm_vcpu_ioctl(cs, set ? KVM_SET_ONE_REG : KVM_GET_ONE_REG, &reg);
+
+    if (ret) {
+        DPRINTF("Unable to %s time base offset to KVM: %s\n",
+                set ? "set" : "get", strerror(errno));
+    }
+
+    return ret;
+}
+
 int kvm_arch_put_registers(CPUState *cs, int level)
 {
     PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -873,6 +890,9 @@  int kvm_arch_put_registers(CPUState *cs, int level)
                 DPRINTF("Warning: Unable to set VPA information to KVM\n");
             }
         }
+
+        kvm_access_one_reg(cs, 1, KVM_REG_PPC_TB_OFFSET,
+                           &env->tb_env->tb_offset);
 #endif /* TARGET_PPC64 */
     }
 
@@ -1082,6 +1102,9 @@  int kvm_arch_get_registers(CPUState *cs)
                 DPRINTF("Warning: Unable to get VPA information from KVM\n");
             }
         }
+
+        kvm_access_one_reg(cs, 0, KVM_REG_PPC_TB_OFFSET,
+                           &env->tb_env->tb_offset);
 #endif
     }
 
diff --git a/target-ppc/machine.c b/target-ppc/machine.c
index 12e1512..d1ffc7f 100644
--- a/target-ppc/machine.c
+++ b/target-ppc/machine.c
@@ -1,5 +1,6 @@ 
 #include "hw/hw.h"
 #include "hw/boards.h"
+#include "hw/ppc/ppc.h"
 #include "sysemu/kvm.h"
 #include "helper_regs.h"
 
@@ -459,6 +460,45 @@  static const VMStateDescription vmstate_tlbmas = {
     }
 };
 
+static void timebase_pre_save(void *opaque)
+{
+    ppc_tb_t *tb_env = opaque;
+    struct timeval tv;
+
+    gettimeofday(&tv, NULL);
+    tb_env->time_of_the_day = tv.tv_sec * 1000000 + tv.tv_usec;
+    tb_env->timebase = cpu_get_real_ticks();
+}
+
+static int timebase_post_load(void *opaque, int version_id)
+{
+    ppc_tb_t *tb_env = opaque;
+
+    if (!tb_env) {
+        printf("NO TB!\n");
+        return -1;
+    }
+    cpu_ppc_adjust_tb_offset(tb_env);
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_timebase = {
+    .name = "cpu/timebase",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .pre_save = timebase_pre_save,
+    .post_load = timebase_post_load,
+    .fields      = (VMStateField []) {
+        VMSTATE_UINT64(timebase, ppc_tb_t),
+        VMSTATE_INT64(tb_offset, ppc_tb_t),
+        VMSTATE_UINT64(time_of_the_day, ppc_tb_t),
+        VMSTATE_UINT32_EQUAL(tb_freq, ppc_tb_t),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 const VMStateDescription vmstate_ppc_cpu = {
     .name = "cpu",
     .version_id = 5,
@@ -498,6 +538,10 @@  const VMStateDescription vmstate_ppc_cpu = {
         VMSTATE_UINT64_EQUAL(env.insns_flags, PowerPCCPU),
         VMSTATE_UINT64_EQUAL(env.insns_flags2, PowerPCCPU),
         VMSTATE_UINT32_EQUAL(env.nb_BATs, PowerPCCPU),
+
+        /* Time offset */
+        VMSTATE_STRUCT_POINTER(env.tb_env, PowerPCCPU,
+                               vmstate_timebase, ppc_tb_t *),
         VMSTATE_END_OF_LIST()
     },
     .subsections = (VMStateSubsection []) {
diff --git a/trace-events b/trace-events
index 935b953..24cf4d2 100644
--- a/trace-events
+++ b/trace-events
@@ -1141,6 +1141,9 @@  spapr_iommu_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "li
 spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
 spapr_iommu_new_table(uint64_t liobn, void *tcet, void *table, int fd) "liobn=%"PRIx64" tcet=%p table=%p fd=%d"
 
+# hw/ppc/ppc.c
+ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff) "adjusted from %"PRIx64" to %"PRIx64", diff %"PRId64
+
 # util/hbitmap.c
 hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
 hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64