diff mbox series

[v5,23/36] spapr/xive: add migration support for KVM

Message ID 20181116105729.23240-24-clg@kaod.org
State New
Headers show
Series ppc: support for the XIVE interrupt controller (POWER9) | expand

Commit Message

Cédric Le Goater Nov. 16, 2018, 10:57 a.m. UTC
This extends the KVM XIVE models to handle the state synchronization
with KVM, for the monitor usage and for the migration.

The migration priority of the XIVE interrupt controller sPAPRXive is
raised for KVM. It operates first and orchestrates the capture
sequence of the states of all the XIVE models. The XIVE sources are
masked to quiesce the interrupt flow and a XIVE xync is performed to
stabilize the OS Event Queues. The state of the ENDs are then captured
by the XIVE interrupt controller model, sPAPRXive, and the state of
the thread contexts by the thread interrupt presenter model,
XiveTCTX. When done, a rollback is performed to restore the sources to
their initial state.

The sPAPRXive 'post_load' method is called from the sPAPR machine,
after all XIVE device states have been transfered and loaded. First,
sPAPRXive restores the XIVE routing tables: ENDT and EAT. Next, are
restored the thread interrupt context registers and the source PQ
bits.

The get/set operations rely on their KVM counterpart in the host
kernel which acts as a proxy for OPAL, the host firmware.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---

 WIP:
 
    If migration occurs when a VCPU is 'ceded', some the OS event
    notification queues are mapped to the ZERO_PAGE on the receiving
    side. As if the HW had triggered a page fault before the dirty
    page was transferred from the source or as if we were not using
    the correct page table.

 include/hw/ppc/spapr_xive.h     |   5 +
 include/hw/ppc/xive.h           |   3 +
 include/migration/vmstate.h     |   1 +
 linux-headers/asm-powerpc/kvm.h |  33 +++
 hw/intc/spapr_xive.c            |  32 +++
 hw/intc/spapr_xive_kvm.c        | 494 ++++++++++++++++++++++++++++++++
 hw/intc/xive.c                  |  46 +++
 hw/ppc/spapr_irq.c              |   2 +-
 8 files changed, 615 insertions(+), 1 deletion(-)

Comments

David Gibson Nov. 29, 2018, 3:43 a.m. UTC | #1
On Fri, Nov 16, 2018 at 11:57:16AM +0100, Cédric Le Goater wrote:
> This extends the KVM XIVE models to handle the state synchronization
> with KVM, for the monitor usage and for the migration.
> 
> The migration priority of the XIVE interrupt controller sPAPRXive is
> raised for KVM. It operates first and orchestrates the capture
> sequence of the states of all the XIVE models. The XIVE sources are
> masked to quiesce the interrupt flow and a XIVE xync is performed to
> stabilize the OS Event Queues. The state of the ENDs are then captured
> by the XIVE interrupt controller model, sPAPRXive, and the state of
> the thread contexts by the thread interrupt presenter model,
> XiveTCTX. When done, a rollback is performed to restore the sources to
> their initial state.
> 
> The sPAPRXive 'post_load' method is called from the sPAPR machine,
> after all XIVE device states have been transfered and loaded. First,
> sPAPRXive restores the XIVE routing tables: ENDT and EAT. Next, are
> restored the thread interrupt context registers and the source PQ
> bits.
> 
> The get/set operations rely on their KVM counterpart in the host
> kernel which acts as a proxy for OPAL, the host firmware.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
> 
>  WIP:
>  
>     If migration occurs when a VCPU is 'ceded', some the OS event
>     notification queues are mapped to the ZERO_PAGE on the receiving
>     side. As if the HW had triggered a page fault before the dirty
>     page was transferred from the source or as if we were not using
>     the correct page table.
> 
>  include/hw/ppc/spapr_xive.h     |   5 +
>  include/hw/ppc/xive.h           |   3 +
>  include/migration/vmstate.h     |   1 +
>  linux-headers/asm-powerpc/kvm.h |  33 +++
>  hw/intc/spapr_xive.c            |  32 +++
>  hw/intc/spapr_xive_kvm.c        | 494 ++++++++++++++++++++++++++++++++
>  hw/intc/xive.c                  |  46 +++
>  hw/ppc/spapr_irq.c              |   2 +-
>  8 files changed, 615 insertions(+), 1 deletion(-)
> 
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 9c817bb7ae74..d2517c040958 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -55,12 +55,17 @@ typedef struct sPAPRXiveClass {
>      XiveRouterClass parent_class;
>  
>      DeviceRealize   parent_realize;
> +
> +    void (*synchronize_state)(sPAPRXive *xive);
> +    int  (*pre_save)(sPAPRXive *xsrc);
> +    int  (*post_load)(sPAPRXive *xsrc, int version_id);

This should go away if the KVM and non-KVM versions are in the same
object.

>  } sPAPRXiveClass;
>  
>  bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
>  bool spapr_xive_irq_disable(sPAPRXive *xive, uint32_t lisn);
>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
>  qemu_irq spapr_xive_qirq(sPAPRXive *xive, uint32_t lisn);
> +int spapr_xive_post_load(sPAPRXive *xive, int version_id);
>  
>  /*
>   * sPAPR NVT and END indexing helpers
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 7aaf5a182cb3..c8201462d698 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -309,6 +309,9 @@ typedef struct XiveTCTXClass {
>      DeviceClass       parent_class;
>  
>      DeviceRealize     parent_realize;
> +
> +    void (*synchronize_state)(XiveTCTX *tctx);
> +    int  (*post_load)(XiveTCTX *tctx, int version_id);

.. and this too.

>  } XiveTCTXClass;
>  
>  /*
> diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
> index 2b501d04669a..ee2e836cc1c1 100644
> --- a/include/migration/vmstate.h
> +++ b/include/migration/vmstate.h
> @@ -154,6 +154,7 @@ typedef enum {
>      MIG_PRI_PCI_BUS,            /* Must happen before IOMMU */
>      MIG_PRI_GICV3_ITS,          /* Must happen before PCI devices */
>      MIG_PRI_GICV3,              /* Must happen before the ITS */
> +    MIG_PRI_XIVE_IC,            /* Must happen before all XIVE models */

Ugh.. explicit priority / order levels are a pretty bad code smell.
Usually migration ordering can be handled by getting the object
heirarchy right.  What exactly is the problem you're addessing with
this?


>      MIG_PRI_MAX,
>  } MigrationPriority;
>  
> diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
> index f34c971491dd..9d55ade23634 100644
> --- a/linux-headers/asm-powerpc/kvm.h
> +++ b/linux-headers/asm-powerpc/kvm.h

Again, linux-headers need to be split out.

> @@ -480,6 +480,8 @@ struct kvm_ppc_cpu_char {
>  #define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
>  #define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
>  
> +#define KVM_REG_PPC_NVT_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d)
> +
>  /* Device control API: PPC-specific devices */
>  #define KVM_DEV_MPIC_GRP_MISC		1
>  #define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
> @@ -681,10 +683,41 @@ struct kvm_ppc_cpu_char {
>  #define   KVM_DEV_XIVE_GET_TIMA_FD	2
>  #define   KVM_DEV_XIVE_VC_BASE		3
>  #define KVM_DEV_XIVE_GRP_SOURCES	2	/* 64-bit source attributes */
> +#define KVM_DEV_XIVE_GRP_SYNC		3	/* 64-bit source attributes */
> +#define KVM_DEV_XIVE_GRP_EAS		4	/* 64-bit eas attributes */
> +#define KVM_DEV_XIVE_GRP_EQ		5	/* 64-bit eq attributes */
>  
>  /* Layout of 64-bit XIVE source attribute values */
>  #define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
>  #define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
>  
> +/* Layout of 64-bit eas attribute values */
> +#define KVM_XIVE_EAS_PRIORITY_SHIFT	0
> +#define KVM_XIVE_EAS_PRIORITY_MASK	0x7
> +#define KVM_XIVE_EAS_SERVER_SHIFT	3
> +#define KVM_XIVE_EAS_SERVER_MASK	0xfffffff8ULL
> +#define KVM_XIVE_EAS_MASK_SHIFT		32
> +#define KVM_XIVE_EAS_MASK_MASK		0x100000000ULL
> +#define KVM_XIVE_EAS_EISN_SHIFT		33
> +#define KVM_XIVE_EAS_EISN_MASK		0xfffffffe00000000ULL
> +
> +/* Layout of 64-bit eq attribute */
> +#define KVM_XIVE_EQ_PRIORITY_SHIFT	0
> +#define KVM_XIVE_EQ_PRIORITY_MASK	0x7
> +#define KVM_XIVE_EQ_SERVER_SHIFT	3
> +#define KVM_XIVE_EQ_SERVER_MASK		0xfffffff8ULL
> +
> +/* Layout of 64-bit eq attribute values */
> +struct kvm_ppc_xive_eq {
> +	__u32 flags;
> +	__u32 qsize;
> +	__u64 qpage;
> +	__u32 qtoggle;
> +	__u32 qindex;
> +};
> +
> +#define KVM_XIVE_EQ_FLAG_ENABLED	0x00000001
> +#define KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY	0x00000002
> +#define KVM_XIVE_EQ_FLAG_ESCALATE	0x00000004
>  
>  #endif /* __LINUX_KVM_POWERPC_H */
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index ec85f7e4f88d..c5c0e063dc33 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -27,9 +27,14 @@
>  
>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
>  {
> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
>      int i;
>      uint32_t offset = 0;
>  
> +    if (sxc->synchronize_state) {
> +        sxc->synchronize_state(xive);
> +    }
> +
>      monitor_printf(mon, "XIVE Source %08x .. %08x\n", offset,
>                     offset + xive->source.nr_irqs - 1);
>      xive_source_pic_print_info(&xive->source, offset, mon);
> @@ -354,10 +359,37 @@ static const VMStateDescription vmstate_spapr_xive_eas = {
>      },
>  };
>  
> +static int vmstate_spapr_xive_pre_save(void *opaque)
> +{
> +    sPAPRXive *xive = SPAPR_XIVE_BASE(opaque);
> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
> +
> +    if (sxc->pre_save) {
> +        return sxc->pre_save(xive);
> +    }
> +
> +    return 0;
> +}
> +
> +/* handled at the machine level */
> +int spapr_xive_post_load(sPAPRXive *xive, int version_id)
> +{
> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
> +
> +    if (sxc->post_load) {
> +        return sxc->post_load(xive, version_id);
> +    }
> +
> +    return 0;
> +}
> +
>  static const VMStateDescription vmstate_spapr_xive_base = {
>      .name = TYPE_SPAPR_XIVE,
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = vmstate_spapr_xive_pre_save,
> +    .post_load = NULL, /* handled at the machine level */
> +    .priority = MIG_PRI_XIVE_IC,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT32_EQUAL(nr_irqs, sPAPRXive, NULL),
>          VMSTATE_STRUCT_VARRAY_POINTER_UINT32(eat, sPAPRXive, nr_irqs,
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index 767f90826e43..176083c37d61 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -58,6 +58,58 @@ static void kvm_cpu_enable(CPUState *cs)
>  /*
>   * XIVE Thread Interrupt Management context (KVM)
>   */
> +static void xive_tctx_kvm_set_state(XiveTCTX *tctx, Error **errp)
> +{
> +    uint64_t state[4];
> +    int ret;
> +
> +    /* word0 and word1 of the OS ring. */
> +    state[0] = *((uint64_t *) &tctx->regs[TM_QW1_OS]);
> +
> +    /* VP identifier. Only for KVM pr_debug() */
> +    state[1] = *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]);
> +
> +    ret = kvm_set_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
> +    if (ret != 0) {
> +        error_setg_errno(errp, errno, "Could restore KVM XIVE CPU %ld state",
> +                         kvm_arch_vcpu_id(tctx->cs));
> +    }
> +}
> +
> +static void xive_tctx_kvm_get_state(XiveTCTX *tctx, Error **errp)
> +{
> +    uint64_t state[4] = { 0 };
> +    int ret;
> +
> +    ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
> +    if (ret != 0) {
> +        error_setg_errno(errp, errno, "Could capture KVM XIVE CPU %ld state",
> +                         kvm_arch_vcpu_id(tctx->cs));
> +        return;
> +    }
> +
> +    /* word0 and word1 of the OS ring. */
> +    *((uint64_t *) &tctx->regs[TM_QW1_OS]) = state[0];
> +
> +    /*
> +     * KVM also returns word2 containing the VP CAM line value which
> +     * is interesting to print out the VP identifier in the QEMU
> +     * monitor. No need to restore it.
> +     */
> +    *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]) = state[1];
> +}
> +
> +static void xive_tctx_kvm_do_synchronize_state(CPUState *cpu,
> +                                              run_on_cpu_data arg)
> +{
> +    xive_tctx_kvm_get_state(arg.host_ptr, &error_fatal);
> +}
> +
> +static void xive_tctx_kvm_synchronize_state(XiveTCTX *tctx)
> +{
> +    run_on_cpu(tctx->cs, xive_tctx_kvm_do_synchronize_state,
> +               RUN_ON_CPU_HOST_PTR(tctx));
> +}
>  
>  static void xive_tctx_kvm_init(XiveTCTX *tctx, Error **errp)
>  {
> @@ -112,6 +164,8 @@ static void xive_tctx_kvm_class_init(ObjectClass *klass, void *data)
>  
>      device_class_set_parent_realize(dc, xive_tctx_kvm_realize,
>                                      &xtc->parent_realize);
> +
> +    xtc->synchronize_state = xive_tctx_kvm_synchronize_state;
>  }
>  
>  static const TypeInfo xive_tctx_kvm_info = {
> @@ -166,6 +220,34 @@ static void xive_source_kvm_reset(DeviceState *dev)
>      xive_source_kvm_init(xsrc, &error_fatal);
>  }
>  
> +/*
> + * This is used to perform the magic loads on the ESB pages, described
> + * in xive.h.
> + */
> +static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
> +{
> +    unsigned long addr = (unsigned long) xsrc->esb_mmap +
> +        xive_source_esb_mgmt(xsrc, srcno) + offset;
> +
> +    /* Prevent the compiler from optimizing away the load */
> +    volatile uint64_t value = *((uint64_t *) addr);
> +
> +    return be64_to_cpu(value) & 0x3;
> +}
> +
> +static void xive_source_kvm_get_state(XiveSource *xsrc)
> +{
> +    int i;
> +
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        /* Perform a load without side effect to retrieve the PQ bits */
> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
> +
> +        /* and save PQ locally */
> +        xive_source_esb_set(xsrc, i, pq);
> +    }
> +}
> +
>  static void xive_source_kvm_set_irq(void *opaque, int srcno, int val)
>  {
>      XiveSource *xsrc = opaque;
> @@ -295,6 +377,414 @@ static const TypeInfo xive_source_kvm_info = {
>  /*
>   * sPAPR XIVE Router (KVM)
>   */
> +static int spapr_xive_kvm_set_eq_state(sPAPRXive *xive, CPUState *cs,
> +                                       Error **errp)
> +{
> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> +    int ret;
> +    int i;
> +
> +    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
> +        Error *local_err = NULL;
> +        XiveEND end;
> +        uint8_t end_blk;
> +        uint32_t end_idx;
> +        struct kvm_ppc_xive_eq kvm_eq = { 0 };
> +        uint64_t kvm_eq_idx;
> +
> +        if (!spapr_xive_priority_is_valid(i)) {
> +            continue;
> +        }
> +
> +        spapr_xive_cpu_to_end(xive, POWERPC_CPU(cs), i, &end_blk, &end_idx);
> +
> +        ret = xive_router_get_end(xrtr, end_blk, end_idx, &end);
> +        if (ret) {
> +            error_setg(errp, "XIVE: No END for CPU %ld priority %d",
> +                       vcpu_id, i);
> +            return ret;
> +        }
> +
> +        if (!(end.w0 & END_W0_VALID)) {
> +            continue;
> +        }
> +
> +        /* Build the KVM state from the local END structure */
> +        kvm_eq.flags   = KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY;
> +        kvm_eq.qsize   = GETFIELD(END_W0_QSIZE, end.w0) + 12;
> +        kvm_eq.qpage   = (((uint64_t)(end.w2 & 0x0fffffff)) << 32) | end.w3;
> +        kvm_eq.qtoggle = GETFIELD(END_W1_GENERATION, end.w1);
> +        kvm_eq.qindex  = GETFIELD(END_W1_PAGE_OFF, end.w1);
> +
> +        /* Encode the tuple (server, prio) as a KVM EQ index */
> +        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
> +            KVM_XIVE_EQ_PRIORITY_MASK;
> +        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
> +            KVM_XIVE_EQ_SERVER_MASK;
> +
> +        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
> +                                &kvm_eq, true, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return ret;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int spapr_xive_kvm_get_eq_state(sPAPRXive *xive, CPUState *cs,
> +                                       Error **errp)
> +{
> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> +    int ret;
> +    int i;
> +
> +    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
> +        Error *local_err = NULL;
> +        struct kvm_ppc_xive_eq kvm_eq = { 0 };
> +        uint64_t kvm_eq_idx;
> +        XiveEND end = { 0 };
> +        uint8_t end_blk, nvt_blk;
> +        uint32_t end_idx, nvt_idx;
> +
> +        /* Skip priorities reserved for the hypervisor */
> +        if (!spapr_xive_priority_is_valid(i)) {
> +            continue;
> +        }
> +
> +        /* Encode the tuple (server, prio) as a KVM EQ index */
> +        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
> +            KVM_XIVE_EQ_PRIORITY_MASK;
> +        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
> +            KVM_XIVE_EQ_SERVER_MASK;
> +
> +        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
> +                                &kvm_eq, false, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return ret;
> +        }
> +
> +        if (!(kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED)) {
> +            continue;
> +        }
> +
> +        /* Update the local END structure with the KVM input */
> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED) {
> +                end.w0 |= END_W0_VALID | END_W0_ENQUEUE;
> +        }
> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY) {
> +                end.w0 |= END_W0_UCOND_NOTIFY;
> +        }
> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ESCALATE) {
> +                end.w0 |= END_W0_ESCALATE_CTL;
> +        }
> +        end.w0 |= SETFIELD(END_W0_QSIZE, 0ul, kvm_eq.qsize - 12);
> +
> +        end.w1 = SETFIELD(END_W1_GENERATION, 0ul, kvm_eq.qtoggle) |
> +            SETFIELD(END_W1_PAGE_OFF, 0ul, kvm_eq.qindex);
> +        end.w2 = (kvm_eq.qpage >> 32) & 0x0fffffff;
> +        end.w3 = kvm_eq.qpage & 0xffffffff;
> +        end.w4 = 0;
> +        end.w5 = 0;
> +
> +        ret = spapr_xive_cpu_to_nvt(xive, POWERPC_CPU(cs), &nvt_blk, &nvt_idx);
> +        if (ret) {
> +            error_setg(errp, "XIVE: No NVT for CPU %ld", vcpu_id);
> +            return ret;
> +        }
> +
> +        end.w6 = SETFIELD(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
> +            SETFIELD(END_W6_NVT_INDEX, 0ul, nvt_idx);
> +        end.w7 = SETFIELD(END_W7_F0_PRIORITY, 0ul, i);
> +
> +        spapr_xive_cpu_to_end(xive, POWERPC_CPU(cs), i, &end_blk, &end_idx);
> +
> +        ret = xive_router_set_end(xrtr, end_blk, end_idx, &end);
> +        if (ret) {
> +            error_setg(errp, "XIVE: No END for CPU %ld priority %d",
> +                       vcpu_id, i);
> +            return ret;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static void spapr_xive_kvm_set_eas_state(sPAPRXive *xive, Error **errp)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    int i;
> +
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        XiveEAS *eas = &xive->eat[i];
> +        uint32_t end_idx;
> +        uint32_t end_blk;
> +        uint32_t eisn;
> +        uint8_t priority;
> +        uint32_t server;
> +        uint64_t kvm_eas;
> +        Error *local_err = NULL;
> +
> +        /* No need to set MASKED EAS, this is the default state after reset */
> +        if (!(eas->w & EAS_VALID) || eas->w & EAS_MASKED) {
> +            continue;
> +        }
> +
> +        end_idx = GETFIELD(EAS_END_INDEX, eas->w);
> +        end_blk = GETFIELD(EAS_END_BLOCK, eas->w);
> +        eisn = GETFIELD(EAS_END_DATA, eas->w);
> +
> +        spapr_xive_end_to_target(xive, end_blk, end_idx, &server, &priority);
> +
> +        kvm_eas = priority << KVM_XIVE_EAS_PRIORITY_SHIFT &
> +            KVM_XIVE_EAS_PRIORITY_MASK;
> +        kvm_eas |= server << KVM_XIVE_EAS_SERVER_SHIFT &
> +            KVM_XIVE_EAS_SERVER_MASK;
> +        kvm_eas |= ((uint64_t)eisn << KVM_XIVE_EAS_EISN_SHIFT) &
> +            KVM_XIVE_EAS_EISN_MASK;
> +
> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, true,
> +                          &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +}
> +
> +static void spapr_xive_kvm_get_eas_state(sPAPRXive *xive, Error **errp)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    int i;
> +
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        XiveEAS *eas = &xive->eat[i];
> +        XiveEAS new_eas;
> +        uint64_t kvm_eas;
> +        uint8_t priority;
> +        uint32_t server;
> +        uint32_t end_idx;
> +        uint8_t end_blk;
> +        uint32_t eisn;
> +        Error *local_err = NULL;
> +
> +        if (!(eas->w & EAS_VALID)) {
> +            continue;
> +        }
> +
> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, false,
> +                          &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +
> +        priority = (kvm_eas & KVM_XIVE_EAS_PRIORITY_MASK) >>
> +            KVM_XIVE_EAS_PRIORITY_SHIFT;
> +        server = (kvm_eas & KVM_XIVE_EAS_SERVER_MASK) >>
> +            KVM_XIVE_EAS_SERVER_SHIFT;
> +        eisn = (kvm_eas & KVM_XIVE_EAS_EISN_MASK) >> KVM_XIVE_EAS_EISN_SHIFT;
> +
> +        if (spapr_xive_target_to_end(xive, server, priority, &end_blk,
> +                                     &end_idx)) {
> +            error_setg(errp, "XIVE: invalid tuple CPU %d priority %d", server,
> +                       priority);
> +            return;
> +        }
> +
> +        new_eas.w = EAS_VALID;
> +        if (kvm_eas & KVM_XIVE_EAS_MASK_MASK) {
> +            new_eas.w |= EAS_MASKED;
> +        }
> +
> +        new_eas.w = SETFIELD(EAS_END_INDEX, new_eas.w, end_idx);
> +        new_eas.w = SETFIELD(EAS_END_BLOCK, new_eas.w, end_blk);
> +        new_eas.w = SETFIELD(EAS_END_DATA, new_eas.w, eisn);
> +
> +        *eas = new_eas;
> +    }
> +}
> +
> +static void spapr_xive_kvm_sync_all(sPAPRXive *xive, Error **errp)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    Error *local_err = NULL;
> +    int i;
> +
> +    /* Sync the KVM source. This reaches the XIVE HW through OPAL */
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        XiveEAS *eas = &xive->eat[i];
> +
> +        if (!(eas->w & EAS_VALID)) {
> +            continue;
> +        }
> +
> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SYNC, i, NULL, true,
> +                          &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +}
> +
> +/*
> + * The sPAPRXive KVM model migration priority is higher to make sure

Higher than what?

> + * its 'pre_save' method runs before all the other XIVE models. It

If the other XIVE components are children of sPAPRXive (which I think
they are or could be), then I believe the parent object's pre_save
will automatically be called first.

> + * orchestrates the capture sequence of the XIVE states in the
> + * following order:
> + *
> + *   1. mask all the sources by setting PQ=01, which returns the
> + *      previous value and save it.
> + *   2. sync the sources in KVM to stabilize all the queues
> + *      sync the ENDs to make sure END -> VP is fully completed
> + *   3. dump the EAS table
> + *   4. dump the END table
> + *   5. dump the thread context (IPB)
> + *
> + *  Rollback to restore the current configuration of the sources



> + */
> +static int spapr_xive_kvm_pre_save(sPAPRXive *xive)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    Error *local_err = NULL;
> +    CPUState *cs;
> +    int i;
> +    int ret = 0;
> +
> +    /* Quiesce the sources, to stop the flow of event notifications */
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        /*
> +         * Mask and save the ESB PQs locally in the XiveSource object.
> +         */
> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_01);
> +        xive_source_esb_set(xsrc, i, pq);
> +    }
> +
> +    /* Sync the sources in KVM */
> +    spapr_xive_kvm_sync_all(xive, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        goto out;
> +    }
> +
> +    /* Grab the EAT (could be done earlier ?) */
> +    spapr_xive_kvm_get_eas_state(xive, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        goto out;
> +    }
> +
> +    /*
> +     * Grab the ENDs. The EQ index and the toggle bit are what we want
> +     * to capture
> +     */
> +    CPU_FOREACH(cs) {
> +        spapr_xive_kvm_get_eq_state(xive, cs, &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            goto out;
> +        }
> +    }
> +
> +    /* Capture the thread interrupt contexts */
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        /* TODO: Check if we need to use under run_on_cpu() ? */
> +        xive_tctx_kvm_get_state(XIVE_TCTX_KVM(cpu->intc), &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            goto out;
> +        }
> +    }
> +
> +    /* All done. */
> +
> +out:
> +    /* Restore the sources to their initial state */
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        uint8_t pq = xive_source_esb_get(xsrc, i);
> +        if (xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8)) != 0x1) {
> +            error_report("XIVE: IRQ %d has an invalid state", i);
> +        }
> +    }
> +
> +    /*
> +     * The XiveSource and the XiveTCTX states will be collected by
> +     * their respective vmstate handlers afterwards.
> +     */
> +    return ret;
> +}
> +
> +/*
> + * The sPAPRXive 'post_load' method is called by the sPAPR machine,
> + * after all XIVE device states have been transfered and loaded.
> + *
> + * All should be in place when the VCPUs resume execution.
> + */
> +static int spapr_xive_kvm_post_load(sPAPRXive *xive, int version_id)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    Error *local_err = NULL;
> +    CPUState *cs;
> +    int i;
> +
> +    /* Set the ENDs first. The targetting depends on it. */
> +    CPU_FOREACH(cs) {
> +        spapr_xive_kvm_set_eq_state(xive, cs, &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return -1;
> +        }
> +    }
> +
> +    /* Restore the targetting, if any */
> +    spapr_xive_kvm_set_eas_state(xive, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return -1;
> +    }
> +
> +    /* Restore the thread interrupt contexts */
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        xive_tctx_kvm_set_state(XIVE_TCTX_KVM(cpu->intc), &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return -1;
> +        }
> +    }
> +
> +    /*
> +     * Get the saved state from the XiveSource model and restore the
> +     * PQ bits
> +     */
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        uint8_t pq = xive_source_esb_get(xsrc, i);
> +        xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
> +    }
> +    return 0;
> +}
> +
> +static void spapr_xive_kvm_synchronize_state(sPAPRXive *xive)
> +{
> +    XiveSource *xsrc = &xive->source;
> +    CPUState *cs;
> +
> +    xive_source_kvm_get_state(xsrc);
> +
> +    spapr_xive_kvm_get_eas_state(xive, &error_fatal);
> +
> +    CPU_FOREACH(cs) {
> +        spapr_xive_kvm_get_eq_state(xive, cs, &error_fatal);
> +    }
> +}
>  
>  static void spapr_xive_kvm_instance_init(Object *obj)
>  {
> @@ -409,6 +899,10 @@ static void spapr_xive_kvm_class_init(ObjectClass *klass, void *data)
>  
>      dc->desc = "sPAPR XIVE KVM Interrupt Controller";
>      dc->unrealize = spapr_xive_kvm_unrealize;
> +
> +    sxc->synchronize_state = spapr_xive_kvm_synchronize_state;
> +    sxc->pre_save = spapr_xive_kvm_pre_save;
> +    sxc->post_load = spapr_xive_kvm_post_load;
>  }
>  
>  static const TypeInfo spapr_xive_kvm_info = {
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index 9bb37553c9ec..c9aedecc8216 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -438,9 +438,14 @@ static const struct {
>  
>  void xive_tctx_pic_print_info(XiveTCTX *tctx, Monitor *mon)
>  {
> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(tctx);
>      int cpu_index = tctx->cs ? tctx->cs->cpu_index : -1;
>      int i;
>  
> +    if (xtc->synchronize_state) {
> +        xtc->synchronize_state(tctx);
> +    }
> +
>      monitor_printf(mon, "CPU[%04x]:   QW   NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
>                     "  W2\n", cpu_index);
>  
> @@ -552,10 +557,23 @@ static void xive_tctx_base_unrealize(DeviceState *dev, Error **errp)
>      qemu_unregister_reset(xive_tctx_base_reset, dev);
>  }
>  
> +static int vmstate_xive_tctx_post_load(void *opaque, int version_id)
> +{
> +    XiveTCTX *tctx = XIVE_TCTX_BASE(opaque);
> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(tctx);
> +
> +    if (xtc->post_load) {
> +        return xtc->post_load(tctx, version_id);
> +    }
> +
> +    return 0;
> +}
> +
>  static const VMStateDescription vmstate_xive_tctx_base = {
>      .name = TYPE_XIVE_TCTX,
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .post_load = vmstate_xive_tctx_post_load,
>      .fields = (VMStateField[]) {
>          VMSTATE_BUFFER(regs, XiveTCTX),
>          VMSTATE_END_OF_LIST()
> @@ -581,9 +599,37 @@ static const TypeInfo xive_tctx_base_info = {
>      .class_size    = sizeof(XiveTCTXClass),
>  };
>  
> +static int xive_tctx_post_load(XiveTCTX *tctx, int version_id)
> +{
> +    XiveRouterClass *xrc = XIVE_ROUTER_GET_CLASS(tctx->xrtr);
> +
> +    /*
> +     * When we collect the states from KVM XIVE irqchip, we set word2
> +     * of the thread context to print out the OS CAM line under the
> +     * QEMU monitor.
> +     *
> +     * This breaks migration on a guest using TCG or not using a KVM
> +     * irqchip. Fix with an extra reset of the thread contexts.
> +     */
> +    if (xrc->reset_tctx) {
> +        xrc->reset_tctx(tctx->xrtr, tctx);
> +    }
> +    return 0;
> +}
> +
> +static void xive_tctx_class_init(ObjectClass *klass, void *data)
> +{
> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_CLASS(klass);
> +
> +    xtc->post_load = xive_tctx_post_load;
> +}
> +
>  static const TypeInfo xive_tctx_info = {
>      .name          = TYPE_XIVE_TCTX,
>      .parent        = TYPE_XIVE_TCTX_BASE,
> +    .instance_size = sizeof(XiveTCTX),
> +    .class_init    = xive_tctx_class_init,
> +    .class_size    = sizeof(XiveTCTXClass),
>  };
>  
>  Object *xive_tctx_create(Object *cpu, const char *type, XiveRouter *xrtr,
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index 92ef53743b64..6fac6ca70595 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -359,7 +359,7 @@ static Object *spapr_irq_cpu_intc_create_xive(sPAPRMachineState *spapr,
>  
>  static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>  {
> -    return 0;
> +    return spapr_xive_post_load(spapr->xive, version_id);
>  }
>  
>  /*
Cédric Le Goater Nov. 29, 2018, 4:19 p.m. UTC | #2
David,

Could you tell what you think about the KVM interfaces for migration,
the ones capturing and restoring the states ? 

On 11/29/18 4:43 AM, David Gibson wrote:
> On Fri, Nov 16, 2018 at 11:57:16AM +0100, Cédric Le Goater wrote:
>> This extends the KVM XIVE models to handle the state synchronization
>> with KVM, for the monitor usage and for the migration.
>>
>> The migration priority of the XIVE interrupt controller sPAPRXive is
>> raised for KVM. It operates first and orchestrates the capture
>> sequence of the states of all the XIVE models. The XIVE sources are
>> masked to quiesce the interrupt flow and a XIVE xync is performed to
>> stabilize the OS Event Queues. The state of the ENDs are then captured
>> by the XIVE interrupt controller model, sPAPRXive, and the state of
>> the thread contexts by the thread interrupt presenter model,
>> XiveTCTX. When done, a rollback is performed to restore the sources to
>> their initial state.
>>
>> The sPAPRXive 'post_load' method is called from the sPAPR machine,
>> after all XIVE device states have been transfered and loaded. First,
>> sPAPRXive restores the XIVE routing tables: ENDT and EAT. Next, are
>> restored the thread interrupt context registers and the source PQ
>> bits.
>>
>> The get/set operations rely on their KVM counterpart in the host
>> kernel which acts as a proxy for OPAL, the host firmware.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>
>>  WIP:
>>  
>>     If migration occurs when a VCPU is 'ceded', some the OS event
>>     notification queues are mapped to the ZERO_PAGE on the receiving
>>     side. As if the HW had triggered a page fault before the dirty
>>     page was transferred from the source or as if we were not using
>>     the correct page table.


v6 adds a VM change state handler to make XIVE reach a quiescent state. 
The sequence is a little more sophisticated and an extra KVM call 
marks the EQ page dirty.

>>
>>  include/hw/ppc/spapr_xive.h     |   5 +
>>  include/hw/ppc/xive.h           |   3 +
>>  include/migration/vmstate.h     |   1 +
>>  linux-headers/asm-powerpc/kvm.h |  33 +++
>>  hw/intc/spapr_xive.c            |  32 +++
>>  hw/intc/spapr_xive_kvm.c        | 494 ++++++++++++++++++++++++++++++++
>>  hw/intc/xive.c                  |  46 +++
>>  hw/ppc/spapr_irq.c              |   2 +-
>>  8 files changed, 615 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>> index 9c817bb7ae74..d2517c040958 100644
>> --- a/include/hw/ppc/spapr_xive.h
>> +++ b/include/hw/ppc/spapr_xive.h
>> @@ -55,12 +55,17 @@ typedef struct sPAPRXiveClass {
>>      XiveRouterClass parent_class;
>>  
>>      DeviceRealize   parent_realize;
>> +
>> +    void (*synchronize_state)(sPAPRXive *xive);
>> +    int  (*pre_save)(sPAPRXive *xsrc);
>> +    int  (*post_load)(sPAPRXive *xsrc, int version_id);
> 
> This should go away if the KVM and non-KVM versions are in the same
> object.

yes.

>>  } sPAPRXiveClass;
>>  
>>  bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
>>  bool spapr_xive_irq_disable(sPAPRXive *xive, uint32_t lisn);
>>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
>>  qemu_irq spapr_xive_qirq(sPAPRXive *xive, uint32_t lisn);
>> +int spapr_xive_post_load(sPAPRXive *xive, int version_id);
>>  
>>  /*
>>   * sPAPR NVT and END indexing helpers
>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>> index 7aaf5a182cb3..c8201462d698 100644
>> --- a/include/hw/ppc/xive.h
>> +++ b/include/hw/ppc/xive.h
>> @@ -309,6 +309,9 @@ typedef struct XiveTCTXClass {
>>      DeviceClass       parent_class;
>>  
>>      DeviceRealize     parent_realize;
>> +
>> +    void (*synchronize_state)(XiveTCTX *tctx);
>> +    int  (*post_load)(XiveTCTX *tctx, int version_id);
> 
> .. and this too.
> 
>>  } XiveTCTXClass;
>>  
>>  /*
>> diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
>> index 2b501d04669a..ee2e836cc1c1 100644
>> --- a/include/migration/vmstate.h
>> +++ b/include/migration/vmstate.h
>> @@ -154,6 +154,7 @@ typedef enum {
>>      MIG_PRI_PCI_BUS,            /* Must happen before IOMMU */
>>      MIG_PRI_GICV3_ITS,          /* Must happen before PCI devices */
>>      MIG_PRI_GICV3,              /* Must happen before the ITS */
>> +    MIG_PRI_XIVE_IC,            /* Must happen before all XIVE models */
> 
> Ugh.. explicit priority / order levels are a pretty bad code smell.
> Usually migration ordering can be handled by getting the object
> heirarchy right.  What exactly is the problem you're addessing with
> this?

I wanted sPAPRXive to capture the state on behalf of all XIVE models. 
But with the addition of the VMState change handler I think I can 
remove this priority. I will check. 

> 
>>      MIG_PRI_MAX,
>>  } MigrationPriority;
>>  
>> diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
>> index f34c971491dd..9d55ade23634 100644
>> --- a/linux-headers/asm-powerpc/kvm.h
>> +++ b/linux-headers/asm-powerpc/kvm.h
> 
> Again, linux-headers need to be split out.
> 
>> @@ -480,6 +480,8 @@ struct kvm_ppc_cpu_char {
>>  #define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
>>  #define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
>>  
>> +#define KVM_REG_PPC_NVT_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d)
>> +
>>  /* Device control API: PPC-specific devices */
>>  #define KVM_DEV_MPIC_GRP_MISC		1
>>  #define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
>> @@ -681,10 +683,41 @@ struct kvm_ppc_cpu_char {
>>  #define   KVM_DEV_XIVE_GET_TIMA_FD	2
>>  #define   KVM_DEV_XIVE_VC_BASE		3
>>  #define KVM_DEV_XIVE_GRP_SOURCES	2	/* 64-bit source attributes */
>> +#define KVM_DEV_XIVE_GRP_SYNC		3	/* 64-bit source attributes */
>> +#define KVM_DEV_XIVE_GRP_EAS		4	/* 64-bit eas attributes */
>> +#define KVM_DEV_XIVE_GRP_EQ		5	/* 64-bit eq attributes */
>>  
>>  /* Layout of 64-bit XIVE source attribute values */
>>  #define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
>>  #define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
>>  
>> +/* Layout of 64-bit eas attribute values */
>> +#define KVM_XIVE_EAS_PRIORITY_SHIFT	0
>> +#define KVM_XIVE_EAS_PRIORITY_MASK	0x7
>> +#define KVM_XIVE_EAS_SERVER_SHIFT	3
>> +#define KVM_XIVE_EAS_SERVER_MASK	0xfffffff8ULL
>> +#define KVM_XIVE_EAS_MASK_SHIFT		32
>> +#define KVM_XIVE_EAS_MASK_MASK		0x100000000ULL
>> +#define KVM_XIVE_EAS_EISN_SHIFT		33
>> +#define KVM_XIVE_EAS_EISN_MASK		0xfffffffe00000000ULL
>> +
>> +/* Layout of 64-bit eq attribute */
>> +#define KVM_XIVE_EQ_PRIORITY_SHIFT	0
>> +#define KVM_XIVE_EQ_PRIORITY_MASK	0x7
>> +#define KVM_XIVE_EQ_SERVER_SHIFT	3
>> +#define KVM_XIVE_EQ_SERVER_MASK		0xfffffff8ULL
>> +
>> +/* Layout of 64-bit eq attribute values */
>> +struct kvm_ppc_xive_eq {
>> +	__u32 flags;
>> +	__u32 qsize;
>> +	__u64 qpage;
>> +	__u32 qtoggle;
>> +	__u32 qindex;
>> +};
>> +
>> +#define KVM_XIVE_EQ_FLAG_ENABLED	0x00000001
>> +#define KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY	0x00000002
>> +#define KVM_XIVE_EQ_FLAG_ESCALATE	0x00000004
>>  
>>  #endif /* __LINUX_KVM_POWERPC_H */
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index ec85f7e4f88d..c5c0e063dc33 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -27,9 +27,14 @@
>>  
>>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
>>  {
>> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
>>      int i;
>>      uint32_t offset = 0;
>>  
>> +    if (sxc->synchronize_state) {
>> +        sxc->synchronize_state(xive);
>> +    }
>> +
>>      monitor_printf(mon, "XIVE Source %08x .. %08x\n", offset,
>>                     offset + xive->source.nr_irqs - 1);
>>      xive_source_pic_print_info(&xive->source, offset, mon);
>> @@ -354,10 +359,37 @@ static const VMStateDescription vmstate_spapr_xive_eas = {
>>      },
>>  };
>>  
>> +static int vmstate_spapr_xive_pre_save(void *opaque)
>> +{
>> +    sPAPRXive *xive = SPAPR_XIVE_BASE(opaque);
>> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
>> +
>> +    if (sxc->pre_save) {
>> +        return sxc->pre_save(xive);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/* handled at the machine level */
>> +int spapr_xive_post_load(sPAPRXive *xive, int version_id)
>> +{
>> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
>> +
>> +    if (sxc->post_load) {
>> +        return sxc->post_load(xive, version_id);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>  static const VMStateDescription vmstate_spapr_xive_base = {
>>      .name = TYPE_SPAPR_XIVE,
>>      .version_id = 1,
>>      .minimum_version_id = 1,
>> +    .pre_save = vmstate_spapr_xive_pre_save,
>> +    .post_load = NULL, /* handled at the machine level */
>> +    .priority = MIG_PRI_XIVE_IC,
>>      .fields = (VMStateField[]) {
>>          VMSTATE_UINT32_EQUAL(nr_irqs, sPAPRXive, NULL),
>>          VMSTATE_STRUCT_VARRAY_POINTER_UINT32(eat, sPAPRXive, nr_irqs,
>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>> index 767f90826e43..176083c37d61 100644
>> --- a/hw/intc/spapr_xive_kvm.c
>> +++ b/hw/intc/spapr_xive_kvm.c
>> @@ -58,6 +58,58 @@ static void kvm_cpu_enable(CPUState *cs)
>>  /*
>>   * XIVE Thread Interrupt Management context (KVM)
>>   */
>> +static void xive_tctx_kvm_set_state(XiveTCTX *tctx, Error **errp)
>> +{
>> +    uint64_t state[4];
>> +    int ret;
>> +
>> +    /* word0 and word1 of the OS ring. */
>> +    state[0] = *((uint64_t *) &tctx->regs[TM_QW1_OS]);
>> +
>> +    /* VP identifier. Only for KVM pr_debug() */
>> +    state[1] = *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]);
>> +
>> +    ret = kvm_set_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
>> +    if (ret != 0) {
>> +        error_setg_errno(errp, errno, "Could restore KVM XIVE CPU %ld state",
>> +                         kvm_arch_vcpu_id(tctx->cs));
>> +    }
>> +}
>> +
>> +static void xive_tctx_kvm_get_state(XiveTCTX *tctx, Error **errp)
>> +{
>> +    uint64_t state[4] = { 0 };
>> +    int ret;
>> +
>> +    ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
>> +    if (ret != 0) {
>> +        error_setg_errno(errp, errno, "Could capture KVM XIVE CPU %ld state",
>> +                         kvm_arch_vcpu_id(tctx->cs));
>> +        return;
>> +    }
>> +
>> +    /* word0 and word1 of the OS ring. */
>> +    *((uint64_t *) &tctx->regs[TM_QW1_OS]) = state[0];
>> +
>> +    /*
>> +     * KVM also returns word2 containing the VP CAM line value which
>> +     * is interesting to print out the VP identifier in the QEMU
>> +     * monitor. No need to restore it.
>> +     */
>> +    *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]) = state[1];
>> +}
>> +
>> +static void xive_tctx_kvm_do_synchronize_state(CPUState *cpu,
>> +                                              run_on_cpu_data arg)
>> +{
>> +    xive_tctx_kvm_get_state(arg.host_ptr, &error_fatal);
>> +}
>> +
>> +static void xive_tctx_kvm_synchronize_state(XiveTCTX *tctx)
>> +{
>> +    run_on_cpu(tctx->cs, xive_tctx_kvm_do_synchronize_state,
>> +               RUN_ON_CPU_HOST_PTR(tctx));
>> +}
>>  
>>  static void xive_tctx_kvm_init(XiveTCTX *tctx, Error **errp)
>>  {
>> @@ -112,6 +164,8 @@ static void xive_tctx_kvm_class_init(ObjectClass *klass, void *data)
>>  
>>      device_class_set_parent_realize(dc, xive_tctx_kvm_realize,
>>                                      &xtc->parent_realize);
>> +
>> +    xtc->synchronize_state = xive_tctx_kvm_synchronize_state;
>>  }
>>  
>>  static const TypeInfo xive_tctx_kvm_info = {
>> @@ -166,6 +220,34 @@ static void xive_source_kvm_reset(DeviceState *dev)
>>      xive_source_kvm_init(xsrc, &error_fatal);
>>  }
>>  
>> +/*
>> + * This is used to perform the magic loads on the ESB pages, described
>> + * in xive.h.
>> + */
>> +static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
>> +{
>> +    unsigned long addr = (unsigned long) xsrc->esb_mmap +
>> +        xive_source_esb_mgmt(xsrc, srcno) + offset;
>> +
>> +    /* Prevent the compiler from optimizing away the load */
>> +    volatile uint64_t value = *((uint64_t *) addr);
>> +
>> +    return be64_to_cpu(value) & 0x3;
>> +}
>> +
>> +static void xive_source_kvm_get_state(XiveSource *xsrc)
>> +{
>> +    int i;
>> +
>> +    for (i = 0; i < xsrc->nr_irqs; i++) {
>> +        /* Perform a load without side effect to retrieve the PQ bits */
>> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
>> +
>> +        /* and save PQ locally */
>> +        xive_source_esb_set(xsrc, i, pq);
>> +    }
>> +}
>> +
>>  static void xive_source_kvm_set_irq(void *opaque, int srcno, int val)
>>  {
>>      XiveSource *xsrc = opaque;
>> @@ -295,6 +377,414 @@ static const TypeInfo xive_source_kvm_info = {
>>  /*
>>   * sPAPR XIVE Router (KVM)
>>   */
>> +static int spapr_xive_kvm_set_eq_state(sPAPRXive *xive, CPUState *cs,
>> +                                       Error **errp)
>> +{
>> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
>> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
>> +    int ret;
>> +    int i;
>> +
>> +    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
>> +        Error *local_err = NULL;
>> +        XiveEND end;
>> +        uint8_t end_blk;
>> +        uint32_t end_idx;
>> +        struct kvm_ppc_xive_eq kvm_eq = { 0 };
>> +        uint64_t kvm_eq_idx;
>> +
>> +        if (!spapr_xive_priority_is_valid(i)) {
>> +            continue;
>> +        }
>> +
>> +        spapr_xive_cpu_to_end(xive, POWERPC_CPU(cs), i, &end_blk, &end_idx);
>> +
>> +        ret = xive_router_get_end(xrtr, end_blk, end_idx, &end);
>> +        if (ret) {
>> +            error_setg(errp, "XIVE: No END for CPU %ld priority %d",
>> +                       vcpu_id, i);
>> +            return ret;
>> +        }
>> +
>> +        if (!(end.w0 & END_W0_VALID)) {
>> +            continue;
>> +        }
>> +
>> +        /* Build the KVM state from the local END structure */
>> +        kvm_eq.flags   = KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY;
>> +        kvm_eq.qsize   = GETFIELD(END_W0_QSIZE, end.w0) + 12;
>> +        kvm_eq.qpage   = (((uint64_t)(end.w2 & 0x0fffffff)) << 32) | end.w3;
>> +        kvm_eq.qtoggle = GETFIELD(END_W1_GENERATION, end.w1);
>> +        kvm_eq.qindex  = GETFIELD(END_W1_PAGE_OFF, end.w1);
>> +
>> +        /* Encode the tuple (server, prio) as a KVM EQ index */
>> +        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
>> +            KVM_XIVE_EQ_PRIORITY_MASK;
>> +        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
>> +            KVM_XIVE_EQ_SERVER_MASK;
>> +
>> +        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
>> +                                &kvm_eq, true, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int spapr_xive_kvm_get_eq_state(sPAPRXive *xive, CPUState *cs,
>> +                                       Error **errp)
>> +{
>> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
>> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
>> +    int ret;
>> +    int i;
>> +
>> +    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
>> +        Error *local_err = NULL;
>> +        struct kvm_ppc_xive_eq kvm_eq = { 0 };
>> +        uint64_t kvm_eq_idx;
>> +        XiveEND end = { 0 };
>> +        uint8_t end_blk, nvt_blk;
>> +        uint32_t end_idx, nvt_idx;
>> +
>> +        /* Skip priorities reserved for the hypervisor */
>> +        if (!spapr_xive_priority_is_valid(i)) {
>> +            continue;
>> +        }
>> +
>> +        /* Encode the tuple (server, prio) as a KVM EQ index */
>> +        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
>> +            KVM_XIVE_EQ_PRIORITY_MASK;
>> +        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
>> +            KVM_XIVE_EQ_SERVER_MASK;
>> +
>> +        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
>> +                                &kvm_eq, false, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return ret;
>> +        }
>> +
>> +        if (!(kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED)) {
>> +            continue;
>> +        }
>> +
>> +        /* Update the local END structure with the KVM input */
>> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED) {
>> +                end.w0 |= END_W0_VALID | END_W0_ENQUEUE;
>> +        }
>> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY) {
>> +                end.w0 |= END_W0_UCOND_NOTIFY;
>> +        }
>> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ESCALATE) {
>> +                end.w0 |= END_W0_ESCALATE_CTL;
>> +        }
>> +        end.w0 |= SETFIELD(END_W0_QSIZE, 0ul, kvm_eq.qsize - 12);
>> +
>> +        end.w1 = SETFIELD(END_W1_GENERATION, 0ul, kvm_eq.qtoggle) |
>> +            SETFIELD(END_W1_PAGE_OFF, 0ul, kvm_eq.qindex);
>> +        end.w2 = (kvm_eq.qpage >> 32) & 0x0fffffff;
>> +        end.w3 = kvm_eq.qpage & 0xffffffff;
>> +        end.w4 = 0;
>> +        end.w5 = 0;
>> +
>> +        ret = spapr_xive_cpu_to_nvt(xive, POWERPC_CPU(cs), &nvt_blk, &nvt_idx);
>> +        if (ret) {
>> +            error_setg(errp, "XIVE: No NVT for CPU %ld", vcpu_id);
>> +            return ret;
>> +        }
>> +
>> +        end.w6 = SETFIELD(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
>> +            SETFIELD(END_W6_NVT_INDEX, 0ul, nvt_idx);
>> +        end.w7 = SETFIELD(END_W7_F0_PRIORITY, 0ul, i);
>> +
>> +        spapr_xive_cpu_to_end(xive, POWERPC_CPU(cs), i, &end_blk, &end_idx);
>> +
>> +        ret = xive_router_set_end(xrtr, end_blk, end_idx, &end);
>> +        if (ret) {
>> +            error_setg(errp, "XIVE: No END for CPU %ld priority %d",
>> +                       vcpu_id, i);
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static void spapr_xive_kvm_set_eas_state(sPAPRXive *xive, Error **errp)
>> +{
>> +    XiveSource *xsrc = &xive->source;
>> +    int i;
>> +
>> +    for (i = 0; i < xsrc->nr_irqs; i++) {
>> +        XiveEAS *eas = &xive->eat[i];
>> +        uint32_t end_idx;
>> +        uint32_t end_blk;
>> +        uint32_t eisn;
>> +        uint8_t priority;
>> +        uint32_t server;
>> +        uint64_t kvm_eas;
>> +        Error *local_err = NULL;
>> +
>> +        /* No need to set MASKED EAS, this is the default state after reset */
>> +        if (!(eas->w & EAS_VALID) || eas->w & EAS_MASKED) {
>> +            continue;
>> +        }
>> +
>> +        end_idx = GETFIELD(EAS_END_INDEX, eas->w);
>> +        end_blk = GETFIELD(EAS_END_BLOCK, eas->w);
>> +        eisn = GETFIELD(EAS_END_DATA, eas->w);
>> +
>> +        spapr_xive_end_to_target(xive, end_blk, end_idx, &server, &priority);
>> +
>> +        kvm_eas = priority << KVM_XIVE_EAS_PRIORITY_SHIFT &
>> +            KVM_XIVE_EAS_PRIORITY_MASK;
>> +        kvm_eas |= server << KVM_XIVE_EAS_SERVER_SHIFT &
>> +            KVM_XIVE_EAS_SERVER_MASK;
>> +        kvm_eas |= ((uint64_t)eisn << KVM_XIVE_EAS_EISN_SHIFT) &
>> +            KVM_XIVE_EAS_EISN_MASK;
>> +
>> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, true,
>> +                          &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>> +    }
>> +}
>> +
>> +static void spapr_xive_kvm_get_eas_state(sPAPRXive *xive, Error **errp)
>> +{
>> +    XiveSource *xsrc = &xive->source;
>> +    int i;
>> +
>> +    for (i = 0; i < xsrc->nr_irqs; i++) {
>> +        XiveEAS *eas = &xive->eat[i];
>> +        XiveEAS new_eas;
>> +        uint64_t kvm_eas;
>> +        uint8_t priority;
>> +        uint32_t server;
>> +        uint32_t end_idx;
>> +        uint8_t end_blk;
>> +        uint32_t eisn;
>> +        Error *local_err = NULL;
>> +
>> +        if (!(eas->w & EAS_VALID)) {
>> +            continue;
>> +        }
>> +
>> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, false,
>> +                          &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>> +
>> +        priority = (kvm_eas & KVM_XIVE_EAS_PRIORITY_MASK) >>
>> +            KVM_XIVE_EAS_PRIORITY_SHIFT;
>> +        server = (kvm_eas & KVM_XIVE_EAS_SERVER_MASK) >>
>> +            KVM_XIVE_EAS_SERVER_SHIFT;
>> +        eisn = (kvm_eas & KVM_XIVE_EAS_EISN_MASK) >> KVM_XIVE_EAS_EISN_SHIFT;
>> +
>> +        if (spapr_xive_target_to_end(xive, server, priority, &end_blk,
>> +                                     &end_idx)) {
>> +            error_setg(errp, "XIVE: invalid tuple CPU %d priority %d", server,
>> +                       priority);
>> +            return;
>> +        }
>> +
>> +        new_eas.w = EAS_VALID;
>> +        if (kvm_eas & KVM_XIVE_EAS_MASK_MASK) {
>> +            new_eas.w |= EAS_MASKED;
>> +        }
>> +
>> +        new_eas.w = SETFIELD(EAS_END_INDEX, new_eas.w, end_idx);
>> +        new_eas.w = SETFIELD(EAS_END_BLOCK, new_eas.w, end_blk);
>> +        new_eas.w = SETFIELD(EAS_END_DATA, new_eas.w, eisn);
>> +
>> +        *eas = new_eas;
>> +    }
>> +}
>> +
>> +static void spapr_xive_kvm_sync_all(sPAPRXive *xive, Error **errp)
>> +{
>> +    XiveSource *xsrc = &xive->source;
>> +    Error *local_err = NULL;
>> +    int i;
>> +
>> +    /* Sync the KVM source. This reaches the XIVE HW through OPAL */
>> +    for (i = 0; i < xsrc->nr_irqs; i++) {
>> +        XiveEAS *eas = &xive->eat[i];
>> +
>> +        if (!(eas->w & EAS_VALID)) {
>> +            continue;
>> +        }
>> +
>> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SYNC, i, NULL, true,
>> +                          &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>> +    }
>> +}
>> +
>> +/*
>> + * The sPAPRXive KVM model migration priority is higher to make sure
> 
> Higher than what?

Than the XiveTCTX and XiveSource models.

>> + * its 'pre_save' method runs before all the other XIVE models. It
> 
> If the other XIVE components are children of sPAPRXive (which I think
> they are or could be), then I believe the parent object's pre_save
> will automatically be called first.

ok. XiveTCTX are not children of sPAPRXive but that might not be 
a problem anymore with the VMState change handler.

Thanks

C.

>> + * orchestrates the capture sequence of the XIVE states in the
>> + * following order:
>> + *
>> + *   1. mask all the sources by setting PQ=01, which returns the
>> + *      previous value and save it.
>> + *   2. sync the sources in KVM to stabilize all the queues
>> + *      sync the ENDs to make sure END -> VP is fully completed
>> + *   3. dump the EAS table
>> + *   4. dump the END table
>> + *   5. dump the thread context (IPB)
>> + *
>> + *  Rollback to restore the current configuration of the sources
> 
> 
> 
>> + */
>> +static int spapr_xive_kvm_pre_save(sPAPRXive *xive)
>> +{
>> +    XiveSource *xsrc = &xive->source;
>> +    Error *local_err = NULL;
>> +    CPUState *cs;
>> +    int i;
>> +    int ret = 0;
>> +
>> +    /* Quiesce the sources, to stop the flow of event notifications */
>> +    for (i = 0; i < xsrc->nr_irqs; i++) {
>> +        /*
>> +         * Mask and save the ESB PQs locally in the XiveSource object.
>> +         */
>> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_01);
>> +        xive_source_esb_set(xsrc, i, pq);
>> +    }
>> +
>> +    /* Sync the sources in KVM */
>> +    spapr_xive_kvm_sync_all(xive, &local_err);
>> +    if (local_err) {
>> +        error_report_err(local_err);
>> +        goto out;
>> +    }
>> +
>> +    /* Grab the EAT (could be done earlier ?) */
>> +    spapr_xive_kvm_get_eas_state(xive, &local_err);
>> +    if (local_err) {
>> +        error_report_err(local_err);
>> +        goto out;
>> +    }
>> +
>> +    /*
>> +     * Grab the ENDs. The EQ index and the toggle bit are what we want
>> +     * to capture
>> +     */
>> +    CPU_FOREACH(cs) {
>> +        spapr_xive_kvm_get_eq_state(xive, cs, &local_err);
>> +        if (local_err) {
>> +            error_report_err(local_err);
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    /* Capture the thread interrupt contexts */
>> +    CPU_FOREACH(cs) {
>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +
>> +        /* TODO: Check if we need to use under run_on_cpu() ? */
>> +        xive_tctx_kvm_get_state(XIVE_TCTX_KVM(cpu->intc), &local_err);
>> +        if (local_err) {
>> +            error_report_err(local_err);
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    /* All done. */
>> +
>> +out:
>> +    /* Restore the sources to their initial state */
>> +    for (i = 0; i < xsrc->nr_irqs; i++) {
>> +        uint8_t pq = xive_source_esb_get(xsrc, i);
>> +        if (xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8)) != 0x1) {
>> +            error_report("XIVE: IRQ %d has an invalid state", i);
>> +        }
>> +    }
>> +
>> +    /*
>> +     * The XiveSource and the XiveTCTX states will be collected by
>> +     * their respective vmstate handlers afterwards.
>> +     */
>> +    return ret;
>> +}
>> +
>> +/*
>> + * The sPAPRXive 'post_load' method is called by the sPAPR machine,
>> + * after all XIVE device states have been transfered and loaded.
>> + *
>> + * All should be in place when the VCPUs resume execution.
>> + */
>> +static int spapr_xive_kvm_post_load(sPAPRXive *xive, int version_id)
>> +{
>> +    XiveSource *xsrc = &xive->source;
>> +    Error *local_err = NULL;
>> +    CPUState *cs;
>> +    int i;
>> +
>> +    /* Set the ENDs first. The targetting depends on it. */
>> +    CPU_FOREACH(cs) {
>> +        spapr_xive_kvm_set_eq_state(xive, cs, &local_err);
>> +        if (local_err) {
>> +            error_report_err(local_err);
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    /* Restore the targetting, if any */
>> +    spapr_xive_kvm_set_eas_state(xive, &local_err);
>> +    if (local_err) {
>> +        error_report_err(local_err);
>> +        return -1;
>> +    }
>> +
>> +    /* Restore the thread interrupt contexts */
>> +    CPU_FOREACH(cs) {
>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +
>> +        xive_tctx_kvm_set_state(XIVE_TCTX_KVM(cpu->intc), &local_err);
>> +        if (local_err) {
>> +            error_report_err(local_err);
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    /*
>> +     * Get the saved state from the XiveSource model and restore the
>> +     * PQ bits
>> +     */
>> +    for (i = 0; i < xsrc->nr_irqs; i++) {
>> +        uint8_t pq = xive_source_esb_get(xsrc, i);
>> +        xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
>> +    }
>> +    return 0;
>> +}
>> +
>> +static void spapr_xive_kvm_synchronize_state(sPAPRXive *xive)
>> +{
>> +    XiveSource *xsrc = &xive->source;
>> +    CPUState *cs;
>> +
>> +    xive_source_kvm_get_state(xsrc);
>> +
>> +    spapr_xive_kvm_get_eas_state(xive, &error_fatal);
>> +
>> +    CPU_FOREACH(cs) {
>> +        spapr_xive_kvm_get_eq_state(xive, cs, &error_fatal);
>> +    }
>> +}
>>  
>>  static void spapr_xive_kvm_instance_init(Object *obj)
>>  {
>> @@ -409,6 +899,10 @@ static void spapr_xive_kvm_class_init(ObjectClass *klass, void *data)
>>  
>>      dc->desc = "sPAPR XIVE KVM Interrupt Controller";
>>      dc->unrealize = spapr_xive_kvm_unrealize;
>> +
>> +    sxc->synchronize_state = spapr_xive_kvm_synchronize_state;
>> +    sxc->pre_save = spapr_xive_kvm_pre_save;
>> +    sxc->post_load = spapr_xive_kvm_post_load;
>>  }
>>  
>>  static const TypeInfo spapr_xive_kvm_info = {
>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>> index 9bb37553c9ec..c9aedecc8216 100644
>> --- a/hw/intc/xive.c
>> +++ b/hw/intc/xive.c
>> @@ -438,9 +438,14 @@ static const struct {
>>  
>>  void xive_tctx_pic_print_info(XiveTCTX *tctx, Monitor *mon)
>>  {
>> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(tctx);
>>      int cpu_index = tctx->cs ? tctx->cs->cpu_index : -1;
>>      int i;
>>  
>> +    if (xtc->synchronize_state) {
>> +        xtc->synchronize_state(tctx);
>> +    }
>> +
>>      monitor_printf(mon, "CPU[%04x]:   QW   NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
>>                     "  W2\n", cpu_index);
>>  
>> @@ -552,10 +557,23 @@ static void xive_tctx_base_unrealize(DeviceState *dev, Error **errp)
>>      qemu_unregister_reset(xive_tctx_base_reset, dev);
>>  }
>>  
>> +static int vmstate_xive_tctx_post_load(void *opaque, int version_id)
>> +{
>> +    XiveTCTX *tctx = XIVE_TCTX_BASE(opaque);
>> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(tctx);
>> +
>> +    if (xtc->post_load) {
>> +        return xtc->post_load(tctx, version_id);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>  static const VMStateDescription vmstate_xive_tctx_base = {
>>      .name = TYPE_XIVE_TCTX,
>>      .version_id = 1,
>>      .minimum_version_id = 1,
>> +    .post_load = vmstate_xive_tctx_post_load,
>>      .fields = (VMStateField[]) {
>>          VMSTATE_BUFFER(regs, XiveTCTX),
>>          VMSTATE_END_OF_LIST()
>> @@ -581,9 +599,37 @@ static const TypeInfo xive_tctx_base_info = {
>>      .class_size    = sizeof(XiveTCTXClass),
>>  };
>>  
>> +static int xive_tctx_post_load(XiveTCTX *tctx, int version_id)
>> +{
>> +    XiveRouterClass *xrc = XIVE_ROUTER_GET_CLASS(tctx->xrtr);
>> +
>> +    /*
>> +     * When we collect the states from KVM XIVE irqchip, we set word2
>> +     * of the thread context to print out the OS CAM line under the
>> +     * QEMU monitor.
>> +     *
>> +     * This breaks migration on a guest using TCG or not using a KVM
>> +     * irqchip. Fix with an extra reset of the thread contexts.
>> +     */
>> +    if (xrc->reset_tctx) {
>> +        xrc->reset_tctx(tctx->xrtr, tctx);
>> +    }
>> +    return 0;
>> +}
>> +
>> +static void xive_tctx_class_init(ObjectClass *klass, void *data)
>> +{
>> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_CLASS(klass);
>> +
>> +    xtc->post_load = xive_tctx_post_load;
>> +}
>> +
>>  static const TypeInfo xive_tctx_info = {
>>      .name          = TYPE_XIVE_TCTX,
>>      .parent        = TYPE_XIVE_TCTX_BASE,
>> +    .instance_size = sizeof(XiveTCTX),
>> +    .class_init    = xive_tctx_class_init,
>> +    .class_size    = sizeof(XiveTCTXClass),
>>  };
>>  
>>  Object *xive_tctx_create(Object *cpu, const char *type, XiveRouter *xrtr,
>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>> index 92ef53743b64..6fac6ca70595 100644
>> --- a/hw/ppc/spapr_irq.c
>> +++ b/hw/ppc/spapr_irq.c
>> @@ -359,7 +359,7 @@ static Object *spapr_irq_cpu_intc_create_xive(sPAPRMachineState *spapr,
>>  
>>  static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>>  {
>> -    return 0;
>> +    return spapr_xive_post_load(spapr->xive, version_id);
>>  }
>>  
>>  /*
>
David Gibson Nov. 30, 2018, 1:24 a.m. UTC | #3
On Thu, Nov 29, 2018 at 05:19:51PM +0100, Cédric Le Goater wrote:
> David,
> 
> Could you tell what you think about the KVM interfaces for migration,
> the ones capturing and restoring the states ? 
> 
> On 11/29/18 4:43 AM, David Gibson wrote:
> > On Fri, Nov 16, 2018 at 11:57:16AM +0100, Cédric Le Goater wrote:
> >> This extends the KVM XIVE models to handle the state synchronization
> >> with KVM, for the monitor usage and for the migration.
> >>
> >> The migration priority of the XIVE interrupt controller sPAPRXive is
> >> raised for KVM. It operates first and orchestrates the capture
> >> sequence of the states of all the XIVE models. The XIVE sources are
> >> masked to quiesce the interrupt flow and a XIVE xync is performed to
> >> stabilize the OS Event Queues. The state of the ENDs are then captured
> >> by the XIVE interrupt controller model, sPAPRXive, and the state of
> >> the thread contexts by the thread interrupt presenter model,
> >> XiveTCTX. When done, a rollback is performed to restore the sources to
> >> their initial state.
> >>
> >> The sPAPRXive 'post_load' method is called from the sPAPR machine,
> >> after all XIVE device states have been transfered and loaded. First,
> >> sPAPRXive restores the XIVE routing tables: ENDT and EAT. Next, are
> >> restored the thread interrupt context registers and the source PQ
> >> bits.
> >>
> >> The get/set operations rely on their KVM counterpart in the host
> >> kernel which acts as a proxy for OPAL, the host firmware.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>
> >>  WIP:
> >>  
> >>     If migration occurs when a VCPU is 'ceded', some the OS event
> >>     notification queues are mapped to the ZERO_PAGE on the receiving
> >>     side. As if the HW had triggered a page fault before the dirty
> >>     page was transferred from the source or as if we were not using
> >>     the correct page table.
> 
> 
> v6 adds a VM change state handler to make XIVE reach a quiescent state. 
> The sequence is a little more sophisticated and an extra KVM call 
> marks the EQ page dirty.

Ok.

> 
> >>
> >>  include/hw/ppc/spapr_xive.h     |   5 +
> >>  include/hw/ppc/xive.h           |   3 +
> >>  include/migration/vmstate.h     |   1 +
> >>  linux-headers/asm-powerpc/kvm.h |  33 +++
> >>  hw/intc/spapr_xive.c            |  32 +++
> >>  hw/intc/spapr_xive_kvm.c        | 494 ++++++++++++++++++++++++++++++++
> >>  hw/intc/xive.c                  |  46 +++
> >>  hw/ppc/spapr_irq.c              |   2 +-
> >>  8 files changed, 615 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> >> index 9c817bb7ae74..d2517c040958 100644
> >> --- a/include/hw/ppc/spapr_xive.h
> >> +++ b/include/hw/ppc/spapr_xive.h
> >> @@ -55,12 +55,17 @@ typedef struct sPAPRXiveClass {
> >>      XiveRouterClass parent_class;
> >>  
> >>      DeviceRealize   parent_realize;
> >> +
> >> +    void (*synchronize_state)(sPAPRXive *xive);
> >> +    int  (*pre_save)(sPAPRXive *xsrc);
> >> +    int  (*post_load)(sPAPRXive *xsrc, int version_id);
> > 
> > This should go away if the KVM and non-KVM versions are in the same
> > object.
> 
> yes.
> 
> >>  } sPAPRXiveClass;
> >>  
> >>  bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
> >>  bool spapr_xive_irq_disable(sPAPRXive *xive, uint32_t lisn);
> >>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
> >>  qemu_irq spapr_xive_qirq(sPAPRXive *xive, uint32_t lisn);
> >> +int spapr_xive_post_load(sPAPRXive *xive, int version_id);
> >>  
> >>  /*
> >>   * sPAPR NVT and END indexing helpers
> >> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> >> index 7aaf5a182cb3..c8201462d698 100644
> >> --- a/include/hw/ppc/xive.h
> >> +++ b/include/hw/ppc/xive.h
> >> @@ -309,6 +309,9 @@ typedef struct XiveTCTXClass {
> >>      DeviceClass       parent_class;
> >>  
> >>      DeviceRealize     parent_realize;
> >> +
> >> +    void (*synchronize_state)(XiveTCTX *tctx);
> >> +    int  (*post_load)(XiveTCTX *tctx, int version_id);
> > 
> > .. and this too.
> > 
> >>  } XiveTCTXClass;
> >>  
> >>  /*
> >> diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
> >> index 2b501d04669a..ee2e836cc1c1 100644
> >> --- a/include/migration/vmstate.h
> >> +++ b/include/migration/vmstate.h
> >> @@ -154,6 +154,7 @@ typedef enum {
> >>      MIG_PRI_PCI_BUS,            /* Must happen before IOMMU */
> >>      MIG_PRI_GICV3_ITS,          /* Must happen before PCI devices */
> >>      MIG_PRI_GICV3,              /* Must happen before the ITS */
> >> +    MIG_PRI_XIVE_IC,            /* Must happen before all XIVE models */
> > 
> > Ugh.. explicit priority / order levels are a pretty bad code smell.
> > Usually migration ordering can be handled by getting the object
> > heirarchy right.  What exactly is the problem you're addessing with
> > this?
> 
> I wanted sPAPRXive to capture the state on behalf of all XIVE models. 
> But with the addition of the VMState change handler I think I can 
> remove this priority. I will check. 
> 
> > 
> >>      MIG_PRI_MAX,
> >>  } MigrationPriority;
> >>  
> >> diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
> >> index f34c971491dd..9d55ade23634 100644
> >> --- a/linux-headers/asm-powerpc/kvm.h
> >> +++ b/linux-headers/asm-powerpc/kvm.h
> > 
> > Again, linux-headers need to be split out.
> > 
> >> @@ -480,6 +480,8 @@ struct kvm_ppc_cpu_char {
> >>  #define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
> >>  #define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
> >>  
> >> +#define KVM_REG_PPC_NVT_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d)
> >> +
> >>  /* Device control API: PPC-specific devices */
> >>  #define KVM_DEV_MPIC_GRP_MISC		1
> >>  #define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
> >> @@ -681,10 +683,41 @@ struct kvm_ppc_cpu_char {
> >>  #define   KVM_DEV_XIVE_GET_TIMA_FD	2
> >>  #define   KVM_DEV_XIVE_VC_BASE		3
> >>  #define KVM_DEV_XIVE_GRP_SOURCES	2	/* 64-bit source attributes */
> >> +#define KVM_DEV_XIVE_GRP_SYNC		3	/* 64-bit source attributes */
> >> +#define KVM_DEV_XIVE_GRP_EAS		4	/* 64-bit eas attributes */
> >> +#define KVM_DEV_XIVE_GRP_EQ		5	/* 64-bit eq attributes */
> >>  
> >>  /* Layout of 64-bit XIVE source attribute values */
> >>  #define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
> >>  #define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
> >>  
> >> +/* Layout of 64-bit eas attribute values */
> >> +#define KVM_XIVE_EAS_PRIORITY_SHIFT	0
> >> +#define KVM_XIVE_EAS_PRIORITY_MASK	0x7
> >> +#define KVM_XIVE_EAS_SERVER_SHIFT	3
> >> +#define KVM_XIVE_EAS_SERVER_MASK	0xfffffff8ULL
> >> +#define KVM_XIVE_EAS_MASK_SHIFT		32
> >> +#define KVM_XIVE_EAS_MASK_MASK		0x100000000ULL
> >> +#define KVM_XIVE_EAS_EISN_SHIFT		33
> >> +#define KVM_XIVE_EAS_EISN_MASK		0xfffffffe00000000ULL
> >> +
> >> +/* Layout of 64-bit eq attribute */
> >> +#define KVM_XIVE_EQ_PRIORITY_SHIFT	0
> >> +#define KVM_XIVE_EQ_PRIORITY_MASK	0x7
> >> +#define KVM_XIVE_EQ_SERVER_SHIFT	3
> >> +#define KVM_XIVE_EQ_SERVER_MASK		0xfffffff8ULL
> >> +
> >> +/* Layout of 64-bit eq attribute values */
> >> +struct kvm_ppc_xive_eq {
> >> +	__u32 flags;
> >> +	__u32 qsize;
> >> +	__u64 qpage;
> >> +	__u32 qtoggle;
> >> +	__u32 qindex;
> >> +};
> >> +
> >> +#define KVM_XIVE_EQ_FLAG_ENABLED	0x00000001
> >> +#define KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY	0x00000002
> >> +#define KVM_XIVE_EQ_FLAG_ESCALATE	0x00000004
> >>  
> >>  #endif /* __LINUX_KVM_POWERPC_H */
> >> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >> index ec85f7e4f88d..c5c0e063dc33 100644
> >> --- a/hw/intc/spapr_xive.c
> >> +++ b/hw/intc/spapr_xive.c
> >> @@ -27,9 +27,14 @@
> >>  
> >>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
> >>  {
> >> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
> >>      int i;
> >>      uint32_t offset = 0;
> >>  
> >> +    if (sxc->synchronize_state) {
> >> +        sxc->synchronize_state(xive);
> >> +    }
> >> +
> >>      monitor_printf(mon, "XIVE Source %08x .. %08x\n", offset,
> >>                     offset + xive->source.nr_irqs - 1);
> >>      xive_source_pic_print_info(&xive->source, offset, mon);
> >> @@ -354,10 +359,37 @@ static const VMStateDescription vmstate_spapr_xive_eas = {
> >>      },
> >>  };
> >>  
> >> +static int vmstate_spapr_xive_pre_save(void *opaque)
> >> +{
> >> +    sPAPRXive *xive = SPAPR_XIVE_BASE(opaque);
> >> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
> >> +
> >> +    if (sxc->pre_save) {
> >> +        return sxc->pre_save(xive);
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +/* handled at the machine level */
> >> +int spapr_xive_post_load(sPAPRXive *xive, int version_id)
> >> +{
> >> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
> >> +
> >> +    if (sxc->post_load) {
> >> +        return sxc->post_load(xive, version_id);
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >>  static const VMStateDescription vmstate_spapr_xive_base = {
> >>      .name = TYPE_SPAPR_XIVE,
> >>      .version_id = 1,
> >>      .minimum_version_id = 1,
> >> +    .pre_save = vmstate_spapr_xive_pre_save,
> >> +    .post_load = NULL, /* handled at the machine level */
> >> +    .priority = MIG_PRI_XIVE_IC,
> >>      .fields = (VMStateField[]) {
> >>          VMSTATE_UINT32_EQUAL(nr_irqs, sPAPRXive, NULL),
> >>          VMSTATE_STRUCT_VARRAY_POINTER_UINT32(eat, sPAPRXive, nr_irqs,
> >> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> >> index 767f90826e43..176083c37d61 100644
> >> --- a/hw/intc/spapr_xive_kvm.c
> >> +++ b/hw/intc/spapr_xive_kvm.c
> >> @@ -58,6 +58,58 @@ static void kvm_cpu_enable(CPUState *cs)
> >>  /*
> >>   * XIVE Thread Interrupt Management context (KVM)
> >>   */
> >> +static void xive_tctx_kvm_set_state(XiveTCTX *tctx, Error **errp)
> >> +{
> >> +    uint64_t state[4];
> >> +    int ret;
> >> +
> >> +    /* word0 and word1 of the OS ring. */
> >> +    state[0] = *((uint64_t *) &tctx->regs[TM_QW1_OS]);
> >> +
> >> +    /* VP identifier. Only for KVM pr_debug() */
> >> +    state[1] = *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]);
> >> +
> >> +    ret = kvm_set_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
> >> +    if (ret != 0) {
> >> +        error_setg_errno(errp, errno, "Could restore KVM XIVE CPU %ld state",
> >> +                         kvm_arch_vcpu_id(tctx->cs));
> >> +    }
> >> +}
> >> +
> >> +static void xive_tctx_kvm_get_state(XiveTCTX *tctx, Error **errp)
> >> +{
> >> +    uint64_t state[4] = { 0 };
> >> +    int ret;
> >> +
> >> +    ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
> >> +    if (ret != 0) {
> >> +        error_setg_errno(errp, errno, "Could capture KVM XIVE CPU %ld state",
> >> +                         kvm_arch_vcpu_id(tctx->cs));
> >> +        return;
> >> +    }
> >> +
> >> +    /* word0 and word1 of the OS ring. */
> >> +    *((uint64_t *) &tctx->regs[TM_QW1_OS]) = state[0];
> >> +
> >> +    /*
> >> +     * KVM also returns word2 containing the VP CAM line value which
> >> +     * is interesting to print out the VP identifier in the QEMU
> >> +     * monitor. No need to restore it.
> >> +     */
> >> +    *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]) = state[1];
> >> +}
> >> +
> >> +static void xive_tctx_kvm_do_synchronize_state(CPUState *cpu,
> >> +                                              run_on_cpu_data arg)
> >> +{
> >> +    xive_tctx_kvm_get_state(arg.host_ptr, &error_fatal);
> >> +}
> >> +
> >> +static void xive_tctx_kvm_synchronize_state(XiveTCTX *tctx)
> >> +{
> >> +    run_on_cpu(tctx->cs, xive_tctx_kvm_do_synchronize_state,
> >> +               RUN_ON_CPU_HOST_PTR(tctx));
> >> +}
> >>  
> >>  static void xive_tctx_kvm_init(XiveTCTX *tctx, Error **errp)
> >>  {
> >> @@ -112,6 +164,8 @@ static void xive_tctx_kvm_class_init(ObjectClass *klass, void *data)
> >>  
> >>      device_class_set_parent_realize(dc, xive_tctx_kvm_realize,
> >>                                      &xtc->parent_realize);
> >> +
> >> +    xtc->synchronize_state = xive_tctx_kvm_synchronize_state;
> >>  }
> >>  
> >>  static const TypeInfo xive_tctx_kvm_info = {
> >> @@ -166,6 +220,34 @@ static void xive_source_kvm_reset(DeviceState *dev)
> >>      xive_source_kvm_init(xsrc, &error_fatal);
> >>  }
> >>  
> >> +/*
> >> + * This is used to perform the magic loads on the ESB pages, described
> >> + * in xive.h.
> >> + */
> >> +static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
> >> +{
> >> +    unsigned long addr = (unsigned long) xsrc->esb_mmap +
> >> +        xive_source_esb_mgmt(xsrc, srcno) + offset;
> >> +
> >> +    /* Prevent the compiler from optimizing away the load */
> >> +    volatile uint64_t value = *((uint64_t *) addr);
> >> +
> >> +    return be64_to_cpu(value) & 0x3;
> >> +}
> >> +
> >> +static void xive_source_kvm_get_state(XiveSource *xsrc)
> >> +{
> >> +    int i;
> >> +
> >> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> >> +        /* Perform a load without side effect to retrieve the PQ bits */
> >> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
> >> +
> >> +        /* and save PQ locally */
> >> +        xive_source_esb_set(xsrc, i, pq);
> >> +    }
> >> +}
> >> +
> >>  static void xive_source_kvm_set_irq(void *opaque, int srcno, int val)
> >>  {
> >>      XiveSource *xsrc = opaque;
> >> @@ -295,6 +377,414 @@ static const TypeInfo xive_source_kvm_info = {
> >>  /*
> >>   * sPAPR XIVE Router (KVM)
> >>   */
> >> +static int spapr_xive_kvm_set_eq_state(sPAPRXive *xive, CPUState *cs,
> >> +                                       Error **errp)
> >> +{
> >> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
> >> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> >> +    int ret;
> >> +    int i;
> >> +
> >> +    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
> >> +        Error *local_err = NULL;
> >> +        XiveEND end;
> >> +        uint8_t end_blk;
> >> +        uint32_t end_idx;
> >> +        struct kvm_ppc_xive_eq kvm_eq = { 0 };
> >> +        uint64_t kvm_eq_idx;
> >> +
> >> +        if (!spapr_xive_priority_is_valid(i)) {
> >> +            continue;
> >> +        }
> >> +
> >> +        spapr_xive_cpu_to_end(xive, POWERPC_CPU(cs), i, &end_blk, &end_idx);
> >> +
> >> +        ret = xive_router_get_end(xrtr, end_blk, end_idx, &end);
> >> +        if (ret) {
> >> +            error_setg(errp, "XIVE: No END for CPU %ld priority %d",
> >> +                       vcpu_id, i);
> >> +            return ret;
> >> +        }
> >> +
> >> +        if (!(end.w0 & END_W0_VALID)) {
> >> +            continue;
> >> +        }
> >> +
> >> +        /* Build the KVM state from the local END structure */
> >> +        kvm_eq.flags   = KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY;
> >> +        kvm_eq.qsize   = GETFIELD(END_W0_QSIZE, end.w0) + 12;
> >> +        kvm_eq.qpage   = (((uint64_t)(end.w2 & 0x0fffffff)) << 32) | end.w3;
> >> +        kvm_eq.qtoggle = GETFIELD(END_W1_GENERATION, end.w1);
> >> +        kvm_eq.qindex  = GETFIELD(END_W1_PAGE_OFF, end.w1);
> >> +
> >> +        /* Encode the tuple (server, prio) as a KVM EQ index */
> >> +        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
> >> +            KVM_XIVE_EQ_PRIORITY_MASK;
> >> +        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
> >> +            KVM_XIVE_EQ_SERVER_MASK;
> >> +
> >> +        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
> >> +                                &kvm_eq, true, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return ret;
> >> +        }
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static int spapr_xive_kvm_get_eq_state(sPAPRXive *xive, CPUState *cs,
> >> +                                       Error **errp)
> >> +{
> >> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
> >> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> >> +    int ret;
> >> +    int i;
> >> +
> >> +    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
> >> +        Error *local_err = NULL;
> >> +        struct kvm_ppc_xive_eq kvm_eq = { 0 };
> >> +        uint64_t kvm_eq_idx;
> >> +        XiveEND end = { 0 };
> >> +        uint8_t end_blk, nvt_blk;
> >> +        uint32_t end_idx, nvt_idx;
> >> +
> >> +        /* Skip priorities reserved for the hypervisor */
> >> +        if (!spapr_xive_priority_is_valid(i)) {
> >> +            continue;
> >> +        }
> >> +
> >> +        /* Encode the tuple (server, prio) as a KVM EQ index */
> >> +        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
> >> +            KVM_XIVE_EQ_PRIORITY_MASK;
> >> +        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
> >> +            KVM_XIVE_EQ_SERVER_MASK;
> >> +
> >> +        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
> >> +                                &kvm_eq, false, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return ret;
> >> +        }
> >> +
> >> +        if (!(kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED)) {
> >> +            continue;
> >> +        }
> >> +
> >> +        /* Update the local END structure with the KVM input */
> >> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED) {
> >> +                end.w0 |= END_W0_VALID | END_W0_ENQUEUE;
> >> +        }
> >> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY) {
> >> +                end.w0 |= END_W0_UCOND_NOTIFY;
> >> +        }
> >> +        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ESCALATE) {
> >> +                end.w0 |= END_W0_ESCALATE_CTL;
> >> +        }
> >> +        end.w0 |= SETFIELD(END_W0_QSIZE, 0ul, kvm_eq.qsize - 12);
> >> +
> >> +        end.w1 = SETFIELD(END_W1_GENERATION, 0ul, kvm_eq.qtoggle) |
> >> +            SETFIELD(END_W1_PAGE_OFF, 0ul, kvm_eq.qindex);
> >> +        end.w2 = (kvm_eq.qpage >> 32) & 0x0fffffff;
> >> +        end.w3 = kvm_eq.qpage & 0xffffffff;
> >> +        end.w4 = 0;
> >> +        end.w5 = 0;
> >> +
> >> +        ret = spapr_xive_cpu_to_nvt(xive, POWERPC_CPU(cs), &nvt_blk, &nvt_idx);
> >> +        if (ret) {
> >> +            error_setg(errp, "XIVE: No NVT for CPU %ld", vcpu_id);
> >> +            return ret;
> >> +        }
> >> +
> >> +        end.w6 = SETFIELD(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
> >> +            SETFIELD(END_W6_NVT_INDEX, 0ul, nvt_idx);
> >> +        end.w7 = SETFIELD(END_W7_F0_PRIORITY, 0ul, i);
> >> +
> >> +        spapr_xive_cpu_to_end(xive, POWERPC_CPU(cs), i, &end_blk, &end_idx);
> >> +
> >> +        ret = xive_router_set_end(xrtr, end_blk, end_idx, &end);
> >> +        if (ret) {
> >> +            error_setg(errp, "XIVE: No END for CPU %ld priority %d",
> >> +                       vcpu_id, i);
> >> +            return ret;
> >> +        }
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static void spapr_xive_kvm_set_eas_state(sPAPRXive *xive, Error **errp)
> >> +{
> >> +    XiveSource *xsrc = &xive->source;
> >> +    int i;
> >> +
> >> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> >> +        XiveEAS *eas = &xive->eat[i];
> >> +        uint32_t end_idx;
> >> +        uint32_t end_blk;
> >> +        uint32_t eisn;
> >> +        uint8_t priority;
> >> +        uint32_t server;
> >> +        uint64_t kvm_eas;
> >> +        Error *local_err = NULL;
> >> +
> >> +        /* No need to set MASKED EAS, this is the default state after reset */
> >> +        if (!(eas->w & EAS_VALID) || eas->w & EAS_MASKED) {
> >> +            continue;
> >> +        }
> >> +
> >> +        end_idx = GETFIELD(EAS_END_INDEX, eas->w);
> >> +        end_blk = GETFIELD(EAS_END_BLOCK, eas->w);
> >> +        eisn = GETFIELD(EAS_END_DATA, eas->w);
> >> +
> >> +        spapr_xive_end_to_target(xive, end_blk, end_idx, &server, &priority);
> >> +
> >> +        kvm_eas = priority << KVM_XIVE_EAS_PRIORITY_SHIFT &
> >> +            KVM_XIVE_EAS_PRIORITY_MASK;
> >> +        kvm_eas |= server << KVM_XIVE_EAS_SERVER_SHIFT &
> >> +            KVM_XIVE_EAS_SERVER_MASK;
> >> +        kvm_eas |= ((uint64_t)eisn << KVM_XIVE_EAS_EISN_SHIFT) &
> >> +            KVM_XIVE_EAS_EISN_MASK;
> >> +
> >> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, true,
> >> +                          &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }
> >> +    }
> >> +}
> >> +
> >> +static void spapr_xive_kvm_get_eas_state(sPAPRXive *xive, Error **errp)
> >> +{
> >> +    XiveSource *xsrc = &xive->source;
> >> +    int i;
> >> +
> >> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> >> +        XiveEAS *eas = &xive->eat[i];
> >> +        XiveEAS new_eas;
> >> +        uint64_t kvm_eas;
> >> +        uint8_t priority;
> >> +        uint32_t server;
> >> +        uint32_t end_idx;
> >> +        uint8_t end_blk;
> >> +        uint32_t eisn;
> >> +        Error *local_err = NULL;
> >> +
> >> +        if (!(eas->w & EAS_VALID)) {
> >> +            continue;
> >> +        }
> >> +
> >> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, false,
> >> +                          &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }
> >> +
> >> +        priority = (kvm_eas & KVM_XIVE_EAS_PRIORITY_MASK) >>
> >> +            KVM_XIVE_EAS_PRIORITY_SHIFT;
> >> +        server = (kvm_eas & KVM_XIVE_EAS_SERVER_MASK) >>
> >> +            KVM_XIVE_EAS_SERVER_SHIFT;
> >> +        eisn = (kvm_eas & KVM_XIVE_EAS_EISN_MASK) >> KVM_XIVE_EAS_EISN_SHIFT;
> >> +
> >> +        if (spapr_xive_target_to_end(xive, server, priority, &end_blk,
> >> +                                     &end_idx)) {
> >> +            error_setg(errp, "XIVE: invalid tuple CPU %d priority %d", server,
> >> +                       priority);
> >> +            return;
> >> +        }
> >> +
> >> +        new_eas.w = EAS_VALID;
> >> +        if (kvm_eas & KVM_XIVE_EAS_MASK_MASK) {
> >> +            new_eas.w |= EAS_MASKED;
> >> +        }
> >> +
> >> +        new_eas.w = SETFIELD(EAS_END_INDEX, new_eas.w, end_idx);
> >> +        new_eas.w = SETFIELD(EAS_END_BLOCK, new_eas.w, end_blk);
> >> +        new_eas.w = SETFIELD(EAS_END_DATA, new_eas.w, eisn);
> >> +
> >> +        *eas = new_eas;
> >> +    }
> >> +}
> >> +
> >> +static void spapr_xive_kvm_sync_all(sPAPRXive *xive, Error **errp)
> >> +{
> >> +    XiveSource *xsrc = &xive->source;
> >> +    Error *local_err = NULL;
> >> +    int i;
> >> +
> >> +    /* Sync the KVM source. This reaches the XIVE HW through OPAL */
> >> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> >> +        XiveEAS *eas = &xive->eat[i];
> >> +
> >> +        if (!(eas->w & EAS_VALID)) {
> >> +            continue;
> >> +        }
> >> +
> >> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SYNC, i, NULL, true,
> >> +                          &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }
> >> +    }
> >> +}
> >> +
> >> +/*
> >> + * The sPAPRXive KVM model migration priority is higher to make sure
> > 
> > Higher than what?
> 
> Than the XiveTCTX and XiveSource models.
> 
> >> + * its 'pre_save' method runs before all the other XIVE models. It
> > 
> > If the other XIVE components are children of sPAPRXive (which I think
> > they are or could be), then I believe the parent object's pre_save
> > will automatically be called first.
> 
> ok. XiveTCTX are not children of sPAPRXive but that might not be 
> a problem anymore with the VMState change handler.

Ah, right.  You might need the handler in the machine itself then - we
already have something like that for XICS, IIRC.

> 
> Thanks
> 
> C.
> 
> >> + * orchestrates the capture sequence of the XIVE states in the
> >> + * following order:
> >> + *
> >> + *   1. mask all the sources by setting PQ=01, which returns the
> >> + *      previous value and save it.
> >> + *   2. sync the sources in KVM to stabilize all the queues
> >> + *      sync the ENDs to make sure END -> VP is fully completed
> >> + *   3. dump the EAS table
> >> + *   4. dump the END table
> >> + *   5. dump the thread context (IPB)
> >> + *
> >> + *  Rollback to restore the current configuration of the sources
> > 
> > 
> > 
> >> + */
> >> +static int spapr_xive_kvm_pre_save(sPAPRXive *xive)
> >> +{
> >> +    XiveSource *xsrc = &xive->source;
> >> +    Error *local_err = NULL;
> >> +    CPUState *cs;
> >> +    int i;
> >> +    int ret = 0;
> >> +
> >> +    /* Quiesce the sources, to stop the flow of event notifications */
> >> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> >> +        /*
> >> +         * Mask and save the ESB PQs locally in the XiveSource object.
> >> +         */
> >> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_01);
> >> +        xive_source_esb_set(xsrc, i, pq);
> >> +    }
> >> +
> >> +    /* Sync the sources in KVM */
> >> +    spapr_xive_kvm_sync_all(xive, &local_err);
> >> +    if (local_err) {
> >> +        error_report_err(local_err);
> >> +        goto out;
> >> +    }
> >> +
> >> +    /* Grab the EAT (could be done earlier ?) */
> >> +    spapr_xive_kvm_get_eas_state(xive, &local_err);
> >> +    if (local_err) {
> >> +        error_report_err(local_err);
> >> +        goto out;
> >> +    }
> >> +
> >> +    /*
> >> +     * Grab the ENDs. The EQ index and the toggle bit are what we want
> >> +     * to capture
> >> +     */
> >> +    CPU_FOREACH(cs) {
> >> +        spapr_xive_kvm_get_eq_state(xive, cs, &local_err);
> >> +        if (local_err) {
> >> +            error_report_err(local_err);
> >> +            goto out;
> >> +        }
> >> +    }
> >> +
> >> +    /* Capture the thread interrupt contexts */
> >> +    CPU_FOREACH(cs) {
> >> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> +
> >> +        /* TODO: Check if we need to use under run_on_cpu() ? */
> >> +        xive_tctx_kvm_get_state(XIVE_TCTX_KVM(cpu->intc), &local_err);
> >> +        if (local_err) {
> >> +            error_report_err(local_err);
> >> +            goto out;
> >> +        }
> >> +    }
> >> +
> >> +    /* All done. */
> >> +
> >> +out:
> >> +    /* Restore the sources to their initial state */
> >> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> >> +        uint8_t pq = xive_source_esb_get(xsrc, i);
> >> +        if (xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8)) != 0x1) {
> >> +            error_report("XIVE: IRQ %d has an invalid state", i);
> >> +        }
> >> +    }
> >> +
> >> +    /*
> >> +     * The XiveSource and the XiveTCTX states will be collected by
> >> +     * their respective vmstate handlers afterwards.
> >> +     */
> >> +    return ret;
> >> +}
> >> +
> >> +/*
> >> + * The sPAPRXive 'post_load' method is called by the sPAPR machine,
> >> + * after all XIVE device states have been transfered and loaded.
> >> + *
> >> + * All should be in place when the VCPUs resume execution.
> >> + */
> >> +static int spapr_xive_kvm_post_load(sPAPRXive *xive, int version_id)
> >> +{
> >> +    XiveSource *xsrc = &xive->source;
> >> +    Error *local_err = NULL;
> >> +    CPUState *cs;
> >> +    int i;
> >> +
> >> +    /* Set the ENDs first. The targetting depends on it. */
> >> +    CPU_FOREACH(cs) {
> >> +        spapr_xive_kvm_set_eq_state(xive, cs, &local_err);
> >> +        if (local_err) {
> >> +            error_report_err(local_err);
> >> +            return -1;
> >> +        }
> >> +    }
> >> +
> >> +    /* Restore the targetting, if any */
> >> +    spapr_xive_kvm_set_eas_state(xive, &local_err);
> >> +    if (local_err) {
> >> +        error_report_err(local_err);
> >> +        return -1;
> >> +    }
> >> +
> >> +    /* Restore the thread interrupt contexts */
> >> +    CPU_FOREACH(cs) {
> >> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> +
> >> +        xive_tctx_kvm_set_state(XIVE_TCTX_KVM(cpu->intc), &local_err);
> >> +        if (local_err) {
> >> +            error_report_err(local_err);
> >> +            return -1;
> >> +        }
> >> +    }
> >> +
> >> +    /*
> >> +     * Get the saved state from the XiveSource model and restore the
> >> +     * PQ bits
> >> +     */
> >> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> >> +        uint8_t pq = xive_source_esb_get(xsrc, i);
> >> +        xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
> >> +    }
> >> +    return 0;
> >> +}
> >> +
> >> +static void spapr_xive_kvm_synchronize_state(sPAPRXive *xive)
> >> +{
> >> +    XiveSource *xsrc = &xive->source;
> >> +    CPUState *cs;
> >> +
> >> +    xive_source_kvm_get_state(xsrc);
> >> +
> >> +    spapr_xive_kvm_get_eas_state(xive, &error_fatal);
> >> +
> >> +    CPU_FOREACH(cs) {
> >> +        spapr_xive_kvm_get_eq_state(xive, cs, &error_fatal);
> >> +    }
> >> +}
> >>  
> >>  static void spapr_xive_kvm_instance_init(Object *obj)
> >>  {
> >> @@ -409,6 +899,10 @@ static void spapr_xive_kvm_class_init(ObjectClass *klass, void *data)
> >>  
> >>      dc->desc = "sPAPR XIVE KVM Interrupt Controller";
> >>      dc->unrealize = spapr_xive_kvm_unrealize;
> >> +
> >> +    sxc->synchronize_state = spapr_xive_kvm_synchronize_state;
> >> +    sxc->pre_save = spapr_xive_kvm_pre_save;
> >> +    sxc->post_load = spapr_xive_kvm_post_load;
> >>  }
> >>  
> >>  static const TypeInfo spapr_xive_kvm_info = {
> >> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >> index 9bb37553c9ec..c9aedecc8216 100644
> >> --- a/hw/intc/xive.c
> >> +++ b/hw/intc/xive.c
> >> @@ -438,9 +438,14 @@ static const struct {
> >>  
> >>  void xive_tctx_pic_print_info(XiveTCTX *tctx, Monitor *mon)
> >>  {
> >> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(tctx);
> >>      int cpu_index = tctx->cs ? tctx->cs->cpu_index : -1;
> >>      int i;
> >>  
> >> +    if (xtc->synchronize_state) {
> >> +        xtc->synchronize_state(tctx);
> >> +    }
> >> +
> >>      monitor_printf(mon, "CPU[%04x]:   QW   NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
> >>                     "  W2\n", cpu_index);
> >>  
> >> @@ -552,10 +557,23 @@ static void xive_tctx_base_unrealize(DeviceState *dev, Error **errp)
> >>      qemu_unregister_reset(xive_tctx_base_reset, dev);
> >>  }
> >>  
> >> +static int vmstate_xive_tctx_post_load(void *opaque, int version_id)
> >> +{
> >> +    XiveTCTX *tctx = XIVE_TCTX_BASE(opaque);
> >> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(tctx);
> >> +
> >> +    if (xtc->post_load) {
> >> +        return xtc->post_load(tctx, version_id);
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >>  static const VMStateDescription vmstate_xive_tctx_base = {
> >>      .name = TYPE_XIVE_TCTX,
> >>      .version_id = 1,
> >>      .minimum_version_id = 1,
> >> +    .post_load = vmstate_xive_tctx_post_load,
> >>      .fields = (VMStateField[]) {
> >>          VMSTATE_BUFFER(regs, XiveTCTX),
> >>          VMSTATE_END_OF_LIST()
> >> @@ -581,9 +599,37 @@ static const TypeInfo xive_tctx_base_info = {
> >>      .class_size    = sizeof(XiveTCTXClass),
> >>  };
> >>  
> >> +static int xive_tctx_post_load(XiveTCTX *tctx, int version_id)
> >> +{
> >> +    XiveRouterClass *xrc = XIVE_ROUTER_GET_CLASS(tctx->xrtr);
> >> +
> >> +    /*
> >> +     * When we collect the states from KVM XIVE irqchip, we set word2
> >> +     * of the thread context to print out the OS CAM line under the
> >> +     * QEMU monitor.
> >> +     *
> >> +     * This breaks migration on a guest using TCG or not using a KVM
> >> +     * irqchip. Fix with an extra reset of the thread contexts.
> >> +     */
> >> +    if (xrc->reset_tctx) {
> >> +        xrc->reset_tctx(tctx->xrtr, tctx);
> >> +    }
> >> +    return 0;
> >> +}
> >> +
> >> +static void xive_tctx_class_init(ObjectClass *klass, void *data)
> >> +{
> >> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_CLASS(klass);
> >> +
> >> +    xtc->post_load = xive_tctx_post_load;
> >> +}
> >> +
> >>  static const TypeInfo xive_tctx_info = {
> >>      .name          = TYPE_XIVE_TCTX,
> >>      .parent        = TYPE_XIVE_TCTX_BASE,
> >> +    .instance_size = sizeof(XiveTCTX),
> >> +    .class_init    = xive_tctx_class_init,
> >> +    .class_size    = sizeof(XiveTCTXClass),
> >>  };
> >>  
> >>  Object *xive_tctx_create(Object *cpu, const char *type, XiveRouter *xrtr,
> >> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> >> index 92ef53743b64..6fac6ca70595 100644
> >> --- a/hw/ppc/spapr_irq.c
> >> +++ b/hw/ppc/spapr_irq.c
> >> @@ -359,7 +359,7 @@ static Object *spapr_irq_cpu_intc_create_xive(sPAPRMachineState *spapr,
> >>  
> >>  static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
> >>  {
> >> -    return 0;
> >> +    return spapr_xive_post_load(spapr->xive, version_id);
> >>  }
> >>  
> >>  /*
> > 
>
Cédric Le Goater Nov. 30, 2018, 7:04 a.m. UTC | #4
[ ... ]

>>>> +/*
>>>> + * The sPAPRXive KVM model migration priority is higher to make sure
>>>
>>> Higher than what?
>>
>> Than the XiveTCTX and XiveSource models.
>>
>>>> + * its 'pre_save' method runs before all the other XIVE models. It
>>>
>>> If the other XIVE components are children of sPAPRXive (which I think
>>> they are or could be), then I believe the parent object's pre_save
>>> will automatically be called first.
>>
>> ok. XiveTCTX are not children of sPAPRXive but that might not be 
>> a problem anymore with the VMState change handler.
> 
> Ah, right.  You might need the handler in the machine itself then - we
> already have something like that for XICS, IIRC.

exactly. For XIVE, I am using the post_load method at the machine level, 
which should be last. The XIVE sources PQs are restored when the 
machine starts running again in the VM state change handler. So I don't
need the priority at all on the destination. I will try to remove the
prio, I agree it's a bit ugly.  

C.
diff mbox series

Patch

diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 9c817bb7ae74..d2517c040958 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -55,12 +55,17 @@  typedef struct sPAPRXiveClass {
     XiveRouterClass parent_class;
 
     DeviceRealize   parent_realize;
+
+    void (*synchronize_state)(sPAPRXive *xive);
+    int  (*pre_save)(sPAPRXive *xsrc);
+    int  (*post_load)(sPAPRXive *xsrc, int version_id);
 } sPAPRXiveClass;
 
 bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
 bool spapr_xive_irq_disable(sPAPRXive *xive, uint32_t lisn);
 void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
 qemu_irq spapr_xive_qirq(sPAPRXive *xive, uint32_t lisn);
+int spapr_xive_post_load(sPAPRXive *xive, int version_id);
 
 /*
  * sPAPR NVT and END indexing helpers
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index 7aaf5a182cb3..c8201462d698 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -309,6 +309,9 @@  typedef struct XiveTCTXClass {
     DeviceClass       parent_class;
 
     DeviceRealize     parent_realize;
+
+    void (*synchronize_state)(XiveTCTX *tctx);
+    int  (*post_load)(XiveTCTX *tctx, int version_id);
 } XiveTCTXClass;
 
 /*
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 2b501d04669a..ee2e836cc1c1 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -154,6 +154,7 @@  typedef enum {
     MIG_PRI_PCI_BUS,            /* Must happen before IOMMU */
     MIG_PRI_GICV3_ITS,          /* Must happen before PCI devices */
     MIG_PRI_GICV3,              /* Must happen before the ITS */
+    MIG_PRI_XIVE_IC,            /* Must happen before all XIVE models */
     MIG_PRI_MAX,
 } MigrationPriority;
 
diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
index f34c971491dd..9d55ade23634 100644
--- a/linux-headers/asm-powerpc/kvm.h
+++ b/linux-headers/asm-powerpc/kvm.h
@@ -480,6 +480,8 @@  struct kvm_ppc_cpu_char {
 #define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
 #define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
 
+#define KVM_REG_PPC_NVT_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d)
+
 /* Device control API: PPC-specific devices */
 #define KVM_DEV_MPIC_GRP_MISC		1
 #define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
@@ -681,10 +683,41 @@  struct kvm_ppc_cpu_char {
 #define   KVM_DEV_XIVE_GET_TIMA_FD	2
 #define   KVM_DEV_XIVE_VC_BASE		3
 #define KVM_DEV_XIVE_GRP_SOURCES	2	/* 64-bit source attributes */
+#define KVM_DEV_XIVE_GRP_SYNC		3	/* 64-bit source attributes */
+#define KVM_DEV_XIVE_GRP_EAS		4	/* 64-bit eas attributes */
+#define KVM_DEV_XIVE_GRP_EQ		5	/* 64-bit eq attributes */
 
 /* Layout of 64-bit XIVE source attribute values */
 #define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
 #define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
 
+/* Layout of 64-bit eas attribute values */
+#define KVM_XIVE_EAS_PRIORITY_SHIFT	0
+#define KVM_XIVE_EAS_PRIORITY_MASK	0x7
+#define KVM_XIVE_EAS_SERVER_SHIFT	3
+#define KVM_XIVE_EAS_SERVER_MASK	0xfffffff8ULL
+#define KVM_XIVE_EAS_MASK_SHIFT		32
+#define KVM_XIVE_EAS_MASK_MASK		0x100000000ULL
+#define KVM_XIVE_EAS_EISN_SHIFT		33
+#define KVM_XIVE_EAS_EISN_MASK		0xfffffffe00000000ULL
+
+/* Layout of 64-bit eq attribute */
+#define KVM_XIVE_EQ_PRIORITY_SHIFT	0
+#define KVM_XIVE_EQ_PRIORITY_MASK	0x7
+#define KVM_XIVE_EQ_SERVER_SHIFT	3
+#define KVM_XIVE_EQ_SERVER_MASK		0xfffffff8ULL
+
+/* Layout of 64-bit eq attribute values */
+struct kvm_ppc_xive_eq {
+	__u32 flags;
+	__u32 qsize;
+	__u64 qpage;
+	__u32 qtoggle;
+	__u32 qindex;
+};
+
+#define KVM_XIVE_EQ_FLAG_ENABLED	0x00000001
+#define KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY	0x00000002
+#define KVM_XIVE_EQ_FLAG_ESCALATE	0x00000004
 
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index ec85f7e4f88d..c5c0e063dc33 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -27,9 +27,14 @@ 
 
 void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
 {
+    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
     int i;
     uint32_t offset = 0;
 
+    if (sxc->synchronize_state) {
+        sxc->synchronize_state(xive);
+    }
+
     monitor_printf(mon, "XIVE Source %08x .. %08x\n", offset,
                    offset + xive->source.nr_irqs - 1);
     xive_source_pic_print_info(&xive->source, offset, mon);
@@ -354,10 +359,37 @@  static const VMStateDescription vmstate_spapr_xive_eas = {
     },
 };
 
+static int vmstate_spapr_xive_pre_save(void *opaque)
+{
+    sPAPRXive *xive = SPAPR_XIVE_BASE(opaque);
+    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
+
+    if (sxc->pre_save) {
+        return sxc->pre_save(xive);
+    }
+
+    return 0;
+}
+
+/* handled at the machine level */
+int spapr_xive_post_load(sPAPRXive *xive, int version_id)
+{
+    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(xive);
+
+    if (sxc->post_load) {
+        return sxc->post_load(xive, version_id);
+    }
+
+    return 0;
+}
+
 static const VMStateDescription vmstate_spapr_xive_base = {
     .name = TYPE_SPAPR_XIVE,
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = vmstate_spapr_xive_pre_save,
+    .post_load = NULL, /* handled at the machine level */
+    .priority = MIG_PRI_XIVE_IC,
     .fields = (VMStateField[]) {
         VMSTATE_UINT32_EQUAL(nr_irqs, sPAPRXive, NULL),
         VMSTATE_STRUCT_VARRAY_POINTER_UINT32(eat, sPAPRXive, nr_irqs,
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index 767f90826e43..176083c37d61 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -58,6 +58,58 @@  static void kvm_cpu_enable(CPUState *cs)
 /*
  * XIVE Thread Interrupt Management context (KVM)
  */
+static void xive_tctx_kvm_set_state(XiveTCTX *tctx, Error **errp)
+{
+    uint64_t state[4];
+    int ret;
+
+    /* word0 and word1 of the OS ring. */
+    state[0] = *((uint64_t *) &tctx->regs[TM_QW1_OS]);
+
+    /* VP identifier. Only for KVM pr_debug() */
+    state[1] = *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]);
+
+    ret = kvm_set_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
+    if (ret != 0) {
+        error_setg_errno(errp, errno, "Could restore KVM XIVE CPU %ld state",
+                         kvm_arch_vcpu_id(tctx->cs));
+    }
+}
+
+static void xive_tctx_kvm_get_state(XiveTCTX *tctx, Error **errp)
+{
+    uint64_t state[4] = { 0 };
+    int ret;
+
+    ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_NVT_STATE, state);
+    if (ret != 0) {
+        error_setg_errno(errp, errno, "Could capture KVM XIVE CPU %ld state",
+                         kvm_arch_vcpu_id(tctx->cs));
+        return;
+    }
+
+    /* word0 and word1 of the OS ring. */
+    *((uint64_t *) &tctx->regs[TM_QW1_OS]) = state[0];
+
+    /*
+     * KVM also returns word2 containing the VP CAM line value which
+     * is interesting to print out the VP identifier in the QEMU
+     * monitor. No need to restore it.
+     */
+    *((uint64_t *) &tctx->regs[TM_QW1_OS + TM_WORD2]) = state[1];
+}
+
+static void xive_tctx_kvm_do_synchronize_state(CPUState *cpu,
+                                              run_on_cpu_data arg)
+{
+    xive_tctx_kvm_get_state(arg.host_ptr, &error_fatal);
+}
+
+static void xive_tctx_kvm_synchronize_state(XiveTCTX *tctx)
+{
+    run_on_cpu(tctx->cs, xive_tctx_kvm_do_synchronize_state,
+               RUN_ON_CPU_HOST_PTR(tctx));
+}
 
 static void xive_tctx_kvm_init(XiveTCTX *tctx, Error **errp)
 {
@@ -112,6 +164,8 @@  static void xive_tctx_kvm_class_init(ObjectClass *klass, void *data)
 
     device_class_set_parent_realize(dc, xive_tctx_kvm_realize,
                                     &xtc->parent_realize);
+
+    xtc->synchronize_state = xive_tctx_kvm_synchronize_state;
 }
 
 static const TypeInfo xive_tctx_kvm_info = {
@@ -166,6 +220,34 @@  static void xive_source_kvm_reset(DeviceState *dev)
     xive_source_kvm_init(xsrc, &error_fatal);
 }
 
+/*
+ * This is used to perform the magic loads on the ESB pages, described
+ * in xive.h.
+ */
+static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
+{
+    unsigned long addr = (unsigned long) xsrc->esb_mmap +
+        xive_source_esb_mgmt(xsrc, srcno) + offset;
+
+    /* Prevent the compiler from optimizing away the load */
+    volatile uint64_t value = *((uint64_t *) addr);
+
+    return be64_to_cpu(value) & 0x3;
+}
+
+static void xive_source_kvm_get_state(XiveSource *xsrc)
+{
+    int i;
+
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        /* Perform a load without side effect to retrieve the PQ bits */
+        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
+
+        /* and save PQ locally */
+        xive_source_esb_set(xsrc, i, pq);
+    }
+}
+
 static void xive_source_kvm_set_irq(void *opaque, int srcno, int val)
 {
     XiveSource *xsrc = opaque;
@@ -295,6 +377,414 @@  static const TypeInfo xive_source_kvm_info = {
 /*
  * sPAPR XIVE Router (KVM)
  */
+static int spapr_xive_kvm_set_eq_state(sPAPRXive *xive, CPUState *cs,
+                                       Error **errp)
+{
+    XiveRouter *xrtr = XIVE_ROUTER(xive);
+    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
+    int ret;
+    int i;
+
+    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
+        Error *local_err = NULL;
+        XiveEND end;
+        uint8_t end_blk;
+        uint32_t end_idx;
+        struct kvm_ppc_xive_eq kvm_eq = { 0 };
+        uint64_t kvm_eq_idx;
+
+        if (!spapr_xive_priority_is_valid(i)) {
+            continue;
+        }
+
+        spapr_xive_cpu_to_end(xive, POWERPC_CPU(cs), i, &end_blk, &end_idx);
+
+        ret = xive_router_get_end(xrtr, end_blk, end_idx, &end);
+        if (ret) {
+            error_setg(errp, "XIVE: No END for CPU %ld priority %d",
+                       vcpu_id, i);
+            return ret;
+        }
+
+        if (!(end.w0 & END_W0_VALID)) {
+            continue;
+        }
+
+        /* Build the KVM state from the local END structure */
+        kvm_eq.flags   = KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY;
+        kvm_eq.qsize   = GETFIELD(END_W0_QSIZE, end.w0) + 12;
+        kvm_eq.qpage   = (((uint64_t)(end.w2 & 0x0fffffff)) << 32) | end.w3;
+        kvm_eq.qtoggle = GETFIELD(END_W1_GENERATION, end.w1);
+        kvm_eq.qindex  = GETFIELD(END_W1_PAGE_OFF, end.w1);
+
+        /* Encode the tuple (server, prio) as a KVM EQ index */
+        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
+            KVM_XIVE_EQ_PRIORITY_MASK;
+        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
+            KVM_XIVE_EQ_SERVER_MASK;
+
+        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
+                                &kvm_eq, true, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static int spapr_xive_kvm_get_eq_state(sPAPRXive *xive, CPUState *cs,
+                                       Error **errp)
+{
+    XiveRouter *xrtr = XIVE_ROUTER(xive);
+    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
+    int ret;
+    int i;
+
+    for (i = 0; i < XIVE_PRIORITY_MAX + 1; i++) {
+        Error *local_err = NULL;
+        struct kvm_ppc_xive_eq kvm_eq = { 0 };
+        uint64_t kvm_eq_idx;
+        XiveEND end = { 0 };
+        uint8_t end_blk, nvt_blk;
+        uint32_t end_idx, nvt_idx;
+
+        /* Skip priorities reserved for the hypervisor */
+        if (!spapr_xive_priority_is_valid(i)) {
+            continue;
+        }
+
+        /* Encode the tuple (server, prio) as a KVM EQ index */
+        kvm_eq_idx = i << KVM_XIVE_EQ_PRIORITY_SHIFT &
+            KVM_XIVE_EQ_PRIORITY_MASK;
+        kvm_eq_idx |= vcpu_id << KVM_XIVE_EQ_SERVER_SHIFT &
+            KVM_XIVE_EQ_SERVER_MASK;
+
+        ret = kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ, kvm_eq_idx,
+                                &kvm_eq, false, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return ret;
+        }
+
+        if (!(kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED)) {
+            continue;
+        }
+
+        /* Update the local END structure with the KVM input */
+        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ENABLED) {
+                end.w0 |= END_W0_VALID | END_W0_ENQUEUE;
+        }
+        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY) {
+                end.w0 |= END_W0_UCOND_NOTIFY;
+        }
+        if (kvm_eq.flags & KVM_XIVE_EQ_FLAG_ESCALATE) {
+                end.w0 |= END_W0_ESCALATE_CTL;
+        }
+        end.w0 |= SETFIELD(END_W0_QSIZE, 0ul, kvm_eq.qsize - 12);
+
+        end.w1 = SETFIELD(END_W1_GENERATION, 0ul, kvm_eq.qtoggle) |
+            SETFIELD(END_W1_PAGE_OFF, 0ul, kvm_eq.qindex);
+        end.w2 = (kvm_eq.qpage >> 32) & 0x0fffffff;
+        end.w3 = kvm_eq.qpage & 0xffffffff;
+        end.w4 = 0;
+        end.w5 = 0;
+
+        ret = spapr_xive_cpu_to_nvt(xive, POWERPC_CPU(cs), &nvt_blk, &nvt_idx);
+        if (ret) {
+            error_setg(errp, "XIVE: No NVT for CPU %ld", vcpu_id);
+            return ret;
+        }
+
+        end.w6 = SETFIELD(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
+            SETFIELD(END_W6_NVT_INDEX, 0ul, nvt_idx);
+        end.w7 = SETFIELD(END_W7_F0_PRIORITY, 0ul, i);
+
+        spapr_xive_cpu_to_end(xive, POWERPC_CPU(cs), i, &end_blk, &end_idx);
+
+        ret = xive_router_set_end(xrtr, end_blk, end_idx, &end);
+        if (ret) {
+            error_setg(errp, "XIVE: No END for CPU %ld priority %d",
+                       vcpu_id, i);
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static void spapr_xive_kvm_set_eas_state(sPAPRXive *xive, Error **errp)
+{
+    XiveSource *xsrc = &xive->source;
+    int i;
+
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        XiveEAS *eas = &xive->eat[i];
+        uint32_t end_idx;
+        uint32_t end_blk;
+        uint32_t eisn;
+        uint8_t priority;
+        uint32_t server;
+        uint64_t kvm_eas;
+        Error *local_err = NULL;
+
+        /* No need to set MASKED EAS, this is the default state after reset */
+        if (!(eas->w & EAS_VALID) || eas->w & EAS_MASKED) {
+            continue;
+        }
+
+        end_idx = GETFIELD(EAS_END_INDEX, eas->w);
+        end_blk = GETFIELD(EAS_END_BLOCK, eas->w);
+        eisn = GETFIELD(EAS_END_DATA, eas->w);
+
+        spapr_xive_end_to_target(xive, end_blk, end_idx, &server, &priority);
+
+        kvm_eas = priority << KVM_XIVE_EAS_PRIORITY_SHIFT &
+            KVM_XIVE_EAS_PRIORITY_MASK;
+        kvm_eas |= server << KVM_XIVE_EAS_SERVER_SHIFT &
+            KVM_XIVE_EAS_SERVER_MASK;
+        kvm_eas |= ((uint64_t)eisn << KVM_XIVE_EAS_EISN_SHIFT) &
+            KVM_XIVE_EAS_EISN_MASK;
+
+        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, true,
+                          &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
+static void spapr_xive_kvm_get_eas_state(sPAPRXive *xive, Error **errp)
+{
+    XiveSource *xsrc = &xive->source;
+    int i;
+
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        XiveEAS *eas = &xive->eat[i];
+        XiveEAS new_eas;
+        uint64_t kvm_eas;
+        uint8_t priority;
+        uint32_t server;
+        uint32_t end_idx;
+        uint8_t end_blk;
+        uint32_t eisn;
+        Error *local_err = NULL;
+
+        if (!(eas->w & EAS_VALID)) {
+            continue;
+        }
+
+        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EAS, i, &kvm_eas, false,
+                          &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+
+        priority = (kvm_eas & KVM_XIVE_EAS_PRIORITY_MASK) >>
+            KVM_XIVE_EAS_PRIORITY_SHIFT;
+        server = (kvm_eas & KVM_XIVE_EAS_SERVER_MASK) >>
+            KVM_XIVE_EAS_SERVER_SHIFT;
+        eisn = (kvm_eas & KVM_XIVE_EAS_EISN_MASK) >> KVM_XIVE_EAS_EISN_SHIFT;
+
+        if (spapr_xive_target_to_end(xive, server, priority, &end_blk,
+                                     &end_idx)) {
+            error_setg(errp, "XIVE: invalid tuple CPU %d priority %d", server,
+                       priority);
+            return;
+        }
+
+        new_eas.w = EAS_VALID;
+        if (kvm_eas & KVM_XIVE_EAS_MASK_MASK) {
+            new_eas.w |= EAS_MASKED;
+        }
+
+        new_eas.w = SETFIELD(EAS_END_INDEX, new_eas.w, end_idx);
+        new_eas.w = SETFIELD(EAS_END_BLOCK, new_eas.w, end_blk);
+        new_eas.w = SETFIELD(EAS_END_DATA, new_eas.w, eisn);
+
+        *eas = new_eas;
+    }
+}
+
+static void spapr_xive_kvm_sync_all(sPAPRXive *xive, Error **errp)
+{
+    XiveSource *xsrc = &xive->source;
+    Error *local_err = NULL;
+    int i;
+
+    /* Sync the KVM source. This reaches the XIVE HW through OPAL */
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        XiveEAS *eas = &xive->eat[i];
+
+        if (!(eas->w & EAS_VALID)) {
+            continue;
+        }
+
+        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SYNC, i, NULL, true,
+                          &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
+/*
+ * The sPAPRXive KVM model migration priority is higher to make sure
+ * its 'pre_save' method runs before all the other XIVE models. It
+ * orchestrates the capture sequence of the XIVE states in the
+ * following order:
+ *
+ *   1. mask all the sources by setting PQ=01, which returns the
+ *      previous value and save it.
+ *   2. sync the sources in KVM to stabilize all the queues
+ *      sync the ENDs to make sure END -> VP is fully completed
+ *   3. dump the EAS table
+ *   4. dump the END table
+ *   5. dump the thread context (IPB)
+ *
+ *  Rollback to restore the current configuration of the sources
+ */
+static int spapr_xive_kvm_pre_save(sPAPRXive *xive)
+{
+    XiveSource *xsrc = &xive->source;
+    Error *local_err = NULL;
+    CPUState *cs;
+    int i;
+    int ret = 0;
+
+    /* Quiesce the sources, to stop the flow of event notifications */
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        /*
+         * Mask and save the ESB PQs locally in the XiveSource object.
+         */
+        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_01);
+        xive_source_esb_set(xsrc, i, pq);
+    }
+
+    /* Sync the sources in KVM */
+    spapr_xive_kvm_sync_all(xive, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        goto out;
+    }
+
+    /* Grab the EAT (could be done earlier ?) */
+    spapr_xive_kvm_get_eas_state(xive, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        goto out;
+    }
+
+    /*
+     * Grab the ENDs. The EQ index and the toggle bit are what we want
+     * to capture
+     */
+    CPU_FOREACH(cs) {
+        spapr_xive_kvm_get_eq_state(xive, cs, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            goto out;
+        }
+    }
+
+    /* Capture the thread interrupt contexts */
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        /* TODO: Check if we need to use under run_on_cpu() ? */
+        xive_tctx_kvm_get_state(XIVE_TCTX_KVM(cpu->intc), &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            goto out;
+        }
+    }
+
+    /* All done. */
+
+out:
+    /* Restore the sources to their initial state */
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        uint8_t pq = xive_source_esb_get(xsrc, i);
+        if (xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8)) != 0x1) {
+            error_report("XIVE: IRQ %d has an invalid state", i);
+        }
+    }
+
+    /*
+     * The XiveSource and the XiveTCTX states will be collected by
+     * their respective vmstate handlers afterwards.
+     */
+    return ret;
+}
+
+/*
+ * The sPAPRXive 'post_load' method is called by the sPAPR machine,
+ * after all XIVE device states have been transfered and loaded.
+ *
+ * All should be in place when the VCPUs resume execution.
+ */
+static int spapr_xive_kvm_post_load(sPAPRXive *xive, int version_id)
+{
+    XiveSource *xsrc = &xive->source;
+    Error *local_err = NULL;
+    CPUState *cs;
+    int i;
+
+    /* Set the ENDs first. The targetting depends on it. */
+    CPU_FOREACH(cs) {
+        spapr_xive_kvm_set_eq_state(xive, cs, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return -1;
+        }
+    }
+
+    /* Restore the targetting, if any */
+    spapr_xive_kvm_set_eas_state(xive, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return -1;
+    }
+
+    /* Restore the thread interrupt contexts */
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        xive_tctx_kvm_set_state(XIVE_TCTX_KVM(cpu->intc), &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return -1;
+        }
+    }
+
+    /*
+     * Get the saved state from the XiveSource model and restore the
+     * PQ bits
+     */
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        uint8_t pq = xive_source_esb_get(xsrc, i);
+        xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
+    }
+    return 0;
+}
+
+static void spapr_xive_kvm_synchronize_state(sPAPRXive *xive)
+{
+    XiveSource *xsrc = &xive->source;
+    CPUState *cs;
+
+    xive_source_kvm_get_state(xsrc);
+
+    spapr_xive_kvm_get_eas_state(xive, &error_fatal);
+
+    CPU_FOREACH(cs) {
+        spapr_xive_kvm_get_eq_state(xive, cs, &error_fatal);
+    }
+}
 
 static void spapr_xive_kvm_instance_init(Object *obj)
 {
@@ -409,6 +899,10 @@  static void spapr_xive_kvm_class_init(ObjectClass *klass, void *data)
 
     dc->desc = "sPAPR XIVE KVM Interrupt Controller";
     dc->unrealize = spapr_xive_kvm_unrealize;
+
+    sxc->synchronize_state = spapr_xive_kvm_synchronize_state;
+    sxc->pre_save = spapr_xive_kvm_pre_save;
+    sxc->post_load = spapr_xive_kvm_post_load;
 }
 
 static const TypeInfo spapr_xive_kvm_info = {
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index 9bb37553c9ec..c9aedecc8216 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -438,9 +438,14 @@  static const struct {
 
 void xive_tctx_pic_print_info(XiveTCTX *tctx, Monitor *mon)
 {
+    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(tctx);
     int cpu_index = tctx->cs ? tctx->cs->cpu_index : -1;
     int i;
 
+    if (xtc->synchronize_state) {
+        xtc->synchronize_state(tctx);
+    }
+
     monitor_printf(mon, "CPU[%04x]:   QW   NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
                    "  W2\n", cpu_index);
 
@@ -552,10 +557,23 @@  static void xive_tctx_base_unrealize(DeviceState *dev, Error **errp)
     qemu_unregister_reset(xive_tctx_base_reset, dev);
 }
 
+static int vmstate_xive_tctx_post_load(void *opaque, int version_id)
+{
+    XiveTCTX *tctx = XIVE_TCTX_BASE(opaque);
+    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(tctx);
+
+    if (xtc->post_load) {
+        return xtc->post_load(tctx, version_id);
+    }
+
+    return 0;
+}
+
 static const VMStateDescription vmstate_xive_tctx_base = {
     .name = TYPE_XIVE_TCTX,
     .version_id = 1,
     .minimum_version_id = 1,
+    .post_load = vmstate_xive_tctx_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_BUFFER(regs, XiveTCTX),
         VMSTATE_END_OF_LIST()
@@ -581,9 +599,37 @@  static const TypeInfo xive_tctx_base_info = {
     .class_size    = sizeof(XiveTCTXClass),
 };
 
+static int xive_tctx_post_load(XiveTCTX *tctx, int version_id)
+{
+    XiveRouterClass *xrc = XIVE_ROUTER_GET_CLASS(tctx->xrtr);
+
+    /*
+     * When we collect the states from KVM XIVE irqchip, we set word2
+     * of the thread context to print out the OS CAM line under the
+     * QEMU monitor.
+     *
+     * This breaks migration on a guest using TCG or not using a KVM
+     * irqchip. Fix with an extra reset of the thread contexts.
+     */
+    if (xrc->reset_tctx) {
+        xrc->reset_tctx(tctx->xrtr, tctx);
+    }
+    return 0;
+}
+
+static void xive_tctx_class_init(ObjectClass *klass, void *data)
+{
+    XiveTCTXClass *xtc = XIVE_TCTX_BASE_CLASS(klass);
+
+    xtc->post_load = xive_tctx_post_load;
+}
+
 static const TypeInfo xive_tctx_info = {
     .name          = TYPE_XIVE_TCTX,
     .parent        = TYPE_XIVE_TCTX_BASE,
+    .instance_size = sizeof(XiveTCTX),
+    .class_init    = xive_tctx_class_init,
+    .class_size    = sizeof(XiveTCTXClass),
 };
 
 Object *xive_tctx_create(Object *cpu, const char *type, XiveRouter *xrtr,
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index 92ef53743b64..6fac6ca70595 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -359,7 +359,7 @@  static Object *spapr_irq_cpu_intc_create_xive(sPAPRMachineState *spapr,
 
 static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
 {
-    return 0;
+    return spapr_xive_post_load(spapr->xive, version_id);
 }
 
 /*