diff mbox series

[PULL,29/44] spapr/xive: introduce a VM state change handler

Message ID 20190529065017.15149-30-david@gibson.dropbear.id.au
State New
Headers show
Series [PULL,01/44] tests: Fix up docker cross builds for ppc64 (BE) targets | expand

Commit Message

David Gibson May 29, 2019, 6:50 a.m. UTC
From: Cédric Le Goater <clg@kaod.org>

This handler is in charge of stabilizing the flow of event notifications
in the XIVE controller before migrating a guest. This is a requirement
before transferring the guest EQ pages to a destination.

When the VM is stopped, the handler sets the source PQs to PENDING to
stop the flow of events and to possibly catch a triggered interrupt
occuring while the VM is stopped. Their previous state is saved. The
XIVE controller is then synced through KVM to flush any in-flight
event notification and to stabilize the EQs. At this stage, the EQ
pages are marked dirty to make sure the EQ pages are transferred if a
migration sequence is in progress.

The previous configuration of the sources is restored when the VM
resumes, after a migration or a stop. If an interrupt was queued while
the VM was stopped, the handler simply generates the missing trigger.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Message-Id: <20190513084245.25755-6-clg@kaod.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/intc/spapr_xive_kvm.c    | 96 ++++++++++++++++++++++++++++++++++++-
 include/hw/ppc/spapr_xive.h |  1 +
 2 files changed, 96 insertions(+), 1 deletion(-)

Comments

Alexey Kardashevskiy June 4, 2019, 7:49 a.m. UTC | #1
On 29/05/2019 16:50, David Gibson wrote:
> From: Cédric Le Goater <clg@kaod.org>
> 
> This handler is in charge of stabilizing the flow of event notifications
> in the XIVE controller before migrating a guest. This is a requirement
> before transferring the guest EQ pages to a destination.
> 
> When the VM is stopped, the handler sets the source PQs to PENDING to
> stop the flow of events and to possibly catch a triggered interrupt
> occuring while the VM is stopped. Their previous state is saved. The
> XIVE controller is then synced through KVM to flush any in-flight
> event notification and to stabilize the EQs. At this stage, the EQ
> pages are marked dirty to make sure the EQ pages are transferred if a
> migration sequence is in progress.
> 
> The previous configuration of the sources is restored when the VM
> resumes, after a migration or a stop. If an interrupt was queued while
> the VM was stopped, the handler simply generates the missing trigger.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> Message-Id: <20190513084245.25755-6-clg@kaod.org>
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>

This one breaks my nvlink2 passthru setup. The host is v5.2-rc2.
v5.2-rc3 fixes it though so it is backward compatibility issue which we
care about to what degree here? I am forcing ic-mode=xive which is not
the default so I am not so sure.



aik@u1804kvm:~$ cat /proc/interrupts
           CPU0
 16:          0  XIVE-IPI   0 Edge      IPI
 21:          0  XIVE-IRQ 4096 Edge      RAS_EPOW
 22:          0  XIVE-IRQ 4097 Edge      RAS_HOTPLUG
257:      12372  XIVE-IRQ 4353 Edge      ibmvscsi
258:          0  XIVE-IRQ 4864 Edge      virtio0-config
259:       2157  XIVE-IRQ 4865 Edge      virtio0-input.0
260:          1  XIVE-IRQ 4866 Edge      virtio0-output.0
261:          0  XIVE-IRQ 4868 Edge      xhci_hcd
262:          0  XIVE-IRQ 4869 Edge      xhci_hcd
272:          1  XIVE-IRQ 4368 Edge      hvc_console
LOC:      10508   Local timer interrupts for timer event device
BCT:          0   Broadcast timer interrupts for timer event device
LOC:          0   Local timer interrupts for others
SPU:          5   Spurious interrupts
PMI:          0   Performance monitoring interrupts
MCE:          0   Machine check exceptions
NMI:          0   System Reset interrupts
DBL:          0   Doorbell interrupts


and 7bfc759c02b8 "spapr/xive: add state synchronization with KVM" works:

           CPU0
 16:          0  XIVE-IPI   0 Edge      IPI
 19:          0  XIVE-IRQ 4610 Level     NPU Device
 20:          0  XIVE-IRQ 4611 Level     NPU Device
 21:          0  XIVE-IRQ 4096 Edge      RAS_EPOW
 22:          0  XIVE-IRQ 4097 Edge      RAS_HOTPLUG
257:      11833  XIVE-IRQ 4353 Edge      ibmvscsi
258:          0  XIVE-IRQ 4864 Edge      virtio0-config
259:       1632  XIVE-IRQ 4865 Edge      virtio0-input.0
260:          1  XIVE-IRQ 4866 Edge      virtio0-output.0
261:          0  XIVE-IRQ 4868 Edge      xhci_hcd
262:          0  XIVE-IRQ 4869 Edge      xhci_hcd
263:         60  XIVE-IRQ 4867 Edge      nvidia
272:          0  XIVE-IRQ 4368 Edge      hvc_console
LOC:       2236   Local timer interrupts for timer event device
BCT:          0   Broadcast timer interrupts for timer event device
LOC:          0   Local timer interrupts for others
SPU:          2   Spurious interrupts
PMI:          0   Performance monitoring interrupts
MCE:          0   Machine check exceptions
NMI:          0   System Reset interrupts
DBL:          0   Doorbell interrupts



Here is the command line:

/home/aik/pbuild/qemu-aikrhel74alt-ppc64/ppc64-softmmu/qemu-system-ppc64 \
-nodefaults \
-chardev stdio,id=STDIO0,signal=off,mux=on \
-device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
-mon id=MON0,chardev=STDIO0,mode=readline -nographic -vga none \
-enable-kvm \
-device nec-usb-xhci,id=nec-usb-xhci0 -m 16G \
-netdev "user,id=USER0,hostfwd=tcp::2223-:22" \
-device "virtio-net-pci,id=vnet0,mac=C0:41:49:4b:00:00,netdev=USER0" \
img/u1804-64G-cuda10.1-418.67-swiotlb.qcow2 \
-machine pseries,cap-cfpc=broken,cap-htm=off,ic-mode=xive \
-device "vfio-pci,id=vfio0006_00_00_1,host=0006:00:00.1" \
-device "vfio-pci,id=vfio0006_00_00_0,host=0006:00:00.0" \
-device "vfio-pci,id=vfio0004_04_00_0,host=0004:04:00.0" \
-kernel ./vmldbg -append "root=/dev/sda2 console=hvc0 debug loglevel=8" \
-snapshot \
-smp 1,threads=1 -bios ./slof.bin \
-L /home/aik/t/qemu-ppc64-bios/ \
-trace events=qemu_trace_events -d guest_errors \
-chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.user2223.6_0_0_1 \
-mon chardev=SOCKET0,mode=control




> ---
>  hw/intc/spapr_xive_kvm.c    | 96 ++++++++++++++++++++++++++++++++++++-
>  include/hw/ppc/spapr_xive.h |  1 +
>  2 files changed, 96 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index 8dd4f96e0b..735577a6f8 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -433,9 +433,100 @@ static void kvmppc_xive_get_queues(SpaprXive *xive, Error **errp)
>      }
>  }
>  
> +/*
> + * The primary goal of the XIVE VM change handler is to mark the EQ
> + * pages dirty when all XIVE event notifications have stopped.
> + *
> + * Whenever the VM is stopped, the VM change handler sets the source
> + * PQs to PENDING to stop the flow of events and to possibly catch a
> + * triggered interrupt occuring while the VM is stopped. The previous
> + * state is saved in anticipation of a migration. The XIVE controller
> + * is then synced through KVM to flush any in-flight event
> + * notification and stabilize the EQs.
> + *
> + * At this stage, we can mark the EQ page dirty and let a migration
> + * sequence transfer the EQ pages to the destination, which is done
> + * just after the stop state.
> + *
> + * The previous configuration of the sources is restored when the VM
> + * runs again. If an interrupt was queued while the VM was stopped,
> + * simply generate a trigger.
> + */
> +static void kvmppc_xive_change_state_handler(void *opaque, int running,
> +                                             RunState state)
> +{
> +    SpaprXive *xive = opaque;
> +    XiveSource *xsrc = &xive->source;
> +    Error *local_err = NULL;
> +    int i;
> +
> +    /*
> +     * Restore the sources to their initial state. This is called when
> +     * the VM resumes after a stop or a migration.
> +     */
> +    if (running) {
> +        for (i = 0; i < xsrc->nr_irqs; i++) {
> +            uint8_t pq = xive_source_esb_get(xsrc, i);
> +            uint8_t old_pq;
> +
> +            old_pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
> +
> +            /*
> +             * An interrupt was queued while the VM was stopped,
> +             * generate a trigger.
> +             */
> +            if (pq == XIVE_ESB_RESET && old_pq == XIVE_ESB_QUEUED) {
> +                xive_esb_trigger(xsrc, i);
> +            }
> +        }
> +
> +        return;
> +    }
> +
> +    /*
> +     * Mask the sources, to stop the flow of event notifications, and
> +     * save the PQs locally in the XiveSource object. The XiveSource
> +     * state will be collected later on by its vmstate handler if a
> +     * migration is in progress.
> +     */
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
> +
> +        /*
> +         * PQ is set to PENDING to possibly catch a triggered
> +         * interrupt occuring while the VM is stopped (hotplug event
> +         * for instance) .
> +         */
> +        if (pq != XIVE_ESB_OFF) {
> +            pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_10);
> +        }
> +        xive_source_esb_set(xsrc, i, pq);
> +    }
> +
> +    /*
> +     * Sync the XIVE controller in KVM, to flush in-flight event
> +     * notification that should be enqueued in the EQs and mark the
> +     * XIVE EQ pages dirty to collect all updates.
> +     */
> +    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL,
> +                      KVM_DEV_XIVE_EQ_SYNC, NULL, true, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return;
> +    }
> +}
> +
>  void kvmppc_xive_synchronize_state(SpaprXive *xive, Error **errp)
>  {
> -    kvmppc_xive_source_get_state(&xive->source);
> +    /*
> +     * When the VM is stopped, the sources are masked and the previous
> +     * state is saved in anticipation of a migration. We should not
> +     * synchronize the source state in that case else we will override
> +     * the saved state.
> +     */
> +    if (runstate_is_running()) {
> +        kvmppc_xive_source_get_state(&xive->source);
> +    }
>  
>      /* EAT: there is no extra state to query from KVM */
>  
> @@ -515,6 +606,9 @@ void kvmppc_xive_connect(SpaprXive *xive, Error **errp)
>                                        "xive.tima", tima_len, xive->tm_mmap);
>      sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
>  
> +    xive->change = qemu_add_vm_change_state_handler(
> +        kvmppc_xive_change_state_handler, xive);
> +
>      kvm_kernel_irqchip = true;
>      kvm_msi_via_irqfd_allowed = true;
>      kvm_gsi_direct_mapping = true;
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 7e49badd8c..734662c12a 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -42,6 +42,7 @@ typedef struct SpaprXive {
>      /* KVM support */
>      int           fd;
>      void          *tm_mmap;
> +    VMChangeStateEntry *change;
>  } SpaprXive;
>  
>  /*
>
Cédric Le Goater June 4, 2019, 8:10 a.m. UTC | #2
On 04/06/2019 09:49, Alexey Kardashevskiy wrote:
> 
> 
> On 29/05/2019 16:50, David Gibson wrote:
>> From: Cédric Le Goater <clg@kaod.org>
>>
>> This handler is in charge of stabilizing the flow of event notifications
>> in the XIVE controller before migrating a guest. This is a requirement
>> before transferring the guest EQ pages to a destination.
>>
>> When the VM is stopped, the handler sets the source PQs to PENDING to
>> stop the flow of events and to possibly catch a triggered interrupt
>> occuring while the VM is stopped. Their previous state is saved. The
>> XIVE controller is then synced through KVM to flush any in-flight
>> event notification and to stabilize the EQs. At this stage, the EQ
>> pages are marked dirty to make sure the EQ pages are transferred if a
>> migration sequence is in progress.
>>
>> The previous configuration of the sources is restored when the VM
>> resumes, after a migration or a stop. If an interrupt was queued while
>> the VM was stopped, the handler simply generates the missing trigger.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>> Message-Id: <20190513084245.25755-6-clg@kaod.org>
>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> 
> This one breaks my nvlink2 passthru setup. The host is v5.2-rc2.
> v5.2-rc3 fixes it though so it is backward compatibility issue which we
> care about to what degree here? 

v5.2-rc2 had an ugly bug impacting passthru under some VM configuration,
XIVE + single CPU. See :

bcaa3110d584 ("KVM: PPC: Book3S HV: XIVE: Fix page offset when clearing 
ESB pages")

passthru also had a serious issue impacting the XICS-over-XIVE and the 
XIVE KVM devices :   

ef9740204051 ("KVM: PPC: Book3S HV: XIVE: Do not clear IRQ data of 
passthrough interrupts")

You need an v5.2-rc3 ! 

> I am forcing ic-mode=xive which is not the default so I am not so sure.

It should be OK.

C. 

> 
> 
> 
> aik@u1804kvm:~$ cat /proc/interrupts
>            CPU0
>  16:          0  XIVE-IPI   0 Edge      IPI
>  21:          0  XIVE-IRQ 4096 Edge      RAS_EPOW
>  22:          0  XIVE-IRQ 4097 Edge      RAS_HOTPLUG
> 257:      12372  XIVE-IRQ 4353 Edge      ibmvscsi
> 258:          0  XIVE-IRQ 4864 Edge      virtio0-config
> 259:       2157  XIVE-IRQ 4865 Edge      virtio0-input.0
> 260:          1  XIVE-IRQ 4866 Edge      virtio0-output.0
> 261:          0  XIVE-IRQ 4868 Edge      xhci_hcd
> 262:          0  XIVE-IRQ 4869 Edge      xhci_hcd
> 272:          1  XIVE-IRQ 4368 Edge      hvc_console
> LOC:      10508   Local timer interrupts for timer event device
> BCT:          0   Broadcast timer interrupts for timer event device
> LOC:          0   Local timer interrupts for others
> SPU:          5   Spurious interrupts
> PMI:          0   Performance monitoring interrupts
> MCE:          0   Machine check exceptions
> NMI:          0   System Reset interrupts
> DBL:          0   Doorbell interrupts
> 
> 
> and 7bfc759c02b8 "spapr/xive: add state synchronization with KVM" works:
> 
>            CPU0
>  16:          0  XIVE-IPI   0 Edge      IPI
>  19:          0  XIVE-IRQ 4610 Level     NPU Device
>  20:          0  XIVE-IRQ 4611 Level     NPU Device
>  21:          0  XIVE-IRQ 4096 Edge      RAS_EPOW
>  22:          0  XIVE-IRQ 4097 Edge      RAS_HOTPLUG
> 257:      11833  XIVE-IRQ 4353 Edge      ibmvscsi
> 258:          0  XIVE-IRQ 4864 Edge      virtio0-config
> 259:       1632  XIVE-IRQ 4865 Edge      virtio0-input.0
> 260:          1  XIVE-IRQ 4866 Edge      virtio0-output.0
> 261:          0  XIVE-IRQ 4868 Edge      xhci_hcd
> 262:          0  XIVE-IRQ 4869 Edge      xhci_hcd
> 263:         60  XIVE-IRQ 4867 Edge      nvidia
> 272:          0  XIVE-IRQ 4368 Edge      hvc_console
> LOC:       2236   Local timer interrupts for timer event device
> BCT:          0   Broadcast timer interrupts for timer event device
> LOC:          0   Local timer interrupts for others
> SPU:          2   Spurious interrupts
> PMI:          0   Performance monitoring interrupts
> MCE:          0   Machine check exceptions
> NMI:          0   System Reset interrupts
> DBL:          0   Doorbell interrupts
> 
> 
> 
> Here is the command line:
> 
> /home/aik/pbuild/qemu-aikrhel74alt-ppc64/ppc64-softmmu/qemu-system-ppc64 \
> -nodefaults \
> -chardev stdio,id=STDIO0,signal=off,mux=on \
> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
> -mon id=MON0,chardev=STDIO0,mode=readline -nographic -vga none \
> -enable-kvm \
> -device nec-usb-xhci,id=nec-usb-xhci0 -m 16G \
> -netdev "user,id=USER0,hostfwd=tcp::2223-:22" \
> -device "virtio-net-pci,id=vnet0,mac=C0:41:49:4b:00:00,netdev=USER0" \
> img/u1804-64G-cuda10.1-418.67-swiotlb.qcow2 \
> -machine pseries,cap-cfpc=broken,cap-htm=off,ic-mode=xive \
> -device "vfio-pci,id=vfio0006_00_00_1,host=0006:00:00.1" \
> -device "vfio-pci,id=vfio0006_00_00_0,host=0006:00:00.0" \
> -device "vfio-pci,id=vfio0004_04_00_0,host=0004:04:00.0" \
> -kernel ./vmldbg -append "root=/dev/sda2 console=hvc0 debug loglevel=8" \
> -snapshot \
> -smp 1,threads=1 -bios ./slof.bin \
> -L /home/aik/t/qemu-ppc64-bios/ \
> -trace events=qemu_trace_events -d guest_errors \
> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.user2223.6_0_0_1 \
> -mon chardev=SOCKET0,mode=control
> 
> 
> 
> 
>> ---
>>  hw/intc/spapr_xive_kvm.c    | 96 ++++++++++++++++++++++++++++++++++++-
>>  include/hw/ppc/spapr_xive.h |  1 +
>>  2 files changed, 96 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>> index 8dd4f96e0b..735577a6f8 100644
>> --- a/hw/intc/spapr_xive_kvm.c
>> +++ b/hw/intc/spapr_xive_kvm.c
>> @@ -433,9 +433,100 @@ static void kvmppc_xive_get_queues(SpaprXive *xive, Error **errp)
>>      }
>>  }
>>  
>> +/*
>> + * The primary goal of the XIVE VM change handler is to mark the EQ
>> + * pages dirty when all XIVE event notifications have stopped.
>> + *
>> + * Whenever the VM is stopped, the VM change handler sets the source
>> + * PQs to PENDING to stop the flow of events and to possibly catch a
>> + * triggered interrupt occuring while the VM is stopped. The previous
>> + * state is saved in anticipation of a migration. The XIVE controller
>> + * is then synced through KVM to flush any in-flight event
>> + * notification and stabilize the EQs.
>> + *
>> + * At this stage, we can mark the EQ page dirty and let a migration
>> + * sequence transfer the EQ pages to the destination, which is done
>> + * just after the stop state.
>> + *
>> + * The previous configuration of the sources is restored when the VM
>> + * runs again. If an interrupt was queued while the VM was stopped,
>> + * simply generate a trigger.
>> + */
>> +static void kvmppc_xive_change_state_handler(void *opaque, int running,
>> +                                             RunState state)
>> +{
>> +    SpaprXive *xive = opaque;
>> +    XiveSource *xsrc = &xive->source;
>> +    Error *local_err = NULL;
>> +    int i;
>> +
>> +    /*
>> +     * Restore the sources to their initial state. This is called when
>> +     * the VM resumes after a stop or a migration.
>> +     */
>> +    if (running) {
>> +        for (i = 0; i < xsrc->nr_irqs; i++) {
>> +            uint8_t pq = xive_source_esb_get(xsrc, i);
>> +            uint8_t old_pq;
>> +
>> +            old_pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
>> +
>> +            /*
>> +             * An interrupt was queued while the VM was stopped,
>> +             * generate a trigger.
>> +             */
>> +            if (pq == XIVE_ESB_RESET && old_pq == XIVE_ESB_QUEUED) {
>> +                xive_esb_trigger(xsrc, i);
>> +            }
>> +        }
>> +
>> +        return;
>> +    }
>> +
>> +    /*
>> +     * Mask the sources, to stop the flow of event notifications, and
>> +     * save the PQs locally in the XiveSource object. The XiveSource
>> +     * state will be collected later on by its vmstate handler if a
>> +     * migration is in progress.
>> +     */
>> +    for (i = 0; i < xsrc->nr_irqs; i++) {
>> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
>> +
>> +        /*
>> +         * PQ is set to PENDING to possibly catch a triggered
>> +         * interrupt occuring while the VM is stopped (hotplug event
>> +         * for instance) .
>> +         */
>> +        if (pq != XIVE_ESB_OFF) {
>> +            pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_10);
>> +        }
>> +        xive_source_esb_set(xsrc, i, pq);
>> +    }
>> +
>> +    /*
>> +     * Sync the XIVE controller in KVM, to flush in-flight event
>> +     * notification that should be enqueued in the EQs and mark the
>> +     * XIVE EQ pages dirty to collect all updates.
>> +     */
>> +    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL,
>> +                      KVM_DEV_XIVE_EQ_SYNC, NULL, true, &local_err);
>> +    if (local_err) {
>> +        error_report_err(local_err);
>> +        return;
>> +    }
>> +}
>> +
>>  void kvmppc_xive_synchronize_state(SpaprXive *xive, Error **errp)
>>  {
>> -    kvmppc_xive_source_get_state(&xive->source);
>> +    /*
>> +     * When the VM is stopped, the sources are masked and the previous
>> +     * state is saved in anticipation of a migration. We should not
>> +     * synchronize the source state in that case else we will override
>> +     * the saved state.
>> +     */
>> +    if (runstate_is_running()) {
>> +        kvmppc_xive_source_get_state(&xive->source);
>> +    }
>>  
>>      /* EAT: there is no extra state to query from KVM */
>>  
>> @@ -515,6 +606,9 @@ void kvmppc_xive_connect(SpaprXive *xive, Error **errp)
>>                                        "xive.tima", tima_len, xive->tm_mmap);
>>      sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
>>  
>> +    xive->change = qemu_add_vm_change_state_handler(
>> +        kvmppc_xive_change_state_handler, xive);
>> +
>>      kvm_kernel_irqchip = true;
>>      kvm_msi_via_irqfd_allowed = true;
>>      kvm_gsi_direct_mapping = true;
>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>> index 7e49badd8c..734662c12a 100644
>> --- a/include/hw/ppc/spapr_xive.h
>> +++ b/include/hw/ppc/spapr_xive.h
>> @@ -42,6 +42,7 @@ typedef struct SpaprXive {
>>      /* KVM support */
>>      int           fd;
>>      void          *tm_mmap;
>> +    VMChangeStateEntry *change;
>>  } SpaprXive;
>>  
>>  /*
>>
>
Alexey Kardashevskiy June 5, 2019, 7:20 a.m. UTC | #3
On 04/06/2019 18:10, Cédric Le Goater wrote:
> On 04/06/2019 09:49, Alexey Kardashevskiy wrote:
>>
>>
>> On 29/05/2019 16:50, David Gibson wrote:
>>> From: Cédric Le Goater <clg@kaod.org>
>>>
>>> This handler is in charge of stabilizing the flow of event notifications
>>> in the XIVE controller before migrating a guest. This is a requirement
>>> before transferring the guest EQ pages to a destination.
>>>
>>> When the VM is stopped, the handler sets the source PQs to PENDING to
>>> stop the flow of events and to possibly catch a triggered interrupt
>>> occuring while the VM is stopped. Their previous state is saved. The
>>> XIVE controller is then synced through KVM to flush any in-flight
>>> event notification and to stabilize the EQs. At this stage, the EQ
>>> pages are marked dirty to make sure the EQ pages are transferred if a
>>> migration sequence is in progress.
>>>
>>> The previous configuration of the sources is restored when the VM
>>> resumes, after a migration or a stop. If an interrupt was queued while
>>> the VM was stopped, the handler simply generates the missing trigger.
>>>
>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>>> Message-Id: <20190513084245.25755-6-clg@kaod.org>
>>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>>
>> This one breaks my nvlink2 passthru setup. The host is v5.2-rc2.
>> v5.2-rc3 fixes it though so it is backward compatibility issue which we
>> care about to what degree here? 
> 
> v5.2-rc2 had an ugly bug impacting passthru under some VM configuration,
> XIVE + single CPU. See :
> 
> bcaa3110d584 ("KVM: PPC: Book3S HV: XIVE: Fix page offset when clearing 
> ESB pages")
> 
> passthru also had a serious issue impacting the XICS-over-XIVE and the 
> XIVE KVM devices :   
> 
> ef9740204051 ("KVM: PPC: Book3S HV: XIVE: Do not clear IRQ data of 
> passthrough interrupts")
> 
> You need an v5.2-rc3 ! 


Yeah, that works. And released v5.1 works too so we do not need to worry
:) Thanks for explanation.


> 
>> I am forcing ic-mode=xive which is not the default so I am not so sure.
> 
> It should be OK.
> 
> C. 
> 
>>
>>
>>
>> aik@u1804kvm:~$ cat /proc/interrupts
>>            CPU0
>>  16:          0  XIVE-IPI   0 Edge      IPI
>>  21:          0  XIVE-IRQ 4096 Edge      RAS_EPOW
>>  22:          0  XIVE-IRQ 4097 Edge      RAS_HOTPLUG
>> 257:      12372  XIVE-IRQ 4353 Edge      ibmvscsi
>> 258:          0  XIVE-IRQ 4864 Edge      virtio0-config
>> 259:       2157  XIVE-IRQ 4865 Edge      virtio0-input.0
>> 260:          1  XIVE-IRQ 4866 Edge      virtio0-output.0
>> 261:          0  XIVE-IRQ 4868 Edge      xhci_hcd
>> 262:          0  XIVE-IRQ 4869 Edge      xhci_hcd
>> 272:          1  XIVE-IRQ 4368 Edge      hvc_console
>> LOC:      10508   Local timer interrupts for timer event device
>> BCT:          0   Broadcast timer interrupts for timer event device
>> LOC:          0   Local timer interrupts for others
>> SPU:          5   Spurious interrupts
>> PMI:          0   Performance monitoring interrupts
>> MCE:          0   Machine check exceptions
>> NMI:          0   System Reset interrupts
>> DBL:          0   Doorbell interrupts
>>
>>
>> and 7bfc759c02b8 "spapr/xive: add state synchronization with KVM" works:
>>
>>            CPU0
>>  16:          0  XIVE-IPI   0 Edge      IPI
>>  19:          0  XIVE-IRQ 4610 Level     NPU Device
>>  20:          0  XIVE-IRQ 4611 Level     NPU Device
>>  21:          0  XIVE-IRQ 4096 Edge      RAS_EPOW
>>  22:          0  XIVE-IRQ 4097 Edge      RAS_HOTPLUG
>> 257:      11833  XIVE-IRQ 4353 Edge      ibmvscsi
>> 258:          0  XIVE-IRQ 4864 Edge      virtio0-config
>> 259:       1632  XIVE-IRQ 4865 Edge      virtio0-input.0
>> 260:          1  XIVE-IRQ 4866 Edge      virtio0-output.0
>> 261:          0  XIVE-IRQ 4868 Edge      xhci_hcd
>> 262:          0  XIVE-IRQ 4869 Edge      xhci_hcd
>> 263:         60  XIVE-IRQ 4867 Edge      nvidia
>> 272:          0  XIVE-IRQ 4368 Edge      hvc_console
>> LOC:       2236   Local timer interrupts for timer event device
>> BCT:          0   Broadcast timer interrupts for timer event device
>> LOC:          0   Local timer interrupts for others
>> SPU:          2   Spurious interrupts
>> PMI:          0   Performance monitoring interrupts
>> MCE:          0   Machine check exceptions
>> NMI:          0   System Reset interrupts
>> DBL:          0   Doorbell interrupts
>>
>>
>>
>> Here is the command line:
>>
>> /home/aik/pbuild/qemu-aikrhel74alt-ppc64/ppc64-softmmu/qemu-system-ppc64 \
>> -nodefaults \
>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>> -mon id=MON0,chardev=STDIO0,mode=readline -nographic -vga none \
>> -enable-kvm \
>> -device nec-usb-xhci,id=nec-usb-xhci0 -m 16G \
>> -netdev "user,id=USER0,hostfwd=tcp::2223-:22" \
>> -device "virtio-net-pci,id=vnet0,mac=C0:41:49:4b:00:00,netdev=USER0" \
>> img/u1804-64G-cuda10.1-418.67-swiotlb.qcow2 \
>> -machine pseries,cap-cfpc=broken,cap-htm=off,ic-mode=xive \
>> -device "vfio-pci,id=vfio0006_00_00_1,host=0006:00:00.1" \
>> -device "vfio-pci,id=vfio0006_00_00_0,host=0006:00:00.0" \
>> -device "vfio-pci,id=vfio0004_04_00_0,host=0004:04:00.0" \
>> -kernel ./vmldbg -append "root=/dev/sda2 console=hvc0 debug loglevel=8" \
>> -snapshot \
>> -smp 1,threads=1 -bios ./slof.bin \
>> -L /home/aik/t/qemu-ppc64-bios/ \
>> -trace events=qemu_trace_events -d guest_errors \
>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.user2223.6_0_0_1 \
>> -mon chardev=SOCKET0,mode=control
>>
>>
>>
>>
>>> ---
>>>  hw/intc/spapr_xive_kvm.c    | 96 ++++++++++++++++++++++++++++++++++++-
>>>  include/hw/ppc/spapr_xive.h |  1 +
>>>  2 files changed, 96 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>>> index 8dd4f96e0b..735577a6f8 100644
>>> --- a/hw/intc/spapr_xive_kvm.c
>>> +++ b/hw/intc/spapr_xive_kvm.c
>>> @@ -433,9 +433,100 @@ static void kvmppc_xive_get_queues(SpaprXive *xive, Error **errp)
>>>      }
>>>  }
>>>  
>>> +/*
>>> + * The primary goal of the XIVE VM change handler is to mark the EQ
>>> + * pages dirty when all XIVE event notifications have stopped.
>>> + *
>>> + * Whenever the VM is stopped, the VM change handler sets the source
>>> + * PQs to PENDING to stop the flow of events and to possibly catch a
>>> + * triggered interrupt occuring while the VM is stopped. The previous
>>> + * state is saved in anticipation of a migration. The XIVE controller
>>> + * is then synced through KVM to flush any in-flight event
>>> + * notification and stabilize the EQs.
>>> + *
>>> + * At this stage, we can mark the EQ page dirty and let a migration
>>> + * sequence transfer the EQ pages to the destination, which is done
>>> + * just after the stop state.
>>> + *
>>> + * The previous configuration of the sources is restored when the VM
>>> + * runs again. If an interrupt was queued while the VM was stopped,
>>> + * simply generate a trigger.
>>> + */
>>> +static void kvmppc_xive_change_state_handler(void *opaque, int running,
>>> +                                             RunState state)
>>> +{
>>> +    SpaprXive *xive = opaque;
>>> +    XiveSource *xsrc = &xive->source;
>>> +    Error *local_err = NULL;
>>> +    int i;
>>> +
>>> +    /*
>>> +     * Restore the sources to their initial state. This is called when
>>> +     * the VM resumes after a stop or a migration.
>>> +     */
>>> +    if (running) {
>>> +        for (i = 0; i < xsrc->nr_irqs; i++) {
>>> +            uint8_t pq = xive_source_esb_get(xsrc, i);
>>> +            uint8_t old_pq;
>>> +
>>> +            old_pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
>>> +
>>> +            /*
>>> +             * An interrupt was queued while the VM was stopped,
>>> +             * generate a trigger.
>>> +             */
>>> +            if (pq == XIVE_ESB_RESET && old_pq == XIVE_ESB_QUEUED) {
>>> +                xive_esb_trigger(xsrc, i);
>>> +            }
>>> +        }
>>> +
>>> +        return;
>>> +    }
>>> +
>>> +    /*
>>> +     * Mask the sources, to stop the flow of event notifications, and
>>> +     * save the PQs locally in the XiveSource object. The XiveSource
>>> +     * state will be collected later on by its vmstate handler if a
>>> +     * migration is in progress.
>>> +     */
>>> +    for (i = 0; i < xsrc->nr_irqs; i++) {
>>> +        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
>>> +
>>> +        /*
>>> +         * PQ is set to PENDING to possibly catch a triggered
>>> +         * interrupt occuring while the VM is stopped (hotplug event
>>> +         * for instance) .
>>> +         */
>>> +        if (pq != XIVE_ESB_OFF) {
>>> +            pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_10);
>>> +        }
>>> +        xive_source_esb_set(xsrc, i, pq);
>>> +    }
>>> +
>>> +    /*
>>> +     * Sync the XIVE controller in KVM, to flush in-flight event
>>> +     * notification that should be enqueued in the EQs and mark the
>>> +     * XIVE EQ pages dirty to collect all updates.
>>> +     */
>>> +    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL,
>>> +                      KVM_DEV_XIVE_EQ_SYNC, NULL, true, &local_err);
>>> +    if (local_err) {
>>> +        error_report_err(local_err);
>>> +        return;
>>> +    }
>>> +}
>>> +
>>>  void kvmppc_xive_synchronize_state(SpaprXive *xive, Error **errp)
>>>  {
>>> -    kvmppc_xive_source_get_state(&xive->source);
>>> +    /*
>>> +     * When the VM is stopped, the sources are masked and the previous
>>> +     * state is saved in anticipation of a migration. We should not
>>> +     * synchronize the source state in that case else we will override
>>> +     * the saved state.
>>> +     */
>>> +    if (runstate_is_running()) {
>>> +        kvmppc_xive_source_get_state(&xive->source);
>>> +    }
>>>  
>>>      /* EAT: there is no extra state to query from KVM */
>>>  
>>> @@ -515,6 +606,9 @@ void kvmppc_xive_connect(SpaprXive *xive, Error **errp)
>>>                                        "xive.tima", tima_len, xive->tm_mmap);
>>>      sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
>>>  
>>> +    xive->change = qemu_add_vm_change_state_handler(
>>> +        kvmppc_xive_change_state_handler, xive);
>>> +
>>>      kvm_kernel_irqchip = true;
>>>      kvm_msi_via_irqfd_allowed = true;
>>>      kvm_gsi_direct_mapping = true;
>>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>>> index 7e49badd8c..734662c12a 100644
>>> --- a/include/hw/ppc/spapr_xive.h
>>> +++ b/include/hw/ppc/spapr_xive.h
>>> @@ -42,6 +42,7 @@ typedef struct SpaprXive {
>>>      /* KVM support */
>>>      int           fd;
>>>      void          *tm_mmap;
>>> +    VMChangeStateEntry *change;
>>>  } SpaprXive;
>>>  
>>>  /*
>>>
>>
>
diff mbox series

Patch

diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index 8dd4f96e0b..735577a6f8 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -433,9 +433,100 @@  static void kvmppc_xive_get_queues(SpaprXive *xive, Error **errp)
     }
 }
 
+/*
+ * The primary goal of the XIVE VM change handler is to mark the EQ
+ * pages dirty when all XIVE event notifications have stopped.
+ *
+ * Whenever the VM is stopped, the VM change handler sets the source
+ * PQs to PENDING to stop the flow of events and to possibly catch a
+ * triggered interrupt occuring while the VM is stopped. The previous
+ * state is saved in anticipation of a migration. The XIVE controller
+ * is then synced through KVM to flush any in-flight event
+ * notification and stabilize the EQs.
+ *
+ * At this stage, we can mark the EQ page dirty and let a migration
+ * sequence transfer the EQ pages to the destination, which is done
+ * just after the stop state.
+ *
+ * The previous configuration of the sources is restored when the VM
+ * runs again. If an interrupt was queued while the VM was stopped,
+ * simply generate a trigger.
+ */
+static void kvmppc_xive_change_state_handler(void *opaque, int running,
+                                             RunState state)
+{
+    SpaprXive *xive = opaque;
+    XiveSource *xsrc = &xive->source;
+    Error *local_err = NULL;
+    int i;
+
+    /*
+     * Restore the sources to their initial state. This is called when
+     * the VM resumes after a stop or a migration.
+     */
+    if (running) {
+        for (i = 0; i < xsrc->nr_irqs; i++) {
+            uint8_t pq = xive_source_esb_get(xsrc, i);
+            uint8_t old_pq;
+
+            old_pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
+
+            /*
+             * An interrupt was queued while the VM was stopped,
+             * generate a trigger.
+             */
+            if (pq == XIVE_ESB_RESET && old_pq == XIVE_ESB_QUEUED) {
+                xive_esb_trigger(xsrc, i);
+            }
+        }
+
+        return;
+    }
+
+    /*
+     * Mask the sources, to stop the flow of event notifications, and
+     * save the PQs locally in the XiveSource object. The XiveSource
+     * state will be collected later on by its vmstate handler if a
+     * migration is in progress.
+     */
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
+
+        /*
+         * PQ is set to PENDING to possibly catch a triggered
+         * interrupt occuring while the VM is stopped (hotplug event
+         * for instance) .
+         */
+        if (pq != XIVE_ESB_OFF) {
+            pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_10);
+        }
+        xive_source_esb_set(xsrc, i, pq);
+    }
+
+    /*
+     * Sync the XIVE controller in KVM, to flush in-flight event
+     * notification that should be enqueued in the EQs and mark the
+     * XIVE EQ pages dirty to collect all updates.
+     */
+    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL,
+                      KVM_DEV_XIVE_EQ_SYNC, NULL, true, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return;
+    }
+}
+
 void kvmppc_xive_synchronize_state(SpaprXive *xive, Error **errp)
 {
-    kvmppc_xive_source_get_state(&xive->source);
+    /*
+     * When the VM is stopped, the sources are masked and the previous
+     * state is saved in anticipation of a migration. We should not
+     * synchronize the source state in that case else we will override
+     * the saved state.
+     */
+    if (runstate_is_running()) {
+        kvmppc_xive_source_get_state(&xive->source);
+    }
 
     /* EAT: there is no extra state to query from KVM */
 
@@ -515,6 +606,9 @@  void kvmppc_xive_connect(SpaprXive *xive, Error **errp)
                                       "xive.tima", tima_len, xive->tm_mmap);
     sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
 
+    xive->change = qemu_add_vm_change_state_handler(
+        kvmppc_xive_change_state_handler, xive);
+
     kvm_kernel_irqchip = true;
     kvm_msi_via_irqfd_allowed = true;
     kvm_gsi_direct_mapping = true;
diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 7e49badd8c..734662c12a 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -42,6 +42,7 @@  typedef struct SpaprXive {
     /* KVM support */
     int           fd;
     void          *tm_mmap;
+    VMChangeStateEntry *change;
 } SpaprXive;
 
 /*