diff mbox

[6/8] spapr: move interrupt allocator to xics

Message ID 1394770689-29039-7-git-send-email-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy March 14, 2014, 4:18 a.m. UTC
The current allocator returns IRQ numbers from a pool and does not
support IRQs reuse in any form as it did not keep track of what it
previously returned, it only had the last returned IRQ.
However migration may change interrupts for devices depending on
their order in the command line.

This moves an allocator from SPAPR to XICS.

This switches IRQ users to use new API.

This uses LSI/MSI flags to know if interrupt is in use.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 hw/intc/xics.c         | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/ppc/spapr.c         | 67 ------------------------------------------
 hw/ppc/spapr_events.c  |  2 +-
 hw/ppc/spapr_pci.c     |  6 ++--
 hw/ppc/spapr_vio.c     |  2 +-
 include/hw/ppc/spapr.h | 10 -------
 include/hw/ppc/xics.h  |  2 ++
 trace-events           |  3 ++
 8 files changed, 90 insertions(+), 82 deletions(-)

Comments

Alexander Graf April 10, 2014, 12:51 p.m. UTC | #1
On 14.03.14 05:18, Alexey Kardashevskiy wrote:
> The current allocator returns IRQ numbers from a pool and does not
> support IRQs reuse in any form as it did not keep track of what it
> previously returned, it only had the last returned IRQ.
> However migration may change interrupts for devices depending on
> their order in the command line.

Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't 
change anything.


Alex

> This moves an allocator from SPAPR to XICS.
>
> This switches IRQ users to use new API.
>
> This uses LSI/MSI flags to know if interrupt is in use.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>   hw/intc/xics.c         | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   hw/ppc/spapr.c         | 67 ------------------------------------------
>   hw/ppc/spapr_events.c  |  2 +-
>   hw/ppc/spapr_pci.c     |  6 ++--
>   hw/ppc/spapr_vio.c     |  2 +-
>   include/hw/ppc/spapr.h | 10 -------
>   include/hw/ppc/xics.h  |  2 ++
>   trace-events           |  3 ++
>   8 files changed, 90 insertions(+), 82 deletions(-)
>
> diff --git a/hw/intc/xics.c b/hw/intc/xics.c
> index e5195bf..8d101a3 100644
> --- a/hw/intc/xics.c
> +++ b/hw/intc/xics.c
> @@ -690,6 +690,86 @@ void xics_set_irq_type(XICSState *icp, int irq, bool lsi)
>       ics_set_irq_type(&icp->ics[server], irq, lsi);
>   }
>   
> +#define XICS_IRQ_FREE(ics, n)   \
> +    (!((ics)->irqs[(n) - (ics)->offset].flags & \
> +       (XICS_FLAGS_LSI | XICS_FLAGS_MSI)))
> +
> +static int ics_find_free_block(ICSState *ics, int num, int alignnum)
> +{
> +    int first, i;
> +
> +    for (first = 0; first < ics->nr_irqs; first += alignnum) {
> +        if (num > (ics->nr_irqs - first)) {
> +            return -1;
> +        }
> +        for (i = first; i < first + num; ++i) {
> +            if (!XICS_IRQ_FREE(ics, i + ics->offset)) {
> +                break;
> +            }
> +        }
> +        if (i == (first + num)) {
> +            return first + ics->offset;
> +        }
> +    }
> +
> +    return -1;
> +}
> +
> +int xics_alloc(XICSState *icp, int server, int irq, bool lsi)
> +{
> +    ICSState *ics = &icp->ics[server];
> +
> +    if (irq) {
> +        assert(server == xics_find_server(icp, irq));
> +        if (!XICS_IRQ_FREE(ics, irq)) {
> +            trace_xics_alloc_failed(server, irq);
> +            return -1;
> +        }
> +    } else {
> +        irq = ics_find_free_block(ics, 1, 1);
> +    }
> +
> +    ics_set_irq_type(ics, irq, lsi);
> +    trace_xics_alloc(server, irq);
> +
> +    return irq;
> +}
> +
> +/*
> + * Allocate block of consequtive IRQs, returns a number of the first.
> + * If align==true, aligns the first IRQ number to num.
> + */
> +int xics_alloc_block(XICSState *icp, int server, int num, bool lsi, bool align)
> +{
> +    int i, first = -1;
> +    ICSState *ics = &icp->ics[server];
> +
> +    assert(server == 0);
> +    /*
> +     * MSIMesage::data is used for storing VIRQ so
> +     * it has to be aligned to num to support multiple
> +     * MSI vectors. MSI-X is not affected by this.
> +     * The hint is used for the first IRQ, the rest should
> +     * be allocated continuously.
> +     */
> +    if (align) {
> +        assert((num == 1) || (num == 2) || (num == 4) ||
> +               (num == 8) || (num == 16) || (num == 32));
> +        first = ics_find_free_block(ics, num, num);
> +    } else {
> +        first = ics_find_free_block(ics, num, 1);
> +    }
> +
> +    if (first > 0) {
> +        for (i = first; i < first + num; ++i) {
> +            ics_set_irq_type(ics, i, lsi);
> +        }
> +    }
> +    trace_xics_alloc_block(server, first, num, lsi, align);
> +
> +    return first;
> +}
> +
>   /*
>    * Guest interfaces
>    */
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 12adc21..29ca2e0 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -83,73 +83,6 @@
>   
>   sPAPREnvironment *spapr;
>   
> -int spapr_allocate_irq(int hint, bool lsi)
> -{
> -    int irq;
> -
> -    if (hint) {
> -        irq = hint;
> -        if (hint >= spapr->next_irq) {
> -            spapr->next_irq = hint + 1;
> -        }
> -        /* FIXME: we should probably check for collisions somehow */
> -    } else {
> -        irq = spapr->next_irq++;
> -    }
> -
> -    /* Configure irq type */
> -    if (!xics_get_qirq(spapr->icp, irq)) {
> -        return 0;
> -    }
> -
> -    xics_set_irq_type(spapr->icp, irq, lsi);
> -
> -    return irq;
> -}
> -
> -/*
> - * Allocate block of consequtive IRQs, returns a number of the first.
> - * If msi==true, aligns the first IRQ number to num.
> - */
> -int spapr_allocate_irq_block(int num, bool lsi, bool msi)
> -{
> -    int first = -1;
> -    int i, hint = 0;
> -
> -    /*
> -     * MSIMesage::data is used for storing VIRQ so
> -     * it has to be aligned to num to support multiple
> -     * MSI vectors. MSI-X is not affected by this.
> -     * The hint is used for the first IRQ, the rest should
> -     * be allocated continuously.
> -     */
> -    if (msi) {
> -        assert((num == 1) || (num == 2) || (num == 4) ||
> -               (num == 8) || (num == 16) || (num == 32));
> -        hint = (spapr->next_irq + num - 1) & ~(num - 1);
> -    }
> -
> -    for (i = 0; i < num; ++i) {
> -        int irq;
> -
> -        irq = spapr_allocate_irq(hint, lsi);
> -        if (!irq) {
> -            return -1;
> -        }
> -
> -        if (0 == i) {
> -            first = irq;
> -            hint = 0;
> -        }
> -
> -        /* If the above doesn't create a consecutive block then that's
> -         * an internal bug */
> -        assert(irq == (first + i));
> -    }
> -
> -    return first;
> -}
> -
>   static XICSState *try_create_xics(const char *type, int nr_servers,
>                                     int nr_irqs)
>   {
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 16fa49e..e605430 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -314,7 +314,7 @@ static void check_exception(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>   
>   void spapr_events_init(sPAPREnvironment *spapr)
>   {
> -    spapr->epow_irq = spapr_allocate_msi(0);
> +    spapr->epow_irq = xics_alloc_block(spapr->icp, 0, 1, false, false);
>       spapr->epow_notifier.notify = spapr_powerdown_req;
>       qemu_register_powerdown_notifier(&spapr->epow_notifier);
>       spapr_rtas_register("check-exception", check_exception);
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index cbef095..4eaf364 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -343,8 +343,8 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>   
>       /* There is no cached config, allocate MSIs */
>       if (!phb->msi_table[ndev].nvec) {
> -        irq = spapr_allocate_irq_block(req_num, false,
> -                                       ret_intr_type == RTAS_TYPE_MSI);
> +        irq = xics_alloc_block(spapr->icp, 0, req_num, false,
> +                               ret_intr_type == RTAS_TYPE_MSI);
>           if (irq < 0) {
>               error_report("Cannot allocate MSIs for device#%d", ndev);
>               rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
> @@ -623,7 +623,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>       for (i = 0; i < PCI_NUM_PINS; i++) {
>           uint32_t irq;
>   
> -        irq = spapr_allocate_lsi(0);
> +        irq = xics_alloc_block(spapr->icp, 0, 1, true, false);
>           if (!irq) {
>               error_setg(errp, "spapr_allocate_lsi failed");
>               return;
> diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c
> index 4e33f46..8aeb263 100644
> --- a/hw/ppc/spapr_vio.c
> +++ b/hw/ppc/spapr_vio.c
> @@ -448,7 +448,7 @@ static int spapr_vio_busdev_init(DeviceState *qdev)
>           dev->qdev.id = id;
>       }
>   
> -    dev->irq = spapr_allocate_msi(dev->irq);
> +    dev->irq = xics_alloc(spapr->icp, 0, dev->irq, false);
>       if (!dev->irq) {
>           return -1;
>       }
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 18332fd..5d577ff 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -323,16 +323,6 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>   int spapr_allocate_irq(int hint, bool lsi);
>   int spapr_allocate_irq_block(int num, bool lsi, bool msi);
>   
> -static inline int spapr_allocate_msi(int hint)
> -{
> -    return spapr_allocate_irq(hint, false);
> -}
> -
> -static inline int spapr_allocate_lsi(int hint)
> -{
> -    return spapr_allocate_irq(hint, true);
> -}
> -
>   /* RTAS return codes */
>   #define RTAS_OUT_SUCCESS            0
>   #define RTAS_OUT_NO_ERRORS_FOUND    1
> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
> index 4b30e9a..337398d 100644
> --- a/include/hw/ppc/xics.h
> +++ b/include/hw/ppc/xics.h
> @@ -157,6 +157,8 @@ struct ICSIRQState {
>   
>   qemu_irq xics_get_qirq(XICSState *icp, int irq);
>   void xics_set_irq_type(XICSState *icp, int irq, bool lsi);
> +int xics_alloc(XICSState *icp, int server, int irq, bool lsi);
> +int xics_alloc_block(XICSState *icp, int server, int num, bool lsi, bool align);
>   
>   void xics_cpu_setup(XICSState *icp, PowerPCCPU *cpu);
>   
> diff --git a/trace-events b/trace-events
> index 002c260..ad7400e 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1143,6 +1143,9 @@ xics_set_irq_lsi(int srcno, int nr) "set_irq_lsi: srcno %d [irq %#x]"
>   xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority) "ics_write_xive: irq %#x [src %d] server %#x prio %#x"
>   xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]"
>   xics_ics_eoi(int nr) "ics_eoi: irq %#x"
> +xics_alloc(int server, int irq) "server#%d, irq %d"
> +xics_alloc_failed(int server, int irq) "server#%d, irq %d"
> +xics_alloc_block(int server, int first, int num, bool lsi, int align) "server#%d, first irq %d, %d irqs, lsi=%d, alignnum %d"
>   
>   # hw/ppc/spapr_iommu.c
>   spapr_iommu_put(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tce=0x%"PRIx64" ret=%"PRId64
Alexey Kardashevskiy April 10, 2014, 1:24 p.m. UTC | #2
On 04/10/2014 10:51 PM, Alexander Graf wrote:
> 
> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>> The current allocator returns IRQ numbers from a pool and does not
>> support IRQs reuse in any form as it did not keep track of what it
>> previously returned, it only had the last returned IRQ.
>> However migration may change interrupts for devices depending on
>> their order in the command line.
> 
> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't change
> anything.


I put wrong commit message. By change I meant that the default state before
the destination guest started accepting migration is different from what
the destination guest became after migration finished. And migration cannot
avoid changing this default state.




> Alex
> 
>> This moves an allocator from SPAPR to XICS.
>>
>> This switches IRQ users to use new API.
>>
>> This uses LSI/MSI flags to know if interrupt is in use.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   hw/intc/xics.c         | 80
>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   hw/ppc/spapr.c         | 67 ------------------------------------------
>>   hw/ppc/spapr_events.c  |  2 +-
>>   hw/ppc/spapr_pci.c     |  6 ++--
>>   hw/ppc/spapr_vio.c     |  2 +-
>>   include/hw/ppc/spapr.h | 10 -------
>>   include/hw/ppc/xics.h  |  2 ++
>>   trace-events           |  3 ++
>>   8 files changed, 90 insertions(+), 82 deletions(-)
>>
>> diff --git a/hw/intc/xics.c b/hw/intc/xics.c
>> index e5195bf..8d101a3 100644
>> --- a/hw/intc/xics.c
>> +++ b/hw/intc/xics.c
>> @@ -690,6 +690,86 @@ void xics_set_irq_type(XICSState *icp, int irq, bool
>> lsi)
>>       ics_set_irq_type(&icp->ics[server], irq, lsi);
>>   }
>>   +#define XICS_IRQ_FREE(ics, n)   \
>> +    (!((ics)->irqs[(n) - (ics)->offset].flags & \
>> +       (XICS_FLAGS_LSI | XICS_FLAGS_MSI)))
>> +
>> +static int ics_find_free_block(ICSState *ics, int num, int alignnum)
>> +{
>> +    int first, i;
>> +
>> +    for (first = 0; first < ics->nr_irqs; first += alignnum) {
>> +        if (num > (ics->nr_irqs - first)) {
>> +            return -1;
>> +        }
>> +        for (i = first; i < first + num; ++i) {
>> +            if (!XICS_IRQ_FREE(ics, i + ics->offset)) {
>> +                break;
>> +            }
>> +        }
>> +        if (i == (first + num)) {
>> +            return first + ics->offset;
>> +        }
>> +    }
>> +
>> +    return -1;
>> +}
>> +
>> +int xics_alloc(XICSState *icp, int server, int irq, bool lsi)
>> +{
>> +    ICSState *ics = &icp->ics[server];
>> +
>> +    if (irq) {
>> +        assert(server == xics_find_server(icp, irq));
>> +        if (!XICS_IRQ_FREE(ics, irq)) {
>> +            trace_xics_alloc_failed(server, irq);
>> +            return -1;
>> +        }
>> +    } else {
>> +        irq = ics_find_free_block(ics, 1, 1);
>> +    }
>> +
>> +    ics_set_irq_type(ics, irq, lsi);
>> +    trace_xics_alloc(server, irq);
>> +
>> +    return irq;
>> +}
>> +
>> +/*
>> + * Allocate block of consequtive IRQs, returns a number of the first.
>> + * If align==true, aligns the first IRQ number to num.
>> + */
>> +int xics_alloc_block(XICSState *icp, int server, int num, bool lsi, bool
>> align)
>> +{
>> +    int i, first = -1;
>> +    ICSState *ics = &icp->ics[server];
>> +
>> +    assert(server == 0);
>> +    /*
>> +     * MSIMesage::data is used for storing VIRQ so
>> +     * it has to be aligned to num to support multiple
>> +     * MSI vectors. MSI-X is not affected by this.
>> +     * The hint is used for the first IRQ, the rest should
>> +     * be allocated continuously.
>> +     */
>> +    if (align) {
>> +        assert((num == 1) || (num == 2) || (num == 4) ||
>> +               (num == 8) || (num == 16) || (num == 32));
>> +        first = ics_find_free_block(ics, num, num);
>> +    } else {
>> +        first = ics_find_free_block(ics, num, 1);
>> +    }
>> +
>> +    if (first > 0) {
>> +        for (i = first; i < first + num; ++i) {
>> +            ics_set_irq_type(ics, i, lsi);
>> +        }
>> +    }
>> +    trace_xics_alloc_block(server, first, num, lsi, align);
>> +
>> +    return first;
>> +}
>> +
>>   /*
>>    * Guest interfaces
>>    */
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 12adc21..29ca2e0 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -83,73 +83,6 @@
>>     sPAPREnvironment *spapr;
>>   -int spapr_allocate_irq(int hint, bool lsi)
>> -{
>> -    int irq;
>> -
>> -    if (hint) {
>> -        irq = hint;
>> -        if (hint >= spapr->next_irq) {
>> -            spapr->next_irq = hint + 1;
>> -        }
>> -        /* FIXME: we should probably check for collisions somehow */
>> -    } else {
>> -        irq = spapr->next_irq++;
>> -    }
>> -
>> -    /* Configure irq type */
>> -    if (!xics_get_qirq(spapr->icp, irq)) {
>> -        return 0;
>> -    }
>> -
>> -    xics_set_irq_type(spapr->icp, irq, lsi);
>> -
>> -    return irq;
>> -}
>> -
>> -/*
>> - * Allocate block of consequtive IRQs, returns a number of the first.
>> - * If msi==true, aligns the first IRQ number to num.
>> - */
>> -int spapr_allocate_irq_block(int num, bool lsi, bool msi)
>> -{
>> -    int first = -1;
>> -    int i, hint = 0;
>> -
>> -    /*
>> -     * MSIMesage::data is used for storing VIRQ so
>> -     * it has to be aligned to num to support multiple
>> -     * MSI vectors. MSI-X is not affected by this.
>> -     * The hint is used for the first IRQ, the rest should
>> -     * be allocated continuously.
>> -     */
>> -    if (msi) {
>> -        assert((num == 1) || (num == 2) || (num == 4) ||
>> -               (num == 8) || (num == 16) || (num == 32));
>> -        hint = (spapr->next_irq + num - 1) & ~(num - 1);
>> -    }
>> -
>> -    for (i = 0; i < num; ++i) {
>> -        int irq;
>> -
>> -        irq = spapr_allocate_irq(hint, lsi);
>> -        if (!irq) {
>> -            return -1;
>> -        }
>> -
>> -        if (0 == i) {
>> -            first = irq;
>> -            hint = 0;
>> -        }
>> -
>> -        /* If the above doesn't create a consecutive block then that's
>> -         * an internal bug */
>> -        assert(irq == (first + i));
>> -    }
>> -
>> -    return first;
>> -}
>> -
>>   static XICSState *try_create_xics(const char *type, int nr_servers,
>>                                     int nr_irqs)
>>   {
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index 16fa49e..e605430 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -314,7 +314,7 @@ static void check_exception(PowerPCCPU *cpu,
>> sPAPREnvironment *spapr,
>>     void spapr_events_init(sPAPREnvironment *spapr)
>>   {
>> -    spapr->epow_irq = spapr_allocate_msi(0);
>> +    spapr->epow_irq = xics_alloc_block(spapr->icp, 0, 1, false, false);
>>       spapr->epow_notifier.notify = spapr_powerdown_req;
>>       qemu_register_powerdown_notifier(&spapr->epow_notifier);
>>       spapr_rtas_register("check-exception", check_exception);
>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>> index cbef095..4eaf364 100644
>> --- a/hw/ppc/spapr_pci.c
>> +++ b/hw/ppc/spapr_pci.c
>> @@ -343,8 +343,8 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu,
>> sPAPREnvironment *spapr,
>>         /* There is no cached config, allocate MSIs */
>>       if (!phb->msi_table[ndev].nvec) {
>> -        irq = spapr_allocate_irq_block(req_num, false,
>> -                                       ret_intr_type == RTAS_TYPE_MSI);
>> +        irq = xics_alloc_block(spapr->icp, 0, req_num, false,
>> +                               ret_intr_type == RTAS_TYPE_MSI);
>>           if (irq < 0) {
>>               error_report("Cannot allocate MSIs for device#%d", ndev);
>>               rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
>> @@ -623,7 +623,7 @@ static void spapr_phb_realize(DeviceState *dev, Error
>> **errp)
>>       for (i = 0; i < PCI_NUM_PINS; i++) {
>>           uint32_t irq;
>>   -        irq = spapr_allocate_lsi(0);
>> +        irq = xics_alloc_block(spapr->icp, 0, 1, true, false);
>>           if (!irq) {
>>               error_setg(errp, "spapr_allocate_lsi failed");
>>               return;
>> diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c
>> index 4e33f46..8aeb263 100644
>> --- a/hw/ppc/spapr_vio.c
>> +++ b/hw/ppc/spapr_vio.c
>> @@ -448,7 +448,7 @@ static int spapr_vio_busdev_init(DeviceState *qdev)
>>           dev->qdev.id = id;
>>       }
>>   -    dev->irq = spapr_allocate_msi(dev->irq);
>> +    dev->irq = xics_alloc(spapr->icp, 0, dev->irq, false);
>>       if (!dev->irq) {
>>           return -1;
>>       }
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 18332fd..5d577ff 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -323,16 +323,6 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu,
>> target_ulong opcode,
>>   int spapr_allocate_irq(int hint, bool lsi);
>>   int spapr_allocate_irq_block(int num, bool lsi, bool msi);
>>   -static inline int spapr_allocate_msi(int hint)
>> -{
>> -    return spapr_allocate_irq(hint, false);
>> -}
>> -
>> -static inline int spapr_allocate_lsi(int hint)
>> -{
>> -    return spapr_allocate_irq(hint, true);
>> -}
>> -
>>   /* RTAS return codes */
>>   #define RTAS_OUT_SUCCESS            0
>>   #define RTAS_OUT_NO_ERRORS_FOUND    1
>> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
>> index 4b30e9a..337398d 100644
>> --- a/include/hw/ppc/xics.h
>> +++ b/include/hw/ppc/xics.h
>> @@ -157,6 +157,8 @@ struct ICSIRQState {
>>     qemu_irq xics_get_qirq(XICSState *icp, int irq);
>>   void xics_set_irq_type(XICSState *icp, int irq, bool lsi);
>> +int xics_alloc(XICSState *icp, int server, int irq, bool lsi);
>> +int xics_alloc_block(XICSState *icp, int server, int num, bool lsi, bool
>> align);
>>     void xics_cpu_setup(XICSState *icp, PowerPCCPU *cpu);
>>   diff --git a/trace-events b/trace-events
>> index 002c260..ad7400e 100644
>> --- a/trace-events
>> +++ b/trace-events
>> @@ -1143,6 +1143,9 @@ xics_set_irq_lsi(int srcno, int nr) "set_irq_lsi:
>> srcno %d [irq %#x]"
>>   xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority)
>> "ics_write_xive: irq %#x [src %d] server %#x prio %#x"
>>   xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]"
>>   xics_ics_eoi(int nr) "ics_eoi: irq %#x"
>> +xics_alloc(int server, int irq) "server#%d, irq %d"
>> +xics_alloc_failed(int server, int irq) "server#%d, irq %d"
>> +xics_alloc_block(int server, int first, int num, bool lsi, int align)
>> "server#%d, first irq %d, %d irqs, lsi=%d, alignnum %d"
>>     # hw/ppc/spapr_iommu.c
>>   spapr_iommu_put(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t
>> ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tce=0x%"PRIx64" ret=%"PRId64
>
Alexander Graf April 10, 2014, 1:26 p.m. UTC | #3
On 10.04.14 15:24, Alexey Kardashevskiy wrote:
> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>> The current allocator returns IRQ numbers from a pool and does not
>>> support IRQs reuse in any form as it did not keep track of what it
>>> previously returned, it only had the last returned IRQ.
>>> However migration may change interrupts for devices depending on
>>> their order in the command line.
>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't change
>> anything.
>
> I put wrong commit message. By change I meant that the default state before
> the destination guest started accepting migration is different from what
> the destination guest became after migration finished. And migration cannot
> avoid changing this default state.

Ok, why is the IRQ configuration different?


Alex
Alexey Kardashevskiy April 10, 2014, 2:43 p.m. UTC | #4
On 04/10/2014 11:26 PM, Alexander Graf wrote:
> 
> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>> The current allocator returns IRQ numbers from a pool and does not
>>>> support IRQs reuse in any form as it did not keep track of what it
>>>> previously returned, it only had the last returned IRQ.
>>>> However migration may change interrupts for devices depending on
>>>> their order in the command line.
>>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't change
>>> anything.
>>
>> I put wrong commit message. By change I meant that the default state before
>> the destination guest started accepting migration is different from what
>> the destination guest became after migration finished. And migration cannot
>> avoid changing this default state.
> 
> Ok, why is the IRQ configuration different?

Because QEMU creates devices in the order as in the command line, and
libvirt changes this order - the XML used to create the guest and the XML
which is sends during migration are different. libvirt thinks it is ok
while it keeps @reg property for (for example) spapr-vscsi devices but it
is not because since the order is different, devices call IRQ allocator in
different order and get different IRQs.
Alexander Graf April 11, 2014, 9:24 a.m. UTC | #5
On 10.04.14 16:43, Alexey Kardashevskiy wrote:
> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>> The current allocator returns IRQ numbers from a pool and does not
>>>>> support IRQs reuse in any form as it did not keep track of what it
>>>>> previously returned, it only had the last returned IRQ.
>>>>> However migration may change interrupts for devices depending on
>>>>> their order in the command line.
>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't change
>>>> anything.
>>> I put wrong commit message. By change I meant that the default state before
>>> the destination guest started accepting migration is different from what
>>> the destination guest became after migration finished. And migration cannot
>>> avoid changing this default state.
>> Ok, why is the IRQ configuration different?
> Because QEMU creates devices in the order as in the command line, and
> libvirt changes this order - the XML used to create the guest and the XML
> which is sends during migration are different. libvirt thinks it is ok
> while it keeps @reg property for (for example) spapr-vscsi devices but it
> is not because since the order is different, devices call IRQ allocator in
> different order and get different IRQs.

So your patch migrates the current IRQ configuration, but once you 
restart the virtual machine on the destination host it will have 
different IRQ numbering again, right?

I'm not sure that's a good solution to the problem. I guess we should 
rather aim to make sure that we can make IRQ allocation explicit. 
Fundamentally the problem sounds very similar to the PCI slot allocation 
which eventually got solved by libvirt specifying the slots manually.


Alex
Alexey Kardashevskiy April 11, 2014, 12:38 p.m. UTC | #6
On 04/11/2014 07:24 PM, Alexander Graf wrote:
> 
> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>> The current allocator returns IRQ numbers from a pool and does not
>>>>>> support IRQs reuse in any form as it did not keep track of what it
>>>>>> previously returned, it only had the last returned IRQ.
>>>>>> However migration may change interrupts for devices depending on
>>>>>> their order in the command line.
>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't change
>>>>> anything.
>>>> I put wrong commit message. By change I meant that the default state
>>>> before
>>>> the destination guest started accepting migration is different from what
>>>> the destination guest became after migration finished. And migration
>>>> cannot
>>>> avoid changing this default state.
>>> Ok, why is the IRQ configuration different?
>> Because QEMU creates devices in the order as in the command line, and
>> libvirt changes this order - the XML used to create the guest and the XML
>> which is sends during migration are different. libvirt thinks it is ok
>> while it keeps @reg property for (for example) spapr-vscsi devices but it
>> is not because since the order is different, devices call IRQ allocator in
>> different order and get different IRQs.
> 
> So your patch migrates the current IRQ configuration, but once you restart
> the virtual machine on the destination host it will have different IRQ
> numbering again, right?

No, why? IRQs are assigned at init time from realize() callbacks (and
survive reset) or as a part of ibm,change-msi rtas call which happens in
the same order as it only depends on pci addresses and we do not change
this either.


> I'm not sure that's a good solution to the problem. I guess we should
> rather aim to make sure that we can make IRQ allocation explicit.
> Fundamentally the problem sounds very similar to the PCI slot allocation
> which eventually got solved by libvirt specifying the slots manually.

We can do that too. Who decides? :)
Alexander Graf April 11, 2014, 1:58 p.m. UTC | #7
On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>> 
>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>> The current allocator returns IRQ numbers from a pool and does not
>>>>>>> support IRQs reuse in any form as it did not keep track of what it
>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>> However migration may change interrupts for devices depending on
>>>>>>> their order in the command line.
>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't change
>>>>>> anything.
>>>>> I put wrong commit message. By change I meant that the default state
>>>>> before
>>>>> the destination guest started accepting migration is different from what
>>>>> the destination guest became after migration finished. And migration
>>>>> cannot
>>>>> avoid changing this default state.
>>>> Ok, why is the IRQ configuration different?
>>> Because QEMU creates devices in the order as in the command line, and
>>> libvirt changes this order - the XML used to create the guest and the XML
>>> which is sends during migration are different. libvirt thinks it is ok
>>> while it keeps @reg property for (for example) spapr-vscsi devices but it
>>> is not because since the order is different, devices call IRQ allocator in
>>> different order and get different IRQs.
>> 
>> So your patch migrates the current IRQ configuration, but once you restart
>> the virtual machine on the destination host it will have different IRQ
>> numbering again, right?
> 
> No, why? IRQs are assigned at init time from realize() callbacks (and
> survive reset) or as a part of ibm,change-msi rtas call which happens in
> the same order as it only depends on pci addresses and we do not change
> this either.

Ok, let me rephrase. If I shut the machine down because I'm doing on-disk hibernate and then boot it back up, will the guest find the same configuration?

> 
> 
>> I'm not sure that's a good solution to the problem. I guess we should
>> rather aim to make sure that we can make IRQ allocation explicit.
>> Fundamentally the problem sounds very similar to the PCI slot allocation
>> which eventually got solved by libvirt specifying the slots manually.
> 
> We can do that too. Who decides? :)

The better solution wins :)


Alex
Alexey Kardashevskiy April 11, 2014, 2:50 p.m. UTC | #8
On 04/11/2014 11:58 PM, Alexander Graf wrote:
> 
> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>
>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>> The current allocator returns IRQ numbers from a pool and does not
>>>>>>>> support IRQs reuse in any form as it did not keep track of what it
>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>> However migration may change interrupts for devices depending on
>>>>>>>> their order in the command line.
>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't change
>>>>>>> anything.
>>>>>> I put wrong commit message. By change I meant that the default state
>>>>>> before
>>>>>> the destination guest started accepting migration is different from what
>>>>>> the destination guest became after migration finished. And migration
>>>>>> cannot
>>>>>> avoid changing this default state.
>>>>> Ok, why is the IRQ configuration different?
>>>> Because QEMU creates devices in the order as in the command line, and
>>>> libvirt changes this order - the XML used to create the guest and the XML
>>>> which is sends during migration are different. libvirt thinks it is ok
>>>> while it keeps @reg property for (for example) spapr-vscsi devices but it
>>>> is not because since the order is different, devices call IRQ allocator in
>>>> different order and get different IRQs.
>>>
>>> So your patch migrates the current IRQ configuration, but once you restart
>>> the virtual machine on the destination host it will have different IRQ
>>> numbering again, right?
>>
>> No, why? IRQs are assigned at init time from realize() callbacks (and
>> survive reset) or as a part of ibm,change-msi rtas call which happens in
>> the same order as it only depends on pci addresses and we do not change
>> this either.
> 

> Ok, let me rephrase. If I shut the machine down because I'm doing
> on-disk hibernate and then boot it back up, will the guest find the same
> configuration?


I do not understand what you mean by this. Hibernation by the guest OS
itself or by QEMU? If this involves QEMU exit and QEMU start - then yes,
config may be different. If it is "migrate to file" and then "migrate from
file" (do not know what you call it when migration goes to a pipe which is
"tar") - then config will be the same.


>>> I'm not sure that's a good solution to the problem. I guess we should
>>> rather aim to make sure that we can make IRQ allocation explicit.
>>> Fundamentally the problem sounds very similar to the PCI slot allocation
>>> which eventually got solved by libvirt specifying the slots manually.
>>
>> We can do that too. Who decides? :)
> 
> The better solution wins :)

We both know who decides ;) I posted series, I need heads up if it is going
the right way or not.
Alexander Graf April 11, 2014, 2:58 p.m. UTC | #9
On 11.04.14 16:50, Alexey Kardashevskiy wrote:
> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>
>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>> The current allocator returns IRQ numbers from a pool and does not
>>>>>>>>> support IRQs reuse in any form as it did not keep track of what it
>>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>>> However migration may change interrupts for devices depending on
>>>>>>>>> their order in the command line.
>>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't change
>>>>>>>> anything.
>>>>>>> I put wrong commit message. By change I meant that the default state
>>>>>>> before
>>>>>>> the destination guest started accepting migration is different from what
>>>>>>> the destination guest became after migration finished. And migration
>>>>>>> cannot
>>>>>>> avoid changing this default state.
>>>>>> Ok, why is the IRQ configuration different?
>>>>> Because QEMU creates devices in the order as in the command line, and
>>>>> libvirt changes this order - the XML used to create the guest and the XML
>>>>> which is sends during migration are different. libvirt thinks it is ok
>>>>> while it keeps @reg property for (for example) spapr-vscsi devices but it
>>>>> is not because since the order is different, devices call IRQ allocator in
>>>>> different order and get different IRQs.
>>>> So your patch migrates the current IRQ configuration, but once you restart
>>>> the virtual machine on the destination host it will have different IRQ
>>>> numbering again, right?
>>> No, why? IRQs are assigned at init time from realize() callbacks (and
>>> survive reset) or as a part of ibm,change-msi rtas call which happens in
>>> the same order as it only depends on pci addresses and we do not change
>>> this either.
>> Ok, let me rephrase. If I shut the machine down because I'm doing
>> on-disk hibernate and then boot it back up, will the guest find the same
>> configuration?
>
> I do not understand what you mean by this. Hibernation by the guest OS
> itself or by QEMU? If this involves QEMU exit and QEMU start - then yes,

by the guest OS. The host will only see a genuine "shutdown" event. The 
guest OS will expect the machine to look *the exact same* as before the 
shutdown.

> config may be different. If it is "migrate to file" and then "migrate from
> file" (do not know what you call it when migration goes to a pipe which is
> "tar") - then config will be the same.
>
>
>>>> I'm not sure that's a good solution to the problem. I guess we should
>>>> rather aim to make sure that we can make IRQ allocation explicit.
>>>> Fundamentally the problem sounds very similar to the PCI slot allocation
>>>> which eventually got solved by libvirt specifying the slots manually.
>>> We can do that too. Who decides? :)
>> The better solution wins :)
> We both know who decides ;) I posted series, I need heads up if it is going
> the right way or not.

It's not :). If a guest may not have different IRQ allocation after 
migration, it also must not have different IRQ allocation after shutdown 
+ restart.


Alex
Alexey Kardashevskiy April 11, 2014, 3:27 p.m. UTC | #10
On 04/12/2014 12:58 AM, Alexander Graf wrote:
> 
> On 11.04.14 16:50, Alexey Kardashevskiy wrote:
>> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>
>>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>>> The current allocator returns IRQ numbers from a pool and does not
>>>>>>>>>> support IRQs reuse in any form as it did not keep track of what it
>>>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>>>> However migration may change interrupts for devices depending on
>>>>>>>>>> their order in the command line.
>>>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't
>>>>>>>>> change
>>>>>>>>> anything.
>>>>>>>> I put wrong commit message. By change I meant that the default state
>>>>>>>> before
>>>>>>>> the destination guest started accepting migration is different from
>>>>>>>> what
>>>>>>>> the destination guest became after migration finished. And migration
>>>>>>>> cannot
>>>>>>>> avoid changing this default state.
>>>>>>> Ok, why is the IRQ configuration different?
>>>>>> Because QEMU creates devices in the order as in the command line, and
>>>>>> libvirt changes this order - the XML used to create the guest and the
>>>>>> XML
>>>>>> which is sends during migration are different. libvirt thinks it is ok
>>>>>> while it keeps @reg property for (for example) spapr-vscsi devices
>>>>>> but it
>>>>>> is not because since the order is different, devices call IRQ
>>>>>> allocator in
>>>>>> different order and get different IRQs.
>>>>> So your patch migrates the current IRQ configuration, but once you
>>>>> restart
>>>>> the virtual machine on the destination host it will have different IRQ
>>>>> numbering again, right?
>>>> No, why? IRQs are assigned at init time from realize() callbacks (and
>>>> survive reset) or as a part of ibm,change-msi rtas call which happens in
>>>> the same order as it only depends on pci addresses and we do not change
>>>> this either.
>>> Ok, let me rephrase. If I shut the machine down because I'm doing
>>> on-disk hibernate and then boot it back up, will the guest find the same
>>> configuration?
>>
>> I do not understand what you mean by this. Hibernation by the guest OS
>> itself or by QEMU? If this involves QEMU exit and QEMU start - then yes,
> 
> by the guest OS. The host will only see a genuine "shutdown" event. The
> guest OS will expect the machine to look *the exact same* as before the
> shutdown.

Ok. So. I have to implement "irq" property everywhere (PHB is missing
INTA/B/C/D now) and check if they did not change during migration via those
VMSTATE.*EQUAL. Correct?

If so (more or less), I still would like to keep patches 1..7.
In fact, the first one is independent and we need it anyway.
Yes/no?


>> config may be different. If it is "migrate to file" and then "migrate from
>> file" (do not know what you call it when migration goes to a pipe which is
>> "tar") - then config will be the same.
>>
>>
>>>>> I'm not sure that's a good solution to the problem. I guess we should
>>>>> rather aim to make sure that we can make IRQ allocation explicit.
>>>>> Fundamentally the problem sounds very similar to the PCI slot allocation
>>>>> which eventually got solved by libvirt specifying the slots manually.
>>>> We can do that too. Who decides? :)
>>> The better solution wins :)
>> We both know who decides ;) I posted series, I need heads up if it is going
>> the right way or not.
> 
> It's not :). If a guest may not have different IRQ allocation after
> migration, it also must not have different IRQ allocation after shutdown +
> restart.

Ok. That's good answer, thanks. How does x86 work then? IRQs are hardcoded
(some are for sure but I do not know about MSI)? Or in order to support
migration, the user has to specify IRQs for the devices which may get
different IRQs depending on things like command line parameters order?
Alexander Graf April 11, 2014, 3:38 p.m. UTC | #11
On 11.04.14 17:27, Alexey Kardashevskiy wrote:
> On 04/12/2014 12:58 AM, Alexander Graf wrote:
>> On 11.04.14 16:50, Alexey Kardashevskiy wrote:
>>> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>>>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>>
>>>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>> The current allocator returns IRQ numbers from a pool and does not
>>>>>>>>>>> support IRQs reuse in any form as it did not keep track of what it
>>>>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>>>>> However migration may change interrupts for devices depending on
>>>>>>>>>>> their order in the command line.
>>>>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't
>>>>>>>>>> change
>>>>>>>>>> anything.
>>>>>>>>> I put wrong commit message. By change I meant that the default state
>>>>>>>>> before
>>>>>>>>> the destination guest started accepting migration is different from
>>>>>>>>> what
>>>>>>>>> the destination guest became after migration finished. And migration
>>>>>>>>> cannot
>>>>>>>>> avoid changing this default state.
>>>>>>>> Ok, why is the IRQ configuration different?
>>>>>>> Because QEMU creates devices in the order as in the command line, and
>>>>>>> libvirt changes this order - the XML used to create the guest and the
>>>>>>> XML
>>>>>>> which is sends during migration are different. libvirt thinks it is ok
>>>>>>> while it keeps @reg property for (for example) spapr-vscsi devices
>>>>>>> but it
>>>>>>> is not because since the order is different, devices call IRQ
>>>>>>> allocator in
>>>>>>> different order and get different IRQs.
>>>>>> So your patch migrates the current IRQ configuration, but once you
>>>>>> restart
>>>>>> the virtual machine on the destination host it will have different IRQ
>>>>>> numbering again, right?
>>>>> No, why? IRQs are assigned at init time from realize() callbacks (and
>>>>> survive reset) or as a part of ibm,change-msi rtas call which happens in
>>>>> the same order as it only depends on pci addresses and we do not change
>>>>> this either.
>>>> Ok, let me rephrase. If I shut the machine down because I'm doing
>>>> on-disk hibernate and then boot it back up, will the guest find the same
>>>> configuration?
>>> I do not understand what you mean by this. Hibernation by the guest OS
>>> itself or by QEMU? If this involves QEMU exit and QEMU start - then yes,
>> by the guest OS. The host will only see a genuine "shutdown" event. The
>> guest OS will expect the machine to look *the exact same* as before the
>> shutdown.
> Ok. So. I have to implement "irq" property everywhere (PHB is missing
> INTA/B/C/D now) and check if they did not change during migration via those

Hrm. Not sure. Maybe it'd make sense to join next week's call on 
platform device creation. The problem seems pretty closely related.

> VMSTATE.*EQUAL. Correct?

Why would you need this? I think we already said a couple dozen times 
that configuration matching is a bigger problem, no?

> If so (more or less), I still would like to keep patches 1..7.
> In fact, the first one is independent and we need it anyway.
> Yes/no?

Why?

>
>
>>> config may be different. If it is "migrate to file" and then "migrate from
>>> file" (do not know what you call it when migration goes to a pipe which is
>>> "tar") - then config will be the same.
>>>
>>>
>>>>>> I'm not sure that's a good solution to the problem. I guess we should
>>>>>> rather aim to make sure that we can make IRQ allocation explicit.
>>>>>> Fundamentally the problem sounds very similar to the PCI slot allocation
>>>>>> which eventually got solved by libvirt specifying the slots manually.
>>>>> We can do that too. Who decides? :)
>>>> The better solution wins :)
>>> We both know who decides ;) I posted series, I need heads up if it is going
>>> the right way or not.
>> It's not :). If a guest may not have different IRQ allocation after
>> migration, it also must not have different IRQ allocation after shutdown +
>> restart.
> Ok. That's good answer, thanks. How does x86 work then? IRQs are hardcoded
> (some are for sure but I do not know about MSI)? Or in order to support

Non-PCI IRQs are hardcoded, yes. PCI IRQs are mapped to one of the 4 PCI 
interrupts which again are hardcoded to IOAPIC interrupt lines after 
some PCI line swizzling.

MSI gets configured by the guest, so it has to make sure MSIs are set up 
identically again after hibernation.


Alex
> migration, the user has to specify IRQs for the devices which may get
> different IRQs depending on things like command line parameters order?
Alexey Kardashevskiy April 11, 2014, 4:01 p.m. UTC | #12
On 04/12/2014 01:38 AM, Alexander Graf wrote:
> 
> On 11.04.14 17:27, Alexey Kardashevskiy wrote:
>> On 04/12/2014 12:58 AM, Alexander Graf wrote:
>>> On 11.04.14 16:50, Alexey Kardashevskiy wrote:
>>>> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>>>>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>>>
>>>>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>>> The current allocator returns IRQ numbers from a pool and does not
>>>>>>>>>>>> support IRQs reuse in any form as it did not keep track of what it
>>>>>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>>>>>> However migration may change interrupts for devices depending on
>>>>>>>>>>>> their order in the command line.
>>>>>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't
>>>>>>>>>>> change
>>>>>>>>>>> anything.
>>>>>>>>>> I put wrong commit message. By change I meant that the default state
>>>>>>>>>> before
>>>>>>>>>> the destination guest started accepting migration is different from
>>>>>>>>>> what
>>>>>>>>>> the destination guest became after migration finished. And migration
>>>>>>>>>> cannot
>>>>>>>>>> avoid changing this default state.
>>>>>>>>> Ok, why is the IRQ configuration different?
>>>>>>>> Because QEMU creates devices in the order as in the command line, and
>>>>>>>> libvirt changes this order - the XML used to create the guest and the
>>>>>>>> XML
>>>>>>>> which is sends during migration are different. libvirt thinks it is ok
>>>>>>>> while it keeps @reg property for (for example) spapr-vscsi devices
>>>>>>>> but it
>>>>>>>> is not because since the order is different, devices call IRQ
>>>>>>>> allocator in
>>>>>>>> different order and get different IRQs.
>>>>>>> So your patch migrates the current IRQ configuration, but once you
>>>>>>> restart
>>>>>>> the virtual machine on the destination host it will have different IRQ
>>>>>>> numbering again, right?
>>>>>> No, why? IRQs are assigned at init time from realize() callbacks (and
>>>>>> survive reset) or as a part of ibm,change-msi rtas call which happens in
>>>>>> the same order as it only depends on pci addresses and we do not change
>>>>>> this either.
>>>>> Ok, let me rephrase. If I shut the machine down because I'm doing
>>>>> on-disk hibernate and then boot it back up, will the guest find the same
>>>>> configuration?
>>>> I do not understand what you mean by this. Hibernation by the guest OS
>>>> itself or by QEMU? If this involves QEMU exit and QEMU start - then yes,
>>> by the guest OS. The host will only see a genuine "shutdown" event. The
>>> guest OS will expect the machine to look *the exact same* as before the
>>> shutdown.
>> Ok. So. I have to implement "irq" property everywhere (PHB is missing
>> INTA/B/C/D now) and check if they did not change during migration via those
> 
> Hrm. Not sure. Maybe it'd make sense to join next week's call on platform
> device creation. The problem seems pretty closely related.

What are those platform devices and what are you going to discuss exactly?


>> VMSTATE.*EQUAL. Correct?
> 
> Why would you need this? I think we already said a couple dozen times that
> configuration matching is a bigger problem, no?

For debug! It is not needed in general, yes.


>> If so (more or less), I still would like to keep patches 1..7.
>> In fact, the first one is independent and we need it anyway.
>> Yes/no?
> 
> Why?

IOMMUs do not migrate correctly - they only have a class have and
instance_id and this instance_it depends on command line arguments order.
The #1 patch makes it classname + liobn.


> 
>>
>>
>>>> config may be different. If it is "migrate to file" and then "migrate from
>>>> file" (do not know what you call it when migration goes to a pipe which is
>>>> "tar") - then config will be the same.
>>>>
>>>>
>>>>>>> I'm not sure that's a good solution to the problem. I guess we should
>>>>>>> rather aim to make sure that we can make IRQ allocation explicit.
>>>>>>> Fundamentally the problem sounds very similar to the PCI slot
>>>>>>> allocation
>>>>>>> which eventually got solved by libvirt specifying the slots manually.
>>>>>> We can do that too. Who decides? :)
>>>>> The better solution wins :)
>>>> We both know who decides ;) I posted series, I need heads up if it is
>>>> going
>>>> the right way or not.
>>> It's not :). If a guest may not have different IRQ allocation after
>>> migration, it also must not have different IRQ allocation after shutdown +
>>> restart.
>> Ok. That's good answer, thanks. How does x86 work then? IRQs are hardcoded
>> (some are for sure but I do not know about MSI)? Or in order to support
> 
> Non-PCI IRQs are hardcoded, yes. PCI IRQs are mapped to one of the 4 PCI
> interrupts which again are hardcoded to IOAPIC interrupt lines after some
> PCI line swizzling.

This is what I meant - I need to have a way to tell PHB IRQ numbers for
INTA/B/C/D.



> MSI gets configured by the guest, so it has to make sure MSIs are set up
> identically again after hibernation.
> 
> 
> Alex
>> migration, the user has to specify IRQs for the devices which may get
>> different IRQs depending on things like command line parameters order?
>
Alexander Graf April 11, 2014, 4:15 p.m. UTC | #13
On 11.04.14 18:01, Alexey Kardashevskiy wrote:
> On 04/12/2014 01:38 AM, Alexander Graf wrote:
>> On 11.04.14 17:27, Alexey Kardashevskiy wrote:
>>> On 04/12/2014 12:58 AM, Alexander Graf wrote:
>>>> On 11.04.14 16:50, Alexey Kardashevskiy wrote:
>>>>> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>>>>>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>>>>
>>>>>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>>>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>> The current allocator returns IRQ numbers from a pool and does not
>>>>>>>>>>>>> support IRQs reuse in any form as it did not keep track of what it
>>>>>>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>>>>>>> However migration may change interrupts for devices depending on
>>>>>>>>>>>>> their order in the command line.
>>>>>>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration shouldn't
>>>>>>>>>>>> change
>>>>>>>>>>>> anything.
>>>>>>>>>>> I put wrong commit message. By change I meant that the default state
>>>>>>>>>>> before
>>>>>>>>>>> the destination guest started accepting migration is different from
>>>>>>>>>>> what
>>>>>>>>>>> the destination guest became after migration finished. And migration
>>>>>>>>>>> cannot
>>>>>>>>>>> avoid changing this default state.
>>>>>>>>>> Ok, why is the IRQ configuration different?
>>>>>>>>> Because QEMU creates devices in the order as in the command line, and
>>>>>>>>> libvirt changes this order - the XML used to create the guest and the
>>>>>>>>> XML
>>>>>>>>> which is sends during migration are different. libvirt thinks it is ok
>>>>>>>>> while it keeps @reg property for (for example) spapr-vscsi devices
>>>>>>>>> but it
>>>>>>>>> is not because since the order is different, devices call IRQ
>>>>>>>>> allocator in
>>>>>>>>> different order and get different IRQs.
>>>>>>>> So your patch migrates the current IRQ configuration, but once you
>>>>>>>> restart
>>>>>>>> the virtual machine on the destination host it will have different IRQ
>>>>>>>> numbering again, right?
>>>>>>> No, why? IRQs are assigned at init time from realize() callbacks (and
>>>>>>> survive reset) or as a part of ibm,change-msi rtas call which happens in
>>>>>>> the same order as it only depends on pci addresses and we do not change
>>>>>>> this either.
>>>>>> Ok, let me rephrase. If I shut the machine down because I'm doing
>>>>>> on-disk hibernate and then boot it back up, will the guest find the same
>>>>>> configuration?
>>>>> I do not understand what you mean by this. Hibernation by the guest OS
>>>>> itself or by QEMU? If this involves QEMU exit and QEMU start - then yes,
>>>> by the guest OS. The host will only see a genuine "shutdown" event. The
>>>> guest OS will expect the machine to look *the exact same* as before the
>>>> shutdown.
>>> Ok. So. I have to implement "irq" property everywhere (PHB is missing
>>> INTA/B/C/D now) and check if they did not change during migration via those
>> Hrm. Not sure. Maybe it'd make sense to join next week's call on platform
>> device creation. The problem seems pretty closely related.
> What are those platform devices and what are you going to discuss exactly?

Devices that don't have a unified interrupt routing scheme like PCI 
where you just link lines A/B/C/D to your controller and you're good to go.

>
>
>>> VMSTATE.*EQUAL. Correct?
>> Why would you need this? I think we already said a couple dozen times that
>> configuration matching is a bigger problem, no?
> For debug! It is not needed in general, yes.
>
>
>>> If so (more or less), I still would like to keep patches 1..7.
>>> In fact, the first one is independent and we need it anyway.
>>> Yes/no?
>> Why?
> IOMMUs do not migrate correctly - they only have a class have and
> instance_id and this instance_it depends on command line arguments order.
> The #1 patch makes it classname + liobn.

Why do we need a bus for that?

>
>
>>>
>>>>> config may be different. If it is "migrate to file" and then "migrate from
>>>>> file" (do not know what you call it when migration goes to a pipe which is
>>>>> "tar") - then config will be the same.
>>>>>
>>>>>
>>>>>>>> I'm not sure that's a good solution to the problem. I guess we should
>>>>>>>> rather aim to make sure that we can make IRQ allocation explicit.
>>>>>>>> Fundamentally the problem sounds very similar to the PCI slot
>>>>>>>> allocation
>>>>>>>> which eventually got solved by libvirt specifying the slots manually.
>>>>>>> We can do that too. Who decides? :)
>>>>>> The better solution wins :)
>>>>> We both know who decides ;) I posted series, I need heads up if it is
>>>>> going
>>>>> the right way or not.
>>>> It's not :). If a guest may not have different IRQ allocation after
>>>> migration, it also must not have different IRQ allocation after shutdown +
>>>> restart.
>>> Ok. That's good answer, thanks. How does x86 work then? IRQs are hardcoded
>>> (some are for sure but I do not know about MSI)? Or in order to support
>> Non-PCI IRQs are hardcoded, yes. PCI IRQs are mapped to one of the 4 PCI
>> interrupts which again are hardcoded to IOAPIC interrupt lines after some
>> PCI line swizzling.
> This is what I meant - I need to have a way to tell PHB IRQ numbers for
> INTA/B/C/D.

Yes, just like platform devices ;).


Alex
Alexey Kardashevskiy April 11, 2014, 4:30 p.m. UTC | #14
On 04/12/2014 02:15 AM, Alexander Graf wrote:
> 
> On 11.04.14 18:01, Alexey Kardashevskiy wrote:
>> On 04/12/2014 01:38 AM, Alexander Graf wrote:
>>> On 11.04.14 17:27, Alexey Kardashevskiy wrote:
>>>> On 04/12/2014 12:58 AM, Alexander Graf wrote:
>>>>> On 11.04.14 16:50, Alexey Kardashevskiy wrote:
>>>>>> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>>>>>>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>>>>>
>>>>>>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>>>>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>>>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>> The current allocator returns IRQ numbers from a pool and
>>>>>>>>>>>>>> does not
>>>>>>>>>>>>>> support IRQs reuse in any form as it did not keep track of
>>>>>>>>>>>>>> what it
>>>>>>>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>>>>>>>> However migration may change interrupts for devices depending on
>>>>>>>>>>>>>> their order in the command line.
>>>>>>>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration
>>>>>>>>>>>>> shouldn't
>>>>>>>>>>>>> change
>>>>>>>>>>>>> anything.
>>>>>>>>>>>> I put wrong commit message. By change I meant that the default
>>>>>>>>>>>> state
>>>>>>>>>>>> before
>>>>>>>>>>>> the destination guest started accepting migration is different
>>>>>>>>>>>> from
>>>>>>>>>>>> what
>>>>>>>>>>>> the destination guest became after migration finished. And
>>>>>>>>>>>> migration
>>>>>>>>>>>> cannot
>>>>>>>>>>>> avoid changing this default state.
>>>>>>>>>>> Ok, why is the IRQ configuration different?
>>>>>>>>>> Because QEMU creates devices in the order as in the command line,
>>>>>>>>>> and
>>>>>>>>>> libvirt changes this order - the XML used to create the guest and
>>>>>>>>>> the
>>>>>>>>>> XML
>>>>>>>>>> which is sends during migration are different. libvirt thinks it
>>>>>>>>>> is ok
>>>>>>>>>> while it keeps @reg property for (for example) spapr-vscsi devices
>>>>>>>>>> but it
>>>>>>>>>> is not because since the order is different, devices call IRQ
>>>>>>>>>> allocator in
>>>>>>>>>> different order and get different IRQs.
>>>>>>>>> So your patch migrates the current IRQ configuration, but once you
>>>>>>>>> restart
>>>>>>>>> the virtual machine on the destination host it will have different
>>>>>>>>> IRQ
>>>>>>>>> numbering again, right?
>>>>>>>> No, why? IRQs are assigned at init time from realize() callbacks (and
>>>>>>>> survive reset) or as a part of ibm,change-msi rtas call which
>>>>>>>> happens in
>>>>>>>> the same order as it only depends on pci addresses and we do not
>>>>>>>> change
>>>>>>>> this either.
>>>>>>> Ok, let me rephrase. If I shut the machine down because I'm doing
>>>>>>> on-disk hibernate and then boot it back up, will the guest find the
>>>>>>> same
>>>>>>> configuration?
>>>>>> I do not understand what you mean by this. Hibernation by the guest OS
>>>>>> itself or by QEMU? If this involves QEMU exit and QEMU start - then yes,
>>>>> by the guest OS. The host will only see a genuine "shutdown" event. The
>>>>> guest OS will expect the machine to look *the exact same* as before the
>>>>> shutdown.
>>>> Ok. So. I have to implement "irq" property everywhere (PHB is missing
>>>> INTA/B/C/D now) and check if they did not change during migration via
>>>> those
>>> Hrm. Not sure. Maybe it'd make sense to join next week's call on platform
>>> device creation. The problem seems pretty closely related.
>> What are those platform devices and what are you going to discuss exactly?
> 
> Devices that don't have a unified interrupt routing scheme like PCI where
> you just link lines A/B/C/D to your controller and you're good to go.

Ah. VIO in my case.



>>>> VMSTATE.*EQUAL. Correct?
>>> Why would you need this? I think we already said a couple dozen times that
>>> configuration matching is a bigger problem, no?
>> For debug! It is not needed in general, yes.
>>
>>
>>>> If so (more or less), I still would like to keep patches 1..7.
>>>> In fact, the first one is independent and we need it anyway.
>>>> Yes/no?
>>> Why?
>> IOMMUs do not migrate correctly - they only have a class have and
>> instance_id and this instance_it depends on command line arguments order.
>> The #1 patch makes it classname + liobn.
> 
> Why do we need a bus for that?


For BusClass::get_dev_path callback to get an unique name.



>>>>>> config may be different. If it is "migrate to file" and then "migrate
>>>>>> from
>>>>>> file" (do not know what you call it when migration goes to a pipe
>>>>>> which is
>>>>>> "tar") - then config will be the same.
>>>>>>
>>>>>>
>>>>>>>>> I'm not sure that's a good solution to the problem. I guess we should
>>>>>>>>> rather aim to make sure that we can make IRQ allocation explicit.
>>>>>>>>> Fundamentally the problem sounds very similar to the PCI slot
>>>>>>>>> allocation
>>>>>>>>> which eventually got solved by libvirt specifying the slots manually.
>>>>>>>> We can do that too. Who decides? :)
>>>>>>> The better solution wins :)
>>>>>> We both know who decides ;) I posted series, I need heads up if it is
>>>>>> going
>>>>>> the right way or not.
>>>>> It's not :). If a guest may not have different IRQ allocation after
>>>>> migration, it also must not have different IRQ allocation after
>>>>> shutdown +
>>>>> restart.
>>>> Ok. That's good answer, thanks. How does x86 work then? IRQs are hardcoded
>>>> (some are for sure but I do not know about MSI)? Or in order to support
>>> Non-PCI IRQs are hardcoded, yes. PCI IRQs are mapped to one of the 4 PCI
>>> interrupts which again are hardcoded to IOAPIC interrupt lines after some
>>> PCI line swizzling.
>> This is what I meant - I need to have a way to tell PHB IRQ numbers for
>> INTA/B/C/D.
> 
> Yes, just like platform devices ;).
Alexander Graf April 14, 2014, 7:34 a.m. UTC | #15
On 11.04.14 18:30, Alexey Kardashevskiy wrote:
> On 04/12/2014 02:15 AM, Alexander Graf wrote:
>> On 11.04.14 18:01, Alexey Kardashevskiy wrote:
>>> On 04/12/2014 01:38 AM, Alexander Graf wrote:
>>>> On 11.04.14 17:27, Alexey Kardashevskiy wrote:
>>>>> On 04/12/2014 12:58 AM, Alexander Graf wrote:
>>>>>> On 11.04.14 16:50, Alexey Kardashevskiy wrote:
>>>>>>> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>>>>>>>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>>>>>>
>>>>>>>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>>>>>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>>>>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>>>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>>> The current allocator returns IRQ numbers from a pool and
>>>>>>>>>>>>>>> does not
>>>>>>>>>>>>>>> support IRQs reuse in any form as it did not keep track of
>>>>>>>>>>>>>>> what it
>>>>>>>>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>>>>>>>>> However migration may change interrupts for devices depending on
>>>>>>>>>>>>>>> their order in the command line.
>>>>>>>>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration
>>>>>>>>>>>>>> shouldn't
>>>>>>>>>>>>>> change
>>>>>>>>>>>>>> anything.
>>>>>>>>>>>>> I put wrong commit message. By change I meant that the default
>>>>>>>>>>>>> state
>>>>>>>>>>>>> before
>>>>>>>>>>>>> the destination guest started accepting migration is different
>>>>>>>>>>>>> from
>>>>>>>>>>>>> what
>>>>>>>>>>>>> the destination guest became after migration finished. And
>>>>>>>>>>>>> migration
>>>>>>>>>>>>> cannot
>>>>>>>>>>>>> avoid changing this default state.
>>>>>>>>>>>> Ok, why is the IRQ configuration different?
>>>>>>>>>>> Because QEMU creates devices in the order as in the command line,
>>>>>>>>>>> and
>>>>>>>>>>> libvirt changes this order - the XML used to create the guest and
>>>>>>>>>>> the
>>>>>>>>>>> XML
>>>>>>>>>>> which is sends during migration are different. libvirt thinks it
>>>>>>>>>>> is ok
>>>>>>>>>>> while it keeps @reg property for (for example) spapr-vscsi devices
>>>>>>>>>>> but it
>>>>>>>>>>> is not because since the order is different, devices call IRQ
>>>>>>>>>>> allocator in
>>>>>>>>>>> different order and get different IRQs.
>>>>>>>>>> So your patch migrates the current IRQ configuration, but once you
>>>>>>>>>> restart
>>>>>>>>>> the virtual machine on the destination host it will have different
>>>>>>>>>> IRQ
>>>>>>>>>> numbering again, right?
>>>>>>>>> No, why? IRQs are assigned at init time from realize() callbacks (and
>>>>>>>>> survive reset) or as a part of ibm,change-msi rtas call which
>>>>>>>>> happens in
>>>>>>>>> the same order as it only depends on pci addresses and we do not
>>>>>>>>> change
>>>>>>>>> this either.
>>>>>>>> Ok, let me rephrase. If I shut the machine down because I'm doing
>>>>>>>> on-disk hibernate and then boot it back up, will the guest find the
>>>>>>>> same
>>>>>>>> configuration?
>>>>>>> I do not understand what you mean by this. Hibernation by the guest OS
>>>>>>> itself or by QEMU? If this involves QEMU exit and QEMU start - then yes,
>>>>>> by the guest OS. The host will only see a genuine "shutdown" event. The
>>>>>> guest OS will expect the machine to look *the exact same* as before the
>>>>>> shutdown.
>>>>> Ok. So. I have to implement "irq" property everywhere (PHB is missing
>>>>> INTA/B/C/D now) and check if they did not change during migration via
>>>>> those
>>>> Hrm. Not sure. Maybe it'd make sense to join next week's call on platform
>>>> device creation. The problem seems pretty closely related.
>>> What are those platform devices and what are you going to discuss exactly?
>> Devices that don't have a unified interrupt routing scheme like PCI where
>> you just link lines A/B/C/D to your controller and you're good to go.
> Ah. VIO in my case.
>
>
>
>>>>> VMSTATE.*EQUAL. Correct?
>>>> Why would you need this? I think we already said a couple dozen times that
>>>> configuration matching is a bigger problem, no?
>>> For debug! It is not needed in general, yes.
>>>
>>>
>>>>> If so (more or less), I still would like to keep patches 1..7.
>>>>> In fact, the first one is independent and we need it anyway.
>>>>> Yes/no?
>>>> Why?
>>> IOMMUs do not migrate correctly - they only have a class have and
>>> instance_id and this instance_it depends on command line arguments order.
>>> The #1 patch makes it classname + liobn.
>> Why do we need a bus for that?
>
> For BusClass::get_dev_path callback to get an unique name.

Juan, I don't think it makes a lot of sense to require a new fake bus 
just to give us a consistent migration view of things.

Do you have any ideas how to migration busless devices? We could just 
detect that case and give them numbering based on their occurence in the 
global QOM hierarchy, no?

Andreas, maybe you have an idea here as well :).


Alex
Alexey Kardashevskiy April 14, 2014, 7:41 a.m. UTC | #16
On 04/14/2014 05:34 PM, Alexander Graf wrote:
> 
> On 11.04.14 18:30, Alexey Kardashevskiy wrote:
>> On 04/12/2014 02:15 AM, Alexander Graf wrote:
>>> On 11.04.14 18:01, Alexey Kardashevskiy wrote:
>>>> On 04/12/2014 01:38 AM, Alexander Graf wrote:
>>>>> On 11.04.14 17:27, Alexey Kardashevskiy wrote:
>>>>>> On 04/12/2014 12:58 AM, Alexander Graf wrote:
>>>>>>> On 11.04.14 16:50, Alexey Kardashevskiy wrote:
>>>>>>>> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>>>>>>>>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>>>>>>>
>>>>>>>>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>>>>>>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>>>>>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>>>>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>>>> The current allocator returns IRQ numbers from a pool and
>>>>>>>>>>>>>>>> does not
>>>>>>>>>>>>>>>> support IRQs reuse in any form as it did not keep track of
>>>>>>>>>>>>>>>> what it
>>>>>>>>>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>>>>>>>>>> However migration may change interrupts for devices
>>>>>>>>>>>>>>>> depending on
>>>>>>>>>>>>>>>> their order in the command line.
>>>>>>>>>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration
>>>>>>>>>>>>>>> shouldn't
>>>>>>>>>>>>>>> change
>>>>>>>>>>>>>>> anything.
>>>>>>>>>>>>>> I put wrong commit message. By change I meant that the default
>>>>>>>>>>>>>> state
>>>>>>>>>>>>>> before
>>>>>>>>>>>>>> the destination guest started accepting migration is different
>>>>>>>>>>>>>> from
>>>>>>>>>>>>>> what
>>>>>>>>>>>>>> the destination guest became after migration finished. And
>>>>>>>>>>>>>> migration
>>>>>>>>>>>>>> cannot
>>>>>>>>>>>>>> avoid changing this default state.
>>>>>>>>>>>>> Ok, why is the IRQ configuration different?
>>>>>>>>>>>> Because QEMU creates devices in the order as in the command line,
>>>>>>>>>>>> and
>>>>>>>>>>>> libvirt changes this order - the XML used to create the guest and
>>>>>>>>>>>> the
>>>>>>>>>>>> XML
>>>>>>>>>>>> which is sends during migration are different. libvirt thinks it
>>>>>>>>>>>> is ok
>>>>>>>>>>>> while it keeps @reg property for (for example) spapr-vscsi devices
>>>>>>>>>>>> but it
>>>>>>>>>>>> is not because since the order is different, devices call IRQ
>>>>>>>>>>>> allocator in
>>>>>>>>>>>> different order and get different IRQs.
>>>>>>>>>>> So your patch migrates the current IRQ configuration, but once you
>>>>>>>>>>> restart
>>>>>>>>>>> the virtual machine on the destination host it will have different
>>>>>>>>>>> IRQ
>>>>>>>>>>> numbering again, right?
>>>>>>>>>> No, why? IRQs are assigned at init time from realize() callbacks
>>>>>>>>>> (and
>>>>>>>>>> survive reset) or as a part of ibm,change-msi rtas call which
>>>>>>>>>> happens in
>>>>>>>>>> the same order as it only depends on pci addresses and we do not
>>>>>>>>>> change
>>>>>>>>>> this either.
>>>>>>>>> Ok, let me rephrase. If I shut the machine down because I'm doing
>>>>>>>>> on-disk hibernate and then boot it back up, will the guest find the
>>>>>>>>> same
>>>>>>>>> configuration?
>>>>>>>> I do not understand what you mean by this. Hibernation by the guest OS
>>>>>>>> itself or by QEMU? If this involves QEMU exit and QEMU start - then
>>>>>>>> yes,
>>>>>>> by the guest OS. The host will only see a genuine "shutdown" event. The
>>>>>>> guest OS will expect the machine to look *the exact same* as before the
>>>>>>> shutdown.
>>>>>> Ok. So. I have to implement "irq" property everywhere (PHB is missing
>>>>>> INTA/B/C/D now) and check if they did not change during migration via
>>>>>> those
>>>>> Hrm. Not sure. Maybe it'd make sense to join next week's call on platform
>>>>> device creation. The problem seems pretty closely related.
>>>> What are those platform devices and what are you going to discuss exactly?
>>> Devices that don't have a unified interrupt routing scheme like PCI where
>>> you just link lines A/B/C/D to your controller and you're good to go.
>> Ah. VIO in my case.
>>
>>
>>
>>>>>> VMSTATE.*EQUAL. Correct?
>>>>> Why would you need this? I think we already said a couple dozen times
>>>>> that
>>>>> configuration matching is a bigger problem, no?
>>>> For debug! It is not needed in general, yes.
>>>>
>>>>
>>>>>> If so (more or less), I still would like to keep patches 1..7.
>>>>>> In fact, the first one is independent and we need it anyway.
>>>>>> Yes/no?
>>>>> Why?
>>>> IOMMUs do not migrate correctly - they only have a class have and
>>>> instance_id and this instance_it depends on command line arguments order.
>>>> The #1 patch makes it classname + liobn.
>>> Why do we need a bus for that?
>>
>> For BusClass::get_dev_path callback to get an unique name.
> 
> Juan, I don't think it makes a lot of sense to require a new fake bus just
> to give us a consistent migration view of things.
> 
> Do you have any ideas how to migration busless devices? We could just
> detect that case and give them numbering based on their occurence in the
> global QOM hierarchy, no?


The mentioned instance_id is that occurrence number which totally depends
on the device order in the command line. And I have to not to depend on that.


> Andreas, maybe you have an idea here as well :).
> 
> 
> Alex
>
Alexander Graf April 14, 2014, 7:42 a.m. UTC | #17
On 14.04.14 09:41, Alexey Kardashevskiy wrote:
> On 04/14/2014 05:34 PM, Alexander Graf wrote:
>> On 11.04.14 18:30, Alexey Kardashevskiy wrote:
>>> On 04/12/2014 02:15 AM, Alexander Graf wrote:
>>>> On 11.04.14 18:01, Alexey Kardashevskiy wrote:
>>>>> On 04/12/2014 01:38 AM, Alexander Graf wrote:
>>>>>> On 11.04.14 17:27, Alexey Kardashevskiy wrote:
>>>>>>> On 04/12/2014 12:58 AM, Alexander Graf wrote:
>>>>>>>> On 11.04.14 16:50, Alexey Kardashevskiy wrote:
>>>>>>>>> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>>>>>>>>>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>>>>>>>>
>>>>>>>>>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>>>>>>>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>>>>>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>>>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>>>>> The current allocator returns IRQ numbers from a pool and
>>>>>>>>>>>>>>>>> does not
>>>>>>>>>>>>>>>>> support IRQs reuse in any form as it did not keep track of
>>>>>>>>>>>>>>>>> what it
>>>>>>>>>>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>>>>>>>>>>> However migration may change interrupts for devices
>>>>>>>>>>>>>>>>> depending on
>>>>>>>>>>>>>>>>> their order in the command line.
>>>>>>>>>>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration
>>>>>>>>>>>>>>>> shouldn't
>>>>>>>>>>>>>>>> change
>>>>>>>>>>>>>>>> anything.
>>>>>>>>>>>>>>> I put wrong commit message. By change I meant that the default
>>>>>>>>>>>>>>> state
>>>>>>>>>>>>>>> before
>>>>>>>>>>>>>>> the destination guest started accepting migration is different
>>>>>>>>>>>>>>> from
>>>>>>>>>>>>>>> what
>>>>>>>>>>>>>>> the destination guest became after migration finished. And
>>>>>>>>>>>>>>> migration
>>>>>>>>>>>>>>> cannot
>>>>>>>>>>>>>>> avoid changing this default state.
>>>>>>>>>>>>>> Ok, why is the IRQ configuration different?
>>>>>>>>>>>>> Because QEMU creates devices in the order as in the command line,
>>>>>>>>>>>>> and
>>>>>>>>>>>>> libvirt changes this order - the XML used to create the guest and
>>>>>>>>>>>>> the
>>>>>>>>>>>>> XML
>>>>>>>>>>>>> which is sends during migration are different. libvirt thinks it
>>>>>>>>>>>>> is ok
>>>>>>>>>>>>> while it keeps @reg property for (for example) spapr-vscsi devices
>>>>>>>>>>>>> but it
>>>>>>>>>>>>> is not because since the order is different, devices call IRQ
>>>>>>>>>>>>> allocator in
>>>>>>>>>>>>> different order and get different IRQs.
>>>>>>>>>>>> So your patch migrates the current IRQ configuration, but once you
>>>>>>>>>>>> restart
>>>>>>>>>>>> the virtual machine on the destination host it will have different
>>>>>>>>>>>> IRQ
>>>>>>>>>>>> numbering again, right?
>>>>>>>>>>> No, why? IRQs are assigned at init time from realize() callbacks
>>>>>>>>>>> (and
>>>>>>>>>>> survive reset) or as a part of ibm,change-msi rtas call which
>>>>>>>>>>> happens in
>>>>>>>>>>> the same order as it only depends on pci addresses and we do not
>>>>>>>>>>> change
>>>>>>>>>>> this either.
>>>>>>>>>> Ok, let me rephrase. If I shut the machine down because I'm doing
>>>>>>>>>> on-disk hibernate and then boot it back up, will the guest find the
>>>>>>>>>> same
>>>>>>>>>> configuration?
>>>>>>>>> I do not understand what you mean by this. Hibernation by the guest OS
>>>>>>>>> itself or by QEMU? If this involves QEMU exit and QEMU start - then
>>>>>>>>> yes,
>>>>>>>> by the guest OS. The host will only see a genuine "shutdown" event. The
>>>>>>>> guest OS will expect the machine to look *the exact same* as before the
>>>>>>>> shutdown.
>>>>>>> Ok. So. I have to implement "irq" property everywhere (PHB is missing
>>>>>>> INTA/B/C/D now) and check if they did not change during migration via
>>>>>>> those
>>>>>> Hrm. Not sure. Maybe it'd make sense to join next week's call on platform
>>>>>> device creation. The problem seems pretty closely related.
>>>>> What are those platform devices and what are you going to discuss exactly?
>>>> Devices that don't have a unified interrupt routing scheme like PCI where
>>>> you just link lines A/B/C/D to your controller and you're good to go.
>>> Ah. VIO in my case.
>>>
>>>
>>>
>>>>>>> VMSTATE.*EQUAL. Correct?
>>>>>> Why would you need this? I think we already said a couple dozen times
>>>>>> that
>>>>>> configuration matching is a bigger problem, no?
>>>>> For debug! It is not needed in general, yes.
>>>>>
>>>>>
>>>>>>> If so (more or less), I still would like to keep patches 1..7.
>>>>>>> In fact, the first one is independent and we need it anyway.
>>>>>>> Yes/no?
>>>>>> Why?
>>>>> IOMMUs do not migrate correctly - they only have a class have and
>>>>> instance_id and this instance_it depends on command line arguments order.
>>>>> The #1 patch makes it classname + liobn.
>>>> Why do we need a bus for that?
>>> For BusClass::get_dev_path callback to get an unique name.
>> Juan, I don't think it makes a lot of sense to require a new fake bus just
>> to give us a consistent migration view of things.
>>
>> Do you have any ideas how to migration busless devices? We could just
>> detect that case and give them numbering based on their occurence in the
>> global QOM hierarchy, no?
>
> The mentioned instance_id is that occurrence number which totally depends
> on the device order in the command line. And I have to not to depend on that.

So how would a bus fix that? The bus gets populated based on the command 
line order just as well, no?


Alex
Alexey Kardashevskiy April 14, 2014, 7:54 a.m. UTC | #18
On 04/14/2014 05:42 PM, Alexander Graf wrote:
> 
> On 14.04.14 09:41, Alexey Kardashevskiy wrote:
>> On 04/14/2014 05:34 PM, Alexander Graf wrote:
>>> On 11.04.14 18:30, Alexey Kardashevskiy wrote:
>>>> On 04/12/2014 02:15 AM, Alexander Graf wrote:
>>>>> On 11.04.14 18:01, Alexey Kardashevskiy wrote:
>>>>>> On 04/12/2014 01:38 AM, Alexander Graf wrote:
>>>>>>> On 11.04.14 17:27, Alexey Kardashevskiy wrote:
>>>>>>>> On 04/12/2014 12:58 AM, Alexander Graf wrote:
>>>>>>>>> On 11.04.14 16:50, Alexey Kardashevskiy wrote:
>>>>>>>>>> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>>>>>>>>>>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>>>>> wrote:
>>>>>>>>>>>
>>>>>>>>>>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>>>>>>>>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>>>>>>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>>>>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>>>>>> The current allocator returns IRQ numbers from a pool and
>>>>>>>>>>>>>>>>>> does not
>>>>>>>>>>>>>>>>>> support IRQs reuse in any form as it did not keep track of
>>>>>>>>>>>>>>>>>> what it
>>>>>>>>>>>>>>>>>> previously returned, it only had the last returned IRQ.
>>>>>>>>>>>>>>>>>> However migration may change interrupts for devices
>>>>>>>>>>>>>>>>>> depending on
>>>>>>>>>>>>>>>>>> their order in the command line.
>>>>>>>>>>>>>>>>> Wtf? Nonono, this sounds very bogus and wrong. Migration
>>>>>>>>>>>>>>>>> shouldn't
>>>>>>>>>>>>>>>>> change
>>>>>>>>>>>>>>>>> anything.
>>>>>>>>>>>>>>>> I put wrong commit message. By change I meant that the default
>>>>>>>>>>>>>>>> state
>>>>>>>>>>>>>>>> before
>>>>>>>>>>>>>>>> the destination guest started accepting migration is different
>>>>>>>>>>>>>>>> from
>>>>>>>>>>>>>>>> what
>>>>>>>>>>>>>>>> the destination guest became after migration finished. And
>>>>>>>>>>>>>>>> migration
>>>>>>>>>>>>>>>> cannot
>>>>>>>>>>>>>>>> avoid changing this default state.
>>>>>>>>>>>>>>> Ok, why is the IRQ configuration different?
>>>>>>>>>>>>>> Because QEMU creates devices in the order as in the command
>>>>>>>>>>>>>> line,
>>>>>>>>>>>>>> and
>>>>>>>>>>>>>> libvirt changes this order - the XML used to create the guest
>>>>>>>>>>>>>> and
>>>>>>>>>>>>>> the
>>>>>>>>>>>>>> XML
>>>>>>>>>>>>>> which is sends during migration are different. libvirt thinks it
>>>>>>>>>>>>>> is ok
>>>>>>>>>>>>>> while it keeps @reg property for (for example) spapr-vscsi
>>>>>>>>>>>>>> devices
>>>>>>>>>>>>>> but it
>>>>>>>>>>>>>> is not because since the order is different, devices call IRQ
>>>>>>>>>>>>>> allocator in
>>>>>>>>>>>>>> different order and get different IRQs.
>>>>>>>>>>>>> So your patch migrates the current IRQ configuration, but once
>>>>>>>>>>>>> you
>>>>>>>>>>>>> restart
>>>>>>>>>>>>> the virtual machine on the destination host it will have
>>>>>>>>>>>>> different
>>>>>>>>>>>>> IRQ
>>>>>>>>>>>>> numbering again, right?
>>>>>>>>>>>> No, why? IRQs are assigned at init time from realize() callbacks
>>>>>>>>>>>> (and
>>>>>>>>>>>> survive reset) or as a part of ibm,change-msi rtas call which
>>>>>>>>>>>> happens in
>>>>>>>>>>>> the same order as it only depends on pci addresses and we do not
>>>>>>>>>>>> change
>>>>>>>>>>>> this either.
>>>>>>>>>>> Ok, let me rephrase. If I shut the machine down because I'm doing
>>>>>>>>>>> on-disk hibernate and then boot it back up, will the guest find the
>>>>>>>>>>> same
>>>>>>>>>>> configuration?
>>>>>>>>>> I do not understand what you mean by this. Hibernation by the
>>>>>>>>>> guest OS
>>>>>>>>>> itself or by QEMU? If this involves QEMU exit and QEMU start - then
>>>>>>>>>> yes,
>>>>>>>>> by the guest OS. The host will only see a genuine "shutdown"
>>>>>>>>> event. The
>>>>>>>>> guest OS will expect the machine to look *the exact same* as
>>>>>>>>> before the
>>>>>>>>> shutdown.
>>>>>>>> Ok. So. I have to implement "irq" property everywhere (PHB is missing
>>>>>>>> INTA/B/C/D now) and check if they did not change during migration via
>>>>>>>> those
>>>>>>> Hrm. Not sure. Maybe it'd make sense to join next week's call on
>>>>>>> platform
>>>>>>> device creation. The problem seems pretty closely related.
>>>>>> What are those platform devices and what are you going to discuss
>>>>>> exactly?
>>>>> Devices that don't have a unified interrupt routing scheme like PCI where
>>>>> you just link lines A/B/C/D to your controller and you're good to go.
>>>> Ah. VIO in my case.
>>>>
>>>>
>>>>
>>>>>>>> VMSTATE.*EQUAL. Correct?
>>>>>>> Why would you need this? I think we already said a couple dozen times
>>>>>>> that
>>>>>>> configuration matching is a bigger problem, no?
>>>>>> For debug! It is not needed in general, yes.
>>>>>>
>>>>>>
>>>>>>>> If so (more or less), I still would like to keep patches 1..7.
>>>>>>>> In fact, the first one is independent and we need it anyway.
>>>>>>>> Yes/no?
>>>>>>> Why?
>>>>>> IOMMUs do not migrate correctly - they only have a class have and
>>>>>> instance_id and this instance_it depends on command line arguments
>>>>>> order.
>>>>>> The #1 patch makes it classname + liobn.
>>>>> Why do we need a bus for that?
>>>> For BusClass::get_dev_path callback to get an unique name.
>>> Juan, I don't think it makes a lot of sense to require a new fake bus just
>>> to give us a consistent migration view of things.
>>>
>>> Do you have any ideas how to migration busless devices? We could just
>>> detect that case and give them numbering based on their occurence in the
>>> global QOM hierarchy, no?
>>
>> The mentioned instance_id is that occurrence number which totally depends
>> on the device order in the command line. And I have to not to depend on
>> that.
> 
> So how would a bus fix that? The bus gets populated based on the command
> line order just as well, no?

Bus provides an unique name for every IOMMU, and every single IOMMU always
has instance_id == 0 so migration chunk cannot possibly go to a wrong device.
Eric Blake April 14, 2014, 3:04 p.m. UTC | #19
On 04/14/2014 01:41 AM, Alexey Kardashevskiy wrote:
> On 04/14/2014 05:34 PM, Alexander Graf wrote:
>>
>> On 11.04.14 18:30, Alexey Kardashevskiy wrote:
>>> On 04/12/2014 02:15 AM, Alexander Graf wrote:
>>>> On 11.04.14 18:01, Alexey Kardashevskiy wrote:
>>>>> On 04/12/2014 01:38 AM, Alexander Graf wrote:
>>>>>> On 11.04.14 17:27, Alexey Kardashevskiy wrote:
>>>>>>> On 04/12/2014 12:58 AM, Alexander Graf wrote:
>>>>>>>> On 11.04.14 16:50, Alexey Kardashevskiy wrote:
>>>>>>>>> On 04/11/2014 11:58 PM, Alexander Graf wrote:
>>>>>>>>>> On 11.04.2014, at 14:38, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>>>>>>>>
>>>>>>>>>>> On 04/11/2014 07:24 PM, Alexander Graf wrote:
>>>>>>>>>>>> On 10.04.14 16:43, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>> On 04/10/2014 11:26 PM, Alexander Graf wrote:
>>>>>>>>>>>>>> On 10.04.14 15:24, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>>> On 04/10/2014 10:51 PM, Alexander Graf wrote:
>>>>>>>>>>>>>>>> On 14.03.14 05:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>>>>>> The current allocator returns IRQ numbers from a pool and

17 levels of quoting!   It's okay to trim your messages, so that people
can get to the gist of your reply without first having to wade through
screenfuls of repeated heavily quoted material.
diff mbox

Patch

diff --git a/hw/intc/xics.c b/hw/intc/xics.c
index e5195bf..8d101a3 100644
--- a/hw/intc/xics.c
+++ b/hw/intc/xics.c
@@ -690,6 +690,86 @@  void xics_set_irq_type(XICSState *icp, int irq, bool lsi)
     ics_set_irq_type(&icp->ics[server], irq, lsi);
 }
 
+#define XICS_IRQ_FREE(ics, n)   \
+    (!((ics)->irqs[(n) - (ics)->offset].flags & \
+       (XICS_FLAGS_LSI | XICS_FLAGS_MSI)))
+
+static int ics_find_free_block(ICSState *ics, int num, int alignnum)
+{
+    int first, i;
+
+    for (first = 0; first < ics->nr_irqs; first += alignnum) {
+        if (num > (ics->nr_irqs - first)) {
+            return -1;
+        }
+        for (i = first; i < first + num; ++i) {
+            if (!XICS_IRQ_FREE(ics, i + ics->offset)) {
+                break;
+            }
+        }
+        if (i == (first + num)) {
+            return first + ics->offset;
+        }
+    }
+
+    return -1;
+}
+
+int xics_alloc(XICSState *icp, int server, int irq, bool lsi)
+{
+    ICSState *ics = &icp->ics[server];
+
+    if (irq) {
+        assert(server == xics_find_server(icp, irq));
+        if (!XICS_IRQ_FREE(ics, irq)) {
+            trace_xics_alloc_failed(server, irq);
+            return -1;
+        }
+    } else {
+        irq = ics_find_free_block(ics, 1, 1);
+    }
+
+    ics_set_irq_type(ics, irq, lsi);
+    trace_xics_alloc(server, irq);
+
+    return irq;
+}
+
+/*
+ * Allocate block of consequtive IRQs, returns a number of the first.
+ * If align==true, aligns the first IRQ number to num.
+ */
+int xics_alloc_block(XICSState *icp, int server, int num, bool lsi, bool align)
+{
+    int i, first = -1;
+    ICSState *ics = &icp->ics[server];
+
+    assert(server == 0);
+    /*
+     * MSIMesage::data is used for storing VIRQ so
+     * it has to be aligned to num to support multiple
+     * MSI vectors. MSI-X is not affected by this.
+     * The hint is used for the first IRQ, the rest should
+     * be allocated continuously.
+     */
+    if (align) {
+        assert((num == 1) || (num == 2) || (num == 4) ||
+               (num == 8) || (num == 16) || (num == 32));
+        first = ics_find_free_block(ics, num, num);
+    } else {
+        first = ics_find_free_block(ics, num, 1);
+    }
+
+    if (first > 0) {
+        for (i = first; i < first + num; ++i) {
+            ics_set_irq_type(ics, i, lsi);
+        }
+    }
+    trace_xics_alloc_block(server, first, num, lsi, align);
+
+    return first;
+}
+
 /*
  * Guest interfaces
  */
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 12adc21..29ca2e0 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -83,73 +83,6 @@ 
 
 sPAPREnvironment *spapr;
 
-int spapr_allocate_irq(int hint, bool lsi)
-{
-    int irq;
-
-    if (hint) {
-        irq = hint;
-        if (hint >= spapr->next_irq) {
-            spapr->next_irq = hint + 1;
-        }
-        /* FIXME: we should probably check for collisions somehow */
-    } else {
-        irq = spapr->next_irq++;
-    }
-
-    /* Configure irq type */
-    if (!xics_get_qirq(spapr->icp, irq)) {
-        return 0;
-    }
-
-    xics_set_irq_type(spapr->icp, irq, lsi);
-
-    return irq;
-}
-
-/*
- * Allocate block of consequtive IRQs, returns a number of the first.
- * If msi==true, aligns the first IRQ number to num.
- */
-int spapr_allocate_irq_block(int num, bool lsi, bool msi)
-{
-    int first = -1;
-    int i, hint = 0;
-
-    /*
-     * MSIMesage::data is used for storing VIRQ so
-     * it has to be aligned to num to support multiple
-     * MSI vectors. MSI-X is not affected by this.
-     * The hint is used for the first IRQ, the rest should
-     * be allocated continuously.
-     */
-    if (msi) {
-        assert((num == 1) || (num == 2) || (num == 4) ||
-               (num == 8) || (num == 16) || (num == 32));
-        hint = (spapr->next_irq + num - 1) & ~(num - 1);
-    }
-
-    for (i = 0; i < num; ++i) {
-        int irq;
-
-        irq = spapr_allocate_irq(hint, lsi);
-        if (!irq) {
-            return -1;
-        }
-
-        if (0 == i) {
-            first = irq;
-            hint = 0;
-        }
-
-        /* If the above doesn't create a consecutive block then that's
-         * an internal bug */
-        assert(irq == (first + i));
-    }
-
-    return first;
-}
-
 static XICSState *try_create_xics(const char *type, int nr_servers,
                                   int nr_irqs)
 {
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index 16fa49e..e605430 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -314,7 +314,7 @@  static void check_exception(PowerPCCPU *cpu, sPAPREnvironment *spapr,
 
 void spapr_events_init(sPAPREnvironment *spapr)
 {
-    spapr->epow_irq = spapr_allocate_msi(0);
+    spapr->epow_irq = xics_alloc_block(spapr->icp, 0, 1, false, false);
     spapr->epow_notifier.notify = spapr_powerdown_req;
     qemu_register_powerdown_notifier(&spapr->epow_notifier);
     spapr_rtas_register("check-exception", check_exception);
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index cbef095..4eaf364 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -343,8 +343,8 @@  static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
 
     /* There is no cached config, allocate MSIs */
     if (!phb->msi_table[ndev].nvec) {
-        irq = spapr_allocate_irq_block(req_num, false,
-                                       ret_intr_type == RTAS_TYPE_MSI);
+        irq = xics_alloc_block(spapr->icp, 0, req_num, false,
+                               ret_intr_type == RTAS_TYPE_MSI);
         if (irq < 0) {
             error_report("Cannot allocate MSIs for device#%d", ndev);
             rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
@@ -623,7 +623,7 @@  static void spapr_phb_realize(DeviceState *dev, Error **errp)
     for (i = 0; i < PCI_NUM_PINS; i++) {
         uint32_t irq;
 
-        irq = spapr_allocate_lsi(0);
+        irq = xics_alloc_block(spapr->icp, 0, 1, true, false);
         if (!irq) {
             error_setg(errp, "spapr_allocate_lsi failed");
             return;
diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c
index 4e33f46..8aeb263 100644
--- a/hw/ppc/spapr_vio.c
+++ b/hw/ppc/spapr_vio.c
@@ -448,7 +448,7 @@  static int spapr_vio_busdev_init(DeviceState *qdev)
         dev->qdev.id = id;
     }
 
-    dev->irq = spapr_allocate_msi(dev->irq);
+    dev->irq = xics_alloc(spapr->icp, 0, dev->irq, false);
     if (!dev->irq) {
         return -1;
     }
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 18332fd..5d577ff 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -323,16 +323,6 @@  target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
 int spapr_allocate_irq(int hint, bool lsi);
 int spapr_allocate_irq_block(int num, bool lsi, bool msi);
 
-static inline int spapr_allocate_msi(int hint)
-{
-    return spapr_allocate_irq(hint, false);
-}
-
-static inline int spapr_allocate_lsi(int hint)
-{
-    return spapr_allocate_irq(hint, true);
-}
-
 /* RTAS return codes */
 #define RTAS_OUT_SUCCESS            0
 #define RTAS_OUT_NO_ERRORS_FOUND    1
diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
index 4b30e9a..337398d 100644
--- a/include/hw/ppc/xics.h
+++ b/include/hw/ppc/xics.h
@@ -157,6 +157,8 @@  struct ICSIRQState {
 
 qemu_irq xics_get_qirq(XICSState *icp, int irq);
 void xics_set_irq_type(XICSState *icp, int irq, bool lsi);
+int xics_alloc(XICSState *icp, int server, int irq, bool lsi);
+int xics_alloc_block(XICSState *icp, int server, int num, bool lsi, bool align);
 
 void xics_cpu_setup(XICSState *icp, PowerPCCPU *cpu);
 
diff --git a/trace-events b/trace-events
index 002c260..ad7400e 100644
--- a/trace-events
+++ b/trace-events
@@ -1143,6 +1143,9 @@  xics_set_irq_lsi(int srcno, int nr) "set_irq_lsi: srcno %d [irq %#x]"
 xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority) "ics_write_xive: irq %#x [src %d] server %#x prio %#x"
 xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]"
 xics_ics_eoi(int nr) "ics_eoi: irq %#x"
+xics_alloc(int server, int irq) "server#%d, irq %d"
+xics_alloc_failed(int server, int irq) "server#%d, irq %d"
+xics_alloc_block(int server, int first, int num, bool lsi, int align) "server#%d, first irq %d, %d irqs, lsi=%d, alignnum %d"
 
 # hw/ppc/spapr_iommu.c
 spapr_iommu_put(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tce=0x%"PRIx64" ret=%"PRId64