Patchwork pc: Clean up PIC-to-APIC IRQ path

login
register
mail settings
Submitter Jan Kiszka
Date Aug. 27, 2011, 2:16 p.m.
Message ID <4E58FC3F.6080809@web.de>
Download mbox | patch
Permalink /patch/111878/
State New
Headers show

Comments

Jan Kiszka - Aug. 27, 2011, 2:16 p.m.
From: Jan Kiszka <jan.kiszka@siemens.com>

The master PIC is connected to the LINTIN0 of the APICs. As the APIC
currently does not track the state of that line, we have to ask the PIC
to re-inject its IRQ after the CPU picked up an event from the APIC.

Adds the proper state tracking so that we can already re-assert the CPU
IRQ at APIC level if there is a pending PIC IRQ. This allows to remove
all the old workarounds.

The patch also fixes some failures of the kvm unit tests apic and
eventinj by enabling a proper CPU IRQ deassert when the guest masks some
pending IRQs at PIC level.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---

It turned out that this patch from a larger cleanup series has no
dependencies and can be applied directly to master to fix the observed
bug.

 hw/apic.c  |    4 +++-
 hw/i8259.c |   10 ++--------
 hw/pc.c    |    3 ---
 hw/pc.h    |    1 -
 4 files changed, 5 insertions(+), 13 deletions(-)
Blue Swirl - Aug. 28, 2011, 7:10 a.m.
On Sat, Aug 27, 2011 at 2:16 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
>
> The master PIC is connected to the LINTIN0 of the APICs. As the APIC
> currently does not track the state of that line, we have to ask the PIC
> to re-inject its IRQ after the CPU picked up an event from the APIC.
>
> Adds the proper state tracking so that we can already re-assert the CPU
> IRQ at APIC level if there is a pending PIC IRQ. This allows to remove
> all the old workarounds.
>
> The patch also fixes some failures of the kvm unit tests apic and
> eventinj by enabling a proper CPU IRQ deassert when the guest masks some
> pending IRQs at PIC level.
>
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>
> It turned out that this patch from a larger cleanup series has no
> dependencies and can be applied directly to master to fix the observed
> bug.
>
>  hw/apic.c  |    4 +++-
>  hw/i8259.c |   10 ++--------
>  hw/pc.c    |    3 ---
>  hw/pc.h    |    1 -
>  4 files changed, 5 insertions(+), 13 deletions(-)
>
> diff --git a/hw/apic.c b/hw/apic.c
> index d8f56c8..22ad635 100644
> --- a/hw/apic.c
> +++ b/hw/apic.c
> @@ -104,6 +104,7 @@ struct APICState {
>     QEMUTimer *timer;
>     int sipi_vector;
>     int wait_for_sipi;
> +    int pic_level;
>  };
>
>  static APICState *local_apics[MAX_APICS + 1];
> @@ -186,6 +187,7 @@ void apic_deliver_pic_intr(DeviceState *d, int level)
>  {
>     APICState *s = DO_UPCAST(APICState, busdev.qdev, d);
>
> +    s->pic_level = level;
>     if (level) {
>         apic_local_deliver(s, APIC_LVT_LINT0);
>     } else {
> @@ -397,7 +399,7 @@ static void apic_update_irq(APICState *s)
>     if (!(s->spurious_vec & APIC_SV_ENABLE)) {
>         return;
>     }
> -    if (apic_irq_pending(s) > 0) {
> +    if (apic_irq_pending(s) > 0 || s->pic_level) {
>         cpu_interrupt(s->cpu_env, CPU_INTERRUPT_HARD);
>     }
>  }
> diff --git a/hw/i8259.c b/hw/i8259.c
> index c0b96ab..cc6f76b 100644
> --- a/hw/i8259.c
> +++ b/hw/i8259.c
> @@ -144,8 +144,7 @@ static int pic_get_irq(PicState *s)
>
>  /* raise irq to CPU if necessary. must be called every time the active
>    irq may change */
> -/* XXX: should not export it, but it is needed for an APIC kludge */
> -void pic_update_irq(PicState2 *s)
> +static void pic_update_irq(PicState2 *s)
>  {
>     int irq2, irq;
>
> @@ -172,14 +171,9 @@ void pic_update_irq(PicState2 *s)
>         printf("pic: cpu_interrupt\n");
>  #endif
>         qemu_irq_raise(s->parent_irq);
> -    }
> -
> -/* all targets should do this rather than acking the IRQ in the cpu */
> -#if defined(TARGET_MIPS) || defined(TARGET_PPC) || defined(TARGET_ALPHA)
> -    else {
> +    } else {

Nice cleanup, this was pretty ugly.

Isn't it possible to compile the device in hwlib now? That should save
about 11 compiles for the full build since it is used by many targets.

>         qemu_irq_lower(s->parent_irq);
>     }
> -#endif
>  }
>
>  #ifdef DEBUG_IRQ_LATENCY
> diff --git a/hw/pc.c b/hw/pc.c
> index 263fb1a..b7b5d6f 100644
> --- a/hw/pc.c
> +++ b/hw/pc.c
> @@ -156,9 +156,6 @@ int cpu_get_pic_interrupt(CPUState *env)
>
>     intno = apic_get_interrupt(env->apic_state);
>     if (intno >= 0) {
> -        /* set irq request if a PIC irq is still pending */
> -        /* XXX: improve that */
> -        pic_update_irq(isa_pic);
>         return intno;
>     }
>     /* read the irq from the PIC */
> diff --git a/hw/pc.h b/hw/pc.h
> index dae736e..f5ff4c0 100644
> --- a/hw/pc.h
> +++ b/hw/pc.h
> @@ -65,7 +65,6 @@ void pic_set_irq(int irq, int level);
>  void pic_set_irq_new(void *opaque, int irq, int level);
>  qemu_irq *i8259_init(qemu_irq parent_irq);
>  int pic_read_irq(PicState2 *s);
> -void pic_update_irq(PicState2 *s);
>  uint32_t pic_intack_read(PicState2 *s);
>  void pic_info(Monitor *mon);
>  void irq_info(Monitor *mon);
>
>
Jan Kiszka - Aug. 28, 2011, 9:08 a.m.
On 2011-08-28 09:10, Blue Swirl wrote:
> On Sat, Aug 27, 2011 at 2:16 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> The master PIC is connected to the LINTIN0 of the APICs. As the APIC
>> currently does not track the state of that line, we have to ask the PIC
>> to re-inject its IRQ after the CPU picked up an event from the APIC.
>>
>> Adds the proper state tracking so that we can already re-assert the CPU
>> IRQ at APIC level if there is a pending PIC IRQ. This allows to remove
>> all the old workarounds.
>>
>> The patch also fixes some failures of the kvm unit tests apic and
>> eventinj by enabling a proper CPU IRQ deassert when the guest masks some
>> pending IRQs at PIC level.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>
>> It turned out that this patch from a larger cleanup series has no
>> dependencies and can be applied directly to master to fix the observed
>> bug.
>>
>>  hw/apic.c  |    4 +++-
>>  hw/i8259.c |   10 ++--------
>>  hw/pc.c    |    3 ---
>>  hw/pc.h    |    1 -
>>  4 files changed, 5 insertions(+), 13 deletions(-)
>>
>> diff --git a/hw/apic.c b/hw/apic.c
>> index d8f56c8..22ad635 100644
>> --- a/hw/apic.c
>> +++ b/hw/apic.c
>> @@ -104,6 +104,7 @@ struct APICState {
>>     QEMUTimer *timer;
>>     int sipi_vector;
>>     int wait_for_sipi;
>> +    int pic_level;
>>  };
>>
>>  static APICState *local_apics[MAX_APICS + 1];
>> @@ -186,6 +187,7 @@ void apic_deliver_pic_intr(DeviceState *d, int level)
>>  {
>>     APICState *s = DO_UPCAST(APICState, busdev.qdev, d);
>>
>> +    s->pic_level = level;
>>     if (level) {
>>         apic_local_deliver(s, APIC_LVT_LINT0);
>>     } else {
>> @@ -397,7 +399,7 @@ static void apic_update_irq(APICState *s)
>>     if (!(s->spurious_vec & APIC_SV_ENABLE)) {
>>         return;
>>     }
>> -    if (apic_irq_pending(s) > 0) {
>> +    if (apic_irq_pending(s) > 0 || s->pic_level) {
>>         cpu_interrupt(s->cpu_env, CPU_INTERRUPT_HARD);
>>     }
>>  }
>> diff --git a/hw/i8259.c b/hw/i8259.c
>> index c0b96ab..cc6f76b 100644
>> --- a/hw/i8259.c
>> +++ b/hw/i8259.c
>> @@ -144,8 +144,7 @@ static int pic_get_irq(PicState *s)
>>
>>  /* raise irq to CPU if necessary. must be called every time the active
>>    irq may change */
>> -/* XXX: should not export it, but it is needed for an APIC kludge */
>> -void pic_update_irq(PicState2 *s)
>> +static void pic_update_irq(PicState2 *s)
>>  {
>>     int irq2, irq;
>>
>> @@ -172,14 +171,9 @@ void pic_update_irq(PicState2 *s)
>>         printf("pic: cpu_interrupt\n");
>>  #endif
>>         qemu_irq_raise(s->parent_irq);
>> -    }
>> -
>> -/* all targets should do this rather than acking the IRQ in the cpu */
>> -#if defined(TARGET_MIPS) || defined(TARGET_PPC) || defined(TARGET_ALPHA)
>> -    else {
>> +    } else {
> 
> Nice cleanup, this was pretty ugly.
> 
> Isn't it possible to compile the device in hwlib now? That should save
> about 11 compiles for the full build since it is used by many targets.

Not yet, but at the end of my queue (pic_info has to be refactored first).

Jan
Anthony Liguori - Aug. 29, 2011, 7:25 p.m.
On 08/27/2011 09:16 AM, Jan Kiszka wrote:
> From: Jan Kiszka<jan.kiszka@siemens.com>
>
> The master PIC is connected to the LINTIN0 of the APICs. As the APIC
> currently does not track the state of that line, we have to ask the PIC
> to re-inject its IRQ after the CPU picked up an event from the APIC.
>
> Adds the proper state tracking so that we can already re-assert the CPU
> IRQ at APIC level if there is a pending PIC IRQ. This allows to remove
> all the old workarounds.
>
> The patch also fixes some failures of the kvm unit tests apic and
> eventinj by enabling a proper CPU IRQ deassert when the guest masks some
> pending IRQs at PIC level.
>
> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
> ---
>
> It turned out that this patch from a larger cleanup series has no
> dependencies and can be applied directly to master to fix the observed
> bug.
>
>   hw/apic.c  |    4 +++-
>   hw/i8259.c |   10 ++--------
>   hw/pc.c    |    3 ---
>   hw/pc.h    |    1 -
>   4 files changed, 5 insertions(+), 13 deletions(-)
>
> diff --git a/hw/apic.c b/hw/apic.c
> index d8f56c8..22ad635 100644
> --- a/hw/apic.c
> +++ b/hw/apic.c
> @@ -104,6 +104,7 @@ struct APICState {
>       QEMUTimer *timer;
>       int sipi_vector;
>       int wait_for_sipi;
> +    int pic_level;
>   };

Does this need to be save/restored for migration?

Regards,

Anthony Liguori
Jan Kiszka - Aug. 29, 2011, 9:06 p.m.
On 2011-08-29 21:25, Anthony Liguori wrote:
> On 08/27/2011 09:16 AM, Jan Kiszka wrote:
>> From: Jan Kiszka<jan.kiszka@siemens.com>
>>
>> The master PIC is connected to the LINTIN0 of the APICs. As the APIC
>> currently does not track the state of that line, we have to ask the PIC
>> to re-inject its IRQ after the CPU picked up an event from the APIC.
>>
>> Adds the proper state tracking so that we can already re-assert the CPU
>> IRQ at APIC level if there is a pending PIC IRQ. This allows to remove
>> all the old workarounds.
>>
>> The patch also fixes some failures of the kvm unit tests apic and
>> eventinj by enabling a proper CPU IRQ deassert when the guest masks some
>> pending IRQs at PIC level.
>>
>> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
>> ---
>>
>> It turned out that this patch from a larger cleanup series has no
>> dependencies and can be applied directly to master to fix the observed
>> bug.
>>
>>   hw/apic.c  |    4 +++-
>>   hw/i8259.c |   10 ++--------
>>   hw/pc.c    |    3 ---
>>   hw/pc.h    |    1 -
>>   4 files changed, 5 insertions(+), 13 deletions(-)
>>
>> diff --git a/hw/apic.c b/hw/apic.c
>> index d8f56c8..22ad635 100644
>> --- a/hw/apic.c
>> +++ b/hw/apic.c
>> @@ -104,6 +104,7 @@ struct APICState {
>>       QEMUTimer *timer;
>>       int sipi_vector;
>>       int wait_for_sipi;
>> +    int pic_level;
>>   };
> 
> Does this need to be save/restored for migration?

Nope, but we need some other measure. I thought to remember the pic was
refreshing this after load, but I do not find any traces of this now. We
likely need a post_load handler in the i8259 that re-asserts the IRQ as
required.

Jan
Avi Kivity - Aug. 29, 2011, 9:13 p.m.
On 08/30/2011 12:06 AM, Jan Kiszka wrote:
> >
> >  Does this need to be save/restored for migration?
>
> Nope, but we need some other measure. I thought to remember the pic was
> refreshing this after load, but I do not find any traces of this now. We
> likely need a post_load handler in the i8259 that re-asserts the IRQ as
> required.
>

We need some kind of two phase restore. In the first phase all state is 
restored; since some of that state drivers outputs that are input to 
other devices, they may experience an edge, and we need to supress 
that.  In the second phase edge detection is unsupressed and the device 
goes live.
Jan Kiszka - Aug. 29, 2011, 9:18 p.m.
On 2011-08-29 23:13, Avi Kivity wrote:
> On 08/30/2011 12:06 AM, Jan Kiszka wrote:
>> >
>> >  Does this need to be save/restored for migration?
>>
>> Nope, but we need some other measure. I thought to remember the pic was
>> refreshing this after load, but I do not find any traces of this now. We
>> likely need a post_load handler in the i8259 that re-asserts the IRQ as
>> required.
>>
> 
> We need some kind of two phase restore. In the first phase all state is
> restored; since some of that state drivers outputs that are input to
> other devices, they may experience an edge, and we need to supress
> that.  In the second phase edge detection is unsupressed and the device
> goes live.

Sounds like it's time to open cpu_synchronize_all_post_init for non-kvm
folks.

Jan
Blue Swirl - Aug. 30, 2011, 7:19 p.m.
On Mon, Aug 29, 2011 at 9:13 PM, Avi Kivity <avi@redhat.com> wrote:
> On 08/30/2011 12:06 AM, Jan Kiszka wrote:
>>
>> >
>> >  Does this need to be save/restored for migration?
>>
>> Nope, but we need some other measure. I thought to remember the pic was
>> refreshing this after load, but I do not find any traces of this now. We
>> likely need a post_load handler in the i8259 that re-asserts the IRQ as
>> required.
>>
>
> We need some kind of two phase restore. In the first phase all state is
> restored; since some of that state drivers outputs that are input to other
> devices, they may experience an edge, and we need to supress that.  In the
> second phase edge detection is unsupressed and the device goes live.

No. Devices may not perform any externally visible activities (like
toggle a qemu_irq) during or after load because 1) qemu_irq is
stateless and 2) since the receiving end is also freshly loaded, both
states are already in synch without any calls or toggling.
Jan Kiszka - Aug. 30, 2011, 7:28 p.m.
On 2011-08-30 21:19, Blue Swirl wrote:
> On Mon, Aug 29, 2011 at 9:13 PM, Avi Kivity <avi@redhat.com> wrote:
>> On 08/30/2011 12:06 AM, Jan Kiszka wrote:
>>>
>>>>
>>>>  Does this need to be save/restored for migration?
>>>
>>> Nope, but we need some other measure. I thought to remember the pic was
>>> refreshing this after load, but I do not find any traces of this now. We
>>> likely need a post_load handler in the i8259 that re-asserts the IRQ as
>>> required.
>>>
>>
>> We need some kind of two phase restore. In the first phase all state is
>> restored; since some of that state drivers outputs that are input to other
>> devices, they may experience an edge, and we need to supress that.  In the
>> second phase edge detection is unsupressed and the device goes live.
> 
> No. Devices may not perform any externally visible activities (like
> toggle a qemu_irq) during or after load because 1) qemu_irq is
> stateless and 2) since the receiving end is also freshly loaded, both
> states are already in synch without any calls or toggling.

Yes, that's the current state. Once we have bidirectional IRQ links in
place (pushing downward, querying upward - required to skip IRQ routers
for fast, lockless deliveries), that should change again.

For now, that's what I realized in the meantime as well, we can't help
saving pic_level in the APIC state.

What is the state of substates in pre-1.0? Do we try to use those again
in favor of simple field additions under a new state version?

Jan
Blue Swirl - Aug. 30, 2011, 7:43 p.m.
On Tue, Aug 30, 2011 at 7:28 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2011-08-30 21:19, Blue Swirl wrote:
>> On Mon, Aug 29, 2011 at 9:13 PM, Avi Kivity <avi@redhat.com> wrote:
>>> On 08/30/2011 12:06 AM, Jan Kiszka wrote:
>>>>
>>>>>
>>>>>  Does this need to be save/restored for migration?
>>>>
>>>> Nope, but we need some other measure. I thought to remember the pic was
>>>> refreshing this after load, but I do not find any traces of this now. We
>>>> likely need a post_load handler in the i8259 that re-asserts the IRQ as
>>>> required.
>>>>
>>>
>>> We need some kind of two phase restore. In the first phase all state is
>>> restored; since some of that state drivers outputs that are input to other
>>> devices, they may experience an edge, and we need to supress that.  In the
>>> second phase edge detection is unsupressed and the device goes live.
>>
>> No. Devices may not perform any externally visible activities (like
>> toggle a qemu_irq) during or after load because 1) qemu_irq is
>> stateless and 2) since the receiving end is also freshly loaded, both
>> states are already in synch without any calls or toggling.
>
> Yes, that's the current state. Once we have bidirectional IRQ links in
> place (pushing downward, querying upward - required to skip IRQ routers
> for fast, lockless deliveries), that should change again.

I don't think it should change. If stateful or dynamically routing
IRQs were implemented, then those IRQs should also save and restore
their state, so again the devices would not need to do anything.

> For now, that's what I realized in the meantime as well, we can't help
> saving pic_level in the APIC state.
>
> What is the state of substates in pre-1.0? Do we try to use those again
> in favor of simple field additions under a new state version?
>
> Jan
>
>
Peter Maydell - Aug. 31, 2011, 8:25 a.m.
On 30 August 2011 20:28, Jan Kiszka <jan.kiszka@web.de> wrote:
> Yes, that's the current state. Once we have bidirectional IRQ links in
> place (pushing downward, querying upward - required to skip IRQ routers
> for fast, lockless deliveries), that should change again.

Can you elaborate a bit more on this? I don't think anybody has
proposed links with their own internal state before in the qdev/qom
discussions...

Also a good solid use case for a bidirectional link is worth
discussing because it influences whether we want to make all
links bidi-by-default or try to do it as a special case.

(OTOH your argument does hold for why reset should be two-phase.)

thanks
-- PMM
Avi Kivity - Aug. 31, 2011, 8:28 a.m.
On 08/30/2011 10:19 PM, Blue Swirl wrote:
> >
> >  We need some kind of two phase restore. In the first phase all state is
> >  restored; since some of that state drivers outputs that are input to other
> >  devices, they may experience an edge, and we need to supress that.  In the
> >  second phase edge detection is unsupressed and the device goes live.
>
> No. Devices may not perform any externally visible activities (like
> toggle a qemu_irq) during or after load because 1) qemu_irq is
> stateless and 2) since the receiving end is also freshly loaded, both
> states are already in synch without any calls or toggling.

That makes it impossible to migrate level-triggered irq lines.  Or at 
least, the receiver has to remember the state, instead of (or in 
addition to) the sender.
Jan Kiszka - Aug. 31, 2011, 10:53 a.m.
On 2011-08-31 10:25, Peter Maydell wrote:
> On 30 August 2011 20:28, Jan Kiszka <jan.kiszka@web.de> wrote:
>> Yes, that's the current state. Once we have bidirectional IRQ links in
>> place (pushing downward, querying upward - required to skip IRQ routers
>> for fast, lockless deliveries), that should change again.
> 
> Can you elaborate a bit more on this? I don't think anybody has
> proposed links with their own internal state before in the qdev/qom
> discussions...

That basic idea is to allow

a) a discovery of the currently active IRQ path from source to sink
   (that would be possible via QOM just using forward links)

b) skip updating the states of IRQ routers in the common case, just
   signaling directly the sink from the source (to allow in-kernel IRQ
   delivery or to skip taking some device locks). Whenever some router
   is queried for its current IRQ line state, it would have to ask the
   preceding IRQ source for its state. So we need a backward link.

We haven't thought about how this could be implemented in details yet
though. Among other things, it heavily depends on the final QOM design.

Jan
Blue Swirl - Aug. 31, 2011, 4:59 p.m.
On Wed, Aug 31, 2011 at 8:28 AM, Avi Kivity <avi@redhat.com> wrote:
> On 08/30/2011 10:19 PM, Blue Swirl wrote:
>>
>> >
>> >  We need some kind of two phase restore. In the first phase all state is
>> >  restored; since some of that state drivers outputs that are input to
>> > other
>> >  devices, they may experience an edge, and we need to supress that.  In
>> > the
>> >  second phase edge detection is unsupressed and the device goes live.
>>
>> No. Devices may not perform any externally visible activities (like
>> toggle a qemu_irq) during or after load because 1) qemu_irq is
>> stateless and 2) since the receiving end is also freshly loaded, both
>> states are already in synch without any calls or toggling.
>
> That makes it impossible to migrate level-triggered irq lines.  Or at least,
> the receiver has to remember the state, instead of (or in addition to) the
> sender.

Both ends probably need to remember the state. That should work
without any multiphase restores and transient suppressors.

It might be also possible to introduce stateful signal lines which
save and restore their state, then the receiving end could check what
is the current level. However, if you consider that the devices may be
restored in random order, if the IRQ line device happens to be
restored later, the receiver would still get wrong information. Adding
priorities could solve this, but I think stateless IRQs are the only
sane way.
Blue Swirl - Aug. 31, 2011, 5:41 p.m.
On Wed, Aug 31, 2011 at 10:53 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2011-08-31 10:25, Peter Maydell wrote:
>> On 30 August 2011 20:28, Jan Kiszka <jan.kiszka@web.de> wrote:
>>> Yes, that's the current state. Once we have bidirectional IRQ links in
>>> place (pushing downward, querying upward - required to skip IRQ routers
>>> for fast, lockless deliveries), that should change again.
>>
>> Can you elaborate a bit more on this? I don't think anybody has
>> proposed links with their own internal state before in the qdev/qom
>> discussions...
>
> That basic idea is to allow
>
> a) a discovery of the currently active IRQ path from source to sink
>   (that would be possible via QOM just using forward links)

Why, only for b)? This is not possible with real hardware.

> b) skip updating the states of IRQ routers in the common case, just
>   signaling directly the sink from the source (to allow in-kernel IRQ
>   delivery or to skip taking some device locks). Whenever some router
>   is queried for its current IRQ line state, it would have to ask the
>   preceding IRQ source for its state. So we need a backward link.

I think this would need pretty heavy changes everywhere. At board
level the full path needs to be identified and special versions of
IRQs installed along the way. The routers would need to use callbacks
to inform other parties about routing changes.

> We haven't thought about how this could be implemented in details yet
> though. Among other things, it heavily depends on the final QOM design.

Perhaps a global IRQ manager could help. It would keep track of the
whole IRQ matrix, what are input (x axis) and output (y axis) states
and what each matrix node (router state) looks like (or able to
compute) if asked. I don't think backward links would be needed with
this approach.
Edgar Iglesias - Aug. 31, 2011, 6:04 p.m.
On Wed, Aug 31, 2011 at 04:59:40PM +0000, Blue Swirl wrote:
> On Wed, Aug 31, 2011 at 8:28 AM, Avi Kivity <avi@redhat.com> wrote:
> > On 08/30/2011 10:19 PM, Blue Swirl wrote:
> >>
> >> >
> >> >  We need some kind of two phase restore. In the first phase all state is
> >> >  restored; since some of that state drivers outputs that are input to
> >> > other
> >> >  devices, they may experience an edge, and we need to supress that.  In
> >> > the
> >> >  second phase edge detection is unsupressed and the device goes live.
> >>
> >> No. Devices may not perform any externally visible activities (like
> >> toggle a qemu_irq) during or after load because 1) qemu_irq is
> >> stateless and 2) since the receiving end is also freshly loaded, both
> >> states are already in synch without any calls or toggling.
> >
> > That makes it impossible to migrate level-triggered irq lines.  Or at least,
> > the receiver has to remember the state, instead of (or in addition to) the
> > sender.
> 
> Both ends probably need to remember the state. That should work
> without any multiphase restores and transient suppressors.
> 
> It might be also possible to introduce stateful signal lines which
> save and restore their state, then the receiving end could check what
> is the current level. However, if you consider that the devices may be
> restored in random order, if the IRQ line device happens to be
> restored later, the receiver would still get wrong information. Adding
> priorities could solve this, but I think stateless IRQs are the only
> sane way.

I may be missunderstanding something, but I agree with the stateless part.
Fundamentally changing IRQ lines like this, deviating from how hardware
normally handles it just to squeeze in other features seems wrong. At
least until other alternatives have been proved to be impossible, but
i doubt that is this case here.

Cheers
Jan Kiszka - Aug. 31, 2011, 6:17 p.m.
On 2011-08-31 19:41, Blue Swirl wrote:
> On Wed, Aug 31, 2011 at 10:53 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2011-08-31 10:25, Peter Maydell wrote:
>>> On 30 August 2011 20:28, Jan Kiszka <jan.kiszka@web.de> wrote:
>>>> Yes, that's the current state. Once we have bidirectional IRQ links in
>>>> place (pushing downward, querying upward - required to skip IRQ routers
>>>> for fast, lockless deliveries), that should change again.
>>>
>>> Can you elaborate a bit more on this? I don't think anybody has
>>> proposed links with their own internal state before in the qdev/qom
>>> discussions...
>>
>> That basic idea is to allow
>>
>> a) a discovery of the currently active IRQ path from source to sink
>>   (that would be possible via QOM just using forward links)
> 
> Why, only for b)? This is not possible with real hardware.
> 
>> b) skip updating the states of IRQ routers in the common case, just
>>   signaling directly the sink from the source (to allow in-kernel IRQ
>>   delivery or to skip taking some device locks). Whenever some router
>>   is queried for its current IRQ line state, it would have to ask the
>>   preceding IRQ source for its state. So we need a backward link.
> 
> I think this would need pretty heavy changes everywhere. At board
> level the full path needs to be identified and special versions of
> IRQs installed along the way. The routers would need to use callbacks
> to inform other parties about routing changes.

It already works in practice (based on a hack and minus IRQ router state
updates) for x86 PCI device pass-through. At least I don't want this
upstream but instead a generic solution. The ability to skip IRQ routers
also in pure user space device model scenarios is a useful by-product.

> 
>> We haven't thought about how this could be implemented in details yet
>> though. Among other things, it heavily depends on the final QOM design.
> 
> Perhaps a global IRQ manager could help. It would keep track of the
> whole IRQ matrix, what are input (x axis) and output (y axis) states
> and what each matrix node (router state) looks like (or able to
> compute) if asked. I don't think backward links would be needed with
> this approach.

Well, the backward links would then be moved to that global IRQ manager.
It's just moving the data management, but if it turns out to allow a
cleaner device design, I would surely not vote against it. But that
manager must support lazy updates as well because we cannot call it from
kernel space for each and every event.

Jan
Jan Kiszka - Aug. 31, 2011, 6:28 p.m.
On 2011-08-31 20:04, Edgar E. Iglesias wrote:
> On Wed, Aug 31, 2011 at 04:59:40PM +0000, Blue Swirl wrote:
>> On Wed, Aug 31, 2011 at 8:28 AM, Avi Kivity <avi@redhat.com> wrote:
>>> On 08/30/2011 10:19 PM, Blue Swirl wrote:
>>>>
>>>>>
>>>>>  We need some kind of two phase restore. In the first phase all state is
>>>>>  restored; since some of that state drivers outputs that are input to
>>>>> other
>>>>>  devices, they may experience an edge, and we need to supress that.  In
>>>>> the
>>>>>  second phase edge detection is unsupressed and the device goes live.
>>>>
>>>> No. Devices may not perform any externally visible activities (like
>>>> toggle a qemu_irq) during or after load because 1) qemu_irq is
>>>> stateless and 2) since the receiving end is also freshly loaded, both
>>>> states are already in synch without any calls or toggling.
>>>
>>> That makes it impossible to migrate level-triggered irq lines.  Or at least,
>>> the receiver has to remember the state, instead of (or in addition to) the
>>> sender.
>>
>> Both ends probably need to remember the state. That should work
>> without any multiphase restores and transient suppressors.
>>
>> It might be also possible to introduce stateful signal lines which
>> save and restore their state, then the receiving end could check what
>> is the current level. However, if you consider that the devices may be
>> restored in random order, if the IRQ line device happens to be
>> restored later, the receiver would still get wrong information. Adding
>> priorities could solve this, but I think stateless IRQs are the only
>> sane way.
> 
> I may be missunderstanding something, but I agree with the stateless part.
> Fundamentally changing IRQ lines like this, deviating from how hardware
> normally handles it just to squeeze in other features seems wrong. At
> least until other alternatives have been proved to be impossible, but
> i doubt that is this case here.

Just to clarify: We do not depend on state-full generic IRQ lines to
solve the in-kernel to in-kernel IRQ delivery problem or to cut short
the path for user space device models. But we do need fully explorable
IRQ routes and the ability to lazily update device states that we skip
when delivering via fast path. How that is finally modeled can be
discussed (and has to wait for QOM anyway).

Jan
Blue Swirl - Aug. 31, 2011, 7:44 p.m.
On Wed, Aug 31, 2011 at 6:17 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2011-08-31 19:41, Blue Swirl wrote:
>> On Wed, Aug 31, 2011 at 10:53 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>> On 2011-08-31 10:25, Peter Maydell wrote:
>>>> On 30 August 2011 20:28, Jan Kiszka <jan.kiszka@web.de> wrote:
>>>>> Yes, that's the current state. Once we have bidirectional IRQ links in
>>>>> place (pushing downward, querying upward - required to skip IRQ routers
>>>>> for fast, lockless deliveries), that should change again.
>>>>
>>>> Can you elaborate a bit more on this? I don't think anybody has
>>>> proposed links with their own internal state before in the qdev/qom
>>>> discussions...
>>>
>>> That basic idea is to allow
>>>
>>> a) a discovery of the currently active IRQ path from source to sink
>>>   (that would be possible via QOM just using forward links)
>>
>> Why, only for b)? This is not possible with real hardware.
>>
>>> b) skip updating the states of IRQ routers in the common case, just
>>>   signaling directly the sink from the source (to allow in-kernel IRQ
>>>   delivery or to skip taking some device locks). Whenever some router
>>>   is queried for its current IRQ line state, it would have to ask the
>>>   preceding IRQ source for its state. So we need a backward link.
>>
>> I think this would need pretty heavy changes everywhere. At board
>> level the full path needs to be identified and special versions of
>> IRQs installed along the way. The routers would need to use callbacks
>> to inform other parties about routing changes.
>
> It already works in practice (based on a hack and minus IRQ router state
> updates) for x86 PCI device pass-through. At least I don't want this
> upstream but instead a generic solution. The ability to skip IRQ routers
> also in pure user space device model scenarios is a useful by-product.
>
>>
>>> We haven't thought about how this could be implemented in details yet
>>> though. Among other things, it heavily depends on the final QOM design.
>>
>> Perhaps a global IRQ manager could help. It would keep track of the
>> whole IRQ matrix, what are input (x axis) and output (y axis) states
>> and what each matrix node (router state) looks like (or able to
>> compute) if asked. I don't think backward links would be needed with
>> this approach.
>
> Well, the backward links would then be moved to that global IRQ manager.
> It's just moving the data management, but if it turns out to allow a
> cleaner device design, I would surely not vote against it. But that
> manager must support lazy updates as well because we cannot call it from
> kernel space for each and every event.

The global IRQ switch matrix would take over all routing for the
devices in question. As an example, let's consider a PCI card
(source), PCI host bridge, IO-APIC, LAPIC and CPU (final destination).
The matrix would get input from the PCI card and output directly to
LAPIC since the CPU does not use a qemu_irq yet. These leaf devices
actually need no changes, just the qemu_irq is routed elsewhere. The
matrix would also have access to intermediate devices but the qemu_irq
inputs of those would not be updated in the lazy mode and the outputs
need not go anywhere.

So for a matrix where x0, x1...  is used for inputs and y0, y1... for
outputs, x0 would be connected in the example to PCI card, x1 to PCI
host bridge output and x2 to IO-APIC (ignoring multiple lines). Then
y0 would be the calculated output from PCI host bridge, y1 IO-APIC and
only y2 would be connected to LAPIC.

If the intermediate IRQ state between the nodes is queried (for
example at IO-APIC), the matrix must be able to tell what would be the
true state at that point and then this value would be supplied to the
device. So a method to query device IRQ input state from matrix on
demand is needed. This could be as unspecific as "refresh all my
inputs". A backward link may not be able to tell the state if
information about next backward link is lost (logic OR of several
signals) whereas the matrix should be able to do that even in that
case.

If routes in IO-APIC change, the matrix needs to be recalculated and
states in the affected devices should be updated. This update cycle
could be triggered via normal qemu_irq device input lines or direct
callback (needs more changes). Maybe even this could be handled lazily
so that no updates would be needed until the final output changes or
the state of an intermediate device is queried.

The global manager should handle saving and restoring states, the
device states would not matter. If it pushed the true state to
devices, we're back to the ordering problem.
Avi Kivity - Sept. 1, 2011, 5:58 a.m.
On 08/31/2011 07:59 PM, Blue Swirl wrote:
> >
> >  That makes it impossible to migrate level-triggered irq lines.  Or at least,
> >  the receiver has to remember the state, instead of (or in addition to) the
> >  sender.
>
> Both ends probably need to remember the state. That should work
> without any multiphase restores and transient suppressors.

State should always correspond to real hardware state - a flip flop or 
capacitor.  Input is not state, it is input.

> It might be also possible to introduce stateful signal lines which
> save and restore their state, then the receiving end could check what
> is the current level. However, if you consider that the devices may be
> restored in random order, if the IRQ line device happens to be
> restored later, the receiver would still get wrong information. Adding
> priorities could solve this, but I think stateless IRQs are the only
> sane way.

I agree that irqs should be stateless, since they don't have any memory 
associated.
Anthony Liguori - Sept. 3, 2011, 7:53 p.m.
On 08/31/2011 11:59 AM, Blue Swirl wrote:
> On Wed, Aug 31, 2011 at 8:28 AM, Avi Kivity<avi@redhat.com>  wrote:
>> On 08/30/2011 10:19 PM, Blue Swirl wrote:
>>>
>>>>
>>>>   We need some kind of two phase restore. In the first phase all state is
>>>>   restored; since some of that state drivers outputs that are input to
>>>> other
>>>>   devices, they may experience an edge, and we need to supress that.  In
>>>> the
>>>>   second phase edge detection is unsupressed and the device goes live.
>>>
>>> No. Devices may not perform any externally visible activities (like
>>> toggle a qemu_irq) during or after load because 1) qemu_irq is
>>> stateless and 2) since the receiving end is also freshly loaded, both
>>> states are already in synch without any calls or toggling.
>>
>> That makes it impossible to migrate level-triggered irq lines.  Or at least,
>> the receiver has to remember the state, instead of (or in addition to) the
>> sender.
>
> Both ends probably need to remember the state. That should work
> without any multiphase restores and transient suppressors.
>
> It might be also possible to introduce stateful signal lines which
> save and restore their state, then the receiving end could check what
> is the current level. However, if you consider that the devices may be
> restored in random order, if the IRQ line device happens to be
> restored later, the receiver would still get wrong information. Adding
> priorities could solve this, but I think stateless IRQs are the only
> sane way.

We shouldn't really use the term IRQ as it's confusing.  I like the term 
"pin" better because that describes what we're really talking about.

qemu_irq is designed oddly today because is represents something that is 
intrinsically state (whether a pin is high or low) with an edge 
notification with the assumption that the state is held somewhere else 
(which is usually true).

Modelling stateful pins is useful though for doing something like 
introspecting pin levels, supporting live migration, etc.

The way this works in QOM right now is that the Pin object has a level 
state that can be queried but it also has the ability to register for 
notifications on level change.

The edge change signal isn't registered until realize.  This means that 
you can connect all of the device models, restore all of the pin states, 
and then realize the device model all at once.  At the point of realize, 
all of the devices have the right pin levels so each device can add 
their edge change signals and read the incoming pins and respond 
accordingly.

Regards,

Anthony Liguori

>
Anthony Liguori - Sept. 3, 2011, 7:54 p.m.
On 08/31/2011 05:53 AM, Jan Kiszka wrote:
> On 2011-08-31 10:25, Peter Maydell wrote:
>> On 30 August 2011 20:28, Jan Kiszka<jan.kiszka@web.de>  wrote:
>>> Yes, that's the current state. Once we have bidirectional IRQ links in
>>> place (pushing downward, querying upward - required to skip IRQ routers
>>> for fast, lockless deliveries), that should change again.
>>
>> Can you elaborate a bit more on this? I don't think anybody has
>> proposed links with their own internal state before in the qdev/qom
>> discussions...
>
> That basic idea is to allow
>
> a) a discovery of the currently active IRQ path from source to sink
>     (that would be possible via QOM just using forward links)
>
> b) skip updating the states of IRQ routers in the common case, just
>     signaling directly the sink from the source (to allow in-kernel IRQ
>     delivery or to skip taking some device locks). Whenever some router
>     is queried for its current IRQ line state, it would have to ask the
>     preceding IRQ source for its state. So we need a backward link.

Can you provide some concrete use-cases of this?  I'm not convinced this 
is really all that important and it seems like tremendous amounts of 
ugliness would be needed to support it.

Regards,

Anthony Liguori

>
> We haven't thought about how this could be implemented in details yet
> though. Among other things, it heavily depends on the final QOM design.
>
> Jan
>
Anthony Liguori - Sept. 3, 2011, 8:07 p.m.
On 09/01/2011 12:58 AM, Avi Kivity wrote:
> On 08/31/2011 07:59 PM, Blue Swirl wrote:
>> >
>> > That makes it impossible to migrate level-triggered irq lines. Or at
>> least,
>> > the receiver has to remember the state, instead of (or in addition
>> to) the
>> > sender.
>>
>> Both ends probably need to remember the state. That should work
>> without any multiphase restores and transient suppressors.
>
> State should always correspond to real hardware state - a flip flop or
> capacitor. Input is not state, it is input.
>
>> It might be also possible to introduce stateful signal lines which
>> save and restore their state, then the receiving end could check what
>> is the current level. However, if you consider that the devices may be
>> restored in random order, if the IRQ line device happens to be
>> restored later, the receiver would still get wrong information. Adding
>> priorities could solve this, but I think stateless IRQs are the only
>> sane way.
>
> I agree that irqs should be stateless, since they don't have any memory
> associated.

In Real Life, you can tie a single bit multiple registers together with 
boolean logic to form an output pin.  This is essentially computed 
state.  If we wanted to model a stateless pin, we would need to do 
something like:

struct Serial {
    uint8_t thr;
    uint8_t lsr;
};

static bool serial_get_irq(Serial *s) {
    return (s->thr & THRE) | (s->lsr & LSRE);
}

static void serial_write(Serial *s, uint64_t addr, uint8_t value)
{
    switch (addr) {
    case THR:
       bool old_irq = serial_get_irq(s);
       s->thr = value;
       if (!old_irq && serial_get_irq(s)) {
           notify_edge_change(s);
       }
    ...
}

static void serial_init(Serial *s)
{
     register_pin(s, serial_get_irq);
}

Obviously, this is pretty sucky.  This is what we do today but we don't 
have a way to query irq value which is wrong.  You could fix that by 
adding the get function but that's not terribly fun.  A better way:

struct Serial {
     Pin irq;
     uint8_t thr;
     uint8_t lsr;
};

static void serial_update_irq(Serial *s)
{
    pin_set_level(&s->irq, (s->thr & THRE) | (s->lsr & LSRE));
}

static void serial_write(Serial *s) {
    switch (addr) {
    case THR:
       s->thr = value;
       serial_update_irq(s);
    ...
}

This results in much nicer code.  The edge handling can be done in 
generic code which will make things more robust overall.

Regards,

Anthony Liguori
Blue Swirl - Sept. 3, 2011, 9:01 p.m.
On Sat, Sep 3, 2011 at 7:53 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 08/31/2011 11:59 AM, Blue Swirl wrote:
>>
>> On Wed, Aug 31, 2011 at 8:28 AM, Avi Kivity<avi@redhat.com>  wrote:
>>>
>>> On 08/30/2011 10:19 PM, Blue Swirl wrote:
>>>>
>>>>>
>>>>>  We need some kind of two phase restore. In the first phase all state
>>>>> is
>>>>>  restored; since some of that state drivers outputs that are input to
>>>>> other
>>>>>  devices, they may experience an edge, and we need to supress that.  In
>>>>> the
>>>>>  second phase edge detection is unsupressed and the device goes live.
>>>>
>>>> No. Devices may not perform any externally visible activities (like
>>>> toggle a qemu_irq) during or after load because 1) qemu_irq is
>>>> stateless and 2) since the receiving end is also freshly loaded, both
>>>> states are already in synch without any calls or toggling.
>>>
>>> That makes it impossible to migrate level-triggered irq lines.  Or at
>>> least,
>>> the receiver has to remember the state, instead of (or in addition to)
>>> the
>>> sender.
>>
>> Both ends probably need to remember the state. That should work
>> without any multiphase restores and transient suppressors.
>>
>> It might be also possible to introduce stateful signal lines which
>> save and restore their state, then the receiving end could check what
>> is the current level. However, if you consider that the devices may be
>> restored in random order, if the IRQ line device happens to be
>> restored later, the receiver would still get wrong information. Adding
>> priorities could solve this, but I think stateless IRQs are the only
>> sane way.
>
> We shouldn't really use the term IRQ as it's confusing.  I like the term
> "pin" better because that describes what we're really talking about.
>
> qemu_irq is designed oddly today because is represents something that is
> intrinsically state (whether a pin is high or low) with an edge notification
> with the assumption that the state is held somewhere else (which is usually
> true).
>
> Modelling stateful pins is useful though for doing something like
> introspecting pin levels, supporting live migration, etc.
>
> The way this works in QOM right now is that the Pin object has a level state
> that can be queried but it also has the ability to register for
> notifications on level change.
>
> The edge change signal isn't registered until realize.  This means that you
> can connect all of the device models, restore all of the pin states, and
> then realize the device model all at once.  At the point of realize, all of
> the devices have the right pin levels so each device can add their edge
> change signals and read the incoming pins and respond accordingly.

Even if the devices read the input pins on restore, they shouldn't
make any changes to their output pins because that would propagate to
other devices. To handle this in non-chaotic way would need hacks to
each device, multiphase stuff, priorities or transient suppressors.
From the device point of view, restoring is not a state change and no
edges should be seen at that moment.
Blue Swirl - Sept. 3, 2011, 9:10 p.m.
On Sat, Sep 3, 2011 at 8:07 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 09/01/2011 12:58 AM, Avi Kivity wrote:
>>
>> On 08/31/2011 07:59 PM, Blue Swirl wrote:
>>>
>>> >
>>> > That makes it impossible to migrate level-triggered irq lines. Or at
>>> least,
>>> > the receiver has to remember the state, instead of (or in addition
>>> to) the
>>> > sender.
>>>
>>> Both ends probably need to remember the state. That should work
>>> without any multiphase restores and transient suppressors.
>>
>> State should always correspond to real hardware state - a flip flop or
>> capacitor. Input is not state, it is input.
>>
>>> It might be also possible to introduce stateful signal lines which
>>> save and restore their state, then the receiving end could check what
>>> is the current level. However, if you consider that the devices may be
>>> restored in random order, if the IRQ line device happens to be
>>> restored later, the receiver would still get wrong information. Adding
>>> priorities could solve this, but I think stateless IRQs are the only
>>> sane way.
>>
>> I agree that irqs should be stateless, since they don't have any memory
>> associated.
>
> In Real Life, you can tie a single bit multiple registers together with
> boolean logic to form an output pin.  This is essentially computed state.
>  If we wanted to model a stateless pin, we would need to do something like:
>
> struct Serial {
>   uint8_t thr;
>   uint8_t lsr;
> };
>
> static bool serial_get_irq(Serial *s) {
>   return (s->thr & THRE) | (s->lsr & LSRE);
> }
>
> static void serial_write(Serial *s, uint64_t addr, uint8_t value)
> {
>   switch (addr) {
>   case THR:
>      bool old_irq = serial_get_irq(s);
>      s->thr = value;
>      if (!old_irq && serial_get_irq(s)) {
>          notify_edge_change(s);
>      }
>   ...
> }
>
> static void serial_init(Serial *s)
> {
>    register_pin(s, serial_get_irq);
> }
>
> Obviously, this is pretty sucky.  This is what we do today but we don't have
> a way to query irq value which is wrong.  You could fix that by adding the
> get function but that's not terribly fun.  A better way:
>
> struct Serial {
>    Pin irq;
>    uint8_t thr;
>    uint8_t lsr;
> };
>
> static void serial_update_irq(Serial *s)
> {
>   pin_set_level(&s->irq, (s->thr & THRE) | (s->lsr & LSRE));
> }
>
> static void serial_write(Serial *s) {
>   switch (addr) {
>   case THR:
>      s->thr = value;
>      serial_update_irq(s);
>   ...
> }
>
> This results in much nicer code.  The edge handling can be done in generic
> code which will make things more robust overall.

I'm sorry but I don't see a huge difference, could you elaborate?

Maybe if the internal state of Pin is magically shared between the
endpoint devices (like typedef bool *Pin) and the devices somehow
still save Pin state as part of their own despite the duplication,
this could work. Restoring Pins first and then devices is a sort of
priority scheme.
Anthony Liguori - Sept. 3, 2011, 9:41 p.m.
On 09/03/2011 04:10 PM, Blue Swirl wrote:
> On Sat, Sep 3, 2011 at 8:07 PM, Anthony Liguori<anthony@codemonkey.ws>  wrote:
>> On 09/01/2011 12:58 AM, Avi Kivity wrote:
>>>
>>> On 08/31/2011 07:59 PM, Blue Swirl wrote:
>>>>
>>>>>
>>>>> That makes it impossible to migrate level-triggered irq lines. Or at
>>>> least,
>>>>> the receiver has to remember the state, instead of (or in addition
>>>> to) the
>>>>> sender.
>>>>
>>>> Both ends probably need to remember the state. That should work
>>>> without any multiphase restores and transient suppressors.
>>>
>>> State should always correspond to real hardware state - a flip flop or
>>> capacitor. Input is not state, it is input.
>>>
>>>> It might be also possible to introduce stateful signal lines which
>>>> save and restore their state, then the receiving end could check what
>>>> is the current level. However, if you consider that the devices may be
>>>> restored in random order, if the IRQ line device happens to be
>>>> restored later, the receiver would still get wrong information. Adding
>>>> priorities could solve this, but I think stateless IRQs are the only
>>>> sane way.
>>>
>>> I agree that irqs should be stateless, since they don't have any memory
>>> associated.
>>
>> In Real Life, you can tie a single bit multiple registers together with
>> boolean logic to form an output pin.  This is essentially computed state.
>>   If we wanted to model a stateless pin, we would need to do something like:
>>
>> struct Serial {
>>    uint8_t thr;
>>    uint8_t lsr;
>> };
>>
>> static bool serial_get_irq(Serial *s) {
>>    return (s->thr&  THRE) | (s->lsr&  LSRE);
>> }
>>
>> static void serial_write(Serial *s, uint64_t addr, uint8_t value)
>> {
>>    switch (addr) {
>>    case THR:
>>       bool old_irq = serial_get_irq(s);
>>       s->thr = value;
>>       if (!old_irq&&  serial_get_irq(s)) {
>>           notify_edge_change(s);
>>       }
>>    ...
>> }
>>
>> static void serial_init(Serial *s)
>> {
>>     register_pin(s, serial_get_irq);
>> }
>>
>> Obviously, this is pretty sucky.  This is what we do today but we don't have
>> a way to query irq value which is wrong.  You could fix that by adding the
>> get function but that's not terribly fun.  A better way:
>>
>> struct Serial {
>>     Pin irq;
>>     uint8_t thr;
>>     uint8_t lsr;
>> };
>>
>> static void serial_update_irq(Serial *s)
>> {
>>    pin_set_level(&s->irq, (s->thr&  THRE) | (s->lsr&  LSRE));
>> }
>>
>> static void serial_write(Serial *s) {
>>    switch (addr) {
>>    case THR:
>>       s->thr = value;
>>       serial_update_irq(s);
>>    ...
>> }
>>
>> This results in much nicer code.  The edge handling can be done in generic
>> code which will make things more robust overall.
>
> I'm sorry but I don't see a huge difference, could you elaborate?

The main difference is whether the Pin is capable of determine if there 
was a level change on its own.  It can only do this is if knows the 
current level which implies that its holding state.

>
> Maybe if the internal state of Pin is magically shared between the
> endpoint devices (like typedef bool *Pin) and the devices somehow
> still save Pin state as part of their own despite the duplication,

I'm somewhat confused by what you mean here.

If you have two devices that have a connection, one has an output pin 
and one has an input pin.  This would look like this:

struct Serial {
    Pin irq; // output pin
};

struct PIIX3 {
    Pin *in[16]; // input pins
};

As part of connecting devices, you'd basically do:

PIIX3 piix3;
Serial serial;

piix3.in[4] = &serial.irq;

serial.irq setting it pin level doesn't do anything to piix3.  piix3 has 
to explicitly read the pin state for its behavior to influence anything.

Here's the flow with taking migration into account:

1) PIIX3 maintains some type of state, performs action (A) whenever 
in[3] changes its state based on an edge change notifier.

2) During migration, PIIX3 has its state saved as does Serial.  Pin is 
part of Serial so it also has its state saved.

3) During restore, PIIX3 has its state restored, as does Serial, and 
Pin.  Action (A) is not invoked because notifiers are not fired when a 
device is not realized.  Restore happens before a device is realized.

So the scenario you're concerned about doesn't happen and it doesn't 
require anything funky.

> this could work. Restoring Pins first and then devices is a sort of
> priority scheme.

There is no priority.  But devices have an explicit realize event and in 
general, shouldn't react to other devices until realize happens.  You 
need this behavior to support construction properly.

Regards,

Anthony Liguori
Blue Swirl - Sept. 4, 2011, 9:27 a.m.
On Sat, Sep 3, 2011 at 9:41 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 09/03/2011 04:10 PM, Blue Swirl wrote:
>>
>> On Sat, Sep 3, 2011 at 8:07 PM, Anthony Liguori<anthony@codemonkey.ws>
>>  wrote:
>>>
>>> On 09/01/2011 12:58 AM, Avi Kivity wrote:
>>>>
>>>> On 08/31/2011 07:59 PM, Blue Swirl wrote:
>>>>>
>>>>>>
>>>>>> That makes it impossible to migrate level-triggered irq lines. Or at
>>>>>
>>>>> least,
>>>>>>
>>>>>> the receiver has to remember the state, instead of (or in addition
>>>>>
>>>>> to) the
>>>>>>
>>>>>> sender.
>>>>>
>>>>> Both ends probably need to remember the state. That should work
>>>>> without any multiphase restores and transient suppressors.
>>>>
>>>> State should always correspond to real hardware state - a flip flop or
>>>> capacitor. Input is not state, it is input.
>>>>
>>>>> It might be also possible to introduce stateful signal lines which
>>>>> save and restore their state, then the receiving end could check what
>>>>> is the current level. However, if you consider that the devices may be
>>>>> restored in random order, if the IRQ line device happens to be
>>>>> restored later, the receiver would still get wrong information. Adding
>>>>> priorities could solve this, but I think stateless IRQs are the only
>>>>> sane way.
>>>>
>>>> I agree that irqs should be stateless, since they don't have any memory
>>>> associated.
>>>
>>> In Real Life, you can tie a single bit multiple registers together with
>>> boolean logic to form an output pin.  This is essentially computed state.
>>>  If we wanted to model a stateless pin, we would need to do something
>>> like:
>>>
>>> struct Serial {
>>>   uint8_t thr;
>>>   uint8_t lsr;
>>> };
>>>
>>> static bool serial_get_irq(Serial *s) {
>>>   return (s->thr&  THRE) | (s->lsr&  LSRE);
>>> }
>>>
>>> static void serial_write(Serial *s, uint64_t addr, uint8_t value)
>>> {
>>>   switch (addr) {
>>>   case THR:
>>>      bool old_irq = serial_get_irq(s);
>>>      s->thr = value;
>>>      if (!old_irq&&  serial_get_irq(s)) {
>>>          notify_edge_change(s);
>>>      }
>>>   ...
>>> }
>>>
>>> static void serial_init(Serial *s)
>>> {
>>>    register_pin(s, serial_get_irq);
>>> }
>>>
>>> Obviously, this is pretty sucky.  This is what we do today but we don't
>>> have
>>> a way to query irq value which is wrong.  You could fix that by adding
>>> the
>>> get function but that's not terribly fun.  A better way:
>>>
>>> struct Serial {
>>>    Pin irq;
>>>    uint8_t thr;
>>>    uint8_t lsr;
>>> };
>>>
>>> static void serial_update_irq(Serial *s)
>>> {
>>>   pin_set_level(&s->irq, (s->thr&  THRE) | (s->lsr&  LSRE));
>>> }
>>>
>>> static void serial_write(Serial *s) {
>>>   switch (addr) {
>>>   case THR:
>>>      s->thr = value;
>>>      serial_update_irq(s);
>>>   ...
>>> }
>>>
>>> This results in much nicer code.  The edge handling can be done in
>>> generic
>>> code which will make things more robust overall.
>>
>> I'm sorry but I don't see a huge difference, could you elaborate?
>
> The main difference is whether the Pin is capable of determine if there was
> a level change on its own.  It can only do this is if knows the current
> level which implies that its holding state.
>
>>
>> Maybe if the internal state of Pin is magically shared between the
>> endpoint devices (like typedef bool *Pin) and the devices somehow
>> still save Pin state as part of their own despite the duplication,
>
> I'm somewhat confused by what you mean here.

Pretty similar to what you propose below except the magical sharing.

> If you have two devices that have a connection, one has an output pin and
> one has an input pin.  This would look like this:
>
> struct Serial {
>   Pin irq; // output pin
> };
>
> struct PIIX3 {
>   Pin *in[16]; // input pins
> };
>
> As part of connecting devices, you'd basically do:
>
> PIIX3 piix3;
> Serial serial;
>
> piix3.in[4] = &serial.irq;
>
> serial.irq setting it pin level doesn't do anything to piix3.  piix3 has to
> explicitly read the pin state for its behavior to influence anything.
>
> Here's the flow with taking migration into account:
>
> 1) PIIX3 maintains some type of state, performs action (A) whenever in[3]
> changes its state based on an edge change notifier.
>
> 2) During migration, PIIX3 has its state saved as does Serial.  Pin is part
> of Serial so it also has its state saved.
>
> 3) During restore, PIIX3 has its state restored, as does Serial, and Pin.
>  Action (A) is not invoked because notifiers are not fired when a device is
> not realized.  Restore happens before a device is realized.
>
> So the scenario you're concerned about doesn't happen and it doesn't require
> anything funky.

OK, this should work.

>> this could work. Restoring Pins first and then devices is a sort of
>> priority scheme.
>
> There is no priority.  But devices have an explicit realize event and in
> general, shouldn't react to other devices until realize happens.  You need
> this behavior to support construction properly.
>
> Regards,
>
> Anthony Liguori
>
Blue Swirl - Sept. 4, 2011, 10:33 a.m.
On Wed, Aug 31, 2011 at 7:44 PM, Blue Swirl <blauwirbel@gmail.com> wrote:
> On Wed, Aug 31, 2011 at 6:17 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2011-08-31 19:41, Blue Swirl wrote:
>>> On Wed, Aug 31, 2011 at 10:53 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>>> On 2011-08-31 10:25, Peter Maydell wrote:
>>>>> On 30 August 2011 20:28, Jan Kiszka <jan.kiszka@web.de> wrote:
>>>>>> Yes, that's the current state. Once we have bidirectional IRQ links in
>>>>>> place (pushing downward, querying upward - required to skip IRQ routers
>>>>>> for fast, lockless deliveries), that should change again.
>>>>>
>>>>> Can you elaborate a bit more on this? I don't think anybody has
>>>>> proposed links with their own internal state before in the qdev/qom
>>>>> discussions...
>>>>
>>>> That basic idea is to allow
>>>>
>>>> a) a discovery of the currently active IRQ path from source to sink
>>>>   (that would be possible via QOM just using forward links)
>>>
>>> Why, only for b)? This is not possible with real hardware.
>>>
>>>> b) skip updating the states of IRQ routers in the common case, just
>>>>   signaling directly the sink from the source (to allow in-kernel IRQ
>>>>   delivery or to skip taking some device locks). Whenever some router
>>>>   is queried for its current IRQ line state, it would have to ask the
>>>>   preceding IRQ source for its state. So we need a backward link.
>>>
>>> I think this would need pretty heavy changes everywhere. At board
>>> level the full path needs to be identified and special versions of
>>> IRQs installed along the way. The routers would need to use callbacks
>>> to inform other parties about routing changes.
>>
>> It already works in practice (based on a hack and minus IRQ router state
>> updates) for x86 PCI device pass-through. At least I don't want this
>> upstream but instead a generic solution. The ability to skip IRQ routers
>> also in pure user space device model scenarios is a useful by-product.
>>
>>>
>>>> We haven't thought about how this could be implemented in details yet
>>>> though. Among other things, it heavily depends on the final QOM design.
>>>
>>> Perhaps a global IRQ manager could help. It would keep track of the
>>> whole IRQ matrix, what are input (x axis) and output (y axis) states
>>> and what each matrix node (router state) looks like (or able to
>>> compute) if asked. I don't think backward links would be needed with
>>> this approach.
>>
>> Well, the backward links would then be moved to that global IRQ manager.
>> It's just moving the data management, but if it turns out to allow a
>> cleaner device design, I would surely not vote against it. But that
>> manager must support lazy updates as well because we cannot call it from
>> kernel space for each and every event.
>
> The global IRQ switch matrix would take over all routing for the
> devices in question. As an example, let's consider a PCI card
> (source), PCI host bridge, IO-APIC, LAPIC and CPU (final destination).
> The matrix would get input from the PCI card and output directly to
> LAPIC since the CPU does not use a qemu_irq yet. These leaf devices
> actually need no changes, just the qemu_irq is routed elsewhere. The
> matrix would also have access to intermediate devices but the qemu_irq
> inputs of those would not be updated in the lazy mode and the outputs
> need not go anywhere.
>
> So for a matrix where x0, x1...  is used for inputs and y0, y1... for
> outputs, x0 would be connected in the example to PCI card, x1 to PCI
> host bridge output and x2 to IO-APIC (ignoring multiple lines). Then
> y0 would be the calculated output from PCI host bridge, y1 IO-APIC and
> only y2 would be connected to LAPIC.
>
> If the intermediate IRQ state between the nodes is queried (for
> example at IO-APIC), the matrix must be able to tell what would be the
> true state at that point and then this value would be supplied to the
> device. So a method to query device IRQ input state from matrix on
> demand is needed. This could be as unspecific as "refresh all my
> inputs". A backward link may not be able to tell the state if
> information about next backward link is lost (logic OR of several
> signals) whereas the matrix should be able to do that even in that
> case.
>
> If routes in IO-APIC change, the matrix needs to be recalculated and
> states in the affected devices should be updated. This update cycle
> could be triggered via normal qemu_irq device input lines or direct
> callback (needs more changes). Maybe even this could be handled lazily
> so that no updates would be needed until the final output changes or
> the state of an intermediate device is queried.
>
> The global manager should handle saving and restoring states, the
> device states would not matter. If it pushed the true state to
> devices, we're back to the ordering problem.

I was thinking that the manager would be at board level, but actually
the wiring at the board level can be considered as a second switch
matrix (mostly constant). Then with these two matrices, all signal
routing for the whole machine could be managed. The board level would
use same facilities to register the routing elements as devices.

I think the matrix model is also compatible with Pins. Since this is
pretty much at hand waving level ATM, maybe it's better to wait for
those though my hands are itching. Maybe I'll make a small proof of
concept matrix to see if this can fly.
Jan Kiszka - Sept. 4, 2011, 12:13 p.m.
On 2011-09-03 21:54, Anthony Liguori wrote:
> On 08/31/2011 05:53 AM, Jan Kiszka wrote:
>> On 2011-08-31 10:25, Peter Maydell wrote:
>>> On 30 August 2011 20:28, Jan Kiszka<jan.kiszka@web.de>  wrote:
>>>> Yes, that's the current state. Once we have bidirectional IRQ links in
>>>> place (pushing downward, querying upward - required to skip IRQ routers
>>>> for fast, lockless deliveries), that should change again.
>>>
>>> Can you elaborate a bit more on this? I don't think anybody has
>>> proposed links with their own internal state before in the qdev/qom
>>> discussions...
>>
>> That basic idea is to allow
>>
>> a) a discovery of the currently active IRQ path from source to sink
>>     (that would be possible via QOM just using forward links)
>>
>> b) skip updating the states of IRQ routers in the common case, just
>>     signaling directly the sink from the source (to allow in-kernel IRQ
>>     delivery or to skip taking some device locks). Whenever some router
>>     is queried for its current IRQ line state, it would have to ask the
>>     preceding IRQ source for its state. So we need a backward link.
> 
> Can you provide some concrete use-cases of this?  I'm not convinced this 
> is really all that important and it seems like tremendous amounts of 
> ugliness would be needed to support it.

INTx support for device assignment, vhost, or any other future in-kernel
IRQ sources. And optimized user space IRQ delivery, particularly under
real-time constraints.

When designing those requirements into a new IRQ/pin management model
right from the beginning, that shouldn't be as ugly as you may think. At
least it will be less ugly and more correct than what we already have in
qemu-kvm today.

Jan
Avi Kivity - Sept. 4, 2011, 12:17 p.m.
On 08/31/2011 01:53 PM, Jan Kiszka wrote:
> On 2011-08-31 10:25, Peter Maydell wrote:
> >  On 30 August 2011 20:28, Jan Kiszka<jan.kiszka@web.de>  wrote:
> >>  Yes, that's the current state. Once we have bidirectional IRQ links in
> >>  place (pushing downward, querying upward - required to skip IRQ routers
> >>  for fast, lockless deliveries), that should change again.
> >
> >  Can you elaborate a bit more on this? I don't think anybody has
> >  proposed links with their own internal state before in the qdev/qom
> >  discussions...
>
> That basic idea is to allow
>
> a) a discovery of the currently active IRQ path from source to sink
>     (that would be possible via QOM just using forward links)
>
> b) skip updating the states of IRQ routers in the common case, just
>     signaling directly the sink from the source (to allow in-kernel IRQ
>     delivery or to skip taking some device locks). Whenever some router
>     is queried for its current IRQ line state, it would have to ask the
>     preceding IRQ source for its state. So we need a backward link.
>
> We haven't thought about how this could be implemented in details yet
> though. Among other things, it heavily depends on the final QOM design.
>

Looks like a similar path to the memory API.  A declarative description 
of the interrupt hierarchy allows routes to be precalculated and flattened.

(here it's strictly an optimization; with the memory API it's a 
requirement since kvm requires a flattened representation, and tcg is 
greatly simplified by it).
Gleb Natapov - Sept. 4, 2011, 12:25 p.m.
> On Wed, Aug 31, 2011 at 6:17 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2011-08-31 19:41, Blue Swirl wrote:
>>> On Wed, Aug 31, 2011 at 10:53 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>>> On 2011-08-31 10:25, Peter Maydell wrote:
>>>>> On 30 August 2011 20:28, Jan Kiszka <jan.kiszka@web.de> wrote:
>>>>>> Yes, that's the current state. Once we have bidirectional IRQ links in
>>>>>> place (pushing downward, querying upward - required to skip IRQ routers
>>>>>> for fast, lockless deliveries), that should change again.
>>>>>
>>>>> Can you elaborate a bit more on this? I don't think anybody has
>>>>> proposed links with their own internal state before in the qdev/qom
>>>>> discussions...
>>>>
>>>> That basic idea is to allow
>>>>
>>>> a) a discovery of the currently active IRQ path from source to sink
>>>>   (that would be possible via QOM just using forward links)
>>>
>>> Why, only for b)? This is not possible with real hardware.
>>>
>>>> b) skip updating the states of IRQ routers in the common case, just
>>>>   signaling directly the sink from the source (to allow in-kernel IRQ
>>>>   delivery or to skip taking some device locks). Whenever some router
>>>>   is queried for its current IRQ line state, it would have to ask the
>>>>   preceding IRQ source for its state. So we need a backward link.
>>>
>>> I think this would need pretty heavy changes everywhere. At board
>>> level the full path needs to be identified and special versions of
>>> IRQs installed along the way. The routers would need to use callbacks
>>> to inform other parties about routing changes.
>>
>> It already works in practice (based on a hack and minus IRQ router state
>> updates) for x86 PCI device pass-through. At least I don't want this
>> upstream but instead a generic solution. The ability to skip IRQ routers
>> also in pure user space device model scenarios is a useful by-product.
>>
>>>
>>>> We haven't thought about how this could be implemented in details yet
>>>> though. Among other things, it heavily depends on the final QOM design.
>>>
>>> Perhaps a global IRQ manager could help. It would keep track of the
>>> whole IRQ matrix, what are input (x axis) and output (y axis) states
>>> and what each matrix node (router state) looks like (or able to
>>> compute) if asked. I don't think backward links would be needed with
>>> this approach.
>>
>> Well, the backward links would then be moved to that global IRQ manager.
>> It's just moving the data management, but if it turns out to allow a
>> cleaner device design, I would surely not vote against it. But that
>> manager must support lazy updates as well because we cannot call it from
>> kernel space for each and every event.
>
> The global IRQ switch matrix would take over all routing for the
> devices in question. As an example, let's consider a PCI card
> (source), PCI host bridge, IO-APIC, LAPIC and CPU (final destination).
--------------------------------------^ LAPICs and CPUs

--
			Gleb.
Jan Kiszka - Sept. 4, 2011, 12:37 p.m.
On 2011-09-04 14:17, Avi Kivity wrote:
> On 08/31/2011 01:53 PM, Jan Kiszka wrote:
>> On 2011-08-31 10:25, Peter Maydell wrote:
>> >  On 30 August 2011 20:28, Jan Kiszka<jan.kiszka@web.de>  wrote:
>> >>  Yes, that's the current state. Once we have bidirectional IRQ
>> links in
>> >>  place (pushing downward, querying upward - required to skip IRQ
>> routers
>> >>  for fast, lockless deliveries), that should change again.
>> >
>> >  Can you elaborate a bit more on this? I don't think anybody has
>> >  proposed links with their own internal state before in the qdev/qom
>> >  discussions...
>>
>> That basic idea is to allow
>>
>> a) a discovery of the currently active IRQ path from source to sink
>>     (that would be possible via QOM just using forward links)
>>
>> b) skip updating the states of IRQ routers in the common case, just
>>     signaling directly the sink from the source (to allow in-kernel IRQ
>>     delivery or to skip taking some device locks). Whenever some router
>>     is queried for its current IRQ line state, it would have to ask the
>>     preceding IRQ source for its state. So we need a backward link.
>>
>> We haven't thought about how this could be implemented in details yet
>> though. Among other things, it heavily depends on the final QOM design.
>>
> 
> Looks like a similar path to the memory API.  A declarative description
> of the interrupt hierarchy allows routes to be precalculated and flattened.
> 
> (here it's strictly an optimization; with the memory API it's a
> requirement since kvm requires a flattened representation, and tcg is
> greatly simplified by it).

With current kvm device assignment it's mandatory as it only support
kernel/kernel IRQ delivery. Only vfio's eventfds will make it optional
(but still highly desirable).

Jan
Avi Kivity - Sept. 4, 2011, 12:43 p.m.
On 09/04/2011 03:37 PM, Jan Kiszka wrote:
> >
> >  (here it's strictly an optimization; with the memory API it's a
> >  requirement since kvm requires a flattened representation, and tcg is
> >  greatly simplified by it).
>
> With current kvm device assignment it's mandatory as it only support
> kernel/kernel IRQ delivery. Only vfio's eventfds will make it optional
> (but still highly desirable).
>

Right.

It's nice to have a single model for everything - understand once, 
confuse everywhere.
Anthony Liguori - Sept. 4, 2011, 1:32 p.m.
On 09/04/2011 07:13 AM, Jan Kiszka wrote:
> On 2011-09-03 21:54, Anthony Liguori wrote:
>> On 08/31/2011 05:53 AM, Jan Kiszka wrote:
>>> On 2011-08-31 10:25, Peter Maydell wrote:
>>>> On 30 August 2011 20:28, Jan Kiszka<jan.kiszka@web.de>   wrote:
>>>>> Yes, that's the current state. Once we have bidirectional IRQ links in
>>>>> place (pushing downward, querying upward - required to skip IRQ routers
>>>>> for fast, lockless deliveries), that should change again.
>>>>
>>>> Can you elaborate a bit more on this? I don't think anybody has
>>>> proposed links with their own internal state before in the qdev/qom
>>>> discussions...
>>>
>>> That basic idea is to allow
>>>
>>> a) a discovery of the currently active IRQ path from source to sink
>>>      (that would be possible via QOM just using forward links)
>>>
>>> b) skip updating the states of IRQ routers in the common case, just
>>>      signaling directly the sink from the source (to allow in-kernel IRQ
>>>      delivery or to skip taking some device locks). Whenever some router
>>>      is queried for its current IRQ line state, it would have to ask the
>>>      preceding IRQ source for its state. So we need a backward link.
>>
>> Can you provide some concrete use-cases of this?  I'm not convinced this
>> is really all that important and it seems like tremendous amounts of
>> ugliness would be needed to support it.
>
> INTx support for device assignment, vhost, or any other future in-kernel
> IRQ sources.

I prefer to not think of IRQs as special things.  They're just single 
bits of information that flow through the device model.  Having a higher 
level representation that understands something like paths seems wrong 
to me.

I'd prefer to treat things like device assignment as a hack.  You just 
need code that can walk the device tree to figure out the path (which is 
not generic code, it's very machine specific).  Then you tell the kernel 
the resolution of the path and are otherwise completely oblivious in 
userspace.

Regards,

Anthony Liguori
Anthony Liguori - Sept. 4, 2011, 1:35 p.m.
On 09/04/2011 07:17 AM, Avi Kivity wrote:
> On 08/31/2011 01:53 PM, Jan Kiszka wrote:
>> On 2011-08-31 10:25, Peter Maydell wrote:
>> > On 30 August 2011 20:28, Jan Kiszka<jan.kiszka@web.de> wrote:
>> >> Yes, that's the current state. Once we have bidirectional IRQ links in
>> >> place (pushing downward, querying upward - required to skip IRQ
>> routers
>> >> for fast, lockless deliveries), that should change again.
>> >
>> > Can you elaborate a bit more on this? I don't think anybody has
>> > proposed links with their own internal state before in the qdev/qom
>> > discussions...
>>
>> That basic idea is to allow
>>
>> a) a discovery of the currently active IRQ path from source to sink
>> (that would be possible via QOM just using forward links)
>>
>> b) skip updating the states of IRQ routers in the common case, just
>> signaling directly the sink from the source (to allow in-kernel IRQ
>> delivery or to skip taking some device locks). Whenever some router
>> is queried for its current IRQ line state, it would have to ask the
>> preceding IRQ source for its state. So we need a backward link.
>>
>> We haven't thought about how this could be implemented in details yet
>> though. Among other things, it heavily depends on the final QOM design.
>>
>
> Looks like a similar path to the memory API. A declarative description
> of the interrupt hierarchy allows routes to be precalculated and flattened.
>
> (here it's strictly an optimization; with the memory API it's a
> requirement since kvm requires a flattened representation, and tcg is
> greatly simplified by it).

Let's not be careful to over generalize here.  Memory access is a fairly 
complicated thing involving multiple different types of busses, various 
properties (endianness, alignment, access constraints).  IRQs are 
extremely simple.  They're just single bits of information.

Regards,

Anthony Liguori

>
Jan Kiszka - Sept. 4, 2011, 1:36 p.m.
On 2011-09-04 15:32, Anthony Liguori wrote:
> On 09/04/2011 07:13 AM, Jan Kiszka wrote:
>> On 2011-09-03 21:54, Anthony Liguori wrote:
>>> On 08/31/2011 05:53 AM, Jan Kiszka wrote:
>>>> On 2011-08-31 10:25, Peter Maydell wrote:
>>>>> On 30 August 2011 20:28, Jan Kiszka<jan.kiszka@web.de>   wrote:
>>>>>> Yes, that's the current state. Once we have bidirectional IRQ
>>>>>> links in
>>>>>> place (pushing downward, querying upward - required to skip IRQ
>>>>>> routers
>>>>>> for fast, lockless deliveries), that should change again.
>>>>>
>>>>> Can you elaborate a bit more on this? I don't think anybody has
>>>>> proposed links with their own internal state before in the qdev/qom
>>>>> discussions...
>>>>
>>>> That basic idea is to allow
>>>>
>>>> a) a discovery of the currently active IRQ path from source to sink
>>>>      (that would be possible via QOM just using forward links)
>>>>
>>>> b) skip updating the states of IRQ routers in the common case, just
>>>>      signaling directly the sink from the source (to allow in-kernel
>>>> IRQ
>>>>      delivery or to skip taking some device locks). Whenever some
>>>> router
>>>>      is queried for its current IRQ line state, it would have to ask
>>>> the
>>>>      preceding IRQ source for its state. So we need a backward link.
>>>
>>> Can you provide some concrete use-cases of this?  I'm not convinced this
>>> is really all that important and it seems like tremendous amounts of
>>> ugliness would be needed to support it.
>>
>> INTx support for device assignment, vhost, or any other future in-kernel
>> IRQ sources.
> 
> I prefer to not think of IRQs as special things.  They're just single
> bits of information that flow through the device model.  Having a higher
> level representation that understands something like paths seems wrong
> to me.
> 
> I'd prefer to treat things like device assignment as a hack.  You just
> need code that can walk the device tree to figure out the path (which is
> not generic code, it's very machine specific).  Then you tell the kernel
> the resolution of the path and are otherwise completely oblivious in
> userspace.

See it as you like, but we need the support, not only for device
assigment. And I do not see any gain it hacking this instead of
designing it.

Jan
Anthony Liguori - Sept. 4, 2011, 1:38 p.m.
On 09/04/2011 07:37 AM, Jan Kiszka wrote:
> On 2011-09-04 14:17, Avi Kivity wrote:
>> On 08/31/2011 01:53 PM, Jan Kiszka wrote:
>>> On 2011-08-31 10:25, Peter Maydell wrote:
>>>>   On 30 August 2011 20:28, Jan Kiszka<jan.kiszka@web.de>   wrote:
>>>>>   Yes, that's the current state. Once we have bidirectional IRQ
>>> links in
>>>>>   place (pushing downward, querying upward - required to skip IRQ
>>> routers
>>>>>   for fast, lockless deliveries), that should change again.
>>>>
>>>>   Can you elaborate a bit more on this? I don't think anybody has
>>>>   proposed links with their own internal state before in the qdev/qom
>>>>   discussions...
>>>
>>> That basic idea is to allow
>>>
>>> a) a discovery of the currently active IRQ path from source to sink
>>>      (that would be possible via QOM just using forward links)
>>>
>>> b) skip updating the states of IRQ routers in the common case, just
>>>      signaling directly the sink from the source (to allow in-kernel IRQ
>>>      delivery or to skip taking some device locks). Whenever some router
>>>      is queried for its current IRQ line state, it would have to ask the
>>>      preceding IRQ source for its state. So we need a backward link.
>>>
>>> We haven't thought about how this could be implemented in details yet
>>> though. Among other things, it heavily depends on the final QOM design.
>>>
>>
>> Looks like a similar path to the memory API.  A declarative description
>> of the interrupt hierarchy allows routes to be precalculated and flattened.
>>
>> (here it's strictly an optimization; with the memory API it's a
>> requirement since kvm requires a flattened representation, and tcg is
>> greatly simplified by it).
>
> With current kvm device assignment it's mandatory as it only support
> kernel/kernel IRQ delivery. Only vfio's eventfds will make it optional
> (but still highly desirable).

It's not mandatory.  All you need to be able to do is calculate the APIC 
IRQ for a given PCI device interrupt.  That doesn't mean we need to be 
able to do arbitrary interrupt resolution in generic code.

There is potentially tremendous complexity here because you'll have to 
bake all interrupt rerouting logic into a declarative API and/or call 
into generic code to update routing tables.  Given the fact that we 
can't even generically refer to a device reliably today, this is would 
be a daunting task.

We're making this all more complicated than it needs to be.

Regards,

Anthony Liguori

> Jan
>
Anthony Liguori - Sept. 4, 2011, 1:41 p.m.
On 09/04/2011 08:36 AM, Jan Kiszka wrote:
> On 2011-09-04 15:32, Anthony Liguori wrote:
>> I prefer to not think of IRQs as special things.  They're just single
>> bits of information that flow through the device model.  Having a higher
>> level representation that understands something like paths seems wrong
>> to me.
>>
>> I'd prefer to treat things like device assignment as a hack.  You just
>> need code that can walk the device tree to figure out the path (which is
>> not generic code, it's very machine specific).  Then you tell the kernel
>> the resolution of the path and are otherwise completely oblivious in
>> userspace.
>
> See it as you like, but we need the support, not only for device
> assigment. And I do not see any gain it hacking this instead of
> designing it.

You can design a hack but it's still a hack.

Device state belongs in devices.  Trying to extract device state 
(interrupt routing paths) and externalizing it is by definition a hack.

Having some sort of global interrupt routing table is just going to add 
a layer of complexity for very little obvious gain.

The PCI bus *is* the I/O APIC for all intents and purposes.  Being able 
to ask the i440fx for the interrupt corresponding to a device is a very 
simple task that involves no generic code.  That's more than enough to 
support device assignment.

Why overcomplicate things when the problem can be solved robustly with 
most likely a 10 line function?

Regards,

Anthony Liguori

>
> Jan
>
Jan Kiszka - Sept. 4, 2011, 1:42 p.m.
On 2011-09-04 15:38, Anthony Liguori wrote:
> On 09/04/2011 07:37 AM, Jan Kiszka wrote:
>> On 2011-09-04 14:17, Avi Kivity wrote:
>>> On 08/31/2011 01:53 PM, Jan Kiszka wrote:
>>>> On 2011-08-31 10:25, Peter Maydell wrote:
>>>>>   On 30 August 2011 20:28, Jan Kiszka<jan.kiszka@web.de>   wrote:
>>>>>>   Yes, that's the current state. Once we have bidirectional IRQ
>>>> links in
>>>>>>   place (pushing downward, querying upward - required to skip IRQ
>>>> routers
>>>>>>   for fast, lockless deliveries), that should change again.
>>>>>
>>>>>   Can you elaborate a bit more on this? I don't think anybody has
>>>>>   proposed links with their own internal state before in the qdev/qom
>>>>>   discussions...
>>>>
>>>> That basic idea is to allow
>>>>
>>>> a) a discovery of the currently active IRQ path from source to sink
>>>>      (that would be possible via QOM just using forward links)
>>>>
>>>> b) skip updating the states of IRQ routers in the common case, just
>>>>      signaling directly the sink from the source (to allow in-kernel
>>>> IRQ
>>>>      delivery or to skip taking some device locks). Whenever some
>>>> router
>>>>      is queried for its current IRQ line state, it would have to ask
>>>> the
>>>>      preceding IRQ source for its state. So we need a backward link.
>>>>
>>>> We haven't thought about how this could be implemented in details yet
>>>> though. Among other things, it heavily depends on the final QOM design.
>>>>
>>>
>>> Looks like a similar path to the memory API.  A declarative description
>>> of the interrupt hierarchy allows routes to be precalculated and
>>> flattened.
>>>
>>> (here it's strictly an optimization; with the memory API it's a
>>> requirement since kvm requires a flattened representation, and tcg is
>>> greatly simplified by it).
>>
>> With current kvm device assignment it's mandatory as it only support
>> kernel/kernel IRQ delivery. Only vfio's eventfds will make it optional
>> (but still highly desirable).
> 
> It's not mandatory.  All you need to be able to do is calculate the APIC
> IRQ for a given PCI device interrupt.

...and establish notifies for changes along this line. And allow to
update intermediate states on access.

>  That doesn't mean we need to be
> able to do arbitrary interrupt resolution in generic code.

We will likely have to solve the same problem on none x86 as well.

> 
> There is potentially tremendous complexity here because you'll have to
> bake all interrupt rerouting logic into a declarative API and/or call
> into generic code to update routing tables.  Given the fact that we
> can't even generically refer to a device reliably today, this is would
> be a daunting task.
> 
> We're making this all more complicated than it needs to be.

We can't discuss the problem away, sorry.

Jan
Jan Kiszka - Sept. 4, 2011, 1:49 p.m.
On 2011-09-04 15:41, Anthony Liguori wrote:
> On 09/04/2011 08:36 AM, Jan Kiszka wrote:
>> On 2011-09-04 15:32, Anthony Liguori wrote:
>>> I prefer to not think of IRQs as special things.  They're just single
>>> bits of information that flow through the device model.  Having a higher
>>> level representation that understands something like paths seems wrong
>>> to me.
>>>
>>> I'd prefer to treat things like device assignment as a hack.  You just
>>> need code that can walk the device tree to figure out the path (which is
>>> not generic code, it's very machine specific).  Then you tell the kernel
>>> the resolution of the path and are otherwise completely oblivious in
>>> userspace.
>>
>> See it as you like, but we need the support, not only for device
>> assigment. And I do not see any gain it hacking this instead of
>> designing it.
> 
> You can design a hack but it's still a hack.
> 
> Device state belongs in devices.  Trying to extract device state
> (interrupt routing paths) and externalizing it is by definition a hack.
> 
> Having some sort of global interrupt routing table is just going to add
> a layer of complexity for very little obvious gain.

It's not yet decided how the problem is solved. A global interrupt
matrix is just one proposal, another option is to extend the pin model
in a way that supports routing change notifiers and backward polling.

> 
> The PCI bus *is* the I/O APIC for all intents and purposes.  Being able
> to ask the i440fx for the interrupt corresponding to a device is a very
> simple task that involves no generic code.  That's more than enough to
> support device assignment.
> 
> Why overcomplicate things when the problem can be solved robustly with
> most likely a 10 line function?

Fine, then INTx is solved. And we will hate us for hacking things for
this single purpose when we later on want to support, say, MSI routing
though an emulated interrupt remapper. Or routing of the Q35 chipset.

It is not only INTx for device assignment, and it is not only x86.

Jan
Anthony Liguori - Sept. 4, 2011, 1:55 p.m.
On 09/04/2011 08:42 AM, Jan Kiszka wrote:
> On 2011-09-04 15:38, Anthony Liguori wrote:
>>> With current kvm device assignment it's mandatory as it only support
>>> kernel/kernel IRQ delivery. Only vfio's eventfds will make it optional
>>> (but still highly desirable).
>>
>> It's not mandatory.  All you need to be able to do is calculate the APIC
>> IRQ for a given PCI device interrupt.
>
> ...and establish notifies for changes along this line. And allow to
> update intermediate states on access.

Again, this is all in a single layer.

>
>>   That doesn't mean we need to be
>> able to do arbitrary interrupt resolution in generic code.
>
> We will likely have to solve the same problem on none x86 as well.

But we're already seeing the that device assignment problem is very 
different on other platforms.

Given multiple well known architectures, yes, it makes sense to write 
generic code to handle both.

But we shouldn't just assume that just because we have one use-case that 
it generalizes to arbitrarily complex use-cases.

An IRQ routing table sounds good in theory but in practice it's just not 
that simple.  Image a simple device with an output pin that is high when 
there is data to read.

It looks and smells like an IRQ but there's nothing to say that it's 
ever going to be routed to the CPU as a level interrupt.  It may be 
handled entirely within another device for a certain platform.

But in another platform, it may actually be routed as a level triggered 
interrupt to the CPU.  In fact, as we do more pin level modelling, we'll 
run into a lot of scenarios like this.

Over generalizing is just going to box us into this narrow view of 
anything that's a single bit of information being directly routable to a 
CPU.

>> There is potentially tremendous complexity here because you'll have to
>> bake all interrupt rerouting logic into a declarative API and/or call
>> into generic code to update routing tables.  Given the fact that we
>> can't even generically refer to a device reliably today, this is would
>> be a daunting task.
>>
>> We're making this all more complicated than it needs to be.
>
> We can't discuss the problem away, sorry.

"We may need at some point in the future" is not a sufficient 
justification either.

If we want to talk about concrete scenarios, fine.  I'm all for doing 
what we need to do to support things.  But unless there's an absolutely 
compelling need to introduce a global interrupt routing table, I think 
we're much better off handling it as special cases.

Regards,

Anthony Liguori

> Jan
>
Anthony Liguori - Sept. 4, 2011, 1:57 p.m.
On 09/04/2011 08:49 AM, Jan Kiszka wrote:
> On 2011-09-04 15:41, Anthony Liguori wrote:
>> On 09/04/2011 08:36 AM, Jan Kiszka wrote:
>>> On 2011-09-04 15:32, Anthony Liguori wrote:
>>>> I prefer to not think of IRQs as special things.  They're just single
>>>> bits of information that flow through the device model.  Having a higher
>>>> level representation that understands something like paths seems wrong
>>>> to me.
>>>>
>>>> I'd prefer to treat things like device assignment as a hack.  You just
>>>> need code that can walk the device tree to figure out the path (which is
>>>> not generic code, it's very machine specific).  Then you tell the kernel
>>>> the resolution of the path and are otherwise completely oblivious in
>>>> userspace.
>>>
>>> See it as you like, but we need the support, not only for device
>>> assigment. And I do not see any gain it hacking this instead of
>>> designing it.
>>
>> You can design a hack but it's still a hack.
>>
>> Device state belongs in devices.  Trying to extract device state
>> (interrupt routing paths) and externalizing it is by definition a hack.
>>
>> Having some sort of global interrupt routing table is just going to add
>> a layer of complexity for very little obvious gain.
>
> It's not yet decided how the problem is solved. A global interrupt
> matrix is just one proposal, another option is to extend the pin model
> in a way that supports routing change notifiers and backward polling.

If that's all you need, then you really just want notification on socket 
changes.  Backwards polling can be achieved by just adding state to the 
Pin (which I full heartedly support).

If that's all you're proposing, than I'm entirely happy with it :-)

I'm happy with that because all of the route detection logic can live 
outside of the devices which at least contains the complexity.

Regards,

Anthony Liguori
Avi Kivity - Sept. 4, 2011, 2:12 p.m.
On 09/04/2011 04:41 PM, Anthony Liguori wrote:
>> See it as you like, but we need the support, not only for device
>> assigment. And I do not see any gain it hacking this instead of
>> designing it.
>
>
> You can design a hack but it's still a hack.
>
> Device state belongs in devices.  Trying to extract device state 
> (interrupt routing paths) and externalizing it is by definition a hack.

Pet peeve - saying something is "by definition" a hack is just rhetoric 
unless the definition of device state is "something that cannot be 
extracted and externalized".  Let's avoid this.

In fact it's exactly what we do with the memory API.  Memory routing is 
part of device state, yet we expose it to the memory API and let it do 
its thing instead of going through the hierarchy on every single memory 
access.

Whether it's worthwhile to do this for interrupts as well depends on how 
common the situation is and what we gain from it.  We can only do this 
if the routing is semi-static (depends only on other state but not the 
actual interrupt) - probably the only exception to this is the "least 
priority" mode which means the last leg cannot be computed statically.

I agree the gain is low if we limit ourselves to 440fx, but if we add 
interrupt remapping it becomes considerably more complicated to do this 
backward path computation.
Anthony Liguori - Sept. 4, 2011, 2:37 p.m.
On 09/04/2011 08:57 AM, Anthony Liguori wrote:
> On 09/04/2011 08:49 AM, Jan Kiszka wrote:
>> On 2011-09-04 15:41, Anthony Liguori wrote:
>>> On 09/04/2011 08:36 AM, Jan Kiszka wrote:
>>> Having some sort of global interrupt routing table is just going to add
>>> a layer of complexity for very little obvious gain.
>>
>> It's not yet decided how the problem is solved. A global interrupt
>> matrix is just one proposal, another option is to extend the pin model
>> in a way that supports routing change notifiers and backward polling.
>
> If that's all you need, then you really just want notification on socket
> changes. Backwards polling can be achieved by just adding state to the
> Pin (which I full heartedly support).
>
> If that's all you're proposing, than I'm entirely happy with it :-)

It's not that simple.

Routing paths can change because of device changes, not just socket changes.

I think you would need an interface for irq routing.  Something like:

struct IrqRouter {
     Interface parent;

     void (*foreach_output)(IrqRouter *obj,
                            void (*fn)(const char *out, void *opaque),
                            void *opaque);

     void (*foreach_input)(IrqRouter *obj,
                           void (*fn)(const char *in, void *opaque),
                           void *opaque);

     const char *(*get_mapping)(IrqRouter *obj, const char *in);
};

You could then implement this interface in I440FX or any other 
controller where we want to be able to support device passthrough.

Representing endpoints as strings means that you can correlate inputs to 
outputs throughout the chain provided that you understand how 
inputs/outputs relate to plugs/sockets.

I think this has the property of letting you write reasonably generic 
code to discover routing while only having to add complexity to the bare 
minimum set of devices to enable device passthrough.

It's basically what I suggested for PCI INTx mapping but a little more 
generic as an interface so that it can extend to other types of devices.

Regards,

Anthony Liguori

> I'm happy with that because all of the route detection logic can live
> outside of the devices which at least contains the complexity.
>
> Regards,
>
> Anthony Liguori
Anthony Liguori - Sept. 4, 2011, 2:43 p.m.
On 09/04/2011 09:12 AM, Avi Kivity wrote:
> On 09/04/2011 04:41 PM, Anthony Liguori wrote:
>>> See it as you like, but we need the support, not only for device
>>> assigment. And I do not see any gain it hacking this instead of
>>> designing it.
>>
>>
>> You can design a hack but it's still a hack.
>>
>> Device state belongs in devices. Trying to extract device state
>> (interrupt routing paths) and externalizing it is by definition a hack.
>
> Pet peeve - saying something is "by definition" a hack is just rhetoric
> unless the definition of device state is "something that cannot be
> extracted and externalized". Let's avoid this.

Likewise, I would prefer to avoid stating that something is a hard 
requirement in the absence of concrete use-cases.

I do think the definition of device state is more or less something that 
is opaque and owned by the device.  The more we externalize the device 
state, the more we have to deal with compatibility (even if it's just 
internal compatibility).  This makes modularity hard because there's too 
many things with complex dependencies.

> In fact it's exactly what we do with the memory API. Memory routing is
> part of device state, yet we expose it to the memory API and let it do
> its thing instead of going through the hierarchy on every single memory
> access.

Yes, and the memory API is complicated and invasive :-)  But it's worth 
it at the end of the day (although I think it could be simplified at the 
expensive of not allowing as much flattening).

> Whether it's worthwhile to do this for interrupts as well depends on how
> common the situation is and what we gain from it. We can only do this if
> the routing is semi-static (depends only on other state but not the
> actual interrupt) - probably the only exception to this is the "least
> priority" mode which means the last leg cannot be computed statically.
>
> I agree the gain is low if we limit ourselves to 440fx, but if we add
> interrupt remapping it becomes considerably more complicated to do this
> backward path computation.

If we start with a simple interface that works for the i440fx and then 
expand it as we support more layers, I think we'll be okay.

What I'm concerned about is an attempt to globally track IRQ routing.  I 
can imagine constructing a board where you have two simple devices that 
have level triggered interrupts and wanting to tie them to a single 
input pin.  A simple OR gate would be sufficient to do this.  Having to 
make an OR gate an IRQ router seems like far too much complexity to me.

I think we need to strive to keep simple things simple.

Regards,

Anthony Liguori
Anthony Liguori - Sept. 4, 2011, 2:49 p.m.
On 09/03/2011 04:01 PM, Blue Swirl wrote:
> On Sat, Sep 3, 2011 at 7:53 PM, Anthony Liguori<anthony@codemonkey.ws>  wrote:
>> On 08/31/2011 11:59 AM, Blue Swirl wrote:
>>>
>>> On Wed, Aug 31, 2011 at 8:28 AM, Avi Kivity<avi@redhat.com>    wrote:
>>>>
>>>> On 08/30/2011 10:19 PM, Blue Swirl wrote:
>>>>>
>>>>>>
>>>>>>   We need some kind of two phase restore. In the first phase all state
>>>>>> is
>>>>>>   restored; since some of that state drivers outputs that are input to
>>>>>> other
>>>>>>   devices, they may experience an edge, and we need to supress that.  In
>>>>>> the
>>>>>>   second phase edge detection is unsupressed and the device goes live.
>>>>>
>>>>> No. Devices may not perform any externally visible activities (like
>>>>> toggle a qemu_irq) during or after load because 1) qemu_irq is
>>>>> stateless and 2) since the receiving end is also freshly loaded, both
>>>>> states are already in synch without any calls or toggling.
>>>>
>>>> That makes it impossible to migrate level-triggered irq lines.  Or at
>>>> least,
>>>> the receiver has to remember the state, instead of (or in addition to)
>>>> the
>>>> sender.
>>>
>>> Both ends probably need to remember the state. That should work
>>> without any multiphase restores and transient suppressors.
>>>
>>> It might be also possible to introduce stateful signal lines which
>>> save and restore their state, then the receiving end could check what
>>> is the current level. However, if you consider that the devices may be
>>> restored in random order, if the IRQ line device happens to be
>>> restored later, the receiver would still get wrong information. Adding
>>> priorities could solve this, but I think stateless IRQs are the only
>>> sane way.
>>
>> We shouldn't really use the term IRQ as it's confusing.  I like the term
>> "pin" better because that describes what we're really talking about.
>>
>> qemu_irq is designed oddly today because is represents something that is
>> intrinsically state (whether a pin is high or low) with an edge notification
>> with the assumption that the state is held somewhere else (which is usually
>> true).
>>
>> Modelling stateful pins is useful though for doing something like
>> introspecting pin levels, supporting live migration, etc.
>>
>> The way this works in QOM right now is that the Pin object has a level state
>> that can be queried but it also has the ability to register for
>> notifications on level change.
>>
>> The edge change signal isn't registered until realize.  This means that you
>> can connect all of the device models, restore all of the pin states, and
>> then realize the device model all at once.  At the point of realize, all of
>> the devices have the right pin levels so each device can add their edge
>> change signals and read the incoming pins and respond accordingly.
>
> Even if the devices read the input pins on restore, they shouldn't
> make any changes to their output pins because that would propagate to
> other devices. To handle this in non-chaotic way would need hacks to
> each device, multiphase stuff, priorities or transient suppressors.
>  From the device point of view, restoring is not a state change and no
> edges should be seen at that moment.

A device wouldn't get a signal about an irq edge at realize.  Restoring 
an arbitrarily complex device model wouldn't result in any irq edge 
notifications because all of the devices are created and their state is 
set before any device is realized.  Since an edge event only occurs when 
the state changes of an IRQ after realize, no edge events will happen.

Whether a device reads the level of an IRQ during realize though depends 
on the device.  I don't think there's any strong reason to but I also 
don't think it's fundamentally wrong for a device to read an IRQ level 
at realize.

Regards,

Anthony Liguori

>
Avi Kivity - Sept. 4, 2011, 3:03 p.m.
On 09/04/2011 05:43 PM, Anthony Liguori wrote:
>> Pet peeve - saying something is "by definition" a hack is just rhetoric
>> unless the definition of device state is "something that cannot be
>> extracted and externalized". Let's avoid this.
>
>
> Likewise, I would prefer to avoid stating that something is a hard 
> requirement in the absence of concrete use-cases.

Agree.  But let's not fight rhetoric with rhetoric.

I also agree that if it were just the 440fx, then we could have a 
private channel between device assignment and the ioapic.  But if there 
are more use cases, it calls for a generic solution.

> I do think the definition of device state is more or less something 
> that is opaque and owned by the device.  The more we externalize the 
> device state, the more we have to deal with compatibility (even if 
> it's just internal compatibility).  This makes modularity hard because 
> there's too many things with complex dependencies.

That depends on the number of special cases we'll see.  I really have no 
insight here.

>
>> In fact it's exactly what we do with the memory API. Memory routing is
>> part of device state, yet we expose it to the memory API and let it do
>> its thing instead of going through the hierarchy on every single memory
>> access.
>
> Yes, and the memory API is complicated and invasive :-)  But it's 
> worth it at the end of the day (although I think it could be 
> simplified at the expensive of not allowing as much flattening).

(we should have spent a few hours at kf2011 to convince you that this is 
impossible)

>
> What I'm concerned about is an attempt to globally track IRQ routing.  
> I can imagine constructing a board where you have two simple devices 
> that have level triggered interrupts and wanting to tie them to a 
> single input pin.  A simple OR gate would be sufficient to do this.  
> Having to make an OR gate an IRQ router seems like far too much 
> complexity to me.

Depends on the API.  If Pin has a method pin_add_tristate_input(), then 
it becomes trivial.

> I think we need to strive to keep simple things simple.

Too late.
Anthony Liguori - Sept. 4, 2011, 3:19 p.m.
On 09/04/2011 10:03 AM, Avi Kivity wrote:
> On 09/04/2011 05:43 PM, Anthony Liguori wrote:
>>> In fact it's exactly what we do with the memory API. Memory routing is
>>> part of device state, yet we expose it to the memory API and let it do
>>> its thing instead of going through the hierarchy on every single memory
>>> access.
>>
>> Yes, and the memory API is complicated and invasive :-) But it's worth
>> it at the end of the day (although I think it could be simplified at
>> the expensive of not allowing as much flattening).
>
> (we should have spent a few hours at kf2011 to convince you that this is
> impossible)

I don't mean for RAM, but for device I/O.

Instead of implementing it_shift in the core API, you could implement 
it_shift by having a device that takes an input MemoryRegion and an 
output MemoryRegion and implements the it_shift logic.

I think endianness could also be handled this way too.

For stuff like coalesced I/O and eventfds, I understand the difficulties.

>
>>
>> What I'm concerned about is an attempt to globally track IRQ routing.
>> I can imagine constructing a board where you have two simple devices
>> that have level triggered interrupts and wanting to tie them to a
>> single input pin. A simple OR gate would be sufficient to do this.
>> Having to make an OR gate an IRQ router seems like far too much
>> complexity to me.
>
> Depends on the API. If Pin has a method pin_add_tristate_input(), then
> it becomes trivial.

See my self-reply to Jan.  I think there's a reasonable middle ground 
using an IrqRouter interface.

Regards,

Anthony Liguori
Blue Swirl - Sept. 4, 2011, 3:20 p.m.
On Sun, Sep 4, 2011 at 2:37 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 09/04/2011 08:57 AM, Anthony Liguori wrote:
>>
>> On 09/04/2011 08:49 AM, Jan Kiszka wrote:
>>>
>>> On 2011-09-04 15:41, Anthony Liguori wrote:
>>>>
>>>> On 09/04/2011 08:36 AM, Jan Kiszka wrote:
>>>> Having some sort of global interrupt routing table is just going to add
>>>> a layer of complexity for very little obvious gain.
>>>
>>> It's not yet decided how the problem is solved. A global interrupt
>>> matrix is just one proposal, another option is to extend the pin model
>>> in a way that supports routing change notifiers and backward polling.
>>
>> If that's all you need, then you really just want notification on socket
>> changes. Backwards polling can be achieved by just adding state to the
>> Pin (which I full heartedly support).
>>
>> If that's all you're proposing, than I'm entirely happy with it :-)
>
> It's not that simple.
>
> Routing paths can change because of device changes, not just socket changes.

Yes, that's why callbacks are needed to let the device inform global matrix.

> I think you would need an interface for irq routing.  Something like:
>
> struct IrqRouter {
>    Interface parent;
>
>    void (*foreach_output)(IrqRouter *obj,
>                           void (*fn)(const char *out, void *opaque),
>                           void *opaque);
>
>    void (*foreach_input)(IrqRouter *obj,
>                          void (*fn)(const char *in, void *opaque),
>                          void *opaque);

Are the above a way for a parent to tell this device that its inputs
are changed?

>    const char *(*get_mapping)(IrqRouter *obj, const char *in);

This would be useful for the matrix too, the matrix would use this to
query for initial routing, maybe also for later changes if the change
callback didn't tell the mapping directly.

> };
>
> You could then implement this interface in I440FX or any other controller
> where we want to be able to support device passthrough.
>
> Representing endpoints as strings means that you can correlate inputs to
> outputs throughout the chain provided that you understand how inputs/outputs
> relate to plugs/sockets.

The string format assumes that there is a reliable way to convert Pin
to string and vice versa. Why can't they be array of pointers to Pins?

String representation (e.g. "/i440fx/pci-bridge@2,0/serial@3fc:2")
could be useful for debugging and 'info qtree' ('info irqtree'?)
though.

> I think this has the property of letting you write reasonably generic code
> to discover routing while only having to add complexity to the bare minimum
> set of devices to enable device passthrough.
>
> It's basically what I suggested for PCI INTx mapping but a little more
> generic as an interface so that it can extend to other types of devices.
>
> Regards,
>
> Anthony Liguori
>
>> I'm happy with that because all of the route detection logic can live
>> outside of the devices which at least contains the complexity.
>>
>> Regards,
>>
>> Anthony Liguori
>
>
Blue Swirl - Sept. 4, 2011, 3:27 p.m.
On Sun, Sep 4, 2011 at 2:43 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 09/04/2011 09:12 AM, Avi Kivity wrote:
>>
>> On 09/04/2011 04:41 PM, Anthony Liguori wrote:
>>>>
>>>> See it as you like, but we need the support, not only for device
>>>> assigment. And I do not see any gain it hacking this instead of
>>>> designing it.
>>>
>>>
>>> You can design a hack but it's still a hack.
>>>
>>> Device state belongs in devices. Trying to extract device state
>>> (interrupt routing paths) and externalizing it is by definition a hack.
>>
>> Pet peeve - saying something is "by definition" a hack is just rhetoric
>> unless the definition of device state is "something that cannot be
>> extracted and externalized". Let's avoid this.
>
> Likewise, I would prefer to avoid stating that something is a hard
> requirement in the absence of concrete use-cases.
>
> I do think the definition of device state is more or less something that is
> opaque and owned by the device.  The more we externalize the device state,
> the more we have to deal with compatibility (even if it's just internal
> compatibility).  This makes modularity hard because there's too many things
> with complex dependencies.
>
>> In fact it's exactly what we do with the memory API. Memory routing is
>> part of device state, yet we expose it to the memory API and let it do
>> its thing instead of going through the hierarchy on every single memory
>> access.
>
> Yes, and the memory API is complicated and invasive :-)  But it's worth it
> at the end of the day (although I think it could be simplified at the
> expensive of not allowing as much flattening).
>
>> Whether it's worthwhile to do this for interrupts as well depends on how
>> common the situation is and what we gain from it. We can only do this if
>> the routing is semi-static (depends only on other state but not the
>> actual interrupt) - probably the only exception to this is the "least
>> priority" mode which means the last leg cannot be computed statically.
>>
>> I agree the gain is low if we limit ourselves to 440fx, but if we add
>> interrupt remapping it becomes considerably more complicated to do this
>> backward path computation.
>
> If we start with a simple interface that works for the i440fx and then
> expand it as we support more layers, I think we'll be okay.
>
> What I'm concerned about is an attempt to globally track IRQ routing.  I can
> imagine constructing a board where you have two simple devices that have
> level triggered interrupts and wanting to tie them to a single input pin.  A
> simple OR gate would be sufficient to do this.  Having to make an OR gate an
> IRQ router seems like far too much complexity to me.

For the simple OR gate this is a downside of course, but I don't see a
way to avoid it, same with the memory API or other APIs that need to
cater for wide variety of needs.

> I think we need to strive to keep simple things simple.

If the APIs need only minor changes, the changes to the devices would
not be so big. At this point I think your router API would only need
the change callback to support the matrix and Pin should be
compatible.

> Regards,
>
> Anthony Liguori
>
>
Anthony Liguori - Sept. 4, 2011, 3:31 p.m.
On 09/04/2011 10:20 AM, Blue Swirl wrote:
> On Sun, Sep 4, 2011 at 2:37 PM, Anthony Liguori<anthony@codemonkey.ws>  wrote:
>> On 09/04/2011 08:57 AM, Anthony Liguori wrote:
>>>
>>> On 09/04/2011 08:49 AM, Jan Kiszka wrote:
>>>>
>>>> On 2011-09-04 15:41, Anthony Liguori wrote:
>>>>>
>>>>> On 09/04/2011 08:36 AM, Jan Kiszka wrote:
>>>>> Having some sort of global interrupt routing table is just going to add
>>>>> a layer of complexity for very little obvious gain.
>>>>
>>>> It's not yet decided how the problem is solved. A global interrupt
>>>> matrix is just one proposal, another option is to extend the pin model
>>>> in a way that supports routing change notifiers and backward polling.
>>>
>>> If that's all you need, then you really just want notification on socket
>>> changes. Backwards polling can be achieved by just adding state to the
>>> Pin (which I full heartedly support).
>>>
>>> If that's all you're proposing, than I'm entirely happy with it :-)
>>
>> It's not that simple.
>>
>> Routing paths can change because of device changes, not just socket changes.
>
> Yes, that's why callbacks are needed to let the device inform global matrix.
>
>> I think you would need an interface for irq routing.  Something like:
>>
>> struct IrqRouter {
>>     Interface parent;
>>
>>     void (*foreach_output)(IrqRouter *obj,
>>                            void (*fn)(const char *out, void *opaque),
>>                            void *opaque);
>>
>>     void (*foreach_input)(IrqRouter *obj,
>>                           void (*fn)(const char *in, void *opaque),
>>                           void *opaque);
>
> Are the above a way for a parent to tell this device that its inputs
> are changed?

No... there really isn't a notion of parent/children.

If something is interested in the routing path, it needs to listen for 
mapping changes at every IrqRouter point.  I neglected to add an 
interface for registering for mapping changes so there would also need 
to be a notification registration interface here.

>
>>     const char *(*get_mapping)(IrqRouter *obj, const char *in);
>
> This would be useful for the matrix too, the matrix would use this to
> query for initial routing, maybe also for later changes if the change
> callback didn't tell the mapping directly.

I think the main difference is, do you try to solve the problem 
everywhere, or do you just allow for certain paths to be introspected. 
And then who is reasonable for maintaining the full path.

>
>> };
>>
>> You could then implement this interface in I440FX or any other controller
>> where we want to be able to support device passthrough.
>>
>> Representing endpoints as strings means that you can correlate inputs to
>> outputs throughout the chain provided that you understand how inputs/outputs
>> relate to plugs/sockets.
>
> The string format assumes that there is a reliable way to convert Pin
> to string and vice versa. Why can't they be array of pointers to Pins?

Yes, all Pins are devices and all devices have unique names.  You could 
just as easily use Pin *s.

It's not clear to me whether you want to make the strings device names 
or just property names though.

>
> String representation (e.g. "/i440fx/pci-bridge@2,0/serial@3fc:2")
> could be useful for debugging and 'info qtree' ('info irqtree'?)
> though.

IIUC, then this would be (in QOM)

i440fx::piix3::serial[0]::irq

This is the unique name the device would get.  That's because the PIIX3 
and Serial devices are created via composition.  You could indirectly 
reference it multiple ways:

/i440fx/slot[2]/serial[0]/irq
/i440fx/piix3/serial[0]/irq

Regards,

Anthony Liguori
Avi Kivity - Sept. 4, 2011, 3:34 p.m.
On 09/04/2011 06:19 PM, Anthony Liguori wrote:
>>> Yes, and the memory API is complicated and invasive :-) But it's worth
>>> it at the end of the day (although I think it could be simplified at
>>> the expensive of not allowing as much flattening).
>>
>> (we should have spent a few hours at kf2011 to convince you that this is
>> impossible)
>
>
> I don't mean for RAM, but for device I/O.

It's impossible to make the distinction.  A piece of RAM can overlay an 
mmio region and split it in two, or an mmio region can split a RAM 
region.  This means the machinery cannot consider the region type anyway.

>
> Instead of implementing it_shift in the core API, you could implement 
> it_shift by having a device that takes an input MemoryRegion and an 
> output MemoryRegion and implements the it_shift logic.

We could, but that imposes a burden on users to find that device and 
glue it to the memory API.  I'm not after a mean and lean API, I'm after 
something that is easy enough to use that people manage to get device 
models that work.

>
> I think endianness could also be handled this way too.

On some archs, endianness can find itself in RAM, so we need complete 
control.  And again, I don't want users gluing stuff together the wrong 
way, I want them using a simple interface.
Blue Swirl - Sept. 4, 2011, 3:44 p.m.
On Sun, Sep 4, 2011 at 3:31 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 09/04/2011 10:20 AM, Blue Swirl wrote:
>>
>> On Sun, Sep 4, 2011 at 2:37 PM, Anthony Liguori<anthony@codemonkey.ws>
>>  wrote:
>>>
>>> On 09/04/2011 08:57 AM, Anthony Liguori wrote:
>>>>
>>>> On 09/04/2011 08:49 AM, Jan Kiszka wrote:
>>>>>
>>>>> On 2011-09-04 15:41, Anthony Liguori wrote:
>>>>>>
>>>>>> On 09/04/2011 08:36 AM, Jan Kiszka wrote:
>>>>>> Having some sort of global interrupt routing table is just going to
>>>>>> add
>>>>>> a layer of complexity for very little obvious gain.
>>>>>
>>>>> It's not yet decided how the problem is solved. A global interrupt
>>>>> matrix is just one proposal, another option is to extend the pin model
>>>>> in a way that supports routing change notifiers and backward polling.
>>>>
>>>> If that's all you need, then you really just want notification on socket
>>>> changes. Backwards polling can be achieved by just adding state to the
>>>> Pin (which I full heartedly support).
>>>>
>>>> If that's all you're proposing, than I'm entirely happy with it :-)
>>>
>>> It's not that simple.
>>>
>>> Routing paths can change because of device changes, not just socket
>>> changes.
>>
>> Yes, that's why callbacks are needed to let the device inform global
>> matrix.
>>
>>> I think you would need an interface for irq routing.  Something like:
>>>
>>> struct IrqRouter {
>>>    Interface parent;
>>>
>>>    void (*foreach_output)(IrqRouter *obj,
>>>                           void (*fn)(const char *out, void *opaque),
>>>                           void *opaque);
>>>
>>>    void (*foreach_input)(IrqRouter *obj,
>>>                          void (*fn)(const char *in, void *opaque),
>>>                          void *opaque);
>>
>> Are the above a way for a parent to tell this device that its inputs
>> are changed?
>
> No... there really isn't a notion of parent/children.
>
> If something is interested in the routing path, it needs to listen for
> mapping changes at every IrqRouter point.  I neglected to add an interface
> for registering for mapping changes so there would also need to be a
> notification registration interface here.

With that change I think we're 100% in agreement.

>>
>>>    const char *(*get_mapping)(IrqRouter *obj, const char *in);
>>
>> This would be useful for the matrix too, the matrix would use this to
>> query for initial routing, maybe also for later changes if the change
>> callback didn't tell the mapping directly.
>
> I think the main difference is, do you try to solve the problem everywhere,
> or do you just allow for certain paths to be introspected. And then who is
> reasonable for maintaining the full path.

The matrix should take full responsibility for the devices (and the
connections of those devices) that are under its control. Other
devices can continue to do as before, they can be converted to matrix
if needed.

The benefit from the matrix would be performance and that would come
from lazy update of the devices in addition to the fast direct path
from source to final destination. If all the intermediate devices are
probed by the OS (if there are no vectored interrupts etc. to
determine the IRQ from PC), the lazy updates will not help and benefit
will be lower.

>>
>>> };
>>>
>>> You could then implement this interface in I440FX or any other controller
>>> where we want to be able to support device passthrough.
>>>
>>> Representing endpoints as strings means that you can correlate inputs to
>>> outputs throughout the chain provided that you understand how
>>> inputs/outputs
>>> relate to plugs/sockets.
>>
>> The string format assumes that there is a reliable way to convert Pin
>> to string and vice versa. Why can't they be array of pointers to Pins?
>
> Yes, all Pins are devices and all devices have unique names.  You could just
> as easily use Pin *s.
>
> It's not clear to me whether you want to make the strings device names or
> just property names though.

I think either would work if the string->Pin conversion works.

>>
>> String representation (e.g. "/i440fx/pci-bridge@2,0/serial@3fc:2")
>> could be useful for debugging and 'info qtree' ('info irqtree'?)
>> though.
>
> IIUC, then this would be (in QOM)
>
> i440fx::piix3::serial[0]::irq
>
> This is the unique name the device would get.  That's because the PIIX3 and
> Serial devices are created via composition.  You could indirectly reference
> it multiple ways:
>
> /i440fx/slot[2]/serial[0]/irq
> /i440fx/piix3/serial[0]/irq
>
> Regards,
>
> Anthony Liguori
>
Edgar Iglesias - Sept. 5, 2011, 8:38 a.m.
On Sat, Sep 03, 2011 at 02:53:31PM -0500, Anthony Liguori wrote:
> On 08/31/2011 11:59 AM, Blue Swirl wrote:
> > On Wed, Aug 31, 2011 at 8:28 AM, Avi Kivity<avi@redhat.com>  wrote:
> >> On 08/30/2011 10:19 PM, Blue Swirl wrote:
> >>>
> >>>>
> >>>>   We need some kind of two phase restore. In the first phase all state is
> >>>>   restored; since some of that state drivers outputs that are input to
> >>>> other
> >>>>   devices, they may experience an edge, and we need to supress that.  In
> >>>> the
> >>>>   second phase edge detection is unsupressed and the device goes live.
> >>>
> >>> No. Devices may not perform any externally visible activities (like
> >>> toggle a qemu_irq) during or after load because 1) qemu_irq is
> >>> stateless and 2) since the receiving end is also freshly loaded, both
> >>> states are already in synch without any calls or toggling.
> >>
> >> That makes it impossible to migrate level-triggered irq lines.  Or at least,
> >> the receiver has to remember the state, instead of (or in addition to) the
> >> sender.
> >
> > Both ends probably need to remember the state. That should work
> > without any multiphase restores and transient suppressors.
> >
> > It might be also possible to introduce stateful signal lines which
> > save and restore their state, then the receiving end could check what
> > is the current level. However, if you consider that the devices may be
> > restored in random order, if the IRQ line device happens to be
> > restored later, the receiver would still get wrong information. Adding
> > priorities could solve this, but I think stateless IRQs are the only
> > sane way.
> 
> We shouldn't really use the term IRQ as it's confusing.  I like the term 
> "pin" better because that describes what we're really talking about.
> 
> qemu_irq is designed oddly today because is represents something that is 
> intrinsically state (whether a pin is high or low) with an edge 
> notification with the assumption that the state is held somewhere else 
> (which is usually true).

I don't agree. That's not what qemu_irq represents.
It represents a wire, a mechanism to drive changes through logic paths
between state. It is intrinsically stateless.

It may be the case that it is missused in some places, or that it isn't
always the best thing to use to represent what ever you need to represent,
so that you want to complement with other mechanisms.
But universally replacing it with a stateful alternative seems wrong to me.

Cheers
Avi Kivity - Sept. 5, 2011, 8:51 a.m.
On 09/05/2011 11:38 AM, Edgar E. Iglesias wrote:
> >
> >  We shouldn't really use the term IRQ as it's confusing.  I like the term
> >  "pin" better because that describes what we're really talking about.
> >
> >  qemu_irq is designed oddly today because is represents something that is
> >  intrinsically state (whether a pin is high or low) with an edge
> >  notification with the assumption that the state is held somewhere else
> >  (which is usually true).
>
> I don't agree. That's not what qemu_irq represents.
> It represents a wire, a mechanism to drive changes through logic paths
> between state. It is intrinsically stateless.
>
> It may be the case that it is missused in some places, or that it isn't
> always the best thing to use to represent what ever you need to represent,
> so that you want to complement with other mechanisms.
> But universally replacing it with a stateful alternative seems wrong to me.
>

I agree that qemu_irq is inherently stateless.  But I do think there 
should be a way for the sink to query the line level.  Whether it is 
implemented as a cache of the last qemu_set() level, or with callbacks 
that query the underlying state is not important, but we can't just rely 
on edge triggers.

(real hardware can query a line at any time, yes?)
Peter Maydell - Sept. 5, 2011, 9:02 a.m.
On 5 September 2011 09:51, Avi Kivity <avi@redhat.com> wrote:
> On 09/05/2011 11:38 AM, Edgar E. Iglesias wrote:
>> I don't agree. That's not what qemu_irq represents.
>> It represents a wire, a mechanism to drive changes through logic paths
>> between state. It is intrinsically stateless.
>>
>> It may be the case that it is missused in some places, or that it isn't
>> always the best thing to use to represent what ever you need to represent,
>> so that you want to complement with other mechanisms.
>> But universally replacing it with a stateful alternative seems wrong to
>> me.

> I agree that qemu_irq is inherently stateless.  But I do think there should
> be a way for the sink to query the line level.  Whether it is implemented as
> a cache of the last qemu_set() level, or with callbacks that query the
> underlying state is not important, but we can't just rely on edge triggers.
>
> (real hardware can query a line at any time, yes?)

There are often significant constraints (eg "can only query at a clock
edge", or "not guaranteed to be valid until some other signal has been
high for at least this time"), which often means hardware will latch
the input value internally. Obviously QEMU doesn't have to care about
things to that level of detail. The reason we don't care is that we're
really restricting ourselves to providing a programmer-visible approximation
to the device behaviour. So I think the right question to ask about qemu_irq
semantics is not "does this act like hardware?" but "what is the right
interface between two components to let us conveniently implement the
behaviour we need?" This probably means that sometimes we want a line
with state-querying and sometimes not.

Ideally our device/object model ought to (a) let us model either option
easily (b) let us easily upgrade a "no state querying" line to a "state
querying" line by having the latter's interface be a superset of the
former's.

[IMHO the most important reason not to call it a qemu_irq is that
'irq' is a description of what the signal is used for, not of its
behaviour. 'gpio' is a bit more neutral.]

-- PMM
Avi Kivity - Sept. 5, 2011, 9:14 a.m.
On 09/05/2011 12:02 PM, Peter Maydell wrote:
> >  I agree that qemu_irq is inherently stateless.  But I do think there should
> >  be a way for the sink to query the line level.  Whether it is implemented as
> >  a cache of the last qemu_set() level, or with callbacks that query the
> >  underlying state is not important, but we can't just rely on edge triggers.
> >
> >  (real hardware can query a line at any time, yes?)
>
> There are often significant constraints (eg "can only query at a clock
> edge", or "not guaranteed to be valid until some other signal has been
> high for at least this time"), which often means hardware will latch
> the input value internally. Obviously QEMU doesn't have to care about
> things to that level of detail. The reason we don't care is that we're
> really restricting ourselves to providing a programmer-visible approximation
> to the device behaviour. So I think the right question to ask about qemu_irq
> semantics is not "does this act like hardware?" but "what is the right
> interface between two components to let us conveniently implement the
> behaviour we need?"

Agree.

> This probably means that sometimes we want a line
> with state-querying and sometimes not.

Right, and either can be implemented in terms of the other.  The 
question is which requires less boilerplate.

>
> Ideally our device/object model ought to (a) let us model either option
> easily (b) let us easily upgrade a "no state querying" line to a "state
> querying" line by having the latter's interface be a superset of the
> former's.
>
> [IMHO the most important reason not to call it a qemu_irq is that
> 'irq' is a description of what the signal is used for, not of its
> behaviour. 'gpio' is a bit more neutral.]

Or 'line'.
Edgar Iglesias - Sept. 5, 2011, 9:22 a.m.
On Mon, Sep 05, 2011 at 11:51:01AM +0300, Avi Kivity wrote:
> On 09/05/2011 11:38 AM, Edgar E. Iglesias wrote:
> > >
> > >  We shouldn't really use the term IRQ as it's confusing.  I like the term
> > >  "pin" better because that describes what we're really talking about.
> > >
> > >  qemu_irq is designed oddly today because is represents something that is
> > >  intrinsically state (whether a pin is high or low) with an edge
> > >  notification with the assumption that the state is held somewhere else
> > >  (which is usually true).
> >
> > I don't agree. That's not what qemu_irq represents.
> > It represents a wire, a mechanism to drive changes through logic paths
> > between state. It is intrinsically stateless.
> >
> > It may be the case that it is missused in some places, or that it isn't
> > always the best thing to use to represent what ever you need to represent,
> > so that you want to complement with other mechanisms.
> > But universally replacing it with a stateful alternative seems wrong to me.
> >
> 
> I agree that qemu_irq is inherently stateless.  But I do think there 
> should be a way for the sink to query the line level.  Whether it is 
> implemented as a cache of the last qemu_set() level, or with callbacks 
> that query the underlying state is not important, but we can't just rely 
> on edge triggers.

I think it is important. Because you need to at least be able to mark the
places were there is state. The cache at the sink sounds more right to me.
An IRQ line, can at the same time have multiple states through its
path between source device and the final sink. Every device that needs to
model state should implement/mark it or maybe connect the the generic caching
sink version and devices that manipulate the level but dont have state,
shouldn't.

> (real hardware can query a line at any time, yes?)

IMO, the "query" is just an upside-down way of thinking of it.

What happens is, you change some state, and the state drives changes through
a logic path towards new state that picks up the updated value etc etc.
Quering is like going (for bus accesses): OK, here's my VGA framebuffer
address, lets do a get back through the bus and see what the CPU wants to
write to this location?

According to the query view, you would add state to the memory regions so
that an MMIO device only gets a xxx_access call, and does a query back to
the core to figure out what's going on. Possible? I'm sure it is. Correct?
who knows. But IMO, a very upside-down way of thinking of it.

Cheers
Avi Kivity - Sept. 5, 2011, 9:28 a.m.
On 09/05/2011 12:22 PM, Edgar E. Iglesias wrote:
> >  (real hardware can query a line at any time, yes?)
>
> IMO, the "query" is just an upside-down way of thinking of it.
>
> What happens is, you change some state, and the state drives changes through
> a logic path towards new state that picks up the updated value etc etc.
> Quering is like going (for bus accesses): OK, here's my VGA framebuffer
> address, lets do a get back through the bus and see what the CPU wants to
> write to this location?
>
> According to the query view, you would add state to the memory regions so
> that an MMIO device only gets a xxx_access call, and does a query back to
> the core to figure out what's going on. Possible? I'm sure it is. Correct?
> who knows.

There's no difference really.  Consider reading the 'data' parameter a 
query.

With qemu_irq, there is a difference, because the line level is valid at 
all times, not just when the edge takes place (for memory, the data 
lines are only valid during the transaction).

> But IMO, a very upside-down way of thinking of it.
>

Query is needed when a line is masked internally, or when a device is 
hot-plugged.

We can work around masking by caching the level in the device even 
though the line is masked, and querying the cache when the line is 
unmasked, but that is artificial, and requires live-migrating the cache 
(which isn't true state).  We can't work around the hotplug case.
Edgar Iglesias - Sept. 5, 2011, 10:44 a.m.
On Sun, Sep 04, 2011 at 08:57:31AM -0500, Anthony Liguori wrote:
> On 09/04/2011 08:49 AM, Jan Kiszka wrote:
> > On 2011-09-04 15:41, Anthony Liguori wrote:
> >> On 09/04/2011 08:36 AM, Jan Kiszka wrote:
> >>> On 2011-09-04 15:32, Anthony Liguori wrote:
> >>>> I prefer to not think of IRQs as special things.  They're just single
> >>>> bits of information that flow through the device model.  Having a higher
> >>>> level representation that understands something like paths seems wrong
> >>>> to me.
> >>>>
> >>>> I'd prefer to treat things like device assignment as a hack.  You just
> >>>> need code that can walk the device tree to figure out the path (which is
> >>>> not generic code, it's very machine specific).  Then you tell the kernel
> >>>> the resolution of the path and are otherwise completely oblivious in
> >>>> userspace.
> >>>
> >>> See it as you like, but we need the support, not only for device
> >>> assigment. And I do not see any gain it hacking this instead of
> >>> designing it.
> >>
> >> You can design a hack but it's still a hack.
> >>
> >> Device state belongs in devices.  Trying to extract device state
> >> (interrupt routing paths) and externalizing it is by definition a hack.
> >>
> >> Having some sort of global interrupt routing table is just going to add
> >> a layer of complexity for very little obvious gain.
> >
> > It's not yet decided how the problem is solved. A global interrupt
> > matrix is just one proposal, another option is to extend the pin model
> > in a way that supports routing change notifiers and backward polling.
> 
> If that's all you need, then you really just want notification on socket 
> changes.  Backwards polling can be achieved by just adding state to the 
> Pin (which I full heartedly support).

I'm not a fan of this backwards thing, but seeing the options available,
it might be the simplest way forward.

I agree that the global interrupt manager sounds like overkill...

Cheers
Edgar Iglesias - Sept. 5, 2011, 10:47 a.m.
On Mon, Sep 05, 2011 at 12:28:50PM +0300, Avi Kivity wrote:

... 
> Query is needed when a line is masked internally, or when a device is 
> hot-plugged.
> 
> We can work around masking by caching the level in the device even 
> though the line is masked, and querying the cache when the line is 
> unmasked, but that is artificial, and requires live-migrating the cache 
> (which isn't true state).  We can't work around the hotplug case.

Yup, hotplug messes everything up :) At least if you want to plug into
arbitrary places in the tree.

Cheers
Blue Swirl - Sept. 5, 2011, 7:36 p.m.
On Mon, Sep 5, 2011 at 8:38 AM, Edgar E. Iglesias
<edgar.iglesias@gmail.com> wrote:
> On Sat, Sep 03, 2011 at 02:53:31PM -0500, Anthony Liguori wrote:
>> On 08/31/2011 11:59 AM, Blue Swirl wrote:
>> > On Wed, Aug 31, 2011 at 8:28 AM, Avi Kivity<avi@redhat.com>  wrote:
>> >> On 08/30/2011 10:19 PM, Blue Swirl wrote:
>> >>>
>> >>>>
>> >>>>   We need some kind of two phase restore. In the first phase all state is
>> >>>>   restored; since some of that state drivers outputs that are input to
>> >>>> other
>> >>>>   devices, they may experience an edge, and we need to supress that.  In
>> >>>> the
>> >>>>   second phase edge detection is unsupressed and the device goes live.
>> >>>
>> >>> No. Devices may not perform any externally visible activities (like
>> >>> toggle a qemu_irq) during or after load because 1) qemu_irq is
>> >>> stateless and 2) since the receiving end is also freshly loaded, both
>> >>> states are already in synch without any calls or toggling.
>> >>
>> >> That makes it impossible to migrate level-triggered irq lines.  Or at least,
>> >> the receiver has to remember the state, instead of (or in addition to) the
>> >> sender.
>> >
>> > Both ends probably need to remember the state. That should work
>> > without any multiphase restores and transient suppressors.
>> >
>> > It might be also possible to introduce stateful signal lines which
>> > save and restore their state, then the receiving end could check what
>> > is the current level. However, if you consider that the devices may be
>> > restored in random order, if the IRQ line device happens to be
>> > restored later, the receiver would still get wrong information. Adding
>> > priorities could solve this, but I think stateless IRQs are the only
>> > sane way.
>>
>> We shouldn't really use the term IRQ as it's confusing.  I like the term
>> "pin" better because that describes what we're really talking about.
>>
>> qemu_irq is designed oddly today because is represents something that is
>> intrinsically state (whether a pin is high or low) with an edge
>> notification with the assumption that the state is held somewhere else
>> (which is usually true).
>
> I don't agree. That's not what qemu_irq represents.
> It represents a wire, a mechanism to drive changes through logic paths
> between state. It is intrinsically stateless.
>
> It may be the case that it is missused in some places, or that it isn't
> always the best thing to use to represent what ever you need to represent,
> so that you want to complement with other mechanisms.
> But universally replacing it with a stateful alternative seems wrong to me.

Maybe there could be a stateless version of Pin for optimization:
Line? This would probably save one bool worth of memory and one memory
store access for each IRQ event.
Avi Kivity - Sept. 6, 2011, 7:46 a.m.
On 09/05/2011 10:36 PM, Blue Swirl wrote:
> >  I don't agree. That's not what qemu_irq represents.
> >  It represents a wire, a mechanism to drive changes through logic paths
> >  between state. It is intrinsically stateless.
> >
> >  It may be the case that it is missused in some places, or that it isn't
> >  always the best thing to use to represent what ever you need to represent,
> >  so that you want to complement with other mechanisms.
> >  But universally replacing it with a stateful alternative seems wrong to me.
>
> Maybe there could be a stateless version of Pin for optimization:
> Line? This would probably save one bool worth of memory and one memory
> store access for each IRQ event.

Let's actually measure the effect of the bool/store before we optimize 
it away.

Patch

diff --git a/hw/apic.c b/hw/apic.c
index d8f56c8..22ad635 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -104,6 +104,7 @@  struct APICState {
     QEMUTimer *timer;
     int sipi_vector;
     int wait_for_sipi;
+    int pic_level;
 };
 
 static APICState *local_apics[MAX_APICS + 1];
@@ -186,6 +187,7 @@  void apic_deliver_pic_intr(DeviceState *d, int level)
 {
     APICState *s = DO_UPCAST(APICState, busdev.qdev, d);
 
+    s->pic_level = level;
     if (level) {
         apic_local_deliver(s, APIC_LVT_LINT0);
     } else {
@@ -397,7 +399,7 @@  static void apic_update_irq(APICState *s)
     if (!(s->spurious_vec & APIC_SV_ENABLE)) {
         return;
     }
-    if (apic_irq_pending(s) > 0) {
+    if (apic_irq_pending(s) > 0 || s->pic_level) {
         cpu_interrupt(s->cpu_env, CPU_INTERRUPT_HARD);
     }
 }
diff --git a/hw/i8259.c b/hw/i8259.c
index c0b96ab..cc6f76b 100644
--- a/hw/i8259.c
+++ b/hw/i8259.c
@@ -144,8 +144,7 @@  static int pic_get_irq(PicState *s)
 
 /* raise irq to CPU if necessary. must be called every time the active
    irq may change */
-/* XXX: should not export it, but it is needed for an APIC kludge */
-void pic_update_irq(PicState2 *s)
+static void pic_update_irq(PicState2 *s)
 {
     int irq2, irq;
 
@@ -172,14 +171,9 @@  void pic_update_irq(PicState2 *s)
         printf("pic: cpu_interrupt\n");
 #endif
         qemu_irq_raise(s->parent_irq);
-    }
-
-/* all targets should do this rather than acking the IRQ in the cpu */
-#if defined(TARGET_MIPS) || defined(TARGET_PPC) || defined(TARGET_ALPHA)
-    else {
+    } else {
         qemu_irq_lower(s->parent_irq);
     }
-#endif
 }
 
 #ifdef DEBUG_IRQ_LATENCY
diff --git a/hw/pc.c b/hw/pc.c
index 263fb1a..b7b5d6f 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -156,9 +156,6 @@  int cpu_get_pic_interrupt(CPUState *env)
 
     intno = apic_get_interrupt(env->apic_state);
     if (intno >= 0) {
-        /* set irq request if a PIC irq is still pending */
-        /* XXX: improve that */
-        pic_update_irq(isa_pic);
         return intno;
     }
     /* read the irq from the PIC */
diff --git a/hw/pc.h b/hw/pc.h
index dae736e..f5ff4c0 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -65,7 +65,6 @@  void pic_set_irq(int irq, int level);
 void pic_set_irq_new(void *opaque, int irq, int level);
 qemu_irq *i8259_init(qemu_irq parent_irq);
 int pic_read_irq(PicState2 *s);
-void pic_update_irq(PicState2 *s);
 uint32_t pic_intack_read(PicState2 *s);
 void pic_info(Monitor *mon);
 void irq_info(Monitor *mon);