Patchwork [v4,12/16] e1000: apply fine lock on e1000

login
register
mail settings
Submitter pingfank@linux.vnet.ibm.com
Date Oct. 22, 2012, 9:23 a.m.
Message ID <1350897839-29593-13-git-send-email-pingfank@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/193196/
State New
Headers show

Comments

pingfank@linux.vnet.ibm.com - Oct. 22, 2012, 9:23 a.m.
Use local lock to protect e1000. When calling the system function,
dropping the fine lock before acquiring the big lock. This will
introduce broken device state, which need extra effort to fix.

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 hw/e1000.c |   24 +++++++++++++++++++++++-
 1 files changed, 23 insertions(+), 1 deletions(-)
Jan Kiszka - Oct. 23, 2012, 9:04 a.m.
On 2012-10-22 11:23, Liu Ping Fan wrote:
> Use local lock to protect e1000. When calling the system function,
> dropping the fine lock before acquiring the big lock. This will
> introduce broken device state, which need extra effort to fix.
> 
> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> ---
>  hw/e1000.c |   24 +++++++++++++++++++++++-
>  1 files changed, 23 insertions(+), 1 deletions(-)
> 
> diff --git a/hw/e1000.c b/hw/e1000.c
> index ae8a6c5..5eddab5 100644
> --- a/hw/e1000.c
> +++ b/hw/e1000.c
> @@ -85,6 +85,7 @@ typedef struct E1000State_st {
>      NICConf conf;
>      MemoryRegion mmio;
>      MemoryRegion io;
> +    QemuMutex e1000_lock;
>  
>      uint32_t mac_reg[0x8000];
>      uint16_t phy_reg[0x20];
> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = {
>  static void
>  set_interrupt_cause(E1000State *s, int index, uint32_t val)
>  {
> +    QemuThread *t;
> +
>      if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
>          /* Only for 8257x */
>          val |= E1000_ICR_INT_ASSERTED;
>      }
>      s->mac_reg[ICR] = val;
>      s->mac_reg[ICS] = val;
> -    qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
> +
> +    t = pthread_getspecific(qemu_thread_key);
> +    if (t->context_type == 1) {
> +        qemu_mutex_unlock(&s->e1000_lock);
> +        qemu_mutex_lock_iothread();
> +    }
> +    if (DEVICE(s)->state < DEV_STATE_STOPPING) {
> +        qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
> +    }
> +    if (t->context_type == 1) {
> +        qemu_mutex_unlock_iothread();
> +        qemu_mutex_lock(&s->e1000_lock);
> +    }

This is ugly for many reasons. First of all, it is racy as the register
content may change while dropping the device lock, no? Then you would
raise or clear an IRQ spuriously.

Second, it clearly shows that we need to address lock-less IRQ delivery.
Almost nothing is won if we have to take the global lock again to push
an IRQ event to the guest. I'm repeating myself, but the problem to be
solved here is almost identical to fast IRQ delivery for assigned
devices (which we only address pretty ad-hoc for PCI so far).

And third: too much boilerplate code... :-/

>  }
>  
>  static void
> @@ -268,6 +283,7 @@ static void e1000_reset(void *opaque)
>      E1000State *d = opaque;
>  
>      qemu_del_timer(d->autoneg_timer);
> +
>      memset(d->phy_reg, 0, sizeof d->phy_reg);
>      memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
>      memset(d->mac_reg, 0, sizeof d->mac_reg);
> @@ -448,7 +464,11 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
>      if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) {
>          s->nic->nc.info->receive(&s->nic->nc, buf, size);
>      } else {
> +        qemu_mutex_unlock(&s->e1000_lock);
> +        qemu_mutex_lock_iothread();
>          qemu_send_packet(&s->nic->nc, buf, size);
> +        qemu_mutex_unlock_iothread();
> +        qemu_mutex_lock(&s->e1000_lock);

And that is the also a problem to be discussed next: How to handle
locking of backends? Do we want separate locks for backend and frontend?
Although they are typically in a 1:1 relationship? Oh, I'm revealing the
content of my talk... ;)

>      }
>  }
>  
> @@ -1221,6 +1241,8 @@ static int pci_e1000_init(PCIDevice *pci_dev)
>      int i;
>      uint8_t *macaddr;
>  
> +    qemu_mutex_init(&d->e1000_lock);
> +
>      pci_conf = d->dev.config;
>  
>      /* TODO: RST# value should be 0, PCI spec 6.2.4 */
> 

Jan
pingfan liu - Oct. 24, 2012, 6:31 a.m.
On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2012-10-22 11:23, Liu Ping Fan wrote:
>> Use local lock to protect e1000. When calling the system function,
>> dropping the fine lock before acquiring the big lock. This will
>> introduce broken device state, which need extra effort to fix.
>>
>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>> ---
>>  hw/e1000.c |   24 +++++++++++++++++++++++-
>>  1 files changed, 23 insertions(+), 1 deletions(-)
>>
>> diff --git a/hw/e1000.c b/hw/e1000.c
>> index ae8a6c5..5eddab5 100644
>> --- a/hw/e1000.c
>> +++ b/hw/e1000.c
>> @@ -85,6 +85,7 @@ typedef struct E1000State_st {
>>      NICConf conf;
>>      MemoryRegion mmio;
>>      MemoryRegion io;
>> +    QemuMutex e1000_lock;
>>
>>      uint32_t mac_reg[0x8000];
>>      uint16_t phy_reg[0x20];
>> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = {
>>  static void
>>  set_interrupt_cause(E1000State *s, int index, uint32_t val)
>>  {
>> +    QemuThread *t;
>> +
>>      if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
>>          /* Only for 8257x */
>>          val |= E1000_ICR_INT_ASSERTED;
>>      }
>>      s->mac_reg[ICR] = val;
>>      s->mac_reg[ICS] = val;
>> -    qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>> +
>> +    t = pthread_getspecific(qemu_thread_key);
>> +    if (t->context_type == 1) {
>> +        qemu_mutex_unlock(&s->e1000_lock);
>> +        qemu_mutex_lock_iothread();
>> +    }
>> +    if (DEVICE(s)->state < DEV_STATE_STOPPING) {
>> +        qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>> +    }
>> +    if (t->context_type == 1) {
>> +        qemu_mutex_unlock_iothread();
>> +        qemu_mutex_lock(&s->e1000_lock);
>> +    }
>
> This is ugly for many reasons. First of all, it is racy as the register
> content may change while dropping the device lock, no? Then you would
> raise or clear an IRQ spuriously.
>
Device state's intact is protected by busy flag, and will not broken

> Second, it clearly shows that we need to address lock-less IRQ delivery.
> Almost nothing is won if we have to take the global lock again to push
> an IRQ event to the guest. I'm repeating myself, but the problem to be
> solved here is almost identical to fast IRQ delivery for assigned
> devices (which we only address pretty ad-hoc for PCI so far).
>
Yes, agreed. But here is the 1st step to show how play device out of
big lock protection.  Also help us set up target for each subsystem. I
think at the next step, we will consider each subsystem.

> And third: too much boilerplate code... :-/
>
Yeah, without the recursive big lock, we need to tell the code is with
or w/o big lock.  I like to  make big lock recursive, but maybe it has
more drawbacks.

Thanks and regards,
pingfan
>>  }
>>
>>  static void
>> @@ -268,6 +283,7 @@ static void e1000_reset(void *opaque)
>>      E1000State *d = opaque;
>>
>>      qemu_del_timer(d->autoneg_timer);
>> +
>>      memset(d->phy_reg, 0, sizeof d->phy_reg);
>>      memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
>>      memset(d->mac_reg, 0, sizeof d->mac_reg);
>> @@ -448,7 +464,11 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
>>      if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) {
>>          s->nic->nc.info->receive(&s->nic->nc, buf, size);
>>      } else {
>> +        qemu_mutex_unlock(&s->e1000_lock);
>> +        qemu_mutex_lock_iothread();
>>          qemu_send_packet(&s->nic->nc, buf, size);
>> +        qemu_mutex_unlock_iothread();
>> +        qemu_mutex_lock(&s->e1000_lock);
>
> And that is the also a problem to be discussed next: How to handle
> locking of backends? Do we want separate locks for backend and frontend?
> Although they are typically in a 1:1 relationship? Oh, I'm revealing the
> content of my talk... ;)
>
>>      }
>>  }
>>
>> @@ -1221,6 +1241,8 @@ static int pci_e1000_init(PCIDevice *pci_dev)
>>      int i;
>>      uint8_t *macaddr;
>>
>> +    qemu_mutex_init(&d->e1000_lock);
>> +
>>      pci_conf = d->dev.config;
>>
>>      /* TODO: RST# value should be 0, PCI spec 6.2.4 */
>>
>
> Jan
>
> --
> Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
> Corporate Competence Center Embedded Linux
>
Jan Kiszka - Oct. 24, 2012, 7:17 a.m.
On 2012-10-24 08:31, liu ping fan wrote:
> On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2012-10-22 11:23, Liu Ping Fan wrote:
>>> Use local lock to protect e1000. When calling the system function,
>>> dropping the fine lock before acquiring the big lock. This will
>>> introduce broken device state, which need extra effort to fix.
>>>
>>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>> ---
>>>  hw/e1000.c |   24 +++++++++++++++++++++++-
>>>  1 files changed, 23 insertions(+), 1 deletions(-)
>>>
>>> diff --git a/hw/e1000.c b/hw/e1000.c
>>> index ae8a6c5..5eddab5 100644
>>> --- a/hw/e1000.c
>>> +++ b/hw/e1000.c
>>> @@ -85,6 +85,7 @@ typedef struct E1000State_st {
>>>      NICConf conf;
>>>      MemoryRegion mmio;
>>>      MemoryRegion io;
>>> +    QemuMutex e1000_lock;
>>>
>>>      uint32_t mac_reg[0x8000];
>>>      uint16_t phy_reg[0x20];
>>> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = {
>>>  static void
>>>  set_interrupt_cause(E1000State *s, int index, uint32_t val)
>>>  {
>>> +    QemuThread *t;
>>> +
>>>      if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
>>>          /* Only for 8257x */
>>>          val |= E1000_ICR_INT_ASSERTED;
>>>      }
>>>      s->mac_reg[ICR] = val;
>>>      s->mac_reg[ICS] = val;
>>> -    qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>>> +
>>> +    t = pthread_getspecific(qemu_thread_key);
>>> +    if (t->context_type == 1) {
>>> +        qemu_mutex_unlock(&s->e1000_lock);
>>> +        qemu_mutex_lock_iothread();
>>> +    }
>>> +    if (DEVICE(s)->state < DEV_STATE_STOPPING) {
>>> +        qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>>> +    }
>>> +    if (t->context_type == 1) {
>>> +        qemu_mutex_unlock_iothread();
>>> +        qemu_mutex_lock(&s->e1000_lock);
>>> +    }
>>
>> This is ugly for many reasons. First of all, it is racy as the register
>> content may change while dropping the device lock, no? Then you would
>> raise or clear an IRQ spuriously.
>>
> Device state's intact is protected by busy flag, and will not broken

Except that the busy flag concept is broken in itself.

I see that we have a all-or-nothing problem here: to address this
properly, we need to convert the IRQ path to lock-less (or at least
compatible with holding per-device locks) as well.

Jan
pingfan liu - Oct. 24, 2012, 7:29 a.m.
On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2012-10-22 11:23, Liu Ping Fan wrote:
>> Use local lock to protect e1000. When calling the system function,
>> dropping the fine lock before acquiring the big lock. This will
>> introduce broken device state, which need extra effort to fix.
>>
>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>> ---
>>  hw/e1000.c |   24 +++++++++++++++++++++++-
>>  1 files changed, 23 insertions(+), 1 deletions(-)
>>
>> diff --git a/hw/e1000.c b/hw/e1000.c
>> index ae8a6c5..5eddab5 100644
>> --- a/hw/e1000.c
>> +++ b/hw/e1000.c
>> @@ -85,6 +85,7 @@ typedef struct E1000State_st {
>>      NICConf conf;
>>      MemoryRegion mmio;
>>      MemoryRegion io;
>> +    QemuMutex e1000_lock;
>>
>>      uint32_t mac_reg[0x8000];
>>      uint16_t phy_reg[0x20];
>> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = {
>>  static void
>>  set_interrupt_cause(E1000State *s, int index, uint32_t val)
>>  {
>> +    QemuThread *t;
>> +
>>      if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
>>          /* Only for 8257x */
>>          val |= E1000_ICR_INT_ASSERTED;
>>      }
>>      s->mac_reg[ICR] = val;
>>      s->mac_reg[ICS] = val;
>> -    qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>> +
>> +    t = pthread_getspecific(qemu_thread_key);
>> +    if (t->context_type == 1) {
>> +        qemu_mutex_unlock(&s->e1000_lock);
>> +        qemu_mutex_lock_iothread();
>> +    }
>> +    if (DEVICE(s)->state < DEV_STATE_STOPPING) {
>> +        qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>> +    }
>> +    if (t->context_type == 1) {
>> +        qemu_mutex_unlock_iothread();
>> +        qemu_mutex_lock(&s->e1000_lock);
>> +    }
>
> This is ugly for many reasons. First of all, it is racy as the register
> content may change while dropping the device lock, no? Then you would
> raise or clear an IRQ spuriously.
>
> Second, it clearly shows that we need to address lock-less IRQ delivery.
> Almost nothing is won if we have to take the global lock again to push
> an IRQ event to the guest. I'm repeating myself, but the problem to be
> solved here is almost identical to fast IRQ delivery for assigned
> devices (which we only address pretty ad-hoc for PCI so far).
>
Interesting, could you show me more detail about it, so I can google...

Thanks,
pingfan
> And third: too much boilerplate code... :-/
>
>>  }
>>
>>  static void
>> @@ -268,6 +283,7 @@ static void e1000_reset(void *opaque)
>>      E1000State *d = opaque;
>>
>>      qemu_del_timer(d->autoneg_timer);
>> +
>>      memset(d->phy_reg, 0, sizeof d->phy_reg);
>>      memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
>>      memset(d->mac_reg, 0, sizeof d->mac_reg);
>> @@ -448,7 +464,11 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
>>      if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) {
>>          s->nic->nc.info->receive(&s->nic->nc, buf, size);
>>      } else {
>> +        qemu_mutex_unlock(&s->e1000_lock);
>> +        qemu_mutex_lock_iothread();
>>          qemu_send_packet(&s->nic->nc, buf, size);
>> +        qemu_mutex_unlock_iothread();
>> +        qemu_mutex_lock(&s->e1000_lock);
>
> And that is the also a problem to be discussed next: How to handle
> locking of backends? Do we want separate locks for backend and frontend?
> Although they are typically in a 1:1 relationship? Oh, I'm revealing the
> content of my talk... ;)
>
>>      }
>>  }
>>
>> @@ -1221,6 +1241,8 @@ static int pci_e1000_init(PCIDevice *pci_dev)
>>      int i;
>>      uint8_t *macaddr;
>>
>> +    qemu_mutex_init(&d->e1000_lock);
>> +
>>      pci_conf = d->dev.config;
>>
>>      /* TODO: RST# value should be 0, PCI spec 6.2.4 */
>>
>
> Jan
>
> --
> Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
> Corporate Competence Center Embedded Linux
>
Avi Kivity - Oct. 25, 2012, 9:01 a.m.
On 10/24/2012 09:17 AM, Jan Kiszka wrote:
>>>
>>> This is ugly for many reasons. First of all, it is racy as the register
>>> content may change while dropping the device lock, no? Then you would
>>> raise or clear an IRQ spuriously.
>>>
>> Device state's intact is protected by busy flag, and will not broken
> 
> Except that the busy flag concept is broken in itself.

How do we fix an mmio that ends up mmio'ing back to itself, perhaps
indirectly?  Note this is broken in mainline too, but in a different way.

Do we introduce clever locks that can detect deadlocks?

> I see that we have a all-or-nothing problem here: to address this
> properly, we need to convert the IRQ path to lock-less (or at least
> compatible with holding per-device locks) as well.

There is a transitional path where writing to a register that can cause
IRQ changes takes both the big lock and the local lock.

Eventually, though, of course all inner subsystems must be threaded for
this work to have value.
Jan Kiszka - Oct. 25, 2012, 9:31 a.m.
On 2012-10-25 11:01, Avi Kivity wrote:
> On 10/24/2012 09:17 AM, Jan Kiszka wrote:
>>>>
>>>> This is ugly for many reasons. First of all, it is racy as the register
>>>> content may change while dropping the device lock, no? Then you would
>>>> raise or clear an IRQ spuriously.
>>>>
>>> Device state's intact is protected by busy flag, and will not broken
>>
>> Except that the busy flag concept is broken in itself.
> 
> How do we fix an mmio that ends up mmio'ing back to itself, perhaps
> indirectly?  Note this is broken in mainline too, but in a different way.
> 
> Do we introduce clever locks that can detect deadlocks?

That problem is already addressed (to my understanding) by blocking
nested MMIO in general. The brokenness of the busy flag is that it
prevents concurrent MMIO by dropping requests.

> 
>> I see that we have a all-or-nothing problem here: to address this
>> properly, we need to convert the IRQ path to lock-less (or at least
>> compatible with holding per-device locks) as well.
> 
> There is a transitional path where writing to a register that can cause
> IRQ changes takes both the big lock and the local lock.
> 
> Eventually, though, of course all inner subsystems must be threaded for
> this work to have value.
> 

But that transitional path must not introduce regressions. Opening a
race window between IRQ cause update and event injection is such a
thing, just like dropping concurrent requests on the floor.

Jan
Jan Kiszka - Oct. 25, 2012, 1:34 p.m.
On 2012-10-24 09:29, liu ping fan wrote:
> On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2012-10-22 11:23, Liu Ping Fan wrote:
>>> Use local lock to protect e1000. When calling the system function,
>>> dropping the fine lock before acquiring the big lock. This will
>>> introduce broken device state, which need extra effort to fix.
>>>
>>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>> ---
>>>  hw/e1000.c |   24 +++++++++++++++++++++++-
>>>  1 files changed, 23 insertions(+), 1 deletions(-)
>>>
>>> diff --git a/hw/e1000.c b/hw/e1000.c
>>> index ae8a6c5..5eddab5 100644
>>> --- a/hw/e1000.c
>>> +++ b/hw/e1000.c
>>> @@ -85,6 +85,7 @@ typedef struct E1000State_st {
>>>      NICConf conf;
>>>      MemoryRegion mmio;
>>>      MemoryRegion io;
>>> +    QemuMutex e1000_lock;
>>>
>>>      uint32_t mac_reg[0x8000];
>>>      uint16_t phy_reg[0x20];
>>> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = {
>>>  static void
>>>  set_interrupt_cause(E1000State *s, int index, uint32_t val)
>>>  {
>>> +    QemuThread *t;
>>> +
>>>      if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
>>>          /* Only for 8257x */
>>>          val |= E1000_ICR_INT_ASSERTED;
>>>      }
>>>      s->mac_reg[ICR] = val;
>>>      s->mac_reg[ICS] = val;
>>> -    qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>>> +
>>> +    t = pthread_getspecific(qemu_thread_key);
>>> +    if (t->context_type == 1) {
>>> +        qemu_mutex_unlock(&s->e1000_lock);
>>> +        qemu_mutex_lock_iothread();
>>> +    }
>>> +    if (DEVICE(s)->state < DEV_STATE_STOPPING) {
>>> +        qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>>> +    }
>>> +    if (t->context_type == 1) {
>>> +        qemu_mutex_unlock_iothread();
>>> +        qemu_mutex_lock(&s->e1000_lock);
>>> +    }
>>
>> This is ugly for many reasons. First of all, it is racy as the register
>> content may change while dropping the device lock, no? Then you would
>> raise or clear an IRQ spuriously.
>>
>> Second, it clearly shows that we need to address lock-less IRQ delivery.
>> Almost nothing is won if we have to take the global lock again to push
>> an IRQ event to the guest. I'm repeating myself, but the problem to be
>> solved here is almost identical to fast IRQ delivery for assigned
>> devices (which we only address pretty ad-hoc for PCI so far).
>>
> Interesting, could you show me more detail about it, so I can google...

No need to look that far, just grep for pci_device_route_intx_to_irq,
pci_device_set_intx_routing_notifier and related functions in the code.

Jan
Avi Kivity - Oct. 25, 2012, 4:21 p.m.
On 10/25/2012 11:31 AM, Jan Kiszka wrote:
> On 2012-10-25 11:01, Avi Kivity wrote:
>> On 10/24/2012 09:17 AM, Jan Kiszka wrote:
>>>>>
>>>>> This is ugly for many reasons. First of all, it is racy as the register
>>>>> content may change while dropping the device lock, no? Then you would
>>>>> raise or clear an IRQ spuriously.
>>>>>
>>>> Device state's intact is protected by busy flag, and will not broken
>>>
>>> Except that the busy flag concept is broken in itself.
>> 
>> How do we fix an mmio that ends up mmio'ing back to itself, perhaps
>> indirectly?  Note this is broken in mainline too, but in a different way.
>> 
>> Do we introduce clever locks that can detect deadlocks?
> 
> That problem is already addressed (to my understanding) by blocking
> nested MMIO in general. 

That doesn't work cross-thread.

vcpu A: write to device X, dma-ing to device Y
vcpu B: write to device Y, dma-ing to device X

My suggestion was to drop the locks around DMA, then re-acquire the lock
and re-validate data.

> The brokenness of the busy flag is that it
> prevents concurrent MMIO by dropping requests.

Right.

> 
>> 
>>> I see that we have a all-or-nothing problem here: to address this
>>> properly, we need to convert the IRQ path to lock-less (or at least
>>> compatible with holding per-device locks) as well.
>> 
>> There is a transitional path where writing to a register that can cause
>> IRQ changes takes both the big lock and the local lock.
>> 
>> Eventually, though, of course all inner subsystems must be threaded for
>> this work to have value.
>> 
> 
> But that transitional path must not introduce regressions. Opening a
> race window between IRQ cause update and event injection is such a
> thing, just like dropping concurrent requests on the floor.

Can you explain the race?
Avi Kivity - Oct. 25, 2012, 4:23 p.m.
On 10/25/2012 03:34 PM, Jan Kiszka wrote:

>>> Second, it clearly shows that we need to address lock-less IRQ delivery.
>>> Almost nothing is won if we have to take the global lock again to push
>>> an IRQ event to the guest. I'm repeating myself, but the problem to be
>>> solved here is almost identical to fast IRQ delivery for assigned
>>> devices (which we only address pretty ad-hoc for PCI so far).
>>>
>> Interesting, could you show me more detail about it, so I can google...
> 
> No need to look that far, just grep for pci_device_route_intx_to_irq,
> pci_device_set_intx_routing_notifier and related functions in the code.

We can address it in the same way the memory core supports concurrency,
by copying dispatch information into rcu or lock protected data structures.

But I really hope we can avoid doing it now.
Jan Kiszka - Oct. 25, 2012, 4:39 p.m.
On 2012-10-25 18:21, Avi Kivity wrote:
> On 10/25/2012 11:31 AM, Jan Kiszka wrote:
>> On 2012-10-25 11:01, Avi Kivity wrote:
>>> On 10/24/2012 09:17 AM, Jan Kiszka wrote:
>>>>>>
>>>>>> This is ugly for many reasons. First of all, it is racy as the register
>>>>>> content may change while dropping the device lock, no? Then you would
>>>>>> raise or clear an IRQ spuriously.
>>>>>>
>>>>> Device state's intact is protected by busy flag, and will not broken
>>>>
>>>> Except that the busy flag concept is broken in itself.
>>>
>>> How do we fix an mmio that ends up mmio'ing back to itself, perhaps
>>> indirectly?  Note this is broken in mainline too, but in a different way.
>>>
>>> Do we introduce clever locks that can detect deadlocks?
>>
>> That problem is already addressed (to my understanding) by blocking
>> nested MMIO in general. 
> 
> That doesn't work cross-thread.
> 
> vcpu A: write to device X, dma-ing to device Y
> vcpu B: write to device Y, dma-ing to device X

We will deny DMA-ing from device X on behalf of a VCPU, ie. in dispatch
context, to Y.

What we do not deny, though, is DMA-ing from an I/O thread that
processes an event for device X. If the invoked callback of device X
holds the device lock across some DMA request to Y, then we risk to run
into the same ABBA issue. Hmm...

> 
> My suggestion was to drop the locks around DMA, then re-acquire the lock
> and re-validate data.

Maybe possible, but hairy depending on the device model.

> 
>> The brokenness of the busy flag is that it
>> prevents concurrent MMIO by dropping requests.
> 
> Right.
> 
>>
>>>
>>>> I see that we have a all-or-nothing problem here: to address this
>>>> properly, we need to convert the IRQ path to lock-less (or at least
>>>> compatible with holding per-device locks) as well.
>>>
>>> There is a transitional path where writing to a register that can cause
>>> IRQ changes takes both the big lock and the local lock.
>>>
>>> Eventually, though, of course all inner subsystems must be threaded for
>>> this work to have value.
>>>
>>
>> But that transitional path must not introduce regressions. Opening a
>> race window between IRQ cause update and event injection is such a
>> thing, just like dropping concurrent requests on the floor.
> 
> Can you explain the race?

Context A				Context B

device.lock
...
device.set interrupt_cause = 0
lower_irq = true
...
device.unlock
					device.lock
					...
					device.interrupt_cause = 42
					raise_irq = true
					...
					device.unlock
					if (raise_irq)
						bql.lock
						set_irq(device.irqno)
						bql.unlock
if (lower_irq)
	bql.lock
	clear_irq(device.irqno)
	bql.unlock


And there it goes, our interrupt event.

Jan
Jan Kiszka - Oct. 25, 2012, 4:41 p.m.
On 2012-10-25 18:23, Avi Kivity wrote:
> On 10/25/2012 03:34 PM, Jan Kiszka wrote:
> 
>>>> Second, it clearly shows that we need to address lock-less IRQ delivery.
>>>> Almost nothing is won if we have to take the global lock again to push
>>>> an IRQ event to the guest. I'm repeating myself, but the problem to be
>>>> solved here is almost identical to fast IRQ delivery for assigned
>>>> devices (which we only address pretty ad-hoc for PCI so far).
>>>>
>>> Interesting, could you show me more detail about it, so I can google...
>>
>> No need to look that far, just grep for pci_device_route_intx_to_irq,
>> pci_device_set_intx_routing_notifier and related functions in the code.
> 
> We can address it in the same way the memory core supports concurrency,
> by copying dispatch information into rcu or lock protected data structures.
> 
> But I really hope we can avoid doing it now.

I doubt so as the alternative is taking the BQL while (still) holding
the device lock. But that creates ABBA risks.

Jan
Avi Kivity - Oct. 25, 2012, 5:02 p.m.
On 10/25/2012 06:39 PM, Jan Kiszka wrote:
>> 
>> That doesn't work cross-thread.
>> 
>> vcpu A: write to device X, dma-ing to device Y
>> vcpu B: write to device Y, dma-ing to device X
> 
> We will deny DMA-ing from device X on behalf of a VCPU, ie. in dispatch
> context, to Y.
> 
> What we do not deny, though, is DMA-ing from an I/O thread that
> processes an event for device X. 

I would really like to avoid depending on the context.  In real hardware, there is no such thing.

> If the invoked callback of device X
> holds the device lock across some DMA request to Y, then we risk to run
> into the same ABBA issue. Hmm...

Yup.

> 
>> 
>> My suggestion was to drop the locks around DMA, then re-acquire the lock
>> and re-validate data.
> 
> Maybe possible, but hairy depending on the device model.

It's unpleasant, yes.

Note depending on the device, we may not need to re-validate data, it may be sufficient to load it into local variables to we know it is consistent at some point.  But all those solutions suffer from requiring device model authors to understand all those issues, rather than just add a simple lock around access to their data structures.

>>>>> I see that we have a all-or-nothing problem here: to address this
>>>>> properly, we need to convert the IRQ path to lock-less (or at least
>>>>> compatible with holding per-device locks) as well.
>>>>
>>>> There is a transitional path where writing to a register that can cause
>>>> IRQ changes takes both the big lock and the local lock.
>>>>
>>>> Eventually, though, of course all inner subsystems must be threaded for
>>>> this work to have value.
>>>>
>>>
>>> But that transitional path must not introduce regressions. Opening a
>>> race window between IRQ cause update and event injection is such a
>>> thing, just like dropping concurrent requests on the floor.
>> 
>> Can you explain the race?
> 
> Context A				Context B
> 
> device.lock
> ...
> device.set interrupt_cause = 0
> lower_irq = true
> ...
> device.unlock
> 					device.lock
> 					...
> 					device.interrupt_cause = 42
> 					raise_irq = true
> 					...
> 					device.unlock
> 					if (raise_irq)
> 						bql.lock
> 						set_irq(device.irqno)
> 						bql.unlock
> if (lower_irq)
> 	bql.lock
> 	clear_irq(device.irqno)
> 	bql.unlock
> 
> 
> And there it goes, our interrupt event.

Obviously you'll need to reacquire the device lock after taking bql and revalidate stuff.  But that is not what I am suggesting.  Instead, any path that can lead to an irq update (or timer update etc) will take both the bql and the device lock.  This will leave after the first pass only side effect free register reads and writes, which is silly if we keep it that way, but we will follow with a threaded timer and irq subsystem and we'll peel away those big locks.

  device_mmio_write:
    if register is involved in irq or timers or block layer or really anything that matters:
      bql.acquire
    device.lock.acquire
    do stuff
    device.lock.release
    if that big condition from above was true:
      bql.release
Avi Kivity - Oct. 25, 2012, 5:03 p.m.
On 10/25/2012 06:41 PM, Jan Kiszka wrote:
> On 2012-10-25 18:23, Avi Kivity wrote:
>> On 10/25/2012 03:34 PM, Jan Kiszka wrote:
>> 
>>>>> Second, it clearly shows that we need to address lock-less IRQ delivery.
>>>>> Almost nothing is won if we have to take the global lock again to push
>>>>> an IRQ event to the guest. I'm repeating myself, but the problem to be
>>>>> solved here is almost identical to fast IRQ delivery for assigned
>>>>> devices (which we only address pretty ad-hoc for PCI so far).
>>>>>
>>>> Interesting, could you show me more detail about it, so I can google...
>>>
>>> No need to look that far, just grep for pci_device_route_intx_to_irq,
>>> pci_device_set_intx_routing_notifier and related functions in the code.
>> 
>> We can address it in the same way the memory core supports concurrency,
>> by copying dispatch information into rcu or lock protected data structures.
>> 
>> But I really hope we can avoid doing it now.
> 
> I doubt so as the alternative is taking the BQL while (still) holding
> the device lock. 

Sorry, doesn't parse.

> But that creates ABBA risks.
Jan Kiszka - Oct. 25, 2012, 6:48 p.m.
On 2012-10-25 19:02, Avi Kivity wrote:
> On 10/25/2012 06:39 PM, Jan Kiszka wrote:
>>>
>>> That doesn't work cross-thread.
>>>
>>> vcpu A: write to device X, dma-ing to device Y
>>> vcpu B: write to device Y, dma-ing to device X
>>
>> We will deny DMA-ing from device X on behalf of a VCPU, ie. in dispatch
>> context, to Y.
>>
>> What we do not deny, though, is DMA-ing from an I/O thread that
>> processes an event for device X. 
> 
> I would really like to avoid depending on the context.  In real hardware, there is no such thing.

The point is how we deal with any kind of access to a device that
requires taking that device's lock while holding another lock, provided
that scenario can also take place in reverse order at the same time.
Known scenarios are:

 - vcpu 1 -> access device A -> access device B
 - vcpu 2 -> access device B -> access device A

 - event 1 -> device A event processing -> access device B
 - event 2 -> device B event processing -> access device A

and combinations of those pairs.

> 
>> If the invoked callback of device X
>> holds the device lock across some DMA request to Y, then we risk to run
>> into the same ABBA issue. Hmm...
> 
> Yup.
> 
>>
>>>
>>> My suggestion was to drop the locks around DMA, then re-acquire the lock
>>> and re-validate data.
>>
>> Maybe possible, but hairy depending on the device model.
> 
> It's unpleasant, yes.
> 
> Note depending on the device, we may not need to re-validate data, it may be sufficient to load it into local variables to we know it is consistent at some point.  But all those solutions suffer from requiring device model authors to understand all those issues, rather than just add a simple lock around access to their data structures.

Right. And therefor it is a suboptimal way to start (patching).

> 
>>>>>> I see that we have a all-or-nothing problem here: to address this
>>>>>> properly, we need to convert the IRQ path to lock-less (or at least
>>>>>> compatible with holding per-device locks) as well.
>>>>>
>>>>> There is a transitional path where writing to a register that can cause
>>>>> IRQ changes takes both the big lock and the local lock.
>>>>>
>>>>> Eventually, though, of course all inner subsystems must be threaded for
>>>>> this work to have value.
>>>>>
>>>>
>>>> But that transitional path must not introduce regressions. Opening a
>>>> race window between IRQ cause update and event injection is such a
>>>> thing, just like dropping concurrent requests on the floor.
>>>
>>> Can you explain the race?
>>
>> Context A				Context B
>>
>> device.lock
>> ...
>> device.set interrupt_cause = 0
>> lower_irq = true
>> ...
>> device.unlock
>> 					device.lock
>> 					...
>> 					device.interrupt_cause = 42
>> 					raise_irq = true
>> 					...
>> 					device.unlock
>> 					if (raise_irq)
>> 						bql.lock
>> 						set_irq(device.irqno)
>> 						bql.unlock
>> if (lower_irq)
>> 	bql.lock
>> 	clear_irq(device.irqno)
>> 	bql.unlock
>>
>>
>> And there it goes, our interrupt event.
> 
> Obviously you'll need to reacquire the device lock after taking bql and revalidate stuff.  But that is not what I am suggesting.  Instead, any path that can lead to an irq update (or timer update etc) will take both the bql and the device lock.  This will leave after the first pass only side effect free register reads and writes, which is silly if we keep it that way, but we will follow with a threaded timer and irq subsystem and we'll peel away those big locks.
> 
>   device_mmio_write:
>     if register is involved in irq or timers or block layer or really anything that matters:
>       bql.acquire
>     device.lock.acquire
>     do stuff
>     device.lock.release
>     if that big condition from above was true:
>       bql.release

Looks simpler than it is as you cannot wrap complete handlers with that
pattern. An example where it would fail (until we solved the locking
issues above):

mmio_write:
  bql.conditional_lock
  device.lock
  device.check_state
  issue_dma
  device.update_state
  update_irq, play_with_timers, etc.
  device.unlock
  bql.conditional_unlock

If that DMA request hits an unconverted MMIO region or one that takes
BQL conditionally as above, we will lock up (or bail out as our mutexes
detect the error). E1000's start_xmit looks like this so far, and that's
a pretty import service.

Moreover, I prefer having a representative cut-through over enjoying to
merge a first step that excludes some 80% of the problems. For that
reason I would be even be inclined to start with addressing the IRQ
injection topic first (patch-wise), then the other necessary backend
services for the e1000 or whatever and convert some device(s) last.

IOW: cut out anything from this series that touches e1000 until the
building blocks for converting it reasonably are finished. Carrying
experimental, partially broken conversion on top is fine, try to merge
pieces of that not, IMHO.

Jan
pingfan liu - Oct. 29, 2012, 5:24 a.m.
On Fri, Oct 26, 2012 at 2:48 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2012-10-25 19:02, Avi Kivity wrote:
>> On 10/25/2012 06:39 PM, Jan Kiszka wrote:
>>>>
>>>> That doesn't work cross-thread.
>>>>
>>>> vcpu A: write to device X, dma-ing to device Y
>>>> vcpu B: write to device Y, dma-ing to device X
>>>
>>> We will deny DMA-ing from device X on behalf of a VCPU, ie. in dispatch
>>> context, to Y.
>>>
>>> What we do not deny, though, is DMA-ing from an I/O thread that
>>> processes an event for device X.
>>
>> I would really like to avoid depending on the context.  In real hardware, there is no such thing.
>
> The point is how we deal with any kind of access to a device that
> requires taking that device's lock while holding another lock, provided
> that scenario can also take place in reverse order at the same time.
> Known scenarios are:
>
>  - vcpu 1 -> access device A -> access device B
>  - vcpu 2 -> access device B -> access device A
>
>  - event 1 -> device A event processing -> access device B
>  - event 2 -> device B event processing -> access device A
>
> and combinations of those pairs.
>
>>
>>> If the invoked callback of device X
>>> holds the device lock across some DMA request to Y, then we risk to run
>>> into the same ABBA issue. Hmm...
>>
>> Yup.
>>
>>>
>>>>
>>>> My suggestion was to drop the locks around DMA, then re-acquire the lock
>>>> and re-validate data.
>>>
>>> Maybe possible, but hairy depending on the device model.
>>
>> It's unpleasant, yes.
>>
>> Note depending on the device, we may not need to re-validate data, it may be sufficient to load it into local variables to we know it is consistent at some point.  But all those solutions suffer from requiring device model authors to understand all those issues, rather than just add a simple lock around access to their data structures.
>
> Right. And therefor it is a suboptimal way to start (patching).
>
>>
>>>>>>> I see that we have a all-or-nothing problem here: to address this
>>>>>>> properly, we need to convert the IRQ path to lock-less (or at least
>>>>>>> compatible with holding per-device locks) as well.
>>>>>>
>>>>>> There is a transitional path where writing to a register that can cause
>>>>>> IRQ changes takes both the big lock and the local lock.
>>>>>>
>>>>>> Eventually, though, of course all inner subsystems must be threaded for
>>>>>> this work to have value.
>>>>>>
>>>>>
>>>>> But that transitional path must not introduce regressions. Opening a
>>>>> race window between IRQ cause update and event injection is such a
>>>>> thing, just like dropping concurrent requests on the floor.
>>>>
>>>> Can you explain the race?
>>>
>>> Context A                            Context B
>>>
>>> device.lock
>>> ...
>>> device.set interrupt_cause = 0
>>> lower_irq = true
>>> ...
>>> device.unlock
>>>                                      device.lock
>>>                                      ...
>>>                                      device.interrupt_cause = 42
>>>                                      raise_irq = true
>>>                                      ...
>>>                                      device.unlock
>>>                                      if (raise_irq)
>>>                                              bql.lock
>>>                                              set_irq(device.irqno)
>>>                                              bql.unlock
>>> if (lower_irq)
>>>      bql.lock
>>>      clear_irq(device.irqno)
>>>      bql.unlock
>>>
>>>
>>> And there it goes, our interrupt event.
>>
>> Obviously you'll need to reacquire the device lock after taking bql and revalidate stuff.  But that is not what I am suggesting.  Instead, any path that can lead to an irq update (or timer update etc) will take both the bql and the device lock.  This will leave after the first pass only side effect free register reads and writes, which is silly if we keep it that way, but we will follow with a threaded timer and irq subsystem and we'll peel away those big locks.
>>
>>   device_mmio_write:
>>     if register is involved in irq or timers or block layer or really anything that matters:
>>       bql.acquire
>>     device.lock.acquire
>>     do stuff
>>     device.lock.release
>>     if that big condition from above was true:
>>       bql.release
>
> Looks simpler than it is as you cannot wrap complete handlers with that
> pattern. An example where it would fail (until we solved the locking
> issues above):
>
> mmio_write:
>   bql.conditional_lock
>   device.lock
>   device.check_state
>   issue_dma
>   device.update_state
>   update_irq, play_with_timers, etc.
>   device.unlock
>   bql.conditional_unlock
>
> If that DMA request hits an unconverted MMIO region or one that takes
> BQL conditionally as above, we will lock up (or bail out as our mutexes
> detect the error). E1000's start_xmit looks like this so far, and that's
> a pretty import service.
>
> Moreover, I prefer having a representative cut-through over enjoying to
> merge a first step that excludes some 80% of the problems. For that
> reason I would be even be inclined to start with addressing the IRQ
> injection topic first (patch-wise), then the other necessary backend
> services for the e1000 or whatever and convert some device(s) last.
>
> IOW: cut out anything from this series that touches e1000 until the
> building blocks for converting it reasonably are finished. Carrying
> experimental, partially broken conversion on top is fine, try to merge
> pieces of that not, IMHO.
>
Agreed.  Just want to take this opportunity to discuss what is the
next and what is nut.

Regards,
pingfan

> Jan
>
> --
> Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
> Corporate Competence Center Embedded Linux
pingfan liu - Oct. 29, 2012, 5:24 a.m.
On Thu, Oct 25, 2012 at 9:34 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2012-10-24 09:29, liu ping fan wrote:
>> On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>> On 2012-10-22 11:23, Liu Ping Fan wrote:
>>>> Use local lock to protect e1000. When calling the system function,
>>>> dropping the fine lock before acquiring the big lock. This will
>>>> introduce broken device state, which need extra effort to fix.
>>>>
>>>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>>> ---
>>>>  hw/e1000.c |   24 +++++++++++++++++++++++-
>>>>  1 files changed, 23 insertions(+), 1 deletions(-)
>>>>
>>>> diff --git a/hw/e1000.c b/hw/e1000.c
>>>> index ae8a6c5..5eddab5 100644
>>>> --- a/hw/e1000.c
>>>> +++ b/hw/e1000.c
>>>> @@ -85,6 +85,7 @@ typedef struct E1000State_st {
>>>>      NICConf conf;
>>>>      MemoryRegion mmio;
>>>>      MemoryRegion io;
>>>> +    QemuMutex e1000_lock;
>>>>
>>>>      uint32_t mac_reg[0x8000];
>>>>      uint16_t phy_reg[0x20];
>>>> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = {
>>>>  static void
>>>>  set_interrupt_cause(E1000State *s, int index, uint32_t val)
>>>>  {
>>>> +    QemuThread *t;
>>>> +
>>>>      if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
>>>>          /* Only for 8257x */
>>>>          val |= E1000_ICR_INT_ASSERTED;
>>>>      }
>>>>      s->mac_reg[ICR] = val;
>>>>      s->mac_reg[ICS] = val;
>>>> -    qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>>>> +
>>>> +    t = pthread_getspecific(qemu_thread_key);
>>>> +    if (t->context_type == 1) {
>>>> +        qemu_mutex_unlock(&s->e1000_lock);
>>>> +        qemu_mutex_lock_iothread();
>>>> +    }
>>>> +    if (DEVICE(s)->state < DEV_STATE_STOPPING) {
>>>> +        qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>>>> +    }
>>>> +    if (t->context_type == 1) {
>>>> +        qemu_mutex_unlock_iothread();
>>>> +        qemu_mutex_lock(&s->e1000_lock);
>>>> +    }
>>>
>>> This is ugly for many reasons. First of all, it is racy as the register
>>> content may change while dropping the device lock, no? Then you would
>>> raise or clear an IRQ spuriously.
>>>
>>> Second, it clearly shows that we need to address lock-less IRQ delivery.
>>> Almost nothing is won if we have to take the global lock again to push
>>> an IRQ event to the guest. I'm repeating myself, but the problem to be
>>> solved here is almost identical to fast IRQ delivery for assigned
>>> devices (which we only address pretty ad-hoc for PCI so far).
>>>
>> Interesting, could you show me more detail about it, so I can google...
>
> No need to look that far, just grep for pci_device_route_intx_to_irq,
> pci_device_set_intx_routing_notifier and related functions in the code.
>
I think, the major point here is to bypass the delivery process among
the irq chipset during runtime. Right?

Thanks and regards,

pingfan
> Jan
>
> --
> Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
> Corporate Competence Center Embedded Linux
Jan Kiszka - Oct. 31, 2012, 7:03 a.m.
On 2012-10-29 06:24, liu ping fan wrote:
> On Thu, Oct 25, 2012 at 9:34 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2012-10-24 09:29, liu ping fan wrote:
>>> On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>>> On 2012-10-22 11:23, Liu Ping Fan wrote:
>>>>> Use local lock to protect e1000. When calling the system function,
>>>>> dropping the fine lock before acquiring the big lock. This will
>>>>> introduce broken device state, which need extra effort to fix.
>>>>>
>>>>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>>>> ---
>>>>>  hw/e1000.c |   24 +++++++++++++++++++++++-
>>>>>  1 files changed, 23 insertions(+), 1 deletions(-)
>>>>>
>>>>> diff --git a/hw/e1000.c b/hw/e1000.c
>>>>> index ae8a6c5..5eddab5 100644
>>>>> --- a/hw/e1000.c
>>>>> +++ b/hw/e1000.c
>>>>> @@ -85,6 +85,7 @@ typedef struct E1000State_st {
>>>>>      NICConf conf;
>>>>>      MemoryRegion mmio;
>>>>>      MemoryRegion io;
>>>>> +    QemuMutex e1000_lock;
>>>>>
>>>>>      uint32_t mac_reg[0x8000];
>>>>>      uint16_t phy_reg[0x20];
>>>>> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = {
>>>>>  static void
>>>>>  set_interrupt_cause(E1000State *s, int index, uint32_t val)
>>>>>  {
>>>>> +    QemuThread *t;
>>>>> +
>>>>>      if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
>>>>>          /* Only for 8257x */
>>>>>          val |= E1000_ICR_INT_ASSERTED;
>>>>>      }
>>>>>      s->mac_reg[ICR] = val;
>>>>>      s->mac_reg[ICS] = val;
>>>>> -    qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>>>>> +
>>>>> +    t = pthread_getspecific(qemu_thread_key);
>>>>> +    if (t->context_type == 1) {
>>>>> +        qemu_mutex_unlock(&s->e1000_lock);
>>>>> +        qemu_mutex_lock_iothread();
>>>>> +    }
>>>>> +    if (DEVICE(s)->state < DEV_STATE_STOPPING) {
>>>>> +        qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>>>>> +    }
>>>>> +    if (t->context_type == 1) {
>>>>> +        qemu_mutex_unlock_iothread();
>>>>> +        qemu_mutex_lock(&s->e1000_lock);
>>>>> +    }
>>>>
>>>> This is ugly for many reasons. First of all, it is racy as the register
>>>> content may change while dropping the device lock, no? Then you would
>>>> raise or clear an IRQ spuriously.
>>>>
>>>> Second, it clearly shows that we need to address lock-less IRQ delivery.
>>>> Almost nothing is won if we have to take the global lock again to push
>>>> an IRQ event to the guest. I'm repeating myself, but the problem to be
>>>> solved here is almost identical to fast IRQ delivery for assigned
>>>> devices (which we only address pretty ad-hoc for PCI so far).
>>>>
>>> Interesting, could you show me more detail about it, so I can google...
>>
>> No need to look that far, just grep for pci_device_route_intx_to_irq,
>> pci_device_set_intx_routing_notifier and related functions in the code.
>>
> I think, the major point here is to bypass the delivery process among
> the irq chipset during runtime. Right?

Right.

Jan

Patch

diff --git a/hw/e1000.c b/hw/e1000.c
index ae8a6c5..5eddab5 100644
--- a/hw/e1000.c
+++ b/hw/e1000.c
@@ -85,6 +85,7 @@  typedef struct E1000State_st {
     NICConf conf;
     MemoryRegion mmio;
     MemoryRegion io;
+    QemuMutex e1000_lock;
 
     uint32_t mac_reg[0x8000];
     uint16_t phy_reg[0x20];
@@ -223,13 +224,27 @@  static const uint32_t mac_reg_init[] = {
 static void
 set_interrupt_cause(E1000State *s, int index, uint32_t val)
 {
+    QemuThread *t;
+
     if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
         /* Only for 8257x */
         val |= E1000_ICR_INT_ASSERTED;
     }
     s->mac_reg[ICR] = val;
     s->mac_reg[ICS] = val;
-    qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
+
+    t = pthread_getspecific(qemu_thread_key);
+    if (t->context_type == 1) {
+        qemu_mutex_unlock(&s->e1000_lock);
+        qemu_mutex_lock_iothread();
+    }
+    if (DEVICE(s)->state < DEV_STATE_STOPPING) {
+        qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
+    }
+    if (t->context_type == 1) {
+        qemu_mutex_unlock_iothread();
+        qemu_mutex_lock(&s->e1000_lock);
+    }
 }
 
 static void
@@ -268,6 +283,7 @@  static void e1000_reset(void *opaque)
     E1000State *d = opaque;
 
     qemu_del_timer(d->autoneg_timer);
+
     memset(d->phy_reg, 0, sizeof d->phy_reg);
     memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
     memset(d->mac_reg, 0, sizeof d->mac_reg);
@@ -448,7 +464,11 @@  e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
     if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) {
         s->nic->nc.info->receive(&s->nic->nc, buf, size);
     } else {
+        qemu_mutex_unlock(&s->e1000_lock);
+        qemu_mutex_lock_iothread();
         qemu_send_packet(&s->nic->nc, buf, size);
+        qemu_mutex_unlock_iothread();
+        qemu_mutex_lock(&s->e1000_lock);
     }
 }
 
@@ -1221,6 +1241,8 @@  static int pci_e1000_init(PCIDevice *pci_dev)
     int i;
     uint8_t *macaddr;
 
+    qemu_mutex_init(&d->e1000_lock);
+
     pci_conf = d->dev.config;
 
     /* TODO: RST# value should be 0, PCI spec 6.2.4 */