Message ID | 1350897839-29593-13-git-send-email-pingfank@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
On 2012-10-22 11:23, Liu Ping Fan wrote: > Use local lock to protect e1000. When calling the system function, > dropping the fine lock before acquiring the big lock. This will > introduce broken device state, which need extra effort to fix. > > Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com> > --- > hw/e1000.c | 24 +++++++++++++++++++++++- > 1 files changed, 23 insertions(+), 1 deletions(-) > > diff --git a/hw/e1000.c b/hw/e1000.c > index ae8a6c5..5eddab5 100644 > --- a/hw/e1000.c > +++ b/hw/e1000.c > @@ -85,6 +85,7 @@ typedef struct E1000State_st { > NICConf conf; > MemoryRegion mmio; > MemoryRegion io; > + QemuMutex e1000_lock; > > uint32_t mac_reg[0x8000]; > uint16_t phy_reg[0x20]; > @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = { > static void > set_interrupt_cause(E1000State *s, int index, uint32_t val) > { > + QemuThread *t; > + > if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) { > /* Only for 8257x */ > val |= E1000_ICR_INT_ASSERTED; > } > s->mac_reg[ICR] = val; > s->mac_reg[ICS] = val; > - qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); > + > + t = pthread_getspecific(qemu_thread_key); > + if (t->context_type == 1) { > + qemu_mutex_unlock(&s->e1000_lock); > + qemu_mutex_lock_iothread(); > + } > + if (DEVICE(s)->state < DEV_STATE_STOPPING) { > + qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); > + } > + if (t->context_type == 1) { > + qemu_mutex_unlock_iothread(); > + qemu_mutex_lock(&s->e1000_lock); > + } This is ugly for many reasons. First of all, it is racy as the register content may change while dropping the device lock, no? Then you would raise or clear an IRQ spuriously. Second, it clearly shows that we need to address lock-less IRQ delivery. Almost nothing is won if we have to take the global lock again to push an IRQ event to the guest. I'm repeating myself, but the problem to be solved here is almost identical to fast IRQ delivery for assigned devices (which we only address pretty ad-hoc for PCI so far). And third: too much boilerplate code... :-/ > } > > static void > @@ -268,6 +283,7 @@ static void e1000_reset(void *opaque) > E1000State *d = opaque; > > qemu_del_timer(d->autoneg_timer); > + > memset(d->phy_reg, 0, sizeof d->phy_reg); > memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init); > memset(d->mac_reg, 0, sizeof d->mac_reg); > @@ -448,7 +464,11 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size) > if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) { > s->nic->nc.info->receive(&s->nic->nc, buf, size); > } else { > + qemu_mutex_unlock(&s->e1000_lock); > + qemu_mutex_lock_iothread(); > qemu_send_packet(&s->nic->nc, buf, size); > + qemu_mutex_unlock_iothread(); > + qemu_mutex_lock(&s->e1000_lock); And that is the also a problem to be discussed next: How to handle locking of backends? Do we want separate locks for backend and frontend? Although they are typically in a 1:1 relationship? Oh, I'm revealing the content of my talk... ;) > } > } > > @@ -1221,6 +1241,8 @@ static int pci_e1000_init(PCIDevice *pci_dev) > int i; > uint8_t *macaddr; > > + qemu_mutex_init(&d->e1000_lock); > + > pci_conf = d->dev.config; > > /* TODO: RST# value should be 0, PCI spec 6.2.4 */ > Jan
On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote: > On 2012-10-22 11:23, Liu Ping Fan wrote: >> Use local lock to protect e1000. When calling the system function, >> dropping the fine lock before acquiring the big lock. This will >> introduce broken device state, which need extra effort to fix. >> >> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com> >> --- >> hw/e1000.c | 24 +++++++++++++++++++++++- >> 1 files changed, 23 insertions(+), 1 deletions(-) >> >> diff --git a/hw/e1000.c b/hw/e1000.c >> index ae8a6c5..5eddab5 100644 >> --- a/hw/e1000.c >> +++ b/hw/e1000.c >> @@ -85,6 +85,7 @@ typedef struct E1000State_st { >> NICConf conf; >> MemoryRegion mmio; >> MemoryRegion io; >> + QemuMutex e1000_lock; >> >> uint32_t mac_reg[0x8000]; >> uint16_t phy_reg[0x20]; >> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = { >> static void >> set_interrupt_cause(E1000State *s, int index, uint32_t val) >> { >> + QemuThread *t; >> + >> if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) { >> /* Only for 8257x */ >> val |= E1000_ICR_INT_ASSERTED; >> } >> s->mac_reg[ICR] = val; >> s->mac_reg[ICS] = val; >> - qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >> + >> + t = pthread_getspecific(qemu_thread_key); >> + if (t->context_type == 1) { >> + qemu_mutex_unlock(&s->e1000_lock); >> + qemu_mutex_lock_iothread(); >> + } >> + if (DEVICE(s)->state < DEV_STATE_STOPPING) { >> + qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >> + } >> + if (t->context_type == 1) { >> + qemu_mutex_unlock_iothread(); >> + qemu_mutex_lock(&s->e1000_lock); >> + } > > This is ugly for many reasons. First of all, it is racy as the register > content may change while dropping the device lock, no? Then you would > raise or clear an IRQ spuriously. > Device state's intact is protected by busy flag, and will not broken > Second, it clearly shows that we need to address lock-less IRQ delivery. > Almost nothing is won if we have to take the global lock again to push > an IRQ event to the guest. I'm repeating myself, but the problem to be > solved here is almost identical to fast IRQ delivery for assigned > devices (which we only address pretty ad-hoc for PCI so far). > Yes, agreed. But here is the 1st step to show how play device out of big lock protection. Also help us set up target for each subsystem. I think at the next step, we will consider each subsystem. > And third: too much boilerplate code... :-/ > Yeah, without the recursive big lock, we need to tell the code is with or w/o big lock. I like to make big lock recursive, but maybe it has more drawbacks. Thanks and regards, pingfan >> } >> >> static void >> @@ -268,6 +283,7 @@ static void e1000_reset(void *opaque) >> E1000State *d = opaque; >> >> qemu_del_timer(d->autoneg_timer); >> + >> memset(d->phy_reg, 0, sizeof d->phy_reg); >> memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init); >> memset(d->mac_reg, 0, sizeof d->mac_reg); >> @@ -448,7 +464,11 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size) >> if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) { >> s->nic->nc.info->receive(&s->nic->nc, buf, size); >> } else { >> + qemu_mutex_unlock(&s->e1000_lock); >> + qemu_mutex_lock_iothread(); >> qemu_send_packet(&s->nic->nc, buf, size); >> + qemu_mutex_unlock_iothread(); >> + qemu_mutex_lock(&s->e1000_lock); > > And that is the also a problem to be discussed next: How to handle > locking of backends? Do we want separate locks for backend and frontend? > Although they are typically in a 1:1 relationship? Oh, I'm revealing the > content of my talk... ;) > >> } >> } >> >> @@ -1221,6 +1241,8 @@ static int pci_e1000_init(PCIDevice *pci_dev) >> int i; >> uint8_t *macaddr; >> >> + qemu_mutex_init(&d->e1000_lock); >> + >> pci_conf = d->dev.config; >> >> /* TODO: RST# value should be 0, PCI spec 6.2.4 */ >> > > Jan > > -- > Siemens AG, Corporate Technology, CT RTC ITP SDP-DE > Corporate Competence Center Embedded Linux >
On 2012-10-24 08:31, liu ping fan wrote: > On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote: >> On 2012-10-22 11:23, Liu Ping Fan wrote: >>> Use local lock to protect e1000. When calling the system function, >>> dropping the fine lock before acquiring the big lock. This will >>> introduce broken device state, which need extra effort to fix. >>> >>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com> >>> --- >>> hw/e1000.c | 24 +++++++++++++++++++++++- >>> 1 files changed, 23 insertions(+), 1 deletions(-) >>> >>> diff --git a/hw/e1000.c b/hw/e1000.c >>> index ae8a6c5..5eddab5 100644 >>> --- a/hw/e1000.c >>> +++ b/hw/e1000.c >>> @@ -85,6 +85,7 @@ typedef struct E1000State_st { >>> NICConf conf; >>> MemoryRegion mmio; >>> MemoryRegion io; >>> + QemuMutex e1000_lock; >>> >>> uint32_t mac_reg[0x8000]; >>> uint16_t phy_reg[0x20]; >>> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = { >>> static void >>> set_interrupt_cause(E1000State *s, int index, uint32_t val) >>> { >>> + QemuThread *t; >>> + >>> if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) { >>> /* Only for 8257x */ >>> val |= E1000_ICR_INT_ASSERTED; >>> } >>> s->mac_reg[ICR] = val; >>> s->mac_reg[ICS] = val; >>> - qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >>> + >>> + t = pthread_getspecific(qemu_thread_key); >>> + if (t->context_type == 1) { >>> + qemu_mutex_unlock(&s->e1000_lock); >>> + qemu_mutex_lock_iothread(); >>> + } >>> + if (DEVICE(s)->state < DEV_STATE_STOPPING) { >>> + qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >>> + } >>> + if (t->context_type == 1) { >>> + qemu_mutex_unlock_iothread(); >>> + qemu_mutex_lock(&s->e1000_lock); >>> + } >> >> This is ugly for many reasons. First of all, it is racy as the register >> content may change while dropping the device lock, no? Then you would >> raise or clear an IRQ spuriously. >> > Device state's intact is protected by busy flag, and will not broken Except that the busy flag concept is broken in itself. I see that we have a all-or-nothing problem here: to address this properly, we need to convert the IRQ path to lock-less (or at least compatible with holding per-device locks) as well. Jan
On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote: > On 2012-10-22 11:23, Liu Ping Fan wrote: >> Use local lock to protect e1000. When calling the system function, >> dropping the fine lock before acquiring the big lock. This will >> introduce broken device state, which need extra effort to fix. >> >> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com> >> --- >> hw/e1000.c | 24 +++++++++++++++++++++++- >> 1 files changed, 23 insertions(+), 1 deletions(-) >> >> diff --git a/hw/e1000.c b/hw/e1000.c >> index ae8a6c5..5eddab5 100644 >> --- a/hw/e1000.c >> +++ b/hw/e1000.c >> @@ -85,6 +85,7 @@ typedef struct E1000State_st { >> NICConf conf; >> MemoryRegion mmio; >> MemoryRegion io; >> + QemuMutex e1000_lock; >> >> uint32_t mac_reg[0x8000]; >> uint16_t phy_reg[0x20]; >> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = { >> static void >> set_interrupt_cause(E1000State *s, int index, uint32_t val) >> { >> + QemuThread *t; >> + >> if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) { >> /* Only for 8257x */ >> val |= E1000_ICR_INT_ASSERTED; >> } >> s->mac_reg[ICR] = val; >> s->mac_reg[ICS] = val; >> - qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >> + >> + t = pthread_getspecific(qemu_thread_key); >> + if (t->context_type == 1) { >> + qemu_mutex_unlock(&s->e1000_lock); >> + qemu_mutex_lock_iothread(); >> + } >> + if (DEVICE(s)->state < DEV_STATE_STOPPING) { >> + qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >> + } >> + if (t->context_type == 1) { >> + qemu_mutex_unlock_iothread(); >> + qemu_mutex_lock(&s->e1000_lock); >> + } > > This is ugly for many reasons. First of all, it is racy as the register > content may change while dropping the device lock, no? Then you would > raise or clear an IRQ spuriously. > > Second, it clearly shows that we need to address lock-less IRQ delivery. > Almost nothing is won if we have to take the global lock again to push > an IRQ event to the guest. I'm repeating myself, but the problem to be > solved here is almost identical to fast IRQ delivery for assigned > devices (which we only address pretty ad-hoc for PCI so far). > Interesting, could you show me more detail about it, so I can google... Thanks, pingfan > And third: too much boilerplate code... :-/ > >> } >> >> static void >> @@ -268,6 +283,7 @@ static void e1000_reset(void *opaque) >> E1000State *d = opaque; >> >> qemu_del_timer(d->autoneg_timer); >> + >> memset(d->phy_reg, 0, sizeof d->phy_reg); >> memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init); >> memset(d->mac_reg, 0, sizeof d->mac_reg); >> @@ -448,7 +464,11 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size) >> if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) { >> s->nic->nc.info->receive(&s->nic->nc, buf, size); >> } else { >> + qemu_mutex_unlock(&s->e1000_lock); >> + qemu_mutex_lock_iothread(); >> qemu_send_packet(&s->nic->nc, buf, size); >> + qemu_mutex_unlock_iothread(); >> + qemu_mutex_lock(&s->e1000_lock); > > And that is the also a problem to be discussed next: How to handle > locking of backends? Do we want separate locks for backend and frontend? > Although they are typically in a 1:1 relationship? Oh, I'm revealing the > content of my talk... ;) > >> } >> } >> >> @@ -1221,6 +1241,8 @@ static int pci_e1000_init(PCIDevice *pci_dev) >> int i; >> uint8_t *macaddr; >> >> + qemu_mutex_init(&d->e1000_lock); >> + >> pci_conf = d->dev.config; >> >> /* TODO: RST# value should be 0, PCI spec 6.2.4 */ >> > > Jan > > -- > Siemens AG, Corporate Technology, CT RTC ITP SDP-DE > Corporate Competence Center Embedded Linux >
On 10/24/2012 09:17 AM, Jan Kiszka wrote: >>> >>> This is ugly for many reasons. First of all, it is racy as the register >>> content may change while dropping the device lock, no? Then you would >>> raise or clear an IRQ spuriously. >>> >> Device state's intact is protected by busy flag, and will not broken > > Except that the busy flag concept is broken in itself. How do we fix an mmio that ends up mmio'ing back to itself, perhaps indirectly? Note this is broken in mainline too, but in a different way. Do we introduce clever locks that can detect deadlocks? > I see that we have a all-or-nothing problem here: to address this > properly, we need to convert the IRQ path to lock-less (or at least > compatible with holding per-device locks) as well. There is a transitional path where writing to a register that can cause IRQ changes takes both the big lock and the local lock. Eventually, though, of course all inner subsystems must be threaded for this work to have value.
On 2012-10-25 11:01, Avi Kivity wrote: > On 10/24/2012 09:17 AM, Jan Kiszka wrote: >>>> >>>> This is ugly for many reasons. First of all, it is racy as the register >>>> content may change while dropping the device lock, no? Then you would >>>> raise or clear an IRQ spuriously. >>>> >>> Device state's intact is protected by busy flag, and will not broken >> >> Except that the busy flag concept is broken in itself. > > How do we fix an mmio that ends up mmio'ing back to itself, perhaps > indirectly? Note this is broken in mainline too, but in a different way. > > Do we introduce clever locks that can detect deadlocks? That problem is already addressed (to my understanding) by blocking nested MMIO in general. The brokenness of the busy flag is that it prevents concurrent MMIO by dropping requests. > >> I see that we have a all-or-nothing problem here: to address this >> properly, we need to convert the IRQ path to lock-less (or at least >> compatible with holding per-device locks) as well. > > There is a transitional path where writing to a register that can cause > IRQ changes takes both the big lock and the local lock. > > Eventually, though, of course all inner subsystems must be threaded for > this work to have value. > But that transitional path must not introduce regressions. Opening a race window between IRQ cause update and event injection is such a thing, just like dropping concurrent requests on the floor. Jan
On 2012-10-24 09:29, liu ping fan wrote: > On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote: >> On 2012-10-22 11:23, Liu Ping Fan wrote: >>> Use local lock to protect e1000. When calling the system function, >>> dropping the fine lock before acquiring the big lock. This will >>> introduce broken device state, which need extra effort to fix. >>> >>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com> >>> --- >>> hw/e1000.c | 24 +++++++++++++++++++++++- >>> 1 files changed, 23 insertions(+), 1 deletions(-) >>> >>> diff --git a/hw/e1000.c b/hw/e1000.c >>> index ae8a6c5..5eddab5 100644 >>> --- a/hw/e1000.c >>> +++ b/hw/e1000.c >>> @@ -85,6 +85,7 @@ typedef struct E1000State_st { >>> NICConf conf; >>> MemoryRegion mmio; >>> MemoryRegion io; >>> + QemuMutex e1000_lock; >>> >>> uint32_t mac_reg[0x8000]; >>> uint16_t phy_reg[0x20]; >>> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = { >>> static void >>> set_interrupt_cause(E1000State *s, int index, uint32_t val) >>> { >>> + QemuThread *t; >>> + >>> if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) { >>> /* Only for 8257x */ >>> val |= E1000_ICR_INT_ASSERTED; >>> } >>> s->mac_reg[ICR] = val; >>> s->mac_reg[ICS] = val; >>> - qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >>> + >>> + t = pthread_getspecific(qemu_thread_key); >>> + if (t->context_type == 1) { >>> + qemu_mutex_unlock(&s->e1000_lock); >>> + qemu_mutex_lock_iothread(); >>> + } >>> + if (DEVICE(s)->state < DEV_STATE_STOPPING) { >>> + qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >>> + } >>> + if (t->context_type == 1) { >>> + qemu_mutex_unlock_iothread(); >>> + qemu_mutex_lock(&s->e1000_lock); >>> + } >> >> This is ugly for many reasons. First of all, it is racy as the register >> content may change while dropping the device lock, no? Then you would >> raise or clear an IRQ spuriously. >> >> Second, it clearly shows that we need to address lock-less IRQ delivery. >> Almost nothing is won if we have to take the global lock again to push >> an IRQ event to the guest. I'm repeating myself, but the problem to be >> solved here is almost identical to fast IRQ delivery for assigned >> devices (which we only address pretty ad-hoc for PCI so far). >> > Interesting, could you show me more detail about it, so I can google... No need to look that far, just grep for pci_device_route_intx_to_irq, pci_device_set_intx_routing_notifier and related functions in the code. Jan
On 10/25/2012 11:31 AM, Jan Kiszka wrote: > On 2012-10-25 11:01, Avi Kivity wrote: >> On 10/24/2012 09:17 AM, Jan Kiszka wrote: >>>>> >>>>> This is ugly for many reasons. First of all, it is racy as the register >>>>> content may change while dropping the device lock, no? Then you would >>>>> raise or clear an IRQ spuriously. >>>>> >>>> Device state's intact is protected by busy flag, and will not broken >>> >>> Except that the busy flag concept is broken in itself. >> >> How do we fix an mmio that ends up mmio'ing back to itself, perhaps >> indirectly? Note this is broken in mainline too, but in a different way. >> >> Do we introduce clever locks that can detect deadlocks? > > That problem is already addressed (to my understanding) by blocking > nested MMIO in general. That doesn't work cross-thread. vcpu A: write to device X, dma-ing to device Y vcpu B: write to device Y, dma-ing to device X My suggestion was to drop the locks around DMA, then re-acquire the lock and re-validate data. > The brokenness of the busy flag is that it > prevents concurrent MMIO by dropping requests. Right. > >> >>> I see that we have a all-or-nothing problem here: to address this >>> properly, we need to convert the IRQ path to lock-less (or at least >>> compatible with holding per-device locks) as well. >> >> There is a transitional path where writing to a register that can cause >> IRQ changes takes both the big lock and the local lock. >> >> Eventually, though, of course all inner subsystems must be threaded for >> this work to have value. >> > > But that transitional path must not introduce regressions. Opening a > race window between IRQ cause update and event injection is such a > thing, just like dropping concurrent requests on the floor. Can you explain the race?
On 10/25/2012 03:34 PM, Jan Kiszka wrote: >>> Second, it clearly shows that we need to address lock-less IRQ delivery. >>> Almost nothing is won if we have to take the global lock again to push >>> an IRQ event to the guest. I'm repeating myself, but the problem to be >>> solved here is almost identical to fast IRQ delivery for assigned >>> devices (which we only address pretty ad-hoc for PCI so far). >>> >> Interesting, could you show me more detail about it, so I can google... > > No need to look that far, just grep for pci_device_route_intx_to_irq, > pci_device_set_intx_routing_notifier and related functions in the code. We can address it in the same way the memory core supports concurrency, by copying dispatch information into rcu or lock protected data structures. But I really hope we can avoid doing it now.
On 2012-10-25 18:21, Avi Kivity wrote: > On 10/25/2012 11:31 AM, Jan Kiszka wrote: >> On 2012-10-25 11:01, Avi Kivity wrote: >>> On 10/24/2012 09:17 AM, Jan Kiszka wrote: >>>>>> >>>>>> This is ugly for many reasons. First of all, it is racy as the register >>>>>> content may change while dropping the device lock, no? Then you would >>>>>> raise or clear an IRQ spuriously. >>>>>> >>>>> Device state's intact is protected by busy flag, and will not broken >>>> >>>> Except that the busy flag concept is broken in itself. >>> >>> How do we fix an mmio that ends up mmio'ing back to itself, perhaps >>> indirectly? Note this is broken in mainline too, but in a different way. >>> >>> Do we introduce clever locks that can detect deadlocks? >> >> That problem is already addressed (to my understanding) by blocking >> nested MMIO in general. > > That doesn't work cross-thread. > > vcpu A: write to device X, dma-ing to device Y > vcpu B: write to device Y, dma-ing to device X We will deny DMA-ing from device X on behalf of a VCPU, ie. in dispatch context, to Y. What we do not deny, though, is DMA-ing from an I/O thread that processes an event for device X. If the invoked callback of device X holds the device lock across some DMA request to Y, then we risk to run into the same ABBA issue. Hmm... > > My suggestion was to drop the locks around DMA, then re-acquire the lock > and re-validate data. Maybe possible, but hairy depending on the device model. > >> The brokenness of the busy flag is that it >> prevents concurrent MMIO by dropping requests. > > Right. > >> >>> >>>> I see that we have a all-or-nothing problem here: to address this >>>> properly, we need to convert the IRQ path to lock-less (or at least >>>> compatible with holding per-device locks) as well. >>> >>> There is a transitional path where writing to a register that can cause >>> IRQ changes takes both the big lock and the local lock. >>> >>> Eventually, though, of course all inner subsystems must be threaded for >>> this work to have value. >>> >> >> But that transitional path must not introduce regressions. Opening a >> race window between IRQ cause update and event injection is such a >> thing, just like dropping concurrent requests on the floor. > > Can you explain the race? Context A Context B device.lock ... device.set interrupt_cause = 0 lower_irq = true ... device.unlock device.lock ... device.interrupt_cause = 42 raise_irq = true ... device.unlock if (raise_irq) bql.lock set_irq(device.irqno) bql.unlock if (lower_irq) bql.lock clear_irq(device.irqno) bql.unlock And there it goes, our interrupt event. Jan
On 2012-10-25 18:23, Avi Kivity wrote: > On 10/25/2012 03:34 PM, Jan Kiszka wrote: > >>>> Second, it clearly shows that we need to address lock-less IRQ delivery. >>>> Almost nothing is won if we have to take the global lock again to push >>>> an IRQ event to the guest. I'm repeating myself, but the problem to be >>>> solved here is almost identical to fast IRQ delivery for assigned >>>> devices (which we only address pretty ad-hoc for PCI so far). >>>> >>> Interesting, could you show me more detail about it, so I can google... >> >> No need to look that far, just grep for pci_device_route_intx_to_irq, >> pci_device_set_intx_routing_notifier and related functions in the code. > > We can address it in the same way the memory core supports concurrency, > by copying dispatch information into rcu or lock protected data structures. > > But I really hope we can avoid doing it now. I doubt so as the alternative is taking the BQL while (still) holding the device lock. But that creates ABBA risks. Jan
On 10/25/2012 06:39 PM, Jan Kiszka wrote: >> >> That doesn't work cross-thread. >> >> vcpu A: write to device X, dma-ing to device Y >> vcpu B: write to device Y, dma-ing to device X > > We will deny DMA-ing from device X on behalf of a VCPU, ie. in dispatch > context, to Y. > > What we do not deny, though, is DMA-ing from an I/O thread that > processes an event for device X. I would really like to avoid depending on the context. In real hardware, there is no such thing. > If the invoked callback of device X > holds the device lock across some DMA request to Y, then we risk to run > into the same ABBA issue. Hmm... Yup. > >> >> My suggestion was to drop the locks around DMA, then re-acquire the lock >> and re-validate data. > > Maybe possible, but hairy depending on the device model. It's unpleasant, yes. Note depending on the device, we may not need to re-validate data, it may be sufficient to load it into local variables to we know it is consistent at some point. But all those solutions suffer from requiring device model authors to understand all those issues, rather than just add a simple lock around access to their data structures. >>>>> I see that we have a all-or-nothing problem here: to address this >>>>> properly, we need to convert the IRQ path to lock-less (or at least >>>>> compatible with holding per-device locks) as well. >>>> >>>> There is a transitional path where writing to a register that can cause >>>> IRQ changes takes both the big lock and the local lock. >>>> >>>> Eventually, though, of course all inner subsystems must be threaded for >>>> this work to have value. >>>> >>> >>> But that transitional path must not introduce regressions. Opening a >>> race window between IRQ cause update and event injection is such a >>> thing, just like dropping concurrent requests on the floor. >> >> Can you explain the race? > > Context A Context B > > device.lock > ... > device.set interrupt_cause = 0 > lower_irq = true > ... > device.unlock > device.lock > ... > device.interrupt_cause = 42 > raise_irq = true > ... > device.unlock > if (raise_irq) > bql.lock > set_irq(device.irqno) > bql.unlock > if (lower_irq) > bql.lock > clear_irq(device.irqno) > bql.unlock > > > And there it goes, our interrupt event. Obviously you'll need to reacquire the device lock after taking bql and revalidate stuff. But that is not what I am suggesting. Instead, any path that can lead to an irq update (or timer update etc) will take both the bql and the device lock. This will leave after the first pass only side effect free register reads and writes, which is silly if we keep it that way, but we will follow with a threaded timer and irq subsystem and we'll peel away those big locks. device_mmio_write: if register is involved in irq or timers or block layer or really anything that matters: bql.acquire device.lock.acquire do stuff device.lock.release if that big condition from above was true: bql.release
On 10/25/2012 06:41 PM, Jan Kiszka wrote: > On 2012-10-25 18:23, Avi Kivity wrote: >> On 10/25/2012 03:34 PM, Jan Kiszka wrote: >> >>>>> Second, it clearly shows that we need to address lock-less IRQ delivery. >>>>> Almost nothing is won if we have to take the global lock again to push >>>>> an IRQ event to the guest. I'm repeating myself, but the problem to be >>>>> solved here is almost identical to fast IRQ delivery for assigned >>>>> devices (which we only address pretty ad-hoc for PCI so far). >>>>> >>>> Interesting, could you show me more detail about it, so I can google... >>> >>> No need to look that far, just grep for pci_device_route_intx_to_irq, >>> pci_device_set_intx_routing_notifier and related functions in the code. >> >> We can address it in the same way the memory core supports concurrency, >> by copying dispatch information into rcu or lock protected data structures. >> >> But I really hope we can avoid doing it now. > > I doubt so as the alternative is taking the BQL while (still) holding > the device lock. Sorry, doesn't parse. > But that creates ABBA risks.
On 2012-10-25 19:02, Avi Kivity wrote: > On 10/25/2012 06:39 PM, Jan Kiszka wrote: >>> >>> That doesn't work cross-thread. >>> >>> vcpu A: write to device X, dma-ing to device Y >>> vcpu B: write to device Y, dma-ing to device X >> >> We will deny DMA-ing from device X on behalf of a VCPU, ie. in dispatch >> context, to Y. >> >> What we do not deny, though, is DMA-ing from an I/O thread that >> processes an event for device X. > > I would really like to avoid depending on the context. In real hardware, there is no such thing. The point is how we deal with any kind of access to a device that requires taking that device's lock while holding another lock, provided that scenario can also take place in reverse order at the same time. Known scenarios are: - vcpu 1 -> access device A -> access device B - vcpu 2 -> access device B -> access device A - event 1 -> device A event processing -> access device B - event 2 -> device B event processing -> access device A and combinations of those pairs. > >> If the invoked callback of device X >> holds the device lock across some DMA request to Y, then we risk to run >> into the same ABBA issue. Hmm... > > Yup. > >> >>> >>> My suggestion was to drop the locks around DMA, then re-acquire the lock >>> and re-validate data. >> >> Maybe possible, but hairy depending on the device model. > > It's unpleasant, yes. > > Note depending on the device, we may not need to re-validate data, it may be sufficient to load it into local variables to we know it is consistent at some point. But all those solutions suffer from requiring device model authors to understand all those issues, rather than just add a simple lock around access to their data structures. Right. And therefor it is a suboptimal way to start (patching). > >>>>>> I see that we have a all-or-nothing problem here: to address this >>>>>> properly, we need to convert the IRQ path to lock-less (or at least >>>>>> compatible with holding per-device locks) as well. >>>>> >>>>> There is a transitional path where writing to a register that can cause >>>>> IRQ changes takes both the big lock and the local lock. >>>>> >>>>> Eventually, though, of course all inner subsystems must be threaded for >>>>> this work to have value. >>>>> >>>> >>>> But that transitional path must not introduce regressions. Opening a >>>> race window between IRQ cause update and event injection is such a >>>> thing, just like dropping concurrent requests on the floor. >>> >>> Can you explain the race? >> >> Context A Context B >> >> device.lock >> ... >> device.set interrupt_cause = 0 >> lower_irq = true >> ... >> device.unlock >> device.lock >> ... >> device.interrupt_cause = 42 >> raise_irq = true >> ... >> device.unlock >> if (raise_irq) >> bql.lock >> set_irq(device.irqno) >> bql.unlock >> if (lower_irq) >> bql.lock >> clear_irq(device.irqno) >> bql.unlock >> >> >> And there it goes, our interrupt event. > > Obviously you'll need to reacquire the device lock after taking bql and revalidate stuff. But that is not what I am suggesting. Instead, any path that can lead to an irq update (or timer update etc) will take both the bql and the device lock. This will leave after the first pass only side effect free register reads and writes, which is silly if we keep it that way, but we will follow with a threaded timer and irq subsystem and we'll peel away those big locks. > > device_mmio_write: > if register is involved in irq or timers or block layer or really anything that matters: > bql.acquire > device.lock.acquire > do stuff > device.lock.release > if that big condition from above was true: > bql.release Looks simpler than it is as you cannot wrap complete handlers with that pattern. An example where it would fail (until we solved the locking issues above): mmio_write: bql.conditional_lock device.lock device.check_state issue_dma device.update_state update_irq, play_with_timers, etc. device.unlock bql.conditional_unlock If that DMA request hits an unconverted MMIO region or one that takes BQL conditionally as above, we will lock up (or bail out as our mutexes detect the error). E1000's start_xmit looks like this so far, and that's a pretty import service. Moreover, I prefer having a representative cut-through over enjoying to merge a first step that excludes some 80% of the problems. For that reason I would be even be inclined to start with addressing the IRQ injection topic first (patch-wise), then the other necessary backend services for the e1000 or whatever and convert some device(s) last. IOW: cut out anything from this series that touches e1000 until the building blocks for converting it reasonably are finished. Carrying experimental, partially broken conversion on top is fine, try to merge pieces of that not, IMHO. Jan
On Fri, Oct 26, 2012 at 2:48 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote: > On 2012-10-25 19:02, Avi Kivity wrote: >> On 10/25/2012 06:39 PM, Jan Kiszka wrote: >>>> >>>> That doesn't work cross-thread. >>>> >>>> vcpu A: write to device X, dma-ing to device Y >>>> vcpu B: write to device Y, dma-ing to device X >>> >>> We will deny DMA-ing from device X on behalf of a VCPU, ie. in dispatch >>> context, to Y. >>> >>> What we do not deny, though, is DMA-ing from an I/O thread that >>> processes an event for device X. >> >> I would really like to avoid depending on the context. In real hardware, there is no such thing. > > The point is how we deal with any kind of access to a device that > requires taking that device's lock while holding another lock, provided > that scenario can also take place in reverse order at the same time. > Known scenarios are: > > - vcpu 1 -> access device A -> access device B > - vcpu 2 -> access device B -> access device A > > - event 1 -> device A event processing -> access device B > - event 2 -> device B event processing -> access device A > > and combinations of those pairs. > >> >>> If the invoked callback of device X >>> holds the device lock across some DMA request to Y, then we risk to run >>> into the same ABBA issue. Hmm... >> >> Yup. >> >>> >>>> >>>> My suggestion was to drop the locks around DMA, then re-acquire the lock >>>> and re-validate data. >>> >>> Maybe possible, but hairy depending on the device model. >> >> It's unpleasant, yes. >> >> Note depending on the device, we may not need to re-validate data, it may be sufficient to load it into local variables to we know it is consistent at some point. But all those solutions suffer from requiring device model authors to understand all those issues, rather than just add a simple lock around access to their data structures. > > Right. And therefor it is a suboptimal way to start (patching). > >> >>>>>>> I see that we have a all-or-nothing problem here: to address this >>>>>>> properly, we need to convert the IRQ path to lock-less (or at least >>>>>>> compatible with holding per-device locks) as well. >>>>>> >>>>>> There is a transitional path where writing to a register that can cause >>>>>> IRQ changes takes both the big lock and the local lock. >>>>>> >>>>>> Eventually, though, of course all inner subsystems must be threaded for >>>>>> this work to have value. >>>>>> >>>>> >>>>> But that transitional path must not introduce regressions. Opening a >>>>> race window between IRQ cause update and event injection is such a >>>>> thing, just like dropping concurrent requests on the floor. >>>> >>>> Can you explain the race? >>> >>> Context A Context B >>> >>> device.lock >>> ... >>> device.set interrupt_cause = 0 >>> lower_irq = true >>> ... >>> device.unlock >>> device.lock >>> ... >>> device.interrupt_cause = 42 >>> raise_irq = true >>> ... >>> device.unlock >>> if (raise_irq) >>> bql.lock >>> set_irq(device.irqno) >>> bql.unlock >>> if (lower_irq) >>> bql.lock >>> clear_irq(device.irqno) >>> bql.unlock >>> >>> >>> And there it goes, our interrupt event. >> >> Obviously you'll need to reacquire the device lock after taking bql and revalidate stuff. But that is not what I am suggesting. Instead, any path that can lead to an irq update (or timer update etc) will take both the bql and the device lock. This will leave after the first pass only side effect free register reads and writes, which is silly if we keep it that way, but we will follow with a threaded timer and irq subsystem and we'll peel away those big locks. >> >> device_mmio_write: >> if register is involved in irq or timers or block layer or really anything that matters: >> bql.acquire >> device.lock.acquire >> do stuff >> device.lock.release >> if that big condition from above was true: >> bql.release > > Looks simpler than it is as you cannot wrap complete handlers with that > pattern. An example where it would fail (until we solved the locking > issues above): > > mmio_write: > bql.conditional_lock > device.lock > device.check_state > issue_dma > device.update_state > update_irq, play_with_timers, etc. > device.unlock > bql.conditional_unlock > > If that DMA request hits an unconverted MMIO region or one that takes > BQL conditionally as above, we will lock up (or bail out as our mutexes > detect the error). E1000's start_xmit looks like this so far, and that's > a pretty import service. > > Moreover, I prefer having a representative cut-through over enjoying to > merge a first step that excludes some 80% of the problems. For that > reason I would be even be inclined to start with addressing the IRQ > injection topic first (patch-wise), then the other necessary backend > services for the e1000 or whatever and convert some device(s) last. > > IOW: cut out anything from this series that touches e1000 until the > building blocks for converting it reasonably are finished. Carrying > experimental, partially broken conversion on top is fine, try to merge > pieces of that not, IMHO. > Agreed. Just want to take this opportunity to discuss what is the next and what is nut. Regards, pingfan > Jan > > -- > Siemens AG, Corporate Technology, CT RTC ITP SDP-DE > Corporate Competence Center Embedded Linux
On Thu, Oct 25, 2012 at 9:34 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote: > On 2012-10-24 09:29, liu ping fan wrote: >> On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote: >>> On 2012-10-22 11:23, Liu Ping Fan wrote: >>>> Use local lock to protect e1000. When calling the system function, >>>> dropping the fine lock before acquiring the big lock. This will >>>> introduce broken device state, which need extra effort to fix. >>>> >>>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com> >>>> --- >>>> hw/e1000.c | 24 +++++++++++++++++++++++- >>>> 1 files changed, 23 insertions(+), 1 deletions(-) >>>> >>>> diff --git a/hw/e1000.c b/hw/e1000.c >>>> index ae8a6c5..5eddab5 100644 >>>> --- a/hw/e1000.c >>>> +++ b/hw/e1000.c >>>> @@ -85,6 +85,7 @@ typedef struct E1000State_st { >>>> NICConf conf; >>>> MemoryRegion mmio; >>>> MemoryRegion io; >>>> + QemuMutex e1000_lock; >>>> >>>> uint32_t mac_reg[0x8000]; >>>> uint16_t phy_reg[0x20]; >>>> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = { >>>> static void >>>> set_interrupt_cause(E1000State *s, int index, uint32_t val) >>>> { >>>> + QemuThread *t; >>>> + >>>> if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) { >>>> /* Only for 8257x */ >>>> val |= E1000_ICR_INT_ASSERTED; >>>> } >>>> s->mac_reg[ICR] = val; >>>> s->mac_reg[ICS] = val; >>>> - qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >>>> + >>>> + t = pthread_getspecific(qemu_thread_key); >>>> + if (t->context_type == 1) { >>>> + qemu_mutex_unlock(&s->e1000_lock); >>>> + qemu_mutex_lock_iothread(); >>>> + } >>>> + if (DEVICE(s)->state < DEV_STATE_STOPPING) { >>>> + qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >>>> + } >>>> + if (t->context_type == 1) { >>>> + qemu_mutex_unlock_iothread(); >>>> + qemu_mutex_lock(&s->e1000_lock); >>>> + } >>> >>> This is ugly for many reasons. First of all, it is racy as the register >>> content may change while dropping the device lock, no? Then you would >>> raise or clear an IRQ spuriously. >>> >>> Second, it clearly shows that we need to address lock-less IRQ delivery. >>> Almost nothing is won if we have to take the global lock again to push >>> an IRQ event to the guest. I'm repeating myself, but the problem to be >>> solved here is almost identical to fast IRQ delivery for assigned >>> devices (which we only address pretty ad-hoc for PCI so far). >>> >> Interesting, could you show me more detail about it, so I can google... > > No need to look that far, just grep for pci_device_route_intx_to_irq, > pci_device_set_intx_routing_notifier and related functions in the code. > I think, the major point here is to bypass the delivery process among the irq chipset during runtime. Right? Thanks and regards, pingfan > Jan > > -- > Siemens AG, Corporate Technology, CT RTC ITP SDP-DE > Corporate Competence Center Embedded Linux
On 2012-10-29 06:24, liu ping fan wrote: > On Thu, Oct 25, 2012 at 9:34 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote: >> On 2012-10-24 09:29, liu ping fan wrote: >>> On Tue, Oct 23, 2012 at 5:04 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote: >>>> On 2012-10-22 11:23, Liu Ping Fan wrote: >>>>> Use local lock to protect e1000. When calling the system function, >>>>> dropping the fine lock before acquiring the big lock. This will >>>>> introduce broken device state, which need extra effort to fix. >>>>> >>>>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com> >>>>> --- >>>>> hw/e1000.c | 24 +++++++++++++++++++++++- >>>>> 1 files changed, 23 insertions(+), 1 deletions(-) >>>>> >>>>> diff --git a/hw/e1000.c b/hw/e1000.c >>>>> index ae8a6c5..5eddab5 100644 >>>>> --- a/hw/e1000.c >>>>> +++ b/hw/e1000.c >>>>> @@ -85,6 +85,7 @@ typedef struct E1000State_st { >>>>> NICConf conf; >>>>> MemoryRegion mmio; >>>>> MemoryRegion io; >>>>> + QemuMutex e1000_lock; >>>>> >>>>> uint32_t mac_reg[0x8000]; >>>>> uint16_t phy_reg[0x20]; >>>>> @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = { >>>>> static void >>>>> set_interrupt_cause(E1000State *s, int index, uint32_t val) >>>>> { >>>>> + QemuThread *t; >>>>> + >>>>> if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) { >>>>> /* Only for 8257x */ >>>>> val |= E1000_ICR_INT_ASSERTED; >>>>> } >>>>> s->mac_reg[ICR] = val; >>>>> s->mac_reg[ICS] = val; >>>>> - qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >>>>> + >>>>> + t = pthread_getspecific(qemu_thread_key); >>>>> + if (t->context_type == 1) { >>>>> + qemu_mutex_unlock(&s->e1000_lock); >>>>> + qemu_mutex_lock_iothread(); >>>>> + } >>>>> + if (DEVICE(s)->state < DEV_STATE_STOPPING) { >>>>> + qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >>>>> + } >>>>> + if (t->context_type == 1) { >>>>> + qemu_mutex_unlock_iothread(); >>>>> + qemu_mutex_lock(&s->e1000_lock); >>>>> + } >>>> >>>> This is ugly for many reasons. First of all, it is racy as the register >>>> content may change while dropping the device lock, no? Then you would >>>> raise or clear an IRQ spuriously. >>>> >>>> Second, it clearly shows that we need to address lock-less IRQ delivery. >>>> Almost nothing is won if we have to take the global lock again to push >>>> an IRQ event to the guest. I'm repeating myself, but the problem to be >>>> solved here is almost identical to fast IRQ delivery for assigned >>>> devices (which we only address pretty ad-hoc for PCI so far). >>>> >>> Interesting, could you show me more detail about it, so I can google... >> >> No need to look that far, just grep for pci_device_route_intx_to_irq, >> pci_device_set_intx_routing_notifier and related functions in the code. >> > I think, the major point here is to bypass the delivery process among > the irq chipset during runtime. Right? Right. Jan
diff --git a/hw/e1000.c b/hw/e1000.c index ae8a6c5..5eddab5 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -85,6 +85,7 @@ typedef struct E1000State_st { NICConf conf; MemoryRegion mmio; MemoryRegion io; + QemuMutex e1000_lock; uint32_t mac_reg[0x8000]; uint16_t phy_reg[0x20]; @@ -223,13 +224,27 @@ static const uint32_t mac_reg_init[] = { static void set_interrupt_cause(E1000State *s, int index, uint32_t val) { + QemuThread *t; + if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) { /* Only for 8257x */ val |= E1000_ICR_INT_ASSERTED; } s->mac_reg[ICR] = val; s->mac_reg[ICS] = val; - qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); + + t = pthread_getspecific(qemu_thread_key); + if (t->context_type == 1) { + qemu_mutex_unlock(&s->e1000_lock); + qemu_mutex_lock_iothread(); + } + if (DEVICE(s)->state < DEV_STATE_STOPPING) { + qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); + } + if (t->context_type == 1) { + qemu_mutex_unlock_iothread(); + qemu_mutex_lock(&s->e1000_lock); + } } static void @@ -268,6 +283,7 @@ static void e1000_reset(void *opaque) E1000State *d = opaque; qemu_del_timer(d->autoneg_timer); + memset(d->phy_reg, 0, sizeof d->phy_reg); memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init); memset(d->mac_reg, 0, sizeof d->mac_reg); @@ -448,7 +464,11 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size) if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) { s->nic->nc.info->receive(&s->nic->nc, buf, size); } else { + qemu_mutex_unlock(&s->e1000_lock); + qemu_mutex_lock_iothread(); qemu_send_packet(&s->nic->nc, buf, size); + qemu_mutex_unlock_iothread(); + qemu_mutex_lock(&s->e1000_lock); } } @@ -1221,6 +1241,8 @@ static int pci_e1000_init(PCIDevice *pci_dev) int i; uint8_t *macaddr; + qemu_mutex_init(&d->e1000_lock); + pci_conf = d->dev.config; /* TODO: RST# value should be 0, PCI spec 6.2.4 */
Use local lock to protect e1000. When calling the system function, dropping the fine lock before acquiring the big lock. This will introduce broken device state, which need extra effort to fix. Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com> --- hw/e1000.c | 24 +++++++++++++++++++++++- 1 files changed, 23 insertions(+), 1 deletions(-)