diff mbox

[5/9] memory: let address_space_rw/ld*/st* run outside the BQL

Message ID 1434646046-27150-6-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini June 18, 2015, 4:47 p.m. UTC
From: Jan Kiszka <jan.kiszka@siemens.com>

The MMIO case is further broken up in two cases: if the caller does not
hold the BQL on invocation, the unlocked one takes or avoids BQL depending
on the locking strategy of the target memory region and its coalesced
MMIO handling.  In this case, the caller should not hold _any_ lock
(a friendly suggestion which is disregarded by virtio-scsi-dataplane).

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 9 deletions(-)

Comments

Alex Bennée June 24, 2015, 4:56 p.m. UTC | #1
Paolo Bonzini <pbonzini@redhat.com> writes:

> From: Jan Kiszka <jan.kiszka@siemens.com>
>
> The MMIO case is further broken up in two cases: if the caller does not
> hold the BQL on invocation, the unlocked one takes or avoids BQL depending
> on the locking strategy of the target memory region and its coalesced
> MMIO handling.  In this case, the caller should not hold _any_ lock
> (a friendly suggestion which is disregarded by virtio-scsi-dataplane).
>
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  exec.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 57 insertions(+), 9 deletions(-)
>
> diff --git a/exec.c b/exec.c
> index 094f87e..78c99f6 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -48,6 +48,7 @@
>  #endif
>  #include "exec/cpu-all.h"
>  #include "qemu/rcu_queue.h"
> +#include "qemu/main-loop.h"
>  #include "exec/cputlb.h"
>  #include "translate-all.h"
>  
> @@ -2318,11 +2319,27 @@ static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
>      return l;
>  }
>  
> -static void prepare_mmio_access(MemoryRegion *mr)
> +static bool prepare_mmio_access(MemoryRegion *mr)
>  {
> +    bool unlocked = !qemu_mutex_iothread_locked();
> +    bool release_lock = false;
> +
> +    if (unlocked && mr->global_locking) {
> +        qemu_mutex_lock_iothread();
> +        unlocked = false;
> +        release_lock = true;
> +    }
>      if (mr->flush_coalesced_mmio) {
> +        if (unlocked) {
> +            qemu_mutex_lock_iothread();
> +        }
>          qemu_flush_coalesced_mmio_buffer();
> +        if (unlocked) {
> +            qemu_mutex_unlock_iothread();
> +        }
>      }
> +
> +    return release_lock;
>  }

This is where I get confused between the advantage of this over however
same pid recursive locking. If you use recursive locking you don't need
to add a bunch of state to remind you of when to release the lock. Then
you'd just need:

static void prepare_mmio_access(MemoryRegion *mr)
{
    if (mr->global_locking) {
        qemu_mutex_lock_iothread();
    }
    if (mr->flush_coalesced_mmio) {
        qemu_mutex_lock_iothread();
        qemu_flush_coalesced_mmio_buffer();
        qemu_mutex_unlock_iothread();
    }
}

and a bunch of:

if (mr->global_locking)
   qemu_mutex_unlock_iothread();

in the access functions. Although I suspect you could push the
mr->global_locking up to the dispatch functions.

Am I missing something here?

>  
>  MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
> @@ -2334,6 +2351,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
>      hwaddr addr1;
>      MemoryRegion *mr;
>      MemTxResult result = MEMTX_OK;
> +    bool release_lock = false;
>  
>      rcu_read_lock();
>      while (len > 0) {
> @@ -2342,7 +2360,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
>  
>          if (is_write) {
>              if (!memory_access_is_direct(mr, is_write)) {
> -                prepare_mmio_access(mr);
> +                release_lock |= prepare_mmio_access(mr);
>                  l = memory_access_size(mr, l, addr1);
>                  /* XXX: could force current_cpu to NULL to avoid
>                     potential bugs */
> @@ -2384,7 +2402,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
>          } else {
>              if (!memory_access_is_direct(mr, is_write)) {
>                  /* I/O case */
> -                prepare_mmio_access(mr);
> +                release_lock |= prepare_mmio_access(mr);
>                  l = memory_access_size(mr, l, addr1);
>                  switch (l) {
>                  case 8:
> @@ -2420,6 +2438,12 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
>                  memcpy(buf, ptr, l);
>              }
>          }
> +
> +        if (release_lock) {
> +            qemu_mutex_unlock_iothread();
> +            release_lock = false;
> +        }
> +
>          len -= l;
>          buf += l;
>          addr += l;
> @@ -2746,11 +2770,12 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
>      hwaddr l = 4;
>      hwaddr addr1;
>      MemTxResult r;
> +    bool release_lock = false;
>  
>      rcu_read_lock();
>      mr = address_space_translate(as, addr, &addr1, &l, false);
>      if (l < 4 || !memory_access_is_direct(mr, false)) {
> -        prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr);
>  
>          /* I/O case */
>          r = memory_region_dispatch_read(mr, addr1, &val, 4, attrs);
> @@ -2784,6 +2809,9 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
>      if (result) {
>          *result = r;
>      }
> +    if (release_lock) {
> +        qemu_mutex_unlock_iothread();
> +    }
>      rcu_read_unlock();
>      return val;
>  }
> @@ -2836,12 +2864,13 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
>      hwaddr l = 8;
>      hwaddr addr1;
>      MemTxResult r;
> +    bool release_lock = false;
>  
>      rcu_read_lock();
>      mr = address_space_translate(as, addr, &addr1, &l,
>                                   false);
>      if (l < 8 || !memory_access_is_direct(mr, false)) {
> -        prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr);
>  
>          /* I/O case */
>          r = memory_region_dispatch_read(mr, addr1, &val, 8, attrs);
> @@ -2875,6 +2904,9 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
>      if (result) {
>          *result = r;
>      }
> +    if (release_lock) {
> +        qemu_mutex_unlock_iothread();
> +    }
>      rcu_read_unlock();
>      return val;
>  }
> @@ -2947,12 +2979,13 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as,
>      hwaddr l = 2;
>      hwaddr addr1;
>      MemTxResult r;
> +    bool release_lock = false;
>  
>      rcu_read_lock();
>      mr = address_space_translate(as, addr, &addr1, &l,
>                                   false);
>      if (l < 2 || !memory_access_is_direct(mr, false)) {
> -        prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr);
>  
>          /* I/O case */
>          r = memory_region_dispatch_read(mr, addr1, &val, 2, attrs);
> @@ -2986,6 +3019,9 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as,
>      if (result) {
>          *result = r;
>      }
> +    if (release_lock) {
> +        qemu_mutex_unlock_iothread();
> +    }
>      rcu_read_unlock();
>      return val;
>  }
> @@ -3037,12 +3073,13 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,
>      hwaddr l = 4;
>      hwaddr addr1;
>      MemTxResult r;
> +    bool release_lock = false;
>  
>      rcu_read_lock();
>      mr = address_space_translate(as, addr, &addr1, &l,
>                                   true);
>      if (l < 4 || !memory_access_is_direct(mr, true)) {
> -        prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr);
>  
>          r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
>      } else {
> @@ -3063,6 +3100,9 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,
>      if (result) {
>          *result = r;
>      }
> +    if (release_lock) {
> +        qemu_mutex_unlock_iothread();
> +    }
>      rcu_read_unlock();
>  }
>  
> @@ -3083,12 +3123,13 @@ static inline void address_space_stl_internal(AddressSpace *as,
>      hwaddr l = 4;
>      hwaddr addr1;
>      MemTxResult r;
> +    bool release_lock = false;
>  
>      rcu_read_lock();
>      mr = address_space_translate(as, addr, &addr1, &l,
>                                   true);
>      if (l < 4 || !memory_access_is_direct(mr, true)) {
> -        prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr);
>  
>  #if defined(TARGET_WORDS_BIGENDIAN)
>          if (endian == DEVICE_LITTLE_ENDIAN) {
> @@ -3121,6 +3162,9 @@ static inline void address_space_stl_internal(AddressSpace *as,
>      if (result) {
>          *result = r;
>      }
> +    if (release_lock) {
> +        qemu_mutex_unlock_iothread();
> +    }
>      rcu_read_unlock();
>  }
>  
> @@ -3190,11 +3234,12 @@ static inline void address_space_stw_internal(AddressSpace *as,
>      hwaddr l = 2;
>      hwaddr addr1;
>      MemTxResult r;
> +    bool release_lock = false;
>  
>      rcu_read_lock();
>      mr = address_space_translate(as, addr, &addr1, &l, true);
>      if (l < 2 || !memory_access_is_direct(mr, true)) {
> -        prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr);
>  
>  #if defined(TARGET_WORDS_BIGENDIAN)
>          if (endian == DEVICE_LITTLE_ENDIAN) {
> @@ -3227,6 +3272,9 @@ static inline void address_space_stw_internal(AddressSpace *as,
>      if (result) {
>          *result = r;
>      }
> +    if (release_lock) {
> +        qemu_mutex_unlock_iothread();
> +    }
>      rcu_read_unlock();
>  }
Paolo Bonzini June 24, 2015, 5:21 p.m. UTC | #2
On 24/06/2015 18:56, Alex Bennée wrote:
> This is where I get confused between the advantage of this over however
> same pid recursive locking. If you use recursive locking you don't need
> to add a bunch of state to remind you of when to release the lock. Then
> you'd just need:
> 
> static void prepare_mmio_access(MemoryRegion *mr)
> {
>     if (mr->global_locking) {
>         qemu_mutex_lock_iothread();
>     }
>     if (mr->flush_coalesced_mmio) {
>         qemu_mutex_lock_iothread();
>         qemu_flush_coalesced_mmio_buffer();
>         qemu_mutex_unlock_iothread();
>     }
> }
> 
> and a bunch of:
> 
> if (mr->global_locking)
>    qemu_mutex_unlock_iothread();
> 
> in the access functions. Although I suspect you could push the
> mr->global_locking up to the dispatch functions.
> 
> Am I missing something here?

The semantics of recursive locking with respect to condition variables
are not clear.  Either cond_wait releases all locks, and then the mutex
can be released when the code doesn't expect it to be, or cond_wait
doesn't release all locks and then you have deadlocks.

POSIX says to do the latter:

	It is advised that an application should not use a
	PTHREAD_MUTEX_RECURSIVE mutex with condition variables because
	the implicit unlock performed for a pthread_cond_timedwait() or
	pthread_cond_wait() may not actually release the mutex (if it
	had been locked multiple times). If this happens, no other
	thread can satisfy the condition of the predicate."

So, recursive locking is okay if you don't have condition variables
attached to the lock (and if you cannot do without it), but
qemu_global_mutex does have them.

QEMU has so far tried to use the solution that Stevens outlines here:
https://books.google.it/books?id=kCTMFpEcIOwC&pg=PA434

	... Leave the interfaces to func1 and func2 unchanged, and avoid
	a recursive mutex by providing a private version of func2,
	called func2_locked.  To call func2_locked, hold the mutex
	embedded in the data structure whose address we pass as the
	argument.

as a way to avoid recursive locking.  This is much better because a) it
is more efficient---taking locks can be expensive even if they're
uncontended, especially if your VM spans multiple NUMA nodes on the
host; b) it is always clear when a lock is taken and when it isn't.

Note that Stevens has another example right after this one of recursive
locking, involving callbacks, but it's ill-defined.  There's no reason
for the "timeout" function in page 437 to hold the mutex when it calls
"func".  It can unlock before and re-lock afterwards, like QEMU's own
timerlist_run_timers function.

Paolo
Alex Bennée June 24, 2015, 6:50 p.m. UTC | #3
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 24/06/2015 18:56, Alex Bennée wrote:
>> This is where I get confused between the advantage of this over however
>> same pid recursive locking. If you use recursive locking you don't need
>> to add a bunch of state to remind you of when to release the lock. Then
>> you'd just need:
>> 
>> static void prepare_mmio_access(MemoryRegion *mr)
>> {
>>     if (mr->global_locking) {
>>         qemu_mutex_lock_iothread();
>>     }
>>     if (mr->flush_coalesced_mmio) {
>>         qemu_mutex_lock_iothread();
>>         qemu_flush_coalesced_mmio_buffer();
>>         qemu_mutex_unlock_iothread();
>>     }
>> }
>> 
>> and a bunch of:
>> 
>> if (mr->global_locking)
>>    qemu_mutex_unlock_iothread();
>> 
>> in the access functions. Although I suspect you could push the
>> mr->global_locking up to the dispatch functions.
>> 
>> Am I missing something here?
>
> The semantics of recursive locking with respect to condition variables
> are not clear.  Either cond_wait releases all locks, and then the mutex
> can be released when the code doesn't expect it to be, or cond_wait
> doesn't release all locks and then you have deadlocks.
>
> POSIX says to do the latter:
>
> 	It is advised that an application should not use a
> 	PTHREAD_MUTEX_RECURSIVE mutex with condition variables because
> 	the implicit unlock performed for a pthread_cond_timedwait() or
> 	pthread_cond_wait() may not actually release the mutex (if it
> 	had been locked multiple times). If this happens, no other
> 	thread can satisfy the condition of the predicate."
>
> So, recursive locking is okay if you don't have condition variables
> attached to the lock (and if you cannot do without it), but
> qemu_global_mutex does have them.

Ahh OK, so I was missing something ;-)

>
> QEMU has so far tried to use the solution that Stevens outlines here:
> https://books.google.it/books?id=kCTMFpEcIOwC&pg=PA434
>
> 	... Leave the interfaces to func1 and func2 unchanged, and avoid
> 	a recursive mutex by providing a private version of func2,
> 	called func2_locked.  To call func2_locked, hold the mutex
> 	embedded in the data structure whose address we pass as the
> 	argument.
>
> as a way to avoid recursive locking.  This is much better because a) it
> is more efficient---taking locks can be expensive even if they're
> uncontended, especially if your VM spans multiple NUMA nodes on the
> host; b) it is always clear when a lock is taken and when it isn't.
>
> Note that Stevens has another example right after this one of recursive
> locking, involving callbacks, but it's ill-defined.  There's no reason
> for the "timeout" function in page 437 to hold the mutex when it calls
> "func".  It can unlock before and re-lock afterwards, like QEMU's own
> timerlist_run_timers function.

Unfortunately I can't read that link but it sounds like I should get
myself a copy of the book. I take it that approach wouldn't approve of:

static __thread int iothread_lock_count;

void qemu_mutex_lock_iothread(void)
{
    if (iothread_lock_count == 0) {
        qemu_mutex_lock(&qemu_global_mutex);
    }
    iothread_lock_count++;
}

void qemu_mutex_unlock_iothread(void)
{
    iothread_lock_count--;
    if (iothread_lock_count==0) {
        qemu_mutex_unlock(&qemu_global_mutex);
    }
    if (iothread_lock_count < 0) {
        fprintf(stderr,"%s: error, too many unlocks %d\n", __func__,
                iothread_lock_count);
    }
}

Which should achieve the same "only one lock held" semantics but still
make the calling code a little less worried about tracking the state.

I guess it depends if there is ever going to be a situation where we say
"lock is held, do something different"?

>
> Paolo
Paolo Bonzini June 25, 2015, 8:13 a.m. UTC | #4
On 24/06/2015 20:50, Alex Bennée wrote:
> 
> Paolo Bonzini <pbonzini@redhat.com> writes:
> 
>> On 24/06/2015 18:56, Alex Bennée wrote:
>>> This is where I get confused between the advantage of this over however
>>> same pid recursive locking. If you use recursive locking you don't need
>>> to add a bunch of state to remind you of when to release the lock.
>>>
>>> Am I missing something here?
>>
>> The semantics of recursive locking with respect to condition variables
>> are not clear.  Either cond_wait releases all locks, and then the mutex
>> can be released when the code doesn't expect it to be, or cond_wait
>> doesn't release all locks and then you have deadlocks.
>>
>> So, recursive locking is okay if you don't have condition variables
>> attached to the lock (and if you cannot do without it), but
>> qemu_global_mutex does have them.
> 
> Ahh OK, so I was missing something ;-)
> 
>> QEMU has so far tried to use the solution that Stevens outlines here:
>> https://books.google.it/books?id=kCTMFpEcIOwC&pg=PA434
> 
> Unfortunately I can't read that link but it sounds like I should get
> myself a copy of the book.

Try following the link from
https://en.wikipedia.org/wiki/Reentrant_mutex#References.

> I take it that approach wouldn't approve of:
> 
> static __thread int iothread_lock_count;
> 
> void qemu_mutex_lock_iothread(void)
> {
>     if (iothread_lock_count == 0) {
>         qemu_mutex_lock(&qemu_global_mutex);
>     }
>     iothread_lock_count++;
> }
> 
> void qemu_mutex_unlock_iothread(void)
> {
>     iothread_lock_count--;
>     if (iothread_lock_count==0) {
>         qemu_mutex_unlock(&qemu_global_mutex);
>     }
>     if (iothread_lock_count < 0) {
>         fprintf(stderr,"%s: error, too many unlocks %d\n", __func__,
>                 iothread_lock_count);
>     }
> }
> 
> Which should achieve the same "only one lock held" semantics but still
> make the calling code a little less worried about tracking the state.

This is effectively implementing the "other" semantics: cond_wait always
drops the lock.

BTW, fine-grained recursive mutexes are bad for another reason: you can
think of "getting the mutex" as "ensuring all the data structure's
invariant are respected" (At the time you acquire the lock, no other
thread is modifying the state, so any invariant left at unlock time must
still be there).  This is not true if you can get the mutex recursively.
 But I'm honestly not sure how much of this argument applies for
something as coarse as the iothread lock.

The best argument I have against recursive mutexes is that it's really a
one-way street.  Once you've decided to make a mutex recursive, it's
really hard to make it non-recursive.

Paolo

>>
>> Paolo
>
diff mbox

Patch

diff --git a/exec.c b/exec.c
index 094f87e..78c99f6 100644
--- a/exec.c
+++ b/exec.c
@@ -48,6 +48,7 @@ 
 #endif
 #include "exec/cpu-all.h"
 #include "qemu/rcu_queue.h"
+#include "qemu/main-loop.h"
 #include "exec/cputlb.h"
 #include "translate-all.h"
 
@@ -2318,11 +2319,27 @@  static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
     return l;
 }
 
-static void prepare_mmio_access(MemoryRegion *mr)
+static bool prepare_mmio_access(MemoryRegion *mr)
 {
+    bool unlocked = !qemu_mutex_iothread_locked();
+    bool release_lock = false;
+
+    if (unlocked && mr->global_locking) {
+        qemu_mutex_lock_iothread();
+        unlocked = false;
+        release_lock = true;
+    }
     if (mr->flush_coalesced_mmio) {
+        if (unlocked) {
+            qemu_mutex_lock_iothread();
+        }
         qemu_flush_coalesced_mmio_buffer();
+        if (unlocked) {
+            qemu_mutex_unlock_iothread();
+        }
     }
+
+    return release_lock;
 }
 
 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
@@ -2334,6 +2351,7 @@  MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
     hwaddr addr1;
     MemoryRegion *mr;
     MemTxResult result = MEMTX_OK;
+    bool release_lock = false;
 
     rcu_read_lock();
     while (len > 0) {
@@ -2342,7 +2360,7 @@  MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
 
         if (is_write) {
             if (!memory_access_is_direct(mr, is_write)) {
-                prepare_mmio_access(mr);
+                release_lock |= prepare_mmio_access(mr);
                 l = memory_access_size(mr, l, addr1);
                 /* XXX: could force current_cpu to NULL to avoid
                    potential bugs */
@@ -2384,7 +2402,7 @@  MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
         } else {
             if (!memory_access_is_direct(mr, is_write)) {
                 /* I/O case */
-                prepare_mmio_access(mr);
+                release_lock |= prepare_mmio_access(mr);
                 l = memory_access_size(mr, l, addr1);
                 switch (l) {
                 case 8:
@@ -2420,6 +2438,12 @@  MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                 memcpy(buf, ptr, l);
             }
         }
+
+        if (release_lock) {
+            qemu_mutex_unlock_iothread();
+            release_lock = false;
+        }
+
         len -= l;
         buf += l;
         addr += l;
@@ -2746,11 +2770,12 @@  static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
     hwaddr l = 4;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l, false);
     if (l < 4 || !memory_access_is_direct(mr, false)) {
-        prepare_mmio_access(mr);
+        release_lock |= prepare_mmio_access(mr);
 
         /* I/O case */
         r = memory_region_dispatch_read(mr, addr1, &val, 4, attrs);
@@ -2784,6 +2809,9 @@  static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
     return val;
 }
@@ -2836,12 +2864,13 @@  static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
     hwaddr l = 8;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  false);
     if (l < 8 || !memory_access_is_direct(mr, false)) {
-        prepare_mmio_access(mr);
+        release_lock |= prepare_mmio_access(mr);
 
         /* I/O case */
         r = memory_region_dispatch_read(mr, addr1, &val, 8, attrs);
@@ -2875,6 +2904,9 @@  static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
     return val;
 }
@@ -2947,12 +2979,13 @@  static inline uint32_t address_space_lduw_internal(AddressSpace *as,
     hwaddr l = 2;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  false);
     if (l < 2 || !memory_access_is_direct(mr, false)) {
-        prepare_mmio_access(mr);
+        release_lock |= prepare_mmio_access(mr);
 
         /* I/O case */
         r = memory_region_dispatch_read(mr, addr1, &val, 2, attrs);
@@ -2986,6 +3019,9 @@  static inline uint32_t address_space_lduw_internal(AddressSpace *as,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
     return val;
 }
@@ -3037,12 +3073,13 @@  void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,
     hwaddr l = 4;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  true);
     if (l < 4 || !memory_access_is_direct(mr, true)) {
-        prepare_mmio_access(mr);
+        release_lock |= prepare_mmio_access(mr);
 
         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
     } else {
@@ -3063,6 +3100,9 @@  void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
 }
 
@@ -3083,12 +3123,13 @@  static inline void address_space_stl_internal(AddressSpace *as,
     hwaddr l = 4;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  true);
     if (l < 4 || !memory_access_is_direct(mr, true)) {
-        prepare_mmio_access(mr);
+        release_lock |= prepare_mmio_access(mr);
 
 #if defined(TARGET_WORDS_BIGENDIAN)
         if (endian == DEVICE_LITTLE_ENDIAN) {
@@ -3121,6 +3162,9 @@  static inline void address_space_stl_internal(AddressSpace *as,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
 }
 
@@ -3190,11 +3234,12 @@  static inline void address_space_stw_internal(AddressSpace *as,
     hwaddr l = 2;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l, true);
     if (l < 2 || !memory_access_is_direct(mr, true)) {
-        prepare_mmio_access(mr);
+        release_lock |= prepare_mmio_access(mr);
 
 #if defined(TARGET_WORDS_BIGENDIAN)
         if (endian == DEVICE_LITTLE_ENDIAN) {
@@ -3227,6 +3272,9 @@  static inline void address_space_stw_internal(AddressSpace *as,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
 }