diff mbox series

[v3,1/1] oslib-posix: initialize backend memory objects in parallel

Message ID 20240131134843.3074922-2-mark.kanda@oracle.com
State New
Headers show
Series Initialize backend memory objects in parallel | expand

Commit Message

Mark Kanda Jan. 31, 2024, 1:48 p.m. UTC
QEMU initializes preallocated backend memory as the objects are parsed from
the command line. This is not optimal in some cases (e.g. memory spanning
multiple NUMA nodes) because the memory objects are initialized in series.

Allow the initialization to occur in parallel (asynchronously). In order to
ensure optimal thread placement, asynchronous initialization requires prealloc
context threads to be in use.

Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 backends/hostmem.c     |   8 ++-
 hw/virtio/virtio-mem.c |   4 +-
 include/qemu/osdep.h   |  18 +++++-
 system/vl.c            |   8 +++
 util/oslib-posix.c     | 131 +++++++++++++++++++++++++++++++----------
 util/oslib-win32.c     |   8 ++-
 6 files changed, 140 insertions(+), 37 deletions(-)

Comments

David Hildenbrand Jan. 31, 2024, 2:04 p.m. UTC | #1
On 31.01.24 14:48, Mark Kanda wrote:
> QEMU initializes preallocated backend memory as the objects are parsed from
> the command line. This is not optimal in some cases (e.g. memory spanning
> multiple NUMA nodes) because the memory objects are initialized in series.
> 
> Allow the initialization to occur in parallel (asynchronously). In order to
> ensure optimal thread placement, asynchronous initialization requires prealloc
> context threads to be in use.
> 
> Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>   backends/hostmem.c     |   8 ++-
>   hw/virtio/virtio-mem.c |   4 +-
>   include/qemu/osdep.h   |  18 +++++-
>   system/vl.c            |   8 +++
>   util/oslib-posix.c     | 131 +++++++++++++++++++++++++++++++----------
>   util/oslib-win32.c     |   8 ++-
>   6 files changed, 140 insertions(+), 37 deletions(-)
> 
> diff --git a/backends/hostmem.c b/backends/hostmem.c
> index 30f69b2cb5..8f602dc86f 100644
> --- a/backends/hostmem.c
> +++ b/backends/hostmem.c
> @@ -20,6 +20,7 @@
>   #include "qom/object_interfaces.h"
>   #include "qemu/mmap-alloc.h"
>   #include "qemu/madvise.h"
> +#include "hw/qdev-core.h"
>   
>   #ifdef CONFIG_NUMA
>   #include <numaif.h>
> @@ -235,9 +236,10 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
>           int fd = memory_region_get_fd(&backend->mr);
>           void *ptr = memory_region_get_ram_ptr(&backend->mr);
>           uint64_t sz = memory_region_size(&backend->mr);
> +        bool async = !phase_check(PHASE_MACHINE_INITIALIZED);
>   
>           if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
> -                               backend->prealloc_context, errp)) {
> +                               backend->prealloc_context, async, errp)) {
>               return;
>           }

I think we will never trigger that case: we would have to set the 
propertly after the device was already initialized, which shouldn't happen.

So I guess we can simplify and drop that.

>           backend->prealloc = true;


[...]

> +++ b/include/qemu/osdep.h
> @@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext;
>    * @area: start address of the are to preallocate
>    * @sz: the size of the area to preallocate
>    * @max_threads: maximum number of threads to use
> + * @tc: prealloc context threads pointer, NULL if not in use
> + * @async: request asynchronous preallocation, requires @tc
>    * @errp: returns an error if this function fails
>    *
>    * Preallocate memory (populate/prefault page tables writable) for the virtual
> @@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext;
>    * each page in the area was faulted in writable at least once, for example,
>    * after allocating file blocks for mapped files.
>    *
> + * When setting @async, allocation might be performed asynchronously.
> + * qemu_finish_async_mem_prealloc() must be called to finish any asynchronous
> + * preallocation.
> + *
>    * Return: true on success, else false setting @errp with error.
>    */
>   bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
> -                       ThreadContext *tc, Error **errp);
> +                       ThreadContext *tc, bool async, Error **errp);
> +
> +/**
> + * qemu_finish_async_mem_prealloc:
> + * @errp: returns an error if this function fails
> + *
> + * Finish all outstanding asynchronous memory preallocation.
> + *
> + * Return: true on success, else false setting @errp with error.
> + */
> +bool qemu_finish_async_mem_prealloc(Error **errp);

Suboptimal suggestion from my side, guess it woud be better to call this

"qemu_finish_async_prealloc_mem" to match "qemu_prealloc_mem"

>   
>   /**
>    * qemu_get_pid_name:
> diff --git a/system/vl.c b/system/vl.c
> index 788d88ea03..290bb3232b 100644
> --- a/system/vl.c
> +++ b/system/vl.c
> @@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)
>   
>       object_option_foreach_add(object_create_late);
>   
> +    /*
> +     * Wait for any outstanding memory prealloc from created memory
> +     * backends to complete.
> +     */
> +    if (!qemu_finish_async_mem_prealloc(&error_fatal)) {
> +        exit(1);
> +    }
> +

I'm wondering if we should have a new phase instead, like

PHASE_LATE_OBJECTS_CREATED.

and do here

phase_advance(PHASE_LATE_OBJECTS_CREATED);

and use that instead. Currently, there is a "gap" between both things. I 
don't think anything is actually broken right now (because any internal 
memory abckend wouldn't have a thread context), but it might be much 
cleaner and obvious that way.


Apart from that LGTM!
Mark Kanda Jan. 31, 2024, 2:27 p.m. UTC | #2
On 1/31/24 8:04 AM, David Hildenbrand wrote:
> On 31.01.24 14:48, Mark Kanda wrote:
>> QEMU initializes preallocated backend memory as the objects are 
>> parsed from
>> the command line. This is not optimal in some cases (e.g. memory 
>> spanning
>> multiple NUMA nodes) because the memory objects are initialized in 
>> series.
>>
>> Allow the initialization to occur in parallel (asynchronously). In 
>> order to
>> ensure optimal thread placement, asynchronous initialization requires 
>> prealloc
>> context threads to be in use.
>>
>> Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>   backends/hostmem.c     |   8 ++-
>>   hw/virtio/virtio-mem.c |   4 +-
>>   include/qemu/osdep.h   |  18 +++++-
>>   system/vl.c            |   8 +++
>>   util/oslib-posix.c     | 131 +++++++++++++++++++++++++++++++----------
>>   util/oslib-win32.c     |   8 ++-
>>   6 files changed, 140 insertions(+), 37 deletions(-)
>>
>> diff --git a/backends/hostmem.c b/backends/hostmem.c
>> index 30f69b2cb5..8f602dc86f 100644
>> --- a/backends/hostmem.c
>> +++ b/backends/hostmem.c
>> @@ -20,6 +20,7 @@
>>   #include "qom/object_interfaces.h"
>>   #include "qemu/mmap-alloc.h"
>>   #include "qemu/madvise.h"
>> +#include "hw/qdev-core.h"
>>     #ifdef CONFIG_NUMA
>>   #include <numaif.h>
>> @@ -235,9 +236,10 @@ static void 
>> host_memory_backend_set_prealloc(Object *obj, bool value,
>>           int fd = memory_region_get_fd(&backend->mr);
>>           void *ptr = memory_region_get_ram_ptr(&backend->mr);
>>           uint64_t sz = memory_region_size(&backend->mr);
>> +        bool async = !phase_check(PHASE_MACHINE_INITIALIZED);
>>             if (!qemu_prealloc_mem(fd, ptr, sz, 
>> backend->prealloc_threads,
>> -                               backend->prealloc_context, errp)) {
>> +                               backend->prealloc_context, async, 
>> errp)) {
>>               return;
>>           }
>
> I think we will never trigger that case: we would have to set the 
> propertly after the device was already initialized, which shouldn't 
> happen.
>
> So I guess we can simplify and drop that.
>

Will fix.

>>           backend->prealloc = true;
>
>
> [...]
>
>> +++ b/include/qemu/osdep.h
>> @@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext;
>>    * @area: start address of the are to preallocate
>>    * @sz: the size of the area to preallocate
>>    * @max_threads: maximum number of threads to use
>> + * @tc: prealloc context threads pointer, NULL if not in use
>> + * @async: request asynchronous preallocation, requires @tc
>>    * @errp: returns an error if this function fails
>>    *
>>    * Preallocate memory (populate/prefault page tables writable) for 
>> the virtual
>> @@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext;
>>    * each page in the area was faulted in writable at least once, for 
>> example,
>>    * after allocating file blocks for mapped files.
>>    *
>> + * When setting @async, allocation might be performed asynchronously.
>> + * qemu_finish_async_mem_prealloc() must be called to finish any 
>> asynchronous
>> + * preallocation.
>> + *
>>    * Return: true on success, else false setting @errp with error.
>>    */
>>   bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
>> -                       ThreadContext *tc, Error **errp);
>> +                       ThreadContext *tc, bool async, Error **errp);
>> +
>> +/**
>> + * qemu_finish_async_mem_prealloc:
>> + * @errp: returns an error if this function fails
>> + *
>> + * Finish all outstanding asynchronous memory preallocation.
>> + *
>> + * Return: true on success, else false setting @errp with error.
>> + */
>> +bool qemu_finish_async_mem_prealloc(Error **errp);
>
> Suboptimal suggestion from my side, guess it woud be better to call this
>
> "qemu_finish_async_prealloc_mem" to match "qemu_prealloc_mem"
>

Will fix.

>>     /**
>>    * qemu_get_pid_name:
>> diff --git a/system/vl.c b/system/vl.c
>> index 788d88ea03..290bb3232b 100644
>> --- a/system/vl.c
>> +++ b/system/vl.c
>> @@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)
>>         object_option_foreach_add(object_create_late);
>>   +    /*
>> +     * Wait for any outstanding memory prealloc from created memory
>> +     * backends to complete.
>> +     */
>> +    if (!qemu_finish_async_mem_prealloc(&error_fatal)) {
>> +        exit(1);
>> +    }
>> +
>
> I'm wondering if we should have a new phase instead, like
>
> PHASE_LATE_OBJECTS_CREATED.
>
> and do here
>
> phase_advance(PHASE_LATE_OBJECTS_CREATED);
>
> and use that instead. Currently, there is a "gap" between both things. 
> I don't think anything is actually broken right now (because any 
> internal memory abckend wouldn't have a thread context), but it might 
> be much cleaner and obvious that way.
>

OK. I'll call it 'PHASE_LATE_BACKENDS_CREATED' (to make it consistent 
with code comments/function name).

> Apart from that LGTM!
>

Thanks/regards,
-Mark
David Hildenbrand Jan. 31, 2024, 2:30 p.m. UTC | #3
> 
> OK. I'll call it 'PHASE_LATE_BACKENDS_CREATED' (to make it consistent
> with code comments/function name).

But then, you should set it at the very end of the function (not sure if 
that would be a problem with the other devices that are getting created 
in between -- if they would be using one of these memory backends; 
likely not).
Mark Kanda Jan. 31, 2024, 2:48 p.m. UTC | #4
On 1/31/24 8:30 AM, David Hildenbrand wrote:
>>
>> OK. I'll call it 'PHASE_LATE_BACKENDS_CREATED' (to make it consistent
>> with code comments/function name).
>
> But then, you should set it at the very end of the function (not sure 
> if that would be a problem with the other devices that are getting 
> created in between -- if they would be using one of these memory 
> backends; likely not).
>

I think I misunderstood your suggestion. I was planning to add it a 
'phase_advance(PHASE_LATE_BACKENDS_CREATED)' to qemu_init():

    @@ -3703,6 +3703,7 @@ void qemu_init(int argc, char **argv)
           * over memory-backend-file objects).
           */
          qemu_create_late_backends();
    +    phase_advance(PHASE_LATE_BACKENDS_CREATED);

And use PHASE_LATE_BACKENDS_CREATED (instead of 
PHASE_MACHINE_INITIALIZED) for the async bool in 
host_memory_backend_memory_complete().

I was planning to leave this call where it is:

    @@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)

          object_option_foreach_add(object_create_late);

    +    /*
    +     * Wait for any outstanding memory prealloc from created memory
    +     * backends to complete.
    +     */
    +    if (!qemu_finish_async_mem_prealloc(&error_fatal)) {
    +        exit(1);
    +    }
    +
          if (tpm_init() < 0) {
              exit(1);
          }

Is this what you had in mind?

Thanks/regards,
-Mark
David Hildenbrand Jan. 31, 2024, 2:57 p.m. UTC | #5
On 31.01.24 15:48, Mark Kanda wrote:
> On 1/31/24 8:30 AM, David Hildenbrand wrote:
>>>
>>> OK. I'll call it 'PHASE_LATE_BACKENDS_CREATED' (to make it consistent
>>> with code comments/function name).
>>
>> But then, you should set it at the very end of the function (not sure
>> if that would be a problem with the other devices that are getting
>> created in between -- if they would be using one of these memory
>> backends; likely not).
>>
> 
> I think I misunderstood your suggestion. I was planning to add it a
> 'phase_advance(PHASE_LATE_BACKENDS_CREATED)' to qemu_init():
> 
>      @@ -3703,6 +3703,7 @@ void qemu_init(int argc, char **argv)
>             * over memory-backend-file objects).
>             */
>            qemu_create_late_backends();
>      +    phase_advance(PHASE_LATE_BACKENDS_CREATED);
> 
> And use PHASE_LATE_BACKENDS_CREATED (instead of
> PHASE_MACHINE_INITIALIZED) for the async bool in
> host_memory_backend_memory_complete().
> 
> I was planning to leave this call where it is:
> 
>      @@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)
> 
>            object_option_foreach_add(object_create_late);
> 
>      +    /*
>      +     * Wait for any outstanding memory prealloc from created memory
>      +     * backends to complete.
>      +     */
>      +    if (!qemu_finish_async_mem_prealloc(&error_fatal)) {
>      +        exit(1);
>      +    }
>      +
>            if (tpm_init() < 0) {
>                exit(1);
>            }
> 

Yes. The only "suboptimal" things is that if someone where to create a 
memory backend between qemu_finish_async_mem_prealloc() and 
phase_advance(PHASE_LATE_BACKENDS_CREATED), it would never get preallocated.

That likely won't ever happen by any of the remaining stuff in 
qemu_create_late_backends(), especially not with "prealloc=on" and 
thread-contexts set.
Mark Kanda Jan. 31, 2024, 3:02 p.m. UTC | #6
On 1/31/24 8:57 AM, David Hildenbrand wrote:
> On 31.01.24 15:48, Mark Kanda wrote:
>> On 1/31/24 8:30 AM, David Hildenbrand wrote:
>>>>
>>>> OK. I'll call it 'PHASE_LATE_BACKENDS_CREATED' (to make it consistent
>>>> with code comments/function name).
>>>
>>> But then, you should set it at the very end of the function (not sure
>>> if that would be a problem with the other devices that are getting
>>> created in between -- if they would be using one of these memory
>>> backends; likely not).
>>>
>>
>> I think I misunderstood your suggestion. I was planning to add it a
>> 'phase_advance(PHASE_LATE_BACKENDS_CREATED)' to qemu_init():
>>
>>      @@ -3703,6 +3703,7 @@ void qemu_init(int argc, char **argv)
>>             * over memory-backend-file objects).
>>             */
>>            qemu_create_late_backends();
>>      +    phase_advance(PHASE_LATE_BACKENDS_CREATED);
>>
>> And use PHASE_LATE_BACKENDS_CREATED (instead of
>> PHASE_MACHINE_INITIALIZED) for the async bool in
>> host_memory_backend_memory_complete().
>>
>> I was planning to leave this call where it is:
>>
>>      @@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)
>>
>>            object_option_foreach_add(object_create_late);
>>
>>      +    /*
>>      +     * Wait for any outstanding memory prealloc from created 
>> memory
>>      +     * backends to complete.
>>      +     */
>>      +    if (!qemu_finish_async_mem_prealloc(&error_fatal)) {
>>      +        exit(1);
>>      +    }
>>      +
>>            if (tpm_init() < 0) {
>>                exit(1);
>>            }
>>
>
> Yes. The only "suboptimal" things is that if someone where to create a 
> memory backend between qemu_finish_async_mem_prealloc() and 
> phase_advance(PHASE_LATE_BACKENDS_CREATED), it would never get 
> preallocated.
>
> That likely won't ever happen by any of the remaining stuff in 
> qemu_create_late_backends(), especially not with "prealloc=on" and 
> thread-contexts set.
>

Yep. OK, I'll go with that.

Thanks again!
diff mbox series

Patch

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 30f69b2cb5..8f602dc86f 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -20,6 +20,7 @@ 
 #include "qom/object_interfaces.h"
 #include "qemu/mmap-alloc.h"
 #include "qemu/madvise.h"
+#include "hw/qdev-core.h"
 
 #ifdef CONFIG_NUMA
 #include <numaif.h>
@@ -235,9 +236,10 @@  static void host_memory_backend_set_prealloc(Object *obj, bool value,
         int fd = memory_region_get_fd(&backend->mr);
         void *ptr = memory_region_get_ram_ptr(&backend->mr);
         uint64_t sz = memory_region_size(&backend->mr);
+        bool async = !phase_check(PHASE_MACHINE_INITIALIZED);
 
         if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
-                               backend->prealloc_context, errp)) {
+                               backend->prealloc_context, async, errp)) {
             return;
         }
         backend->prealloc = true;
@@ -323,6 +325,7 @@  host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
     HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
     void *ptr;
     uint64_t sz;
+    bool async = !phase_check(PHASE_MACHINE_INITIALIZED);
 
     if (!bc->alloc) {
         return;
@@ -398,7 +401,8 @@  host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
     if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
                                                 ptr, sz,
                                                 backend->prealloc_threads,
-                                                backend->prealloc_context, errp)) {
+                                                backend->prealloc_context,
+                                                async, errp)) {
         return;
     }
 }
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index 99ab989852..ffd119ebac 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -605,7 +605,7 @@  static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
         int fd = memory_region_get_fd(&vmem->memdev->mr);
         Error *local_err = NULL;
 
-        if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
+        if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
             static bool warned;
 
             /*
@@ -1248,7 +1248,7 @@  static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg,
     int fd = memory_region_get_fd(&vmem->memdev->mr);
     Error *local_err = NULL;
 
-    if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
+    if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
         error_report_err(local_err);
         return -ENOMEM;
     }
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index c9692cc314..f45954b512 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -680,6 +680,8 @@  typedef struct ThreadContext ThreadContext;
  * @area: start address of the are to preallocate
  * @sz: the size of the area to preallocate
  * @max_threads: maximum number of threads to use
+ * @tc: prealloc context threads pointer, NULL if not in use
+ * @async: request asynchronous preallocation, requires @tc
  * @errp: returns an error if this function fails
  *
  * Preallocate memory (populate/prefault page tables writable) for the virtual
@@ -687,10 +689,24 @@  typedef struct ThreadContext ThreadContext;
  * each page in the area was faulted in writable at least once, for example,
  * after allocating file blocks for mapped files.
  *
+ * When setting @async, allocation might be performed asynchronously.
+ * qemu_finish_async_mem_prealloc() must be called to finish any asynchronous
+ * preallocation.
+ *
  * Return: true on success, else false setting @errp with error.
  */
 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
-                       ThreadContext *tc, Error **errp);
+                       ThreadContext *tc, bool async, Error **errp);
+
+/**
+ * qemu_finish_async_mem_prealloc:
+ * @errp: returns an error if this function fails
+ *
+ * Finish all outstanding asynchronous memory preallocation.
+ *
+ * Return: true on success, else false setting @errp with error.
+ */
+bool qemu_finish_async_mem_prealloc(Error **errp);
 
 /**
  * qemu_get_pid_name:
diff --git a/system/vl.c b/system/vl.c
index 788d88ea03..290bb3232b 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -2009,6 +2009,14 @@  static void qemu_create_late_backends(void)
 
     object_option_foreach_add(object_create_late);
 
+    /*
+     * Wait for any outstanding memory prealloc from created memory
+     * backends to complete.
+     */
+    if (!qemu_finish_async_mem_prealloc(&error_fatal)) {
+        exit(1);
+    }
+
     if (tpm_init() < 0) {
         exit(1);
     }
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 7c297003b9..74493e3cf7 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -42,6 +42,7 @@ 
 #include "qemu/cutils.h"
 #include "qemu/units.h"
 #include "qemu/thread-context.h"
+#include "qemu/main-loop.h"
 
 #ifdef CONFIG_LINUX
 #include <sys/syscall.h>
@@ -63,11 +64,15 @@ 
 
 struct MemsetThread;
 
+static QLIST_HEAD(, MemsetContext) memset_contexts =
+    QLIST_HEAD_INITIALIZER(memset_contexts);
+
 typedef struct MemsetContext {
     bool all_threads_created;
     bool any_thread_failed;
     struct MemsetThread *threads;
     int num_threads;
+    QLIST_ENTRY(MemsetContext) next;
 } MemsetContext;
 
 struct MemsetThread {
@@ -412,19 +417,44 @@  static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
     return ret;
 }
 
+static int wait_and_free_mem_prealloc_context(MemsetContext *context)
+{
+    int i, ret = 0, tmp;
+
+    for (i = 0; i < context->num_threads; i++) {
+        tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);
+
+        if (tmp) {
+            ret = tmp;
+        }
+    }
+    g_free(context->threads);
+    g_free(context);
+    return ret;
+}
+
 static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
-                           int max_threads, ThreadContext *tc,
+                           int max_threads, ThreadContext *tc, bool async,
                            bool use_madv_populate_write)
 {
     static gsize initialized = 0;
-    MemsetContext context = {
-        .num_threads = get_memset_num_threads(hpagesize, numpages, max_threads),
-    };
+    MemsetContext *context = g_malloc0(sizeof(MemsetContext));
     size_t numpages_per_thread, leftover;
     void *(*touch_fn)(void *);
-    int ret = 0, i = 0;
+    int ret, i = 0;
     char *addr = area;
 
+    /*
+     * Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE
+     * and prealloc context for thread placement.
+     */
+    if (!use_madv_populate_write || !tc) {
+        async = false;
+    }
+
+    context->num_threads =
+        get_memset_num_threads(hpagesize, numpages, max_threads);
+
     if (g_once_init_enter(&initialized)) {
         qemu_mutex_init(&page_mutex);
         qemu_cond_init(&page_cond);
@@ -432,8 +462,11 @@  static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
     }
 
     if (use_madv_populate_write) {
-        /* Avoid creating a single thread for MADV_POPULATE_WRITE */
-        if (context.num_threads == 1) {
+        /*
+         * Avoid creating a single thread for MADV_POPULATE_WRITE when
+         * preallocating synchronously.
+         */
+        if (context->num_threads == 1 && !async) {
             if (qemu_madvise(area, hpagesize * numpages,
                              QEMU_MADV_POPULATE_WRITE)) {
                 return -errno;
@@ -445,50 +478,86 @@  static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
         touch_fn = do_touch_pages;
     }
 
-    context.threads = g_new0(MemsetThread, context.num_threads);
-    numpages_per_thread = numpages / context.num_threads;
-    leftover = numpages % context.num_threads;
-    for (i = 0; i < context.num_threads; i++) {
-        context.threads[i].addr = addr;
-        context.threads[i].numpages = numpages_per_thread + (i < leftover);
-        context.threads[i].hpagesize = hpagesize;
-        context.threads[i].context = &context;
+    context->threads = g_new0(MemsetThread, context->num_threads);
+    numpages_per_thread = numpages / context->num_threads;
+    leftover = numpages % context->num_threads;
+    for (i = 0; i < context->num_threads; i++) {
+        context->threads[i].addr = addr;
+        context->threads[i].numpages = numpages_per_thread + (i < leftover);
+        context->threads[i].hpagesize = hpagesize;
+        context->threads[i].context = context;
         if (tc) {
-            thread_context_create_thread(tc, &context.threads[i].pgthread,
+            thread_context_create_thread(tc, &context->threads[i].pgthread,
                                          "touch_pages",
-                                         touch_fn, &context.threads[i],
+                                         touch_fn, &context->threads[i],
                                          QEMU_THREAD_JOINABLE);
         } else {
-            qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
-                               touch_fn, &context.threads[i],
+            qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
+                               touch_fn, &context->threads[i],
                                QEMU_THREAD_JOINABLE);
         }
-        addr += context.threads[i].numpages * hpagesize;
+        addr += context->threads[i].numpages * hpagesize;
+    }
+
+    if (async) {
+        /*
+         * async requests currently require the BQL. Add it to the list and kick
+         * preallocation off during qemu_finish_async_mem_prealloc().
+         */
+        assert(bql_locked());
+        QLIST_INSERT_HEAD(&memset_contexts, context, next);
+        return 0;
     }
 
     if (!use_madv_populate_write) {
-        sigbus_memset_context = &context;
+        sigbus_memset_context = context;
     }
 
     qemu_mutex_lock(&page_mutex);
-    context.all_threads_created = true;
+    context->all_threads_created = true;
     qemu_cond_broadcast(&page_cond);
     qemu_mutex_unlock(&page_mutex);
 
-    for (i = 0; i < context.num_threads; i++) {
-        int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
+    ret = wait_and_free_mem_prealloc_context(context);
 
+    if (!use_madv_populate_write) {
+        sigbus_memset_context = NULL;
+    }
+    return ret;
+}
+
+bool qemu_finish_async_mem_prealloc(Error **errp)
+{
+    int ret, tmp;
+    MemsetContext *context, *next_context;
+
+    /* Waiting for preallocation requires the BQL. */
+    assert(bql_locked());
+    if (QLIST_EMPTY(&memset_contexts)) {
+        return true;
+    }
+
+    qemu_mutex_lock(&page_mutex);
+    QLIST_FOREACH(context, &memset_contexts, next) {
+        context->all_threads_created = true;
+    }
+    qemu_cond_broadcast(&page_cond);
+    qemu_mutex_unlock(&page_mutex);
+
+    QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
+        QLIST_REMOVE(context, next);
+        tmp = wait_and_free_mem_prealloc_context(context);
         if (tmp) {
             ret = tmp;
         }
     }
 
-    if (!use_madv_populate_write) {
-        sigbus_memset_context = NULL;
+    if (ret) {
+        error_setg_errno(errp, -ret,
+                         "qemu_prealloc_mem: preallocating memory failed");
+        return false;
     }
-    g_free(context.threads);
-
-    return ret;
+    return true;
 }
 
 static bool madv_populate_write_possible(char *area, size_t pagesize)
@@ -498,7 +567,7 @@  static bool madv_populate_write_possible(char *area, size_t pagesize)
 }
 
 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
-                       ThreadContext *tc, Error **errp)
+                       ThreadContext *tc, bool async, Error **errp)
 {
     static gsize initialized;
     int ret;
@@ -540,7 +609,7 @@  bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
     }
 
     /* touch pages simultaneously */
-    ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc,
+    ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
                           use_madv_populate_write);
     if (ret) {
         error_setg_errno(errp, -ret,
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index c4a5f05a49..107f0efe37 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -265,7 +265,7 @@  int getpagesize(void)
 }
 
 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
-                       ThreadContext *tc, Error **errp)
+                       ThreadContext *tc, bool async, Error **errp)
 {
     int i;
     size_t pagesize = qemu_real_host_page_size();
@@ -278,6 +278,12 @@  bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
     return true;
 }
 
+bool qemu_finish_async_mem_prealloc(Error **errp)
+{
+    /* async prealloc not supported, there is nothing to finish */
+    return true;
+}
+
 char *qemu_get_pid_name(pid_t pid)
 {
     /* XXX Implement me */