diff mbox

[RFC,v1,07/12] cpus: introduce async_safe_run_on_cpu.

Message ID 1460730231-1184-9-git-send-email-alex.bennee@linaro.org
State New
Headers show

Commit Message

Alex Bennée April 15, 2016, 2:23 p.m. UTC
From: KONRAD Frederic <fred.konrad@greensocs.com>

We already had async_run_on_cpu but for some tasks we need to ensure all
vCPUs have stopped running before updating shared structures. We call
this safe work as it is safe to make the modifications.

Work is scheduled with async_safe_run_on_cpu() which is passed the
CPUState and an anonymous structure with the relevant data for the task.
Once this is done the vCPU is kicked to bring it out of the main
execution loop.

The main difference with the other run_on_cpu functions is it operates
out of a single queue. This ensures fairness as all pending tasks will
get drained whichever vCPU is nominally doing the work. The internal
implementation is also a GArray so the need to malloc memory is
minimised while adding tasks to the queue.

When async_safe_work_pending() cpu_exec returns and the vCPUs can't
enters execution loop. Once all scheduled vCPUs have exited the loop the
last one to exit processed the execution queue.

Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
[AJB: Name change, single queue, atomic counter for active vCPUs]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

---
v1 (arm-v1)
  - now async_safe_run_on_cpu
  - single GArray based queue
  - use atomic counter to bring all vCPUs to a halt
  - wording for async safe_work
---
 cpu-exec-common.c |   1 +
 cpu-exec.c        |  11 ++++++
 cpus.c            | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 include/qom/cpu.h |  19 ++++++++++
 4 files changed, 131 insertions(+), 2 deletions(-)

Comments

Sergey Fedorov June 5, 2016, 4:01 p.m. UTC | #1
On 15/04/16 17:23, Alex Bennée wrote:
> diff --git a/cpu-exec-common.c b/cpu-exec-common.c
> index 3d7eaa3..c2f7c29 100644
> --- a/cpu-exec-common.c
> +++ b/cpu-exec-common.c
> @@ -79,3 +79,4 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
>      cpu->current_tb = NULL;
>      siglongjmp(cpu->jmp_env, 1);
>  }
> +

Do we rally want this extra line at the end of the file? :)

> diff --git a/cpu-exec.c b/cpu-exec.c
> index 42cec05..2f362f8 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -365,6 +365,17 @@ int cpu_exec(CPUState *cpu)
>      uintptr_t next_tb;
>      SyncClocks sc;
>  
> +    /*
> +     * This happen when somebody doesn't want this CPU to start
> +     * In case of MTTCG.
> +     */
> +#ifdef CONFIG_SOFTMMU
> +    if (async_safe_work_pending()) {
> +        cpu->exit_request = 1;
> +        return 0;
> +    }
> +#endif
> +

I can't see the point of this change. We check for
"async_safe_work_pending()" each time round the loop in
qemu_tcg_cpu_thread_fn() before entering tcg_cpu_exec() and we do that
after 'cpu->exit_request' gets reset.

>      /* replay_interrupt may need current_cpu */
>      current_cpu = cpu;
>  
> diff --git a/cpus.c b/cpus.c
> index 9177161..860e2a9 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -928,6 +928,19 @@ static QemuCond qemu_cpu_cond;
>  static QemuCond qemu_pause_cond;
>  static QemuCond qemu_work_cond;
>  
> +/* safe work */
> +static int safe_work_pending;
> +static int tcg_scheduled_cpus;
> +
> +typedef struct {
> +    CPUState            *cpu;  /* CPU affected */

Do we really need to pass CPU state to the safe work? Async safe work
seems to be supposed to handle cross-CPU tasks rather than CPU-specific
ones. TB flush, for example, doesn't really require CPU to be passed to
the async safe work handler: we should be able to check for code buffer
overflow before scheduling the job. Because of this, it seems to be
reasonable to rename async_safe_run_on_cpu() to something like
request_async_cpu_safe_work().

> +    run_on_cpu_func     func;  /* Helper function */
> +    void                *data; /* Helper data */
> +} qemu_safe_work_item;
> +
> +static GArray *safe_work;       /* array of qemu_safe_work_items */

I think we shouldn't bother with "static" memory allocation for the list
of work items. We should be fine with dynamic allocation in
async_safe_run_on_cpu(). Halting all the CPUs doesn't seem to be cheap
anyhow, thus it shouldn't be used frequently. The only use so far is to
make tb_flush() safe. If we look at how tb_flush() is used we can see
that this is either code buffer exhaustion or some rare operation which
needs all the previous translations to be flushed. The former shouldn't
happen often, the latter isn't supposed to be cheap and fast anyway.

> +static QemuMutex safe_work_mutex;
> +
>  void qemu_init_cpu_loop(void)
>  {
>      qemu_init_sigbus();
> @@ -937,6 +950,9 @@ void qemu_init_cpu_loop(void)
>      qemu_mutex_init(&qemu_global_mutex);
>  
>      qemu_thread_get_self(&io_thread);
> +
> +    safe_work = g_array_sized_new(TRUE, TRUE, sizeof(qemu_safe_work_item), 128);
> +    qemu_mutex_init(&safe_work_mutex);
>  }
>  
>  void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
> @@ -997,6 +1013,81 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
>      qemu_cpu_kick(cpu);
>  }
>  
> +/*
> + * Safe work interface
> + *
> + * Safe work is defined as work that requires the system to be
> + * quiescent before making changes.
> + */
> +
> +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
> +{
> +    CPUState *iter;
> +    qemu_safe_work_item wi;
> +    wi.cpu = cpu;
> +    wi.func = func;
> +    wi.data = data;
> +
> +    qemu_mutex_lock(&safe_work_mutex);
> +    g_array_append_val(safe_work, wi);
> +    atomic_inc(&safe_work_pending);
> +    qemu_mutex_unlock(&safe_work_mutex);
> +
> +    /* Signal all vCPUs to halt */
> +    CPU_FOREACH(iter) {
> +        qemu_cpu_kick(iter);
> +    }
> +}
> +
> +/**
> + * flush_queued_safe_work:
> + *
> + * @scheduled_cpu_count
> + *
> + * If not 0 will signal the other vCPUs and sleep. The last vCPU to
> + * get to the function then drains the queue while the system is in a
> + * quiescent state. This allows the operations to change shared
> + * structures.
> + *
> + * @see async_run_safe_work_on_cpu
> + */
> +static void flush_queued_safe_work(int scheduled_cpu_count)
> +{
> +    qemu_safe_work_item *wi;
> +    int i;
> +
> +    /* bail out if there is nothing to do */
> +    if (!async_safe_work_pending()) {
> +        return;
> +    }
> +
> +    if (scheduled_cpu_count) {
> +
> +        /* Nothing to do but sleep */
> +        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);

We'll not be able to extend this to user-mode emulation because we don't
have BQL there. I think we should consider something like
exclusive_idle()/start_exclusive()/end_exclusive()/cpu_exec_start()/cpu_exec_end().

Kind regards,
Sergey

> +
> +    } else {
> +
> +        /* We can now do the work */
> +        qemu_mutex_lock(&safe_work_mutex);
> +        for (i = 0; i < safe_work->len; i++) {
> +            wi = &g_array_index(safe_work, qemu_safe_work_item, i);
> +            wi->func(wi->cpu, wi->data);
> +        }
> +        g_array_remove_range(safe_work, 0, safe_work->len);
> +        atomic_set(&safe_work_pending, 0);
> +        qemu_mutex_unlock(&safe_work_mutex);
> +
> +        /* Wake everyone up */
> +        qemu_cond_broadcast(&qemu_work_cond);
> +    }
> +}
> +
> +bool async_safe_work_pending(void)
> +{
> +    return (atomic_read(&safe_work_pending) != 0);
> +}
> +
>  static void flush_queued_work(CPUState *cpu)
>  {
>      struct qemu_work_item *wi;
Sergey Fedorov June 5, 2016, 4:44 p.m. UTC | #2
On 15/04/16 17:23, Alex Bennée wrote:
> +/*
> + * Safe work interface
> + *
> + * Safe work is defined as work that requires the system to be
> + * quiescent before making changes.
> + */
> +
> +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
> +{
> +    CPUState *iter;
> +    qemu_safe_work_item wi;

We can call the function right away if MTTCG is disabled or there's only
one CPU emulated.

Kind regards,
Sergey

> +    wi.cpu = cpu;
> +    wi.func = func;
> +    wi.data = data;
> +
> +    qemu_mutex_lock(&safe_work_mutex);
> +    g_array_append_val(safe_work, wi);
> +    atomic_inc(&safe_work_pending);
> +    qemu_mutex_unlock(&safe_work_mutex);
> +
> +    /* Signal all vCPUs to halt */
> +    CPU_FOREACH(iter) {
> +        qemu_cpu_kick(iter);
> +    }
> +}
Alex Bennée June 6, 2016, 8:50 a.m. UTC | #3
Sergey Fedorov <serge.fdrv@gmail.com> writes:

> On 15/04/16 17:23, Alex Bennée wrote:
>> diff --git a/cpu-exec-common.c b/cpu-exec-common.c
>> index 3d7eaa3..c2f7c29 100644
>> --- a/cpu-exec-common.c
>> +++ b/cpu-exec-common.c
>> @@ -79,3 +79,4 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
>>      cpu->current_tb = NULL;
>>      siglongjmp(cpu->jmp_env, 1);
>>  }
>> +
>
> Do we rally want this extra line at the end of the file? :)
>
>> diff --git a/cpu-exec.c b/cpu-exec.c
>> index 42cec05..2f362f8 100644
>> --- a/cpu-exec.c
>> +++ b/cpu-exec.c
>> @@ -365,6 +365,17 @@ int cpu_exec(CPUState *cpu)
>>      uintptr_t next_tb;
>>      SyncClocks sc;
>>
>> +    /*
>> +     * This happen when somebody doesn't want this CPU to start
>> +     * In case of MTTCG.
>> +     */
>> +#ifdef CONFIG_SOFTMMU
>> +    if (async_safe_work_pending()) {
>> +        cpu->exit_request = 1;
>> +        return 0;
>> +    }
>> +#endif
>> +
>
> I can't see the point of this change. We check for
> "async_safe_work_pending()" each time round the loop in
> qemu_tcg_cpu_thread_fn() before entering tcg_cpu_exec() and we do that
> after 'cpu->exit_request' gets reset.

I can't remember what this was meant to fix now so I'll remove it as suggested.

>
>>      /* replay_interrupt may need current_cpu */
>>      current_cpu = cpu;
>>
>> diff --git a/cpus.c b/cpus.c
>> index 9177161..860e2a9 100644
>> --- a/cpus.c
>> +++ b/cpus.c
>> @@ -928,6 +928,19 @@ static QemuCond qemu_cpu_cond;
>>  static QemuCond qemu_pause_cond;
>>  static QemuCond qemu_work_cond;
>>
>> +/* safe work */
>> +static int safe_work_pending;
>> +static int tcg_scheduled_cpus;
>> +
>> +typedef struct {
>> +    CPUState            *cpu;  /* CPU affected */
>
> Do we really need to pass CPU state to the safe work? Async safe work
> seems to be supposed to handle cross-CPU tasks rather than CPU-specific
> ones. TB flush, for example, doesn't really require CPU to be passed to
> the async safe work handler: we should be able to check for code buffer
> overflow before scheduling the job. Because of this, it seems to be
> reasonable to rename async_safe_run_on_cpu() to something like
> request_async_cpu_safe_work().

CPUState is common enough to be passed around and this is a shared array
for all queued work. I wanted to ensure the second most common operation
of CPUState + one extra bit of information was covered without having to
resort to memory allocation.

The previous iterations of this work had allocations for structures and
the linked list and I was working hard to make the common case
allocation free.

>
>> +    run_on_cpu_func     func;  /* Helper function */
>> +    void                *data; /* Helper data */
>> +} qemu_safe_work_item;
>> +
>> +static GArray *safe_work;       /* array of qemu_safe_work_items */
>
> I think we shouldn't bother with "static" memory allocation for the list
> of work items. We should be fine with dynamic allocation in
> async_safe_run_on_cpu().

Why? Under heavy load these add up fast, some TLB flush operations can
queue CPU x ASID deferred operations. It's not exactly a large array to
allocate at the start. We could make the initial size smaller if you
want.

> Halting all the CPUs doesn't seem to be cheap
> anyhow, thus it shouldn't be used frequently. The only use so far is to
> make tb_flush() safe. If we look at how tb_flush() is used we can see
> that this is either code buffer exhaustion or some rare operation which
> needs all the previous translations to be flushed. The former shouldn't
> happen often, the latter isn't supposed to be cheap and fast anyway.

The various flushes are more common under ARMv8, we are not just talking
about the big tb_flush using this mechanism.

>
>> +static QemuMutex safe_work_mutex;
>> +
>>  void qemu_init_cpu_loop(void)
>>  {
>>      qemu_init_sigbus();
>> @@ -937,6 +950,9 @@ void qemu_init_cpu_loop(void)
>>      qemu_mutex_init(&qemu_global_mutex);
>>
>>      qemu_thread_get_self(&io_thread);
>> +
>> +    safe_work = g_array_sized_new(TRUE, TRUE, sizeof(qemu_safe_work_item), 128);
>> +    qemu_mutex_init(&safe_work_mutex);
>>  }
>>
>>  void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
>> @@ -997,6 +1013,81 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
>>      qemu_cpu_kick(cpu);
>>  }
>>
>> +/*
>> + * Safe work interface
>> + *
>> + * Safe work is defined as work that requires the system to be
>> + * quiescent before making changes.
>> + */
>> +
>> +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
>> +{
>> +    CPUState *iter;
>> +    qemu_safe_work_item wi;
>> +    wi.cpu = cpu;
>> +    wi.func = func;
>> +    wi.data = data;
>> +
>> +    qemu_mutex_lock(&safe_work_mutex);
>> +    g_array_append_val(safe_work, wi);
>> +    atomic_inc(&safe_work_pending);
>> +    qemu_mutex_unlock(&safe_work_mutex);
>> +
>> +    /* Signal all vCPUs to halt */
>> +    CPU_FOREACH(iter) {
>> +        qemu_cpu_kick(iter);
>> +    }
>> +}
>> +
>> +/**
>> + * flush_queued_safe_work:
>> + *
>> + * @scheduled_cpu_count
>> + *
>> + * If not 0 will signal the other vCPUs and sleep. The last vCPU to
>> + * get to the function then drains the queue while the system is in a
>> + * quiescent state. This allows the operations to change shared
>> + * structures.
>> + *
>> + * @see async_run_safe_work_on_cpu
>> + */
>> +static void flush_queued_safe_work(int scheduled_cpu_count)
>> +{
>> +    qemu_safe_work_item *wi;
>> +    int i;
>> +
>> +    /* bail out if there is nothing to do */
>> +    if (!async_safe_work_pending()) {
>> +        return;
>> +    }
>> +
>> +    if (scheduled_cpu_count) {
>> +
>> +        /* Nothing to do but sleep */
>> +        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
>
> We'll not be able to extend this to user-mode emulation because we don't
> have BQL there. I think we should consider something like
> exclusive_idle()/start_exclusive()/end_exclusive()/cpu_exec_start()/cpu_exec_end().

This is true, sacrificed on the alter of making things simpler. It would
be nice if we had a single simple dispatch point for linux-user as well
and we could make things sane there as well.

I'll have another look at the exclusive code although at first blush it
seemed a little more complex because it didn't have the luxury of
knowing how many threads are running.

>
> Kind regards,
> Sergey
>
>> +
>> +    } else {
>> +
>> +        /* We can now do the work */
>> +        qemu_mutex_lock(&safe_work_mutex);
>> +        for (i = 0; i < safe_work->len; i++) {
>> +            wi = &g_array_index(safe_work, qemu_safe_work_item, i);
>> +            wi->func(wi->cpu, wi->data);
>> +        }
>> +        g_array_remove_range(safe_work, 0, safe_work->len);
>> +        atomic_set(&safe_work_pending, 0);
>> +        qemu_mutex_unlock(&safe_work_mutex);
>> +
>> +        /* Wake everyone up */
>> +        qemu_cond_broadcast(&qemu_work_cond);
>> +    }
>> +}
>> +
>> +bool async_safe_work_pending(void)
>> +{
>> +    return (atomic_read(&safe_work_pending) != 0);
>> +}
>> +
>>  static void flush_queued_work(CPUState *cpu)
>>  {
>>      struct qemu_work_item *wi;


--
Alex Bennée
Sergey Fedorov June 6, 2016, 9:38 a.m. UTC | #4
On 06/06/16 11:50, Alex Bennée wrote:
> Sergey Fedorov <serge.fdrv@gmail.com> writes:
>
>> On 15/04/16 17:23, Alex Bennée wrote:
>>> diff --git a/cpus.c b/cpus.c
>>> index 9177161..860e2a9 100644
>>> --- a/cpus.c
>>> +++ b/cpus.c
>>> @@ -928,6 +928,19 @@ static QemuCond qemu_cpu_cond;
>>>  static QemuCond qemu_pause_cond;
>>>  static QemuCond qemu_work_cond;
>>>
>>> +/* safe work */
>>> +static int safe_work_pending;
>>> +static int tcg_scheduled_cpus;
>>> +
>>> +typedef struct {
>>> +    CPUState            *cpu;  /* CPU affected */
>> Do we really need to pass CPU state to the safe work? Async safe work
>> seems to be supposed to handle cross-CPU tasks rather than CPU-specific
>> ones. TB flush, for example, doesn't really require CPU to be passed to
>> the async safe work handler: we should be able to check for code buffer
>> overflow before scheduling the job. Because of this, it seems to be
>> reasonable to rename async_safe_run_on_cpu() to something like
>> request_async_cpu_safe_work().
> CPUState is common enough to be passed around and this is a shared array
> for all queued work. I wanted to ensure the second most common operation
> of CPUState + one extra bit of information was covered without having to
> resort to memory allocation.
>
> The previous iterations of this work had allocations for structures and
> the linked list and I was working hard to make the common case
> allocation free.

The point was that this mechanism supposed to be used for operations
requiring all CPUs to be halted thus there should be little chance it
would require a specific CPUState to be passed.

>
>>> +    run_on_cpu_func     func;  /* Helper function */
>>> +    void                *data; /* Helper data */
>>> +} qemu_safe_work_item;
>>> +
>>> +static GArray *safe_work;       /* array of qemu_safe_work_items */
>> I think we shouldn't bother with "static" memory allocation for the list
>> of work items. We should be fine with dynamic allocation in
>> async_safe_run_on_cpu().
> Why? Under heavy load these add up fast, some TLB flush operations can
> queue CPU x ASID deferred operations. It's not exactly a large array to
> allocate at the start. We could make the initial size smaller if you
> want.

We are going to use async_run_on_cpu() for TLB invalidation, aren't we?

>
>> Halting all the CPUs doesn't seem to be cheap
>> anyhow, thus it shouldn't be used frequently. The only use so far is to
>> make tb_flush() safe. If we look at how tb_flush() is used we can see
>> that this is either code buffer exhaustion or some rare operation which
>> needs all the previous translations to be flushed. The former shouldn't
>> happen often, the latter isn't supposed to be cheap and fast anyway.
> The various flushes are more common under ARMv8, we are not just talking
> about the big tb_flush using this mechanism.

Which of them would really require quiescent state? I suppose all
frequent operations are to be handled in per-CPU manner and don't
require this expansive "halt all the CPUs" mechanism.

>
>>
>>> +static QemuMutex safe_work_mutex;
>>> +
>>>  void qemu_init_cpu_loop(void)
>>>  {
>>>      qemu_init_sigbus();
(snip)
>>> @@ -997,6 +1013,81 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
>>>      qemu_cpu_kick(cpu);
>>>  }
>>>
>>> +/*
>>> + * Safe work interface
>>> + *
>>> + * Safe work is defined as work that requires the system to be
>>> + * quiescent before making changes.
>>> + */
>>> +
>>> +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
>>> +{
>>> +    CPUState *iter;
>>> +    qemu_safe_work_item wi;
>>> +    wi.cpu = cpu;
>>> +    wi.func = func;
>>> +    wi.data = data;
>>> +
>>> +    qemu_mutex_lock(&safe_work_mutex);
>>> +    g_array_append_val(safe_work, wi);
>>> +    atomic_inc(&safe_work_pending);
>>> +    qemu_mutex_unlock(&safe_work_mutex);
>>> +
>>> +    /* Signal all vCPUs to halt */
>>> +    CPU_FOREACH(iter) {
>>> +        qemu_cpu_kick(iter);
>>> +    }
>>> +}
>>> +
>>> +/**
>>> + * flush_queued_safe_work:
>>> + *
>>> + * @scheduled_cpu_count
>>> + *
>>> + * If not 0 will signal the other vCPUs and sleep. The last vCPU to
>>> + * get to the function then drains the queue while the system is in a
>>> + * quiescent state. This allows the operations to change shared
>>> + * structures.
>>> + *
>>> + * @see async_run_safe_work_on_cpu
>>> + */
>>> +static void flush_queued_safe_work(int scheduled_cpu_count)
>>> +{
>>> +    qemu_safe_work_item *wi;
>>> +    int i;
>>> +
>>> +    /* bail out if there is nothing to do */
>>> +    if (!async_safe_work_pending()) {
>>> +        return;
>>> +    }
>>> +
>>> +    if (scheduled_cpu_count) {
>>> +
>>> +        /* Nothing to do but sleep */
>>> +        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
>> We'll not be able to extend this to user-mode emulation because we don't
>> have BQL there. I think we should consider something like
>> exclusive_idle()/start_exclusive()/end_exclusive()/cpu_exec_start()/cpu_exec_end().
> This is true, sacrificed on the alter of making things simpler. It would
> be nice if we had a single simple dispatch point for linux-user as well
> and we could make things sane there as well.
>
> I'll have another look at the exclusive code although at first blush it
> seemed a little more complex because it didn't have the luxury of
> knowing how many threads are running.

Actually, it knows that making use of user-mode-only 'CPUState::running'
and its global counter 'pending_cpus'.

Kind regards,
Sergey
diff mbox

Patch

diff --git a/cpu-exec-common.c b/cpu-exec-common.c
index 3d7eaa3..c2f7c29 100644
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -79,3 +79,4 @@  void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
     cpu->current_tb = NULL;
     siglongjmp(cpu->jmp_env, 1);
 }
+
diff --git a/cpu-exec.c b/cpu-exec.c
index 42cec05..2f362f8 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -365,6 +365,17 @@  int cpu_exec(CPUState *cpu)
     uintptr_t next_tb;
     SyncClocks sc;
 
+    /*
+     * This happen when somebody doesn't want this CPU to start
+     * In case of MTTCG.
+     */
+#ifdef CONFIG_SOFTMMU
+    if (async_safe_work_pending()) {
+        cpu->exit_request = 1;
+        return 0;
+    }
+#endif
+
     /* replay_interrupt may need current_cpu */
     current_cpu = cpu;
 
diff --git a/cpus.c b/cpus.c
index 9177161..860e2a9 100644
--- a/cpus.c
+++ b/cpus.c
@@ -928,6 +928,19 @@  static QemuCond qemu_cpu_cond;
 static QemuCond qemu_pause_cond;
 static QemuCond qemu_work_cond;
 
+/* safe work */
+static int safe_work_pending;
+static int tcg_scheduled_cpus;
+
+typedef struct {
+    CPUState            *cpu;  /* CPU affected */
+    run_on_cpu_func     func;  /* Helper function */
+    void                *data; /* Helper data */
+} qemu_safe_work_item;
+
+static GArray *safe_work;       /* array of qemu_safe_work_items */
+static QemuMutex safe_work_mutex;
+
 void qemu_init_cpu_loop(void)
 {
     qemu_init_sigbus();
@@ -937,6 +950,9 @@  void qemu_init_cpu_loop(void)
     qemu_mutex_init(&qemu_global_mutex);
 
     qemu_thread_get_self(&io_thread);
+
+    safe_work = g_array_sized_new(TRUE, TRUE, sizeof(qemu_safe_work_item), 128);
+    qemu_mutex_init(&safe_work_mutex);
 }
 
 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
@@ -997,6 +1013,81 @@  void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
     qemu_cpu_kick(cpu);
 }
 
+/*
+ * Safe work interface
+ *
+ * Safe work is defined as work that requires the system to be
+ * quiescent before making changes.
+ */
+
+void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
+{
+    CPUState *iter;
+    qemu_safe_work_item wi;
+    wi.cpu = cpu;
+    wi.func = func;
+    wi.data = data;
+
+    qemu_mutex_lock(&safe_work_mutex);
+    g_array_append_val(safe_work, wi);
+    atomic_inc(&safe_work_pending);
+    qemu_mutex_unlock(&safe_work_mutex);
+
+    /* Signal all vCPUs to halt */
+    CPU_FOREACH(iter) {
+        qemu_cpu_kick(iter);
+    }
+}
+
+/**
+ * flush_queued_safe_work:
+ *
+ * @scheduled_cpu_count
+ *
+ * If not 0 will signal the other vCPUs and sleep. The last vCPU to
+ * get to the function then drains the queue while the system is in a
+ * quiescent state. This allows the operations to change shared
+ * structures.
+ *
+ * @see async_run_safe_work_on_cpu
+ */
+static void flush_queued_safe_work(int scheduled_cpu_count)
+{
+    qemu_safe_work_item *wi;
+    int i;
+
+    /* bail out if there is nothing to do */
+    if (!async_safe_work_pending()) {
+        return;
+    }
+
+    if (scheduled_cpu_count) {
+
+        /* Nothing to do but sleep */
+        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
+
+    } else {
+
+        /* We can now do the work */
+        qemu_mutex_lock(&safe_work_mutex);
+        for (i = 0; i < safe_work->len; i++) {
+            wi = &g_array_index(safe_work, qemu_safe_work_item, i);
+            wi->func(wi->cpu, wi->data);
+        }
+        g_array_remove_range(safe_work, 0, safe_work->len);
+        atomic_set(&safe_work_pending, 0);
+        qemu_mutex_unlock(&safe_work_mutex);
+
+        /* Wake everyone up */
+        qemu_cond_broadcast(&qemu_work_cond);
+    }
+}
+
+bool async_safe_work_pending(void)
+{
+    return (atomic_read(&safe_work_pending) != 0);
+}
+
 static void flush_queued_work(CPUState *cpu)
 {
     struct qemu_work_item *wi;
@@ -1259,6 +1350,7 @@  static void *qemu_tcg_single_cpu_thread_fn(void *arg)
 
         if (cpu) {
             g_assert(cpu->exit_request);
+            flush_queued_safe_work(0);
             /* Pairs with smp_wmb in qemu_cpu_kick.  */
             atomic_mb_set(&cpu->exit_request, 0);
             qemu_tcg_wait_io_event(cpu);
@@ -1300,8 +1392,13 @@  static void *qemu_tcg_cpu_thread_fn(void *arg)
     while (1) {
         bool sleep = false;
 
-        if (cpu_can_run(cpu)) {
-            int r = tcg_cpu_exec(cpu);
+        if (cpu_can_run(cpu) && !async_safe_work_pending()) {
+            int r;
+
+            atomic_inc(&tcg_scheduled_cpus);
+            r = tcg_cpu_exec(cpu);
+            flush_queued_safe_work(atomic_dec_fetch(&tcg_scheduled_cpus));
+
             switch (r)
             {
             case EXCP_DEBUG:
@@ -1319,6 +1416,7 @@  static void *qemu_tcg_cpu_thread_fn(void *arg)
                 /* Ignore everything else? */
                 break;
             }
+
         } else {
             sleep = true;
         }
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 385d5bb..8ab969e 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -642,6 +642,25 @@  void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
 void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
 
 /**
+ * async_safe_run_on_cpu:
+ * @cpu: The vCPU to run on.
+ * @func: The function to be executed.
+ * @data: Data to pass to the function.
+ *
+ * Schedules the function @func for execution on the vCPU @cpu asynchronously
+ * when all the VCPUs are outside their loop.
+ */
+void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
+
+/**
+ * async_safe_work_pending:
+ *
+ * Check whether any safe work is pending on any VCPUs.
+ * Returns: @true if a safe work is pending, @false otherwise.
+ */
+bool async_safe_work_pending(void);
+
+/**
  * qemu_get_cpu:
  * @index: The CPUState@cpu_index value of the CPU to obtain.
  *