Patchwork [4/7] qemu_flush_work for remote vcpu execution

login
register
mail settings
Submitter Glauber Costa
Date Nov. 26, 2009, 5:24 p.m.
Message ID <1259256300-23937-5-git-send-email-glommer@redhat.com>
Download mbox | patch
Permalink /patch/39576/
State New
Headers show

Comments

Glauber Costa - Nov. 26, 2009, 5:24 p.m.
This function is similar to qemu-kvm's on_vcpu mechanism. Totally synchronous,
and guarantees that a given function will be executed at the specified vcpu.

The approach I am taking is to put it under the hood, in kvm_vcpu_ioctl.
This way, the kvm_vcpu_ioctl can be used anywhere, and we guarantee it will never
be executed outside its realm.

This is not much of a problem, since remote execution is rare. It does happen at
lot at machine bootup, because saving/restoring registers spans a lot of ioctls,
but this should get better if we move to Jan's patch of doing it all at once.

Signed-off-by: Glauber Costa <glommer@redhat.com>
---
 cpu-all.h  |    3 ++
 cpu-defs.h |   14 +++++++++++
 kvm-all.c  |   75 ++++++++++++++++++++++++-----------------------------------
 kvm.h      |    7 +++++
 vl.c       |   63 ++++++++++++++++++++++++++++++++++++++++++++++----
 5 files changed, 113 insertions(+), 49 deletions(-)
Avi Kivity - Nov. 29, 2009, 3:32 p.m.
On 11/26/2009 07:24 PM, Glauber Costa wrote:
> This function is similar to qemu-kvm's on_vcpu mechanism. Totally synchronous,
> and guarantees that a given function will be executed at the specified vcpu.
>
> The approach I am taking is to put it under the hood, in kvm_vcpu_ioctl.
> This way, the kvm_vcpu_ioctl can be used anywhere, and we guarantee it will never
> be executed outside its realm.
>
> This is not much of a problem, since remote execution is rare. It does happen at
> lot at machine bootup, because saving/restoring registers spans a lot of ioctls,
> but this should get better if we move to Jan's patch of doing it all at once.
>    

I really dislike this.  In general vcpu ioctls are used as components of 
some work to be done, for example RMW of some state.  In this case it is 
meaningless to execute the ioctls remotely, you need to execute the 
entire RMW remotely instead.
Glauber Costa - Nov. 30, 2009, 11:44 a.m.
>
> I really dislike this.  In general vcpu ioctls are used as components of
> some work to be done, for example RMW of some state.  In this case it is
> meaningless to execute the ioctls remotely, you need to execute the entire
> RMW remotely instead.
>

Why? The "M" part of RMW is executed in shared memory.
Only the R and W parts have any restrictions on where to be executed.
Paolo Bonzini - Nov. 30, 2009, 11:48 a.m.
On 11/26/2009 06:24 PM, Glauber Costa wrote:
> +CPUState *qemu_get_current_env(void);

Useless forward reference (to a function that doesn't exist in that file 
at all).

Paolo
Avi Kivity - Nov. 30, 2009, 12:06 p.m.
On 11/30/2009 01:44 PM, Glauber Costa wrote:
>> I really dislike this.  In general vcpu ioctls are used as components of
>> some work to be done, for example RMW of some state.  In this case it is
>> meaningless to execute the ioctls remotely, you need to execute the entire
>> RMW remotely instead.
>>
>>      
> Why? The "M" part of RMW is executed in shared memory.
> Only the R and W parts have any restrictions on where to be executed.
>
>    

If the guest continues to run during the RMW, you will get inconsistent 
results.

This may be prevented by qemu_mutex, but I'd rather not rely on it.

Also, I'd like remote operations to be visible.

Patch

diff --git a/cpu-all.h b/cpu-all.h
index ebe8bfb..3eb865d 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -763,6 +763,9 @@  extern CPUState *cpu_single_env;
 extern int64_t qemu_icount;
 extern int use_icount;
 
+void qemu_queue_work(CPUState *env, void (*func)(void *data), void *data);
+void qemu_flush_work(CPUState *env);
+
 #define CPU_INTERRUPT_HARD   0x02 /* hardware interrupt pending */
 #define CPU_INTERRUPT_EXITTB 0x04 /* exit the current TB (use for x86 a20 case) */
 #define CPU_INTERRUPT_TIMER  0x08 /* internal timer exception pending */
diff --git a/cpu-defs.h b/cpu-defs.h
index 95068b5..18792fc 100644
--- a/cpu-defs.h
+++ b/cpu-defs.h
@@ -31,6 +31,8 @@ 
 #include "qemu-queue.h"
 #include "targphys.h"
 
+#include "qemu-thread.h"
+
 #ifndef TARGET_LONG_BITS
 #error TARGET_LONG_BITS must be defined before including this header
 #endif
@@ -134,6 +136,13 @@  typedef struct CPUWatchpoint {
     QTAILQ_ENTRY(CPUWatchpoint) entry;
 } CPUWatchpoint;
 
+typedef struct QemuWorkItem {
+    void (*func)(void *data);
+    void *data;
+    int done;
+} QemuWorkItem;
+
+
 #define CPU_TEMP_BUF_NLONGS 128
 #define CPU_COMMON                                                      \
     struct TranslationBlock *current_tb; /* currently executing TB  */  \
@@ -175,6 +184,10 @@  typedef struct CPUWatchpoint {
     QTAILQ_HEAD(watchpoints_head, CPUWatchpoint) watchpoints;            \
     CPUWatchpoint *watchpoint_hit;                                      \
                                                                         \
+    QemuWorkItem queued_work;                                           \
+    uint64_t queued_local, queued_total;                                \
+    struct QemuMutex queue_lock;                                        \
+                                                                        \
     struct GDBRegisterState *gdb_regs;                                  \
                                                                         \
     /* Core interrupt code */                                           \
@@ -194,6 +207,7 @@  typedef struct CPUWatchpoint {
     uint32_t created;                                                   \
     struct QemuThread *thread;                                          \
     struct QemuCond *halt_cond;                                         \
+    struct QemuCond work_cond;                                          \
     const char *cpu_model_str;                                          \
     struct KVMState *kvm_state;                                         \
     struct kvm_run *kvm_run;                                            \
diff --git a/kvm-all.c b/kvm-all.c
index 0c0b3c3..28f7ab7 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -636,7 +636,7 @@  int kvm_cpu_exec(CPUState *env)
     struct kvm_run *run = env->kvm_run;
     int ret;
 
-    dprintf("kvm_cpu_exec()\n");
+    dprintf("kvm_cpu_exec() %d\n", env->cpu_index);
 
     do {
         if (env->exit_request) {
@@ -904,9 +904,22 @@  int kvm_vm_ioctl(KVMState *s, int type, ...)
     return ret;
 }
 
-int kvm_vcpu_ioctl(CPUState *env, int type, ...)
+CPUState *qemu_get_current_env(void);
+static void kvm_remote_ioctl(void *data)
 {
+    KVMIoctl *arg = data;
     int ret;
+
+    ret = ioctl(arg->fd, arg->type, arg->data);
+    if (ret == -1)
+        ret = -errno;
+    arg->ret = ret;
+}
+
+KVMIoctl data;
+
+int kvm_vcpu_ioctl(CPUState *env, int type, ...)
+{
     void *arg;
     va_list ap;
 
@@ -914,11 +927,12 @@  int kvm_vcpu_ioctl(CPUState *env, int type, ...)
     arg = va_arg(ap, void *);
     va_end(ap);
 
-    ret = ioctl(env->kvm_fd, type, arg);
-    if (ret == -1)
-        ret = -errno;
+    data.type = type;
+    data.data = arg;
+    data.fd = env->kvm_fd;
 
-    return ret;
+    qemu_queue_work(env, kvm_remote_ioctl, (void *)&data);
+    return data.ret;
 }
 
 int kvm_has_sync_mmu(void)
@@ -951,19 +965,6 @@  void kvm_setup_guest_memory(void *start, size_t size)
 }
 
 #ifdef KVM_CAP_SET_GUEST_DEBUG
-static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
-{
-#ifdef CONFIG_IOTHREAD
-    if (env == cpu_single_env) {
-        func(data);
-        return;
-    }
-    abort();
-#else
-    func(data);
-#endif
-}
-
 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
                                                  target_ulong pc)
 {
@@ -981,38 +982,24 @@  int kvm_sw_breakpoints_active(CPUState *env)
     return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
 }
 
-struct kvm_set_guest_debug_data {
-    struct kvm_guest_debug dbg;
-    CPUState *env;
-    int err;
-};
-
-static void kvm_invoke_set_guest_debug(void *data)
+int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
 {
-    struct kvm_set_guest_debug_data *dbg_data = data;
-    CPUState *env = dbg_data->env;
+    struct kvm_guest_debug dbg;
+  
+    dbg.control = 0;
+    if (env->singlestep_enabled)
+        dbg.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
+  
+    kvm_arch_update_guest_debug(env, &dbg);
+    dbg.control |= reinject_trap;
 
     if (env->kvm_state->regs_modified) {
         kvm_arch_put_registers(env);
         env->kvm_state->regs_modified = 0;
     }
-    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
-}
-
-int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
-{
-    struct kvm_set_guest_debug_data data;
-
-    data.dbg.control = 0;
-    if (env->singlestep_enabled)
-        data.dbg.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
-
-    kvm_arch_update_guest_debug(env, &data.dbg);
-    data.dbg.control |= reinject_trap;
-    data.env = env;
+ 
+    return kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg);
 
-    on_vcpu(env, kvm_invoke_set_guest_debug, &data);
-    return data.err;
 }
 
 int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
diff --git a/kvm.h b/kvm.h
index a474d95..d42b720 100644
--- a/kvm.h
+++ b/kvm.h
@@ -149,4 +149,11 @@  static inline void cpu_synchronize_state(CPUState *env)
     }
 }
 
+typedef struct KVMIoctl {
+    int fd;
+    int type;
+    int ret;
+    void *data;
+} KVMIoctl;
+
 #endif
diff --git a/vl.c b/vl.c
index 9afe4b6..91e2264 100644
--- a/vl.c
+++ b/vl.c
@@ -3405,6 +3405,11 @@  void qemu_notify_event(void)
     }
 }
 
+void qemu_queue_work(CPUState *env, void (*func)(void *data), void *data, int wait)
+{
+        func(data);
+}
+
 void qemu_mutex_lock_iothread(void) {}
 void qemu_mutex_unlock_iothread(void) {}
 
@@ -3438,8 +3443,7 @@  static int tcg_has_work(void);
 
 static pthread_key_t current_env;
 
-CPUState *qemu_get_current_env(void);
-CPUState *qemu_get_current_env(void)
+static CPUState *qemu_get_current_env(void)
 {
     return pthread_getspecific(current_env);
 }
@@ -3476,8 +3480,10 @@  static int qemu_init_main_loop(void)
 
 static void qemu_wait_io_event(CPUState *env)
 {
-    while (!tcg_has_work())
+    while (!tcg_has_work()) {
+	qemu_flush_work(env);
         qemu_cond_timedwait(env->halt_cond, &qemu_global_mutex, 1000);
+    }
 
     qemu_mutex_unlock(&qemu_global_mutex);
 
@@ -3490,6 +3496,7 @@  static void qemu_wait_io_event(CPUState *env)
     qemu_mutex_unlock(&qemu_fair_mutex);
 
     qemu_mutex_lock(&qemu_global_mutex);
+
     if (env->stop) {
         env->stop = 0;
         env->stopped = 1;
@@ -3516,8 +3523,11 @@  static void *kvm_cpu_thread_fn(void *arg)
     qemu_cond_signal(&qemu_cpu_cond);
 
     /* and wait for machine initialization */
-    while (!qemu_system_ready)
+    while (!qemu_system_ready) {
+        /* system reset and initialization is a heavy caller of queue_work() */
+        qemu_flush_work(env);
         qemu_cond_timedwait(&qemu_system_cond, &qemu_global_mutex, 100);
+    }
 
     while (1) {
         if (cpu_can_run(env))
@@ -3544,8 +3554,10 @@  static void *tcg_cpu_thread_fn(void *arg)
     qemu_cond_signal(&qemu_cpu_cond);
 
     /* and wait for machine initialization */
-    while (!qemu_system_ready)
+    while (!qemu_system_ready) {
+        qemu_flush_work(env);
         qemu_cond_timedwait(&qemu_system_cond, &qemu_global_mutex, 100);
+    }
 
     while (1) {
         tcg_cpu_exec();
@@ -3563,6 +3575,45 @@  void qemu_cpu_kick(void *_env)
         qemu_thread_signal(env->thread, SIGUSR1);
 }
 
+void qemu_queue_work(CPUState *env, void (*func)(void *data), void *data)
+{
+    QemuWorkItem *wii;
+
+    env->queued_total++;
+
+    if (env == qemu_get_current_env()) {
+        env->queued_local++;
+        func(data);
+        return;
+    }
+
+    wii = &env->queued_work;
+    wii->func = func;
+    wii->data = data;
+    wii->done = 0;
+
+    qemu_thread_signal(env->thread, SIGUSR1);
+
+    while (!wii->done) {
+        qemu_cond_wait(&env->work_cond, &qemu_global_mutex);
+    }
+
+    wii->func = NULL;
+}
+
+void qemu_flush_work(CPUState *env)
+{
+    QemuWorkItem *wi;
+
+    wi = &env->queued_work;
+    if (wi->func) {
+     //   printf("executing func %p\n", wi->func);
+        wi->func(wi->data);
+        wi->done = 1;
+    }
+    qemu_cond_broadcast(&env->work_cond);
+}
+
 int qemu_cpu_self(void *_env)
 {
     CPUState *env = _env;
@@ -3721,6 +3772,8 @@  void qemu_init_vcpu(void *_env)
 {
     CPUState *env = _env;
 
+    qemu_cond_init(&env->work_cond);
+
     if (kvm_enabled())
         kvm_start_vcpu(env);
     else