Patchwork [2/2] kvm: use per-cpu lock to free vcpu thread out of the big lock

login
register
mail settings
Submitter pingfan liu
Date June 21, 2012, 3:06 p.m.
Message ID <1340291218-11669-3-git-send-email-qemulist@gmail.com>
Download mbox | patch
Permalink /patch/166343/
State New
Headers show

Comments

pingfan liu - June 21, 2012, 3:06 p.m.
In order to break the big lock, using per-cpu_lock in kvm_cpu_exec()
to protect the race from other cpu's access to env->apic_state & related
field in env.
Also, we need to protect agaist run_on_cpu().

Race condition can be like this:
1.  vcpu-1 IPI vcpu-2
    vcpu-3 IPI vcpu-2
    Open window exists for accessing to vcpu-2's apic_state & env

2. run_on_cpu() write env->queued_work_last, while flush_queued_work()
   read

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 cpus.c    |    6 ++++--
 hw/apic.c |   58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 hw/pc.c   |    8 +++++++-
 kvm-all.c |   13 +++++++++++--
 4 files changed, 76 insertions(+), 9 deletions(-)
陳韋任 - June 22, 2012, 2:29 a.m.
Hi Liu,

On Thu, Jun 21, 2012 at 11:06:58PM +0800, Liu Ping Fan wrote:
> In order to break the big lock, using per-cpu_lock in kvm_cpu_exec()
> to protect the race from other cpu's access to env->apic_state & related
> field in env.

  Can this also be applied on tcg_cpu_exec(), too?

Regards,
chenwj
pingfan liu - June 22, 2012, 10:26 a.m.
On Fri, Jun 22, 2012 at 10:29 AM, 陳韋任 (Wei-Ren Chen)
<chenwj@iis.sinica.edu.tw> wrote:
> Hi Liu,
>
> On Thu, Jun 21, 2012 at 11:06:58PM +0800, Liu Ping Fan wrote:
>> In order to break the big lock, using per-cpu_lock in kvm_cpu_exec()
>> to protect the race from other cpu's access to env->apic_state & related
>> field in env.
>
>  Can this also be applied on tcg_cpu_exec(), too?
>
No, currently, I am focusing on kvm branch

Regards,
pingfan

> Regards,
> chenwj
>
> --
> Wei-Ren Chen (陳韋任)
> Computer Systems Lab, Institute of Information Science,
> Academia Sinica, Taiwan (R.O.C.)
> Tel:886-2-2788-3799 #1667
> Homepage: http://people.cs.nctu.edu.tw/~chenwj

Patch

diff --git a/cpus.c b/cpus.c
index 554f7bc..ac99afe 100644
--- a/cpus.c
+++ b/cpus.c
@@ -649,6 +649,7 @@  void run_on_cpu(CPUArchState *env, void (*func)(void *data), void *data)
 
     wi.func = func;
     wi.data = data;
+    qemu_mutex_lock(env->cpu_lock);
     if (!env->queued_work_first) {
         env->queued_work_first = &wi;
     } else {
@@ -657,6 +658,7 @@  void run_on_cpu(CPUArchState *env, void (*func)(void *data), void *data)
     env->queued_work_last = &wi;
     wi.next = NULL;
     wi.done = false;
+    qemu_mutex_unlock(env->cpu_lock);
 
     qemu_cpu_kick(env);
     while (!wi.done) {
@@ -718,7 +720,7 @@  static void qemu_tcg_wait_io_event(void)
 static void qemu_kvm_wait_io_event(CPUArchState *env)
 {
     while (cpu_thread_is_idle(env)) {
-        qemu_cond_wait(env->halt_cond, &qemu_global_mutex);
+        qemu_cond_wait(env->halt_cond, env->cpu_lock);
     }
 
     qemu_kvm_eat_signals(env);
@@ -729,8 +731,8 @@  static void *qemu_kvm_cpu_thread_fn(void *arg)
 {
     CPUArchState *env = arg;
     int r;
+    qemu_mutex_lock_cpu(env);
 
-    qemu_mutex_lock(&qemu_global_mutex);
     qemu_thread_get_self(env->thread);
     env->thread_id = qemu_get_thread_id();
     cpu_single_env = env;
diff --git a/hw/apic.c b/hw/apic.c
index 4eeaf88..b999a40 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -22,6 +22,7 @@ 
 #include "host-utils.h"
 #include "trace.h"
 #include "pc.h"
+#include "qemu-thread.h"
 
 #define MAX_APIC_WORDS 8
 
@@ -94,6 +95,7 @@  static int get_highest_priority_int(uint32_t *tab)
     return -1;
 }
 
+/* Caller must hold the lock */
 static void apic_sync_vapic(APICCommonState *s, int sync_type)
 {
     VAPICState vapic_state;
@@ -141,11 +143,13 @@  static void apic_sync_vapic(APICCommonState *s, int sync_type)
     }
 }
 
+/* Caller must hold lock */
 static void apic_vapic_base_update(APICCommonState *s)
 {
     apic_sync_vapic(s, SYNC_TO_VAPIC);
 }
 
+/* Caller must hold the lock */
 static void apic_local_deliver(APICCommonState *s, int vector)
 {
     uint32_t lvt = s->lvt[vector];
@@ -175,9 +179,11 @@  static void apic_local_deliver(APICCommonState *s, int vector)
             (lvt & APIC_LVT_LEVEL_TRIGGER))
             trigger_mode = APIC_TRIGGER_LEVEL;
         apic_set_irq(s, lvt & 0xff, trigger_mode);
+        break;
     }
 }
 
+/* Caller must hold the lock */
 void apic_deliver_pic_intr(DeviceState *d, int level)
 {
     APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
@@ -200,9 +206,12 @@  void apic_deliver_pic_intr(DeviceState *d, int level)
     }
 }
 
+/* Must hold lock */
 static void apic_external_nmi(APICCommonState *s)
 {
+    qemu_mutex_lock_cpu(s->cpu_env);
     apic_local_deliver(s, APIC_LVT_LINT1);
+    qemu_mutex_unlock_cpu(s->cpu_env);
 }
 
 #define foreach_apic(apic, deliver_bitmask, code) \
@@ -215,7 +224,9 @@  static void apic_external_nmi(APICCommonState *s)
                 if (__mask & (1 << __j)) {\
                     apic = local_apics[__i * 32 + __j];\
                     if (apic) {\
+                        qemu_mutex_lock_cpu(apic->cpu_env);\
                         code;\
+                        qemu_mutex_unlock_cpu(apic->cpu_env);\
                     }\
                 }\
             }\
@@ -244,7 +255,9 @@  static void apic_bus_deliver(const uint32_t *deliver_bitmask,
                 if (d >= 0) {
                     apic_iter = local_apics[d];
                     if (apic_iter) {
+                        qemu_mutex_lock_cpu(apic_iter->cpu_env);
                         apic_set_irq(apic_iter, vector_num, trigger_mode);
+                        qemu_mutex_unlock_cpu(apic_iter->cpu_env);
                     }
                 }
             }
@@ -293,6 +306,7 @@  void apic_deliver_irq(uint8_t dest, uint8_t dest_mode, uint8_t delivery_mode,
     apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num, trigger_mode);
 }
 
+/* Caller must hold lock */
 static void apic_set_base(APICCommonState *s, uint64_t val)
 {
     s->apicbase = (val & 0xfffff000) |
@@ -305,6 +319,7 @@  static void apic_set_base(APICCommonState *s, uint64_t val)
     }
 }
 
+/* caller must hold lock */
 static void apic_set_tpr(APICCommonState *s, uint8_t val)
 {
     /* Updates from cr8 are ignored while the VAPIC is active */
@@ -314,12 +329,14 @@  static void apic_set_tpr(APICCommonState *s, uint8_t val)
     }
 }
 
+/* caller must hold lock */
 static uint8_t apic_get_tpr(APICCommonState *s)
 {
     apic_sync_vapic(s, SYNC_FROM_VAPIC);
     return s->tpr >> 4;
 }
 
+/* Caller must hold the lock */
 static int apic_get_ppr(APICCommonState *s)
 {
     int tpr, isrv, ppr;
@@ -348,6 +365,7 @@  static int apic_get_arb_pri(APICCommonState *s)
  * 0  - no interrupt,
  * >0 - interrupt number
  */
+/* Caller must hold cpu_lock */
 static int apic_irq_pending(APICCommonState *s)
 {
     int irrv, ppr;
@@ -363,6 +381,7 @@  static int apic_irq_pending(APICCommonState *s)
     return irrv;
 }
 
+/* caller must ensure the lock has been taken */
 /* signal the CPU if an irq is pending */
 static void apic_update_irq(APICCommonState *s)
 {
@@ -380,11 +399,13 @@  static void apic_update_irq(APICCommonState *s)
 void apic_poll_irq(DeviceState *d)
 {
     APICCommonState *s = APIC_COMMON(d);
-
+    qemu_mutex_lock_cpu(s->cpu_env);
     apic_sync_vapic(s, SYNC_FROM_VAPIC);
     apic_update_irq(s);
+    qemu_mutex_unlock_cpu(s->cpu_env);
 }
 
+/* caller must hold the lock */
 static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode)
 {
     apic_report_irq_delivered(!get_bit(s->irr, vector_num));
@@ -407,6 +428,7 @@  static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode)
     apic_update_irq(s);
 }
 
+/* caller must hold the lock */
 static void apic_eoi(APICCommonState *s)
 {
     int isrv;
@@ -415,6 +437,7 @@  static void apic_eoi(APICCommonState *s)
         return;
     reset_bit(s->isr, isrv);
     if (!(s->spurious_vec & APIC_SV_DIRECTED_IO) && get_bit(s->tmr, isrv)) {
+        //fix me,need to take extra lock for the bus?
         ioapic_eoi_broadcast(isrv);
     }
     apic_sync_vapic(s, SYNC_FROM_VAPIC | SYNC_TO_VAPIC);
@@ -480,18 +503,23 @@  static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
 static void apic_startup(APICCommonState *s, int vector_num)
 {
     s->sipi_vector = vector_num;
+    qemu_mutex_lock_cpu(s->cpu_env);
     cpu_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI);
+    qemu_mutex_unlock_cpu(s->cpu_env);
 }
 
 void apic_sipi(DeviceState *d)
 {
     APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
-
+    qemu_mutex_lock_cpu(s->cpu_env);
     cpu_reset_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI);
 
-    if (!s->wait_for_sipi)
+    if (!s->wait_for_sipi) {
+        qemu_mutex_unlock_cpu(s->cpu_env);
         return;
+    }
     cpu_x86_load_seg_cache_sipi(s->cpu_env, s->sipi_vector);
+    qemu_mutex_unlock_cpu(s->cpu_env);
     s->wait_for_sipi = 0;
 }
 
@@ -543,6 +571,7 @@  static void apic_deliver(DeviceState *d, uint8_t dest, uint8_t dest_mode,
     apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num, trigger_mode);
 }
 
+/* Caller must take lock */
 int apic_get_interrupt(DeviceState *d)
 {
     APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
@@ -572,6 +601,7 @@  int apic_get_interrupt(DeviceState *d)
     return intno;
 }
 
+/* Caller must hold the cpu_lock */
 int apic_accept_pic_intr(DeviceState *d)
 {
     APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
@@ -589,6 +619,7 @@  int apic_accept_pic_intr(DeviceState *d)
     return 0;
 }
 
+/* Caller hold lock */
 static uint32_t apic_get_current_count(APICCommonState *s)
 {
     int64_t d;
@@ -619,9 +650,10 @@  static void apic_timer_update(APICCommonState *s, int64_t current_time)
 static void apic_timer(void *opaque)
 {
     APICCommonState *s = opaque;
-
+    qemu_mutex_lock_cpu(s->cpu_env);
     apic_local_deliver(s, APIC_LVT_TIMER);
     apic_timer_update(s, s->next_time);
+    qemu_mutex_unlock_cpu(s->cpu_env);
 }
 
 static uint32_t apic_mem_readb(void *opaque, target_phys_addr_t addr)
@@ -664,18 +696,22 @@  static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr)
         val = 0x11 | ((APIC_LVT_NB - 1) << 16); /* version 0x11 */
         break;
     case 0x08:
+        qemu_mutex_lock_cpu(s->cpu_env);
         apic_sync_vapic(s, SYNC_FROM_VAPIC);
         if (apic_report_tpr_access) {
             cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_READ);
         }
         val = s->tpr;
+        qemu_mutex_unlock_cpu(s->cpu_env);
         break;
     case 0x09:
         val = apic_get_arb_pri(s);
         break;
     case 0x0a:
         /* ppr */
+        qemu_mutex_lock_cpu(s->cpu_env);
         val = apic_get_ppr(s);
+        qemu_mutex_unlock_cpu(s->cpu_env);
         break;
     case 0x0b:
         val = 0;
@@ -712,7 +748,9 @@  static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr)
         val = s->initial_count;
         break;
     case 0x39:
+        qemu_mutex_lock_cpu(s->cpu_env);
         val = apic_get_current_count(s);
+        qemu_mutex_unlock_cpu(s->cpu_env);
         break;
     case 0x3e:
         val = s->divide_conf;
@@ -767,18 +805,22 @@  static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
     case 0x03:
         break;
     case 0x08:
+       qemu_mutex_lock_cpu(s->cpu_env);
         if (apic_report_tpr_access) {
             cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_WRITE);
         }
         s->tpr = val;
         apic_sync_vapic(s, SYNC_TO_VAPIC);
         apic_update_irq(s);
+        qemu_mutex_unlock_cpu(s->cpu_env);
         break;
     case 0x09:
     case 0x0a:
         break;
     case 0x0b: /* EOI */
+        qemu_mutex_lock_cpu(s->cpu_env);
         apic_eoi(s);
+        qemu_mutex_unlock_cpu(s->cpu_env);
         break;
     case 0x0d:
         s->log_dest = val >> 24;
@@ -787,8 +829,10 @@  static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
         s->dest_mode = val >> 28;
         break;
     case 0x0f:
+        qemu_mutex_lock_cpu(s->cpu_env);
         s->spurious_vec = val & 0x1ff;
         apic_update_irq(s);
+        qemu_mutex_unlock_cpu(s->cpu_env);
         break;
     case 0x10 ... 0x17:
     case 0x18 ... 0x1f:
@@ -807,24 +851,30 @@  static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
     case 0x32 ... 0x37:
         {
             int n = index - 0x32;
+            qemu_mutex_lock_cpu(s->cpu_env);
             s->lvt[n] = val;
             if (n == APIC_LVT_TIMER)
                 apic_timer_update(s, qemu_get_clock_ns(vm_clock));
+            qemu_mutex_unlock_cpu(s->cpu_env);
         }
         break;
     case 0x38:
+       qemu_mutex_lock_cpu(s->cpu_env);
         s->initial_count = val;
         s->initial_count_load_time = qemu_get_clock_ns(vm_clock);
         apic_timer_update(s, s->initial_count_load_time);
+        qemu_mutex_unlock_cpu(s->cpu_env);
         break;
     case 0x39:
         break;
     case 0x3e:
         {
             int v;
+            qemu_mutex_lock_cpu(s->cpu_env);
             s->divide_conf = val & 0xb;
             v = (s->divide_conf & 3) | ((s->divide_conf >> 1) & 4);
             s->count_shift = (v + 1) & 7;
+            qemu_mutex_unlock_cpu(s->cpu_env);
         }
         break;
     default:
diff --git a/hw/pc.c b/hw/pc.c
index 4d34a33..5e7350c 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -147,7 +147,7 @@  void cpu_smm_update(CPUX86State *env)
         smm_set(!!(env->hflags & HF_SMM_MASK), smm_arg);
 }
 
-
+/* Called by kvm_cpu_exec(), already with lock protection */
 /* IRQ handling */
 int cpu_get_pic_interrupt(CPUX86State *env)
 {
@@ -173,9 +173,15 @@  static void pic_irq_request(void *opaque, int irq, int level)
     DPRINTF("pic_irqs: %s irq %d\n", level? "raise" : "lower", irq);
     if (env->apic_state) {
         while (env) {
+            if (!qemu_cpu_is_self(env)) {
+                qemu_mutex_lock_cpu(env);
+            }
             if (apic_accept_pic_intr(env->apic_state)) {
                 apic_deliver_pic_intr(env->apic_state, level);
             }
+            if (!qemu_cpu_is_self(env)) {
+                qemu_mutex_unlock_cpu(env);
+            }
             env = env->next_cpu;
         }
     } else {
diff --git a/kvm-all.c b/kvm-all.c
index 9b73ccf..dc9aa54 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -29,6 +29,7 @@ 
 #include "bswap.h"
 #include "memory.h"
 #include "exec-memory.h"
+#include "qemu-thread.h"
 
 /* This check must be after config-host.h is included */
 #ifdef CONFIG_EVENTFD
@@ -1252,12 +1253,15 @@  int kvm_cpu_exec(CPUArchState *env)
              */
             qemu_cpu_kick_self();
         }
-        qemu_mutex_unlock_iothread();
+        qemu_mutex_unlock(env->cpu_lock);
 
         run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
 
-        qemu_mutex_lock_iothread();
+        qemu_mutex_lock(env->cpu_lock);
         kvm_arch_post_run(env, run);
+        qemu_mutex_unlock_cpu(env);
+
+        qemu_mutex_lock_iothread();
 
         kvm_flush_coalesced_mmio_buffer();
 
@@ -1265,6 +1269,8 @@  int kvm_cpu_exec(CPUArchState *env)
             if (run_ret == -EINTR || run_ret == -EAGAIN) {
                 DPRINTF("io window exit\n");
                 ret = EXCP_INTERRUPT;
+                qemu_mutex_unlock_iothread();
+                qemu_mutex_lock_cpu(env);
                 break;
             }
             fprintf(stderr, "error: kvm run failed %s\n",
@@ -1312,6 +1318,9 @@  int kvm_cpu_exec(CPUArchState *env)
             ret = kvm_arch_handle_exit(env, run);
             break;
         }
+
+        qemu_mutex_unlock_iothread();
+        qemu_mutex_lock_cpu(env);
     } while (ret == 0);
 
     if (ret < 0) {