Patchwork [v3,05/10] KVM: x86: Restrict writeback of VCPU state

login
register
mail settings
Submitter Jan Kiszka
Date Feb. 24, 2010, 2:17 p.m.
Message ID <09d3dc5c4cc2a36e61d2584b1b706143b9ce7765.1267021065.git.jan.kiszka@siemens.com>
Download mbox | patch
Permalink /patch/46128/
State New
Headers show

Comments

Jan Kiszka - Feb. 24, 2010, 2:17 p.m.
Do not write nmi_pending, sipi_vector, and mpstate unless we at least go
through a reset. And TSC as well as KVM wallclocks should only be
written on full sync, otherwise we risk to drop some time on during
state read-modify-write.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 kvm.h                 |    2 +-
 qemu-kvm-x86.c        |    2 +-
 target-i386/kvm.c     |   32 ++++++++++++++++++++------------
 target-i386/machine.c |    2 +-
 4 files changed, 23 insertions(+), 15 deletions(-)
Marcelo Tosatti - Feb. 24, 2010, 10:59 p.m.
On Wed, Feb 24, 2010 at 03:17:53PM +0100, Jan Kiszka wrote:
> Do not write nmi_pending, sipi_vector, and mpstate unless we at least go
> through a reset. And TSC as well as KVM wallclocks should only be
> written on full sync, otherwise we risk to drop some time on during
> state read-modify-write.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>  kvm.h                 |    2 +-
>  qemu-kvm-x86.c        |    2 +-
>  target-i386/kvm.c     |   32 ++++++++++++++++++++------------
>  target-i386/machine.c |    2 +-
>  4 files changed, 23 insertions(+), 15 deletions(-)
> 
> diff --git a/kvm.h b/kvm.h
> index 3ec5b59..3ee307d 100644
> --- a/kvm.h
> +++ b/kvm.h
> @@ -44,7 +44,7 @@ int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size);
>  int kvm_has_sync_mmu(void);
>  int kvm_has_vcpu_events(void);
>  int kvm_has_robust_singlestep(void);
> -int kvm_put_vcpu_events(CPUState *env);
> +int kvm_put_vcpu_events(CPUState *env, int level);
>  int kvm_get_vcpu_events(CPUState *env);
>  
>  void kvm_cpu_register_phys_memory_client(void);
> diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c
> index 4e6ae70..b0f9670 100644
> --- a/qemu-kvm-x86.c
> +++ b/qemu-kvm-x86.c
> @@ -1391,7 +1391,7 @@ void kvm_arch_push_nmi(void *opaque)
>  void kvm_arch_cpu_reset(CPUState *env)
>  {
>      kvm_arch_reset_vcpu(env);
> -    kvm_put_vcpu_events(env);
> +    kvm_put_vcpu_events(env, KVM_PUT_RESET_STATE);
>      kvm_reset_mpstate(env);
>      if (!cpu_is_bsp(env) && !kvm_irqchip_in_kernel()) {
>          env->interrupt_request &= ~CPU_INTERRUPT_HARD;
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 5f0829b..f1f44d3 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -541,7 +541,7 @@ static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
>      entry->data = value;
>  }
>  
> -static int kvm_put_msrs(CPUState *env)
> +static int kvm_put_msrs(CPUState *env, int level)
>  {
>      struct {
>          struct kvm_msrs info;
> @@ -555,7 +555,6 @@ static int kvm_put_msrs(CPUState *env)
>      kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
>      if (kvm_has_msr_star(env))
>  	kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
> -    kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
>      kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
>  #ifdef TARGET_X86_64
>      /* FIXME if lm capable */
> @@ -564,8 +563,12 @@ static int kvm_put_msrs(CPUState *env)
>      kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
>      kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
>  #endif
> -    kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,  env->system_time_msr);
> -    kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK,  env->wall_clock_msr);
> +    if (level == KVM_PUT_FULL_STATE) {
> +        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
> +        kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
> +                          env->system_time_msr);
> +        kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
> +    }
>  
>      msr_data.info.nmsrs = n;
>  
> @@ -783,7 +786,7 @@ static int kvm_get_mp_state(CPUState *env)
>  }
>  #endif
>  
> -int kvm_put_vcpu_events(CPUState *env)
> +int kvm_put_vcpu_events(CPUState *env, int level)
>  {
>  #ifdef KVM_CAP_VCPU_EVENTS
>      struct kvm_vcpu_events events;
> @@ -807,8 +810,11 @@ int kvm_put_vcpu_events(CPUState *env)
>  
>      events.sipi_vector = env->sipi_vector;
>  
> -    events.flags =
> -        KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
> +    events.flags = 0;
> +    if (level >= KVM_PUT_RESET_STATE) {
> +        events.flags |=
> +            KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
> +    }
>  
>      return kvm_vcpu_ioctl(env, KVM_SET_VCPU_EVENTS, &events);

What is the reason for write-back of any vcpu-event state for RUNTIME 
case again?

The debug workaround?
Jan Kiszka - Feb. 24, 2010, 11:51 p.m.
Marcelo Tosatti wrote:
> On Wed, Feb 24, 2010 at 03:17:53PM +0100, Jan Kiszka wrote:
>> Do not write nmi_pending, sipi_vector, and mpstate unless we at least go
>> through a reset. And TSC as well as KVM wallclocks should only be
>> written on full sync, otherwise we risk to drop some time on during
>> state read-modify-write.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>  kvm.h                 |    2 +-
>>  qemu-kvm-x86.c        |    2 +-
>>  target-i386/kvm.c     |   32 ++++++++++++++++++++------------
>>  target-i386/machine.c |    2 +-
>>  4 files changed, 23 insertions(+), 15 deletions(-)
>>
>> diff --git a/kvm.h b/kvm.h
>> index 3ec5b59..3ee307d 100644
>> --- a/kvm.h
>> +++ b/kvm.h
>> @@ -44,7 +44,7 @@ int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size);
>>  int kvm_has_sync_mmu(void);
>>  int kvm_has_vcpu_events(void);
>>  int kvm_has_robust_singlestep(void);
>> -int kvm_put_vcpu_events(CPUState *env);
>> +int kvm_put_vcpu_events(CPUState *env, int level);
>>  int kvm_get_vcpu_events(CPUState *env);
>>  
>>  void kvm_cpu_register_phys_memory_client(void);
>> diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c
>> index 4e6ae70..b0f9670 100644
>> --- a/qemu-kvm-x86.c
>> +++ b/qemu-kvm-x86.c
>> @@ -1391,7 +1391,7 @@ void kvm_arch_push_nmi(void *opaque)
>>  void kvm_arch_cpu_reset(CPUState *env)
>>  {
>>      kvm_arch_reset_vcpu(env);
>> -    kvm_put_vcpu_events(env);
>> +    kvm_put_vcpu_events(env, KVM_PUT_RESET_STATE);
>>      kvm_reset_mpstate(env);
>>      if (!cpu_is_bsp(env) && !kvm_irqchip_in_kernel()) {
>>          env->interrupt_request &= ~CPU_INTERRUPT_HARD;
>> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
>> index 5f0829b..f1f44d3 100644
>> --- a/target-i386/kvm.c
>> +++ b/target-i386/kvm.c
>> @@ -541,7 +541,7 @@ static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
>>      entry->data = value;
>>  }
>>  
>> -static int kvm_put_msrs(CPUState *env)
>> +static int kvm_put_msrs(CPUState *env, int level)
>>  {
>>      struct {
>>          struct kvm_msrs info;
>> @@ -555,7 +555,6 @@ static int kvm_put_msrs(CPUState *env)
>>      kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
>>      if (kvm_has_msr_star(env))
>>  	kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
>> -    kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
>>      kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
>>  #ifdef TARGET_X86_64
>>      /* FIXME if lm capable */
>> @@ -564,8 +563,12 @@ static int kvm_put_msrs(CPUState *env)
>>      kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
>>      kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
>>  #endif
>> -    kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,  env->system_time_msr);
>> -    kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK,  env->wall_clock_msr);
>> +    if (level == KVM_PUT_FULL_STATE) {
>> +        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
>> +        kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
>> +                          env->system_time_msr);
>> +        kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
>> +    }
>>  
>>      msr_data.info.nmsrs = n;
>>  
>> @@ -783,7 +786,7 @@ static int kvm_get_mp_state(CPUState *env)
>>  }
>>  #endif
>>  
>> -int kvm_put_vcpu_events(CPUState *env)
>> +int kvm_put_vcpu_events(CPUState *env, int level)
>>  {
>>  #ifdef KVM_CAP_VCPU_EVENTS
>>      struct kvm_vcpu_events events;
>> @@ -807,8 +810,11 @@ int kvm_put_vcpu_events(CPUState *env)
>>  
>>      events.sipi_vector = env->sipi_vector;
>>  
>> -    events.flags =
>> -        KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
>> +    events.flags = 0;
>> +    if (level >= KVM_PUT_RESET_STATE) {
>> +        events.flags |=
>> +            KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
>> +    }
>>  
>>      return kvm_vcpu_ioctl(env, KVM_SET_VCPU_EVENTS, &events);
> 
> What is the reason for write-back of any vcpu-event state for RUNTIME 
> case again?
> 
> The debug workaround?

Consistency and maximum flexibility.

I don't want to start fiddling with this again when we start to
manipulate some VCPU runtime state that may not require writeback yet
(workarounds like the guest debugging stuff can be a reason for that).
Instead, we should now establish a clean concept that only knows those
three types and their well-defined writeback points.

Jan

Patch

diff --git a/kvm.h b/kvm.h
index 3ec5b59..3ee307d 100644
--- a/kvm.h
+++ b/kvm.h
@@ -44,7 +44,7 @@  int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size);
 int kvm_has_sync_mmu(void);
 int kvm_has_vcpu_events(void);
 int kvm_has_robust_singlestep(void);
-int kvm_put_vcpu_events(CPUState *env);
+int kvm_put_vcpu_events(CPUState *env, int level);
 int kvm_get_vcpu_events(CPUState *env);
 
 void kvm_cpu_register_phys_memory_client(void);
diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c
index 4e6ae70..b0f9670 100644
--- a/qemu-kvm-x86.c
+++ b/qemu-kvm-x86.c
@@ -1391,7 +1391,7 @@  void kvm_arch_push_nmi(void *opaque)
 void kvm_arch_cpu_reset(CPUState *env)
 {
     kvm_arch_reset_vcpu(env);
-    kvm_put_vcpu_events(env);
+    kvm_put_vcpu_events(env, KVM_PUT_RESET_STATE);
     kvm_reset_mpstate(env);
     if (!cpu_is_bsp(env) && !kvm_irqchip_in_kernel()) {
         env->interrupt_request &= ~CPU_INTERRUPT_HARD;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 5f0829b..f1f44d3 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -541,7 +541,7 @@  static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
     entry->data = value;
 }
 
-static int kvm_put_msrs(CPUState *env)
+static int kvm_put_msrs(CPUState *env, int level)
 {
     struct {
         struct kvm_msrs info;
@@ -555,7 +555,6 @@  static int kvm_put_msrs(CPUState *env)
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
     if (kvm_has_msr_star(env))
 	kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
-    kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
     kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
 #ifdef TARGET_X86_64
     /* FIXME if lm capable */
@@ -564,8 +563,12 @@  static int kvm_put_msrs(CPUState *env)
     kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
     kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
 #endif
-    kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,  env->system_time_msr);
-    kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK,  env->wall_clock_msr);
+    if (level == KVM_PUT_FULL_STATE) {
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
+        kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
+                          env->system_time_msr);
+        kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
+    }
 
     msr_data.info.nmsrs = n;
 
@@ -783,7 +786,7 @@  static int kvm_get_mp_state(CPUState *env)
 }
 #endif
 
-int kvm_put_vcpu_events(CPUState *env)
+int kvm_put_vcpu_events(CPUState *env, int level)
 {
 #ifdef KVM_CAP_VCPU_EVENTS
     struct kvm_vcpu_events events;
@@ -807,8 +810,11 @@  int kvm_put_vcpu_events(CPUState *env)
 
     events.sipi_vector = env->sipi_vector;
 
-    events.flags =
-        KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
+    events.flags = 0;
+    if (level >= KVM_PUT_RESET_STATE) {
+        events.flags |=
+            KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
+    }
 
     return kvm_vcpu_ioctl(env, KVM_SET_VCPU_EVENTS, &events);
 #else
@@ -901,15 +907,17 @@  int kvm_arch_put_registers(CPUState *env, int level)
     if (ret < 0)
         return ret;
 
-    ret = kvm_put_msrs(env);
+    ret = kvm_put_msrs(env, level);
     if (ret < 0)
         return ret;
 
-    ret = kvm_put_mp_state(env);
-    if (ret < 0)
-        return ret;
+    if (level >= KVM_PUT_RESET_STATE) {
+        ret = kvm_put_mp_state(env);
+        if (ret < 0)
+            return ret;
+    }
 
-    ret = kvm_put_vcpu_events(env);
+    ret = kvm_put_vcpu_events(env, level);
     if (ret < 0)
         return ret;
 
diff --git a/target-i386/machine.c b/target-i386/machine.c
index 61e6a87..6fca559 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -362,7 +362,7 @@  static int cpu_post_load(void *opaque, int version_id)
 
     if (kvm_enabled()) {
         kvm_load_tsc(env);
-        kvm_put_vcpu_events(env);
+        kvm_put_vcpu_events(env, KVM_PUT_FULL_STATE);
     }
 
     return 0;