Patchwork [4/7,v2] KVM regsync: Add register bitmap parameter to do_kvm_cpu_synchronize_state

login
register
mail settings
Submitter Jason J. Herne
Date Jan. 10, 2013, 3:29 p.m.
Message ID <1357831744-3950-1-git-send-email-jjherne@us.ibm.com>
Download mbox | patch
Permalink /patch/211107/
State New
Headers show

Comments

Jason J. Herne - Jan. 10, 2013, 3:29 p.m.
From: "Jason J. Herne" <jjherne@us.ibm.com>

do_kvm_cpu_synchronize_state is called via run_on_cpu, so we can only pass
a single argument.  Create SyncStateArgs struct for this purpose and add
register bitmap data member to it.

Signed-off-by: Jason J. Herne <jjherne@us.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 include/sysemu/kvm.h |    6 ++++++
 kvm-all.c            |   27 +++++++++++++++++----------
 2 files changed, 23 insertions(+), 10 deletions(-)
Blue Swirl - Jan. 12, 2013, 11:08 a.m.
On Thu, Jan 10, 2013 at 3:29 PM, Jason J. Herne <jjherne@us.ibm.com> wrote:
> From: "Jason J. Herne" <jjherne@us.ibm.com>
>
> do_kvm_cpu_synchronize_state is called via run_on_cpu, so we can only pass
> a single argument.  Create SyncStateArgs struct for this purpose and add
> register bitmap data member to it.
>
> Signed-off-by: Jason J. Herne <jjherne@us.ibm.com>
> Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  include/sysemu/kvm.h |    6 ++++++
>  kvm-all.c            |   27 +++++++++++++++++----------
>  2 files changed, 23 insertions(+), 10 deletions(-)
>
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index e0738ba..193d1f4 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -223,6 +223,12 @@ int kvm_check_extension(KVMState *s, unsigned int extension);
>
>  uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function,
>                                        uint32_t index, int reg);
> +
> +struct kvm_cpu_syncstate_args {

KVMCPUSyncStateArgs, don't forget the typedef.

> +    CPUState *cpu;
> +    int regmap;
> +};
> +
>  void kvm_cpu_synchronize_state(CPUArchState *env);
>  void kvm_cpu_synchronize_post_reset(CPUArchState *env);
>  void kvm_cpu_synchronize_post_init(CPUArchState *env);
> diff --git a/kvm-all.c b/kvm-all.c
> index 1aa61bb..77ab72a 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -231,7 +231,7 @@ int kvm_init_vcpu(CPUArchState *env)
>
>      cpu->kvm_fd = ret;
>      cpu->kvm_state = s;
> -    cpu->kvm_vcpu_dirty = true;
> +    cpu->kvm_vcpu_dirty = KVM_REGSYNC_FULL_STATE;
>
>      mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>      if (mmap_size < 0) {
> @@ -1491,20 +1491,27 @@ void kvm_flush_coalesced_mmio_buffer(void)
>
>  static void do_kvm_cpu_synchronize_state(void *arg)
>  {
> -    CPUState *cpu = arg;
> +    struct kvm_cpu_syncstate_args *args = arg;
>
> -    if (!cpu->kvm_vcpu_dirty) {
> -        kvm_arch_get_registers(cpu, KVM_REGSYNC_FULL_STATE);
> -        cpu->kvm_vcpu_dirty = true;
> +    /* Do not sync regs that are already dirty */
> +    int regs_to_get = args->regmap & ~args->cpu->kvm_vcpu_dirty;
> +
> +    if (regs_to_get) {
> +        kvm_arch_get_registers(args->cpu, regs_to_get);
> +        args->cpu->kvm_vcpu_dirty |= regs_to_get;
>      }
>  }
>
>  void kvm_cpu_synchronize_state(CPUArchState *env)
>  {
>      CPUState *cpu = ENV_GET_CPU(env);
> +    struct kvm_cpu_syncstate_args args;
> +
> +    args.cpu = cpu;
> +    args.regmap = KVM_REGSYNC_FULL_STATE;
>
> -    if (!cpu->kvm_vcpu_dirty) {
> -        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
> +    if (args.regmap & ~cpu->kvm_vcpu_dirty) {
> +        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, &args);
>      }
>  }
>
> @@ -1513,7 +1520,7 @@ void kvm_cpu_synchronize_post_reset(CPUArchState *env)
>      CPUState *cpu = ENV_GET_CPU(env);
>
>      kvm_arch_put_registers(cpu, KVM_REGSYNC_RESET_STATE);
> -    cpu->kvm_vcpu_dirty = false;
> +    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RESET_STATE;
>  }
>
>  void kvm_cpu_synchronize_post_init(CPUArchState *env)
> @@ -1521,7 +1528,7 @@ void kvm_cpu_synchronize_post_init(CPUArchState *env)
>      CPUState *cpu = ENV_GET_CPU(env);
>
>      kvm_arch_put_registers(cpu, KVM_REGSYNC_FULL_STATE);
> -    cpu->kvm_vcpu_dirty = false;
> +    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_FULL_STATE;
>  }
>
>  int kvm_cpu_exec(CPUArchState *env)
> @@ -1540,7 +1547,7 @@ int kvm_cpu_exec(CPUArchState *env)
>      do {
>          if (cpu->kvm_vcpu_dirty) {
>              kvm_arch_put_registers(cpu, KVM_REGSYNC_RUNTIME_STATE);
> -            cpu->kvm_vcpu_dirty = false;
> +            cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RUNTIME_STATE;
>          }
>
>          kvm_arch_pre_run(cpu, run);
> --
> 1.7.9.5
>
>
Bharat Bhushan - Jan. 14, 2013, 11:32 a.m.
> -----Original Message-----
> From: Jason J. Herne [mailto:jjherne@us.ibm.com]
> Sent: Thursday, January 10, 2013 8:59 PM
> To: agraf@suse.de; borntraeger@de.ibm.com; aliguori@us.ibm.com;
> mtosatti@redhat.com; qemu-devel@nongnu.org; Bhushan Bharat-R65777;
> jan.kiszka@siemens.com
> Cc: Jason J. Herne
> Subject: [PATCH 4/7 v2] KVM regsync: Add register bitmap parameter to
> do_kvm_cpu_synchronize_state
> 
> From: "Jason J. Herne" <jjherne@us.ibm.com>
> 
> do_kvm_cpu_synchronize_state is called via run_on_cpu, so we can only pass
> a single argument.  Create SyncStateArgs struct for this purpose and add
> register bitmap data member to it.
> 
> Signed-off-by: Jason J. Herne <jjherne@us.ibm.com>
> Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  include/sysemu/kvm.h |    6 ++++++
>  kvm-all.c            |   27 +++++++++++++++++----------
>  2 files changed, 23 insertions(+), 10 deletions(-)
> 
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index e0738ba..193d1f4 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -223,6 +223,12 @@ int kvm_check_extension(KVMState *s, unsigned int
> extension);
> 
>  uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function,
>                                        uint32_t index, int reg);
> +
> +struct kvm_cpu_syncstate_args {
> +    CPUState *cpu;
> +    int regmap;
> +};
> +
>  void kvm_cpu_synchronize_state(CPUArchState *env);
>  void kvm_cpu_synchronize_post_reset(CPUArchState *env);
>  void kvm_cpu_synchronize_post_init(CPUArchState *env);
> diff --git a/kvm-all.c b/kvm-all.c
> index 1aa61bb..77ab72a 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -231,7 +231,7 @@ int kvm_init_vcpu(CPUArchState *env)
> 
>      cpu->kvm_fd = ret;
>      cpu->kvm_state = s;
> -    cpu->kvm_vcpu_dirty = true;
> +    cpu->kvm_vcpu_dirty = KVM_REGSYNC_FULL_STATE;
> 
>      mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>      if (mmap_size < 0) {
> @@ -1491,20 +1491,27 @@ void kvm_flush_coalesced_mmio_buffer(void)
> 
>  static void do_kvm_cpu_synchronize_state(void *arg)
>  {
> -    CPUState *cpu = arg;
> +    struct kvm_cpu_syncstate_args *args = arg;
> 
> -    if (!cpu->kvm_vcpu_dirty) {
> -        kvm_arch_get_registers(cpu, KVM_REGSYNC_FULL_STATE);
> -        cpu->kvm_vcpu_dirty = true;
> +    /* Do not sync regs that are already dirty */
> +    int regs_to_get = args->regmap & ~args->cpu->kvm_vcpu_dirty;
> +
> +    if (regs_to_get) {
> +        kvm_arch_get_registers(args->cpu, regs_to_get);
> +        args->cpu->kvm_vcpu_dirty |= regs_to_get;
>      }
>  }
> 
>  void kvm_cpu_synchronize_state(CPUArchState *env)
>  {
>      CPUState *cpu = ENV_GET_CPU(env);
> +    struct kvm_cpu_syncstate_args args;
> +
> +    args.cpu = cpu;
> +    args.regmap = KVM_REGSYNC_FULL_STATE;
> 
> -    if (!cpu->kvm_vcpu_dirty) {
> -        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
> +    if (args.regmap & ~cpu->kvm_vcpu_dirty) {
> +        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, &args);
>      }
>  }
> 
> @@ -1513,7 +1520,7 @@ void kvm_cpu_synchronize_post_reset(CPUArchState *env)
>      CPUState *cpu = ENV_GET_CPU(env);
> 
>      kvm_arch_put_registers(cpu, KVM_REGSYNC_RESET_STATE);
> -    cpu->kvm_vcpu_dirty = false;
> +    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RESET_STATE;
>  }
> 
>  void kvm_cpu_synchronize_post_init(CPUArchState *env)
> @@ -1521,7 +1528,7 @@ void kvm_cpu_synchronize_post_init(CPUArchState *env)
>      CPUState *cpu = ENV_GET_CPU(env);
> 
>      kvm_arch_put_registers(cpu, KVM_REGSYNC_FULL_STATE);
> -    cpu->kvm_vcpu_dirty = false;
> +    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_FULL_STATE;
>  }
> 
>  int kvm_cpu_exec(CPUArchState *env)
> @@ -1540,7 +1547,7 @@ int kvm_cpu_exec(CPUArchState *env)
>      do {
>          if (cpu->kvm_vcpu_dirty) {
>              kvm_arch_put_registers(cpu, KVM_REGSYNC_RUNTIME_STATE);

s/KVM_REGSYNC_RUNTIME_STATE/cpu->kvm_vcpu_dirty


> -            cpu->kvm_vcpu_dirty = false;
> +            cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RUNTIME_STATE;

cpu->kvm_vcpu_dirty = 0;

-Bharat

>          }
> 
>          kvm_arch_pre_run(cpu, run);
> --
> 1.7.9.5
>
Marcelo Tosatti - Jan. 16, 2013, 4:05 p.m.
On Thu, Jan 10, 2013 at 10:29:04AM -0500, Jason J. Herne wrote:
> From: "Jason J. Herne" <jjherne@us.ibm.com>
> 
> do_kvm_cpu_synchronize_state is called via run_on_cpu, so we can only pass
> a single argument.  Create SyncStateArgs struct for this purpose and add
> register bitmap data member to it.
> 
> Signed-off-by: Jason J. Herne <jjherne@us.ibm.com>
> Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  include/sysemu/kvm.h |    6 ++++++
>  kvm-all.c            |   27 +++++++++++++++++----------
>  2 files changed, 23 insertions(+), 10 deletions(-)
> 
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index e0738ba..193d1f4 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -223,6 +223,12 @@ int kvm_check_extension(KVMState *s, unsigned int extension);
>  
>  uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function,
>                                        uint32_t index, int reg);
> +
> +struct kvm_cpu_syncstate_args {
> +    CPUState *cpu;
> +    int regmap;
> +};
> +
>  void kvm_cpu_synchronize_state(CPUArchState *env);
>  void kvm_cpu_synchronize_post_reset(CPUArchState *env);
>  void kvm_cpu_synchronize_post_init(CPUArchState *env);
> diff --git a/kvm-all.c b/kvm-all.c
> index 1aa61bb..77ab72a 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -231,7 +231,7 @@ int kvm_init_vcpu(CPUArchState *env)
>  
>      cpu->kvm_fd = ret;
>      cpu->kvm_state = s;
> -    cpu->kvm_vcpu_dirty = true;
> +    cpu->kvm_vcpu_dirty = KVM_REGSYNC_FULL_STATE;
>  
>      mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>      if (mmap_size < 0) {
> @@ -1491,20 +1491,27 @@ void kvm_flush_coalesced_mmio_buffer(void)
>  
>  static void do_kvm_cpu_synchronize_state(void *arg)
>  {
> -    CPUState *cpu = arg;
> +    struct kvm_cpu_syncstate_args *args = arg;
>  
> -    if (!cpu->kvm_vcpu_dirty) {
> -        kvm_arch_get_registers(cpu, KVM_REGSYNC_FULL_STATE);
> -        cpu->kvm_vcpu_dirty = true;
> +    /* Do not sync regs that are already dirty */
> +    int regs_to_get = args->regmap & ~args->cpu->kvm_vcpu_dirty;
> +
> +    if (regs_to_get) {
> +        kvm_arch_get_registers(args->cpu, regs_to_get);
> +        args->cpu->kvm_vcpu_dirty |= regs_to_get;
>      }
>  }
>  
>  void kvm_cpu_synchronize_state(CPUArchState *env)
>  {
>      CPUState *cpu = ENV_GET_CPU(env);
> +    struct kvm_cpu_syncstate_args args;
> +
> +    args.cpu = cpu;
> +    args.regmap = KVM_REGSYNC_FULL_STATE;
>  
> -    if (!cpu->kvm_vcpu_dirty) {
> -        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
> +    if (args.regmap & ~cpu->kvm_vcpu_dirty) {
> +        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, &args);
>      }
>  }
>  
> @@ -1513,7 +1520,7 @@ void kvm_cpu_synchronize_post_reset(CPUArchState *env)
>      CPUState *cpu = ENV_GET_CPU(env);
>  
>      kvm_arch_put_registers(cpu, KVM_REGSYNC_RESET_STATE);
> -    cpu->kvm_vcpu_dirty = false;
> +    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RESET_STATE;
>  }
>  
>  void kvm_cpu_synchronize_post_init(CPUArchState *env)
> @@ -1521,7 +1528,7 @@ void kvm_cpu_synchronize_post_init(CPUArchState *env)
>      CPUState *cpu = ENV_GET_CPU(env);
>  
>      kvm_arch_put_registers(cpu, KVM_REGSYNC_FULL_STATE);
> -    cpu->kvm_vcpu_dirty = false;
> +    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_FULL_STATE;
>  }
>  
>  int kvm_cpu_exec(CPUArchState *env)
> @@ -1540,7 +1547,7 @@ int kvm_cpu_exec(CPUArchState *env)
>      do {
>          if (cpu->kvm_vcpu_dirty) {
>              kvm_arch_put_registers(cpu, KVM_REGSYNC_RUNTIME_STATE);
> -            cpu->kvm_vcpu_dirty = false;
> +            cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RUNTIME_STATE;
>          }

1) This implies a vcpu can enter guest mode with kvm_vcpu_dirty non-zero.

Unrelated:

2) Also, what is the reason for specifying sets of registers in
arch-specific code? Is that because it allows PPC to fix their sync-timer
register problem? 

When you are writing generic code, what does it mean to
use 'KVM_REGSYNC_{RUNTIME,RESET,FULL}_STATE' ?
Answer: it depends on the architecture.

3) On x86, kvm_arch_get_registers(GET_FULL) must not imply 
kvm_arch_put_registers(PUT_FULL).

The S/390 problem, from
http://lists.nongnu.org/archive/html/qemu-devel/2012-11/msg02213.html:

">>> The kvm register sync needs to happen in the kvm register sync
>>> function :)
>> That would eliminate the whole purpose of sync regs and forces us to
>> have an
>> expensive ioctl on lots of exits (again). I would prefer to sync the 
>> registers
>> that we never need in qemu just here.
> 
> That's why the register sync has different stages.

Not the get_register. Which is called on every synchronize_state. Which
happen 
quite often
on s390."

But wait: on these S/390 codepaths, you do GET_REGS already, via
cpu_synchronize_state.

So on S/390

- cpu_synchronize_state(env)
- read any register from env

Is not valid? This is what generic code assumes.


Bhushan Bharat, the PPC problem, can you describe it clearly: from what
i understood, an in-kernel register cannot be read/written back because
that register value can change in the meantime. When is it necessary 
to write it back? (there is a similar problem with TSC on x86, which 
is "fixed" by only writing TSC on FULL_STATE arch_put_registers).
Marcelo Tosatti - Jan. 16, 2013, 4:12 p.m.
On Wed, Jan 16, 2013 at 02:05:33PM -0200, Marcelo Tosatti wrote:
> On Thu, Jan 10, 2013 at 10:29:04AM -0500, Jason J. Herne wrote:
> > From: "Jason J. Herne" <jjherne@us.ibm.com>
> > 
> > do_kvm_cpu_synchronize_state is called via run_on_cpu, so we can only pass
> > a single argument.  Create SyncStateArgs struct for this purpose and add
> > register bitmap data member to it.
> > 
> > Signed-off-by: Jason J. Herne <jjherne@us.ibm.com>
> > Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
> > ---
> >  include/sysemu/kvm.h |    6 ++++++
> >  kvm-all.c            |   27 +++++++++++++++++----------
> >  2 files changed, 23 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> > index e0738ba..193d1f4 100644
> > --- a/include/sysemu/kvm.h
> > +++ b/include/sysemu/kvm.h
> > @@ -223,6 +223,12 @@ int kvm_check_extension(KVMState *s, unsigned int extension);
> >  
> >  uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function,
> >                                        uint32_t index, int reg);
> > +
> > +struct kvm_cpu_syncstate_args {
> > +    CPUState *cpu;
> > +    int regmap;
> > +};
> > +
> >  void kvm_cpu_synchronize_state(CPUArchState *env);
> >  void kvm_cpu_synchronize_post_reset(CPUArchState *env);
> >  void kvm_cpu_synchronize_post_init(CPUArchState *env);
> > diff --git a/kvm-all.c b/kvm-all.c
> > index 1aa61bb..77ab72a 100644
> > --- a/kvm-all.c
> > +++ b/kvm-all.c
> > @@ -231,7 +231,7 @@ int kvm_init_vcpu(CPUArchState *env)
> >  
> >      cpu->kvm_fd = ret;
> >      cpu->kvm_state = s;
> > -    cpu->kvm_vcpu_dirty = true;
> > +    cpu->kvm_vcpu_dirty = KVM_REGSYNC_FULL_STATE;
> >  
> >      mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
> >      if (mmap_size < 0) {
> > @@ -1491,20 +1491,27 @@ void kvm_flush_coalesced_mmio_buffer(void)
> >  
> >  static void do_kvm_cpu_synchronize_state(void *arg)
> >  {
> > -    CPUState *cpu = arg;
> > +    struct kvm_cpu_syncstate_args *args = arg;
> >  
> > -    if (!cpu->kvm_vcpu_dirty) {
> > -        kvm_arch_get_registers(cpu, KVM_REGSYNC_FULL_STATE);
> > -        cpu->kvm_vcpu_dirty = true;
> > +    /* Do not sync regs that are already dirty */
> > +    int regs_to_get = args->regmap & ~args->cpu->kvm_vcpu_dirty;
> > +
> > +    if (regs_to_get) {
> > +        kvm_arch_get_registers(args->cpu, regs_to_get);
> > +        args->cpu->kvm_vcpu_dirty |= regs_to_get;
> >      }
> >  }
> >  
> >  void kvm_cpu_synchronize_state(CPUArchState *env)
> >  {
> >      CPUState *cpu = ENV_GET_CPU(env);
> > +    struct kvm_cpu_syncstate_args args;
> > +
> > +    args.cpu = cpu;
> > +    args.regmap = KVM_REGSYNC_FULL_STATE;
> >  
> > -    if (!cpu->kvm_vcpu_dirty) {
> > -        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
> > +    if (args.regmap & ~cpu->kvm_vcpu_dirty) {
> > +        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, &args);
> >      }
> >  }
> >  
> > @@ -1513,7 +1520,7 @@ void kvm_cpu_synchronize_post_reset(CPUArchState *env)
> >      CPUState *cpu = ENV_GET_CPU(env);
> >  
> >      kvm_arch_put_registers(cpu, KVM_REGSYNC_RESET_STATE);
> > -    cpu->kvm_vcpu_dirty = false;
> > +    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RESET_STATE;
> >  }
> >  
> >  void kvm_cpu_synchronize_post_init(CPUArchState *env)
> > @@ -1521,7 +1528,7 @@ void kvm_cpu_synchronize_post_init(CPUArchState *env)
> >      CPUState *cpu = ENV_GET_CPU(env);
> >  
> >      kvm_arch_put_registers(cpu, KVM_REGSYNC_FULL_STATE);
> > -    cpu->kvm_vcpu_dirty = false;
> > +    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_FULL_STATE;
> >  }
> >  
> >  int kvm_cpu_exec(CPUArchState *env)
> > @@ -1540,7 +1547,7 @@ int kvm_cpu_exec(CPUArchState *env)
> >      do {
> >          if (cpu->kvm_vcpu_dirty) {
> >              kvm_arch_put_registers(cpu, KVM_REGSYNC_RUNTIME_STATE);
> > -            cpu->kvm_vcpu_dirty = false;
> > +            cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RUNTIME_STATE;
> >          }
> 
> 1) This implies a vcpu can enter guest mode with kvm_vcpu_dirty non-zero.
> 
> Unrelated:
> 
> 2) Also, what is the reason for specifying sets of registers in
> arch-specific code? Is that because it allows PPC to fix their sync-timer
> register problem? 
> 
> When you are writing generic code, what does it mean to
> use 'KVM_REGSYNC_{RUNTIME,RESET,FULL}_STATE' ?
> Answer: it depends on the architecture.
> 
> 3) On x86, kvm_arch_get_registers(GET_FULL) must not imply 
> kvm_arch_put_registers(PUT_FULL).
> 
> The S/390 problem, from
> http://lists.nongnu.org/archive/html/qemu-devel/2012-11/msg02213.html:
> 
> ">>> The kvm register sync needs to happen in the kvm register sync
> >>> function :)
> >> That would eliminate the whole purpose of sync regs and forces us to
> >> have an
> >> expensive ioctl on lots of exits (again). I would prefer to sync the 
> >> registers
> >> that we never need in qemu just here.
> > 
> > That's why the register sync has different stages.
> 
> Not the get_register. Which is called on every synchronize_state. Which
> happen 
> quite often
> on s390."
> 
> But wait: on these S/390 codepaths, you do GET_REGS already, via
> cpu_synchronize_state.
> 
> So on S/390
> 
> - cpu_synchronize_state(env)
> - read any register from env
> 
> Is not valid? This is what generic code assumes.

One possible alternative would be:

- Full synchronization on cpu_synchronize_state(env).
- Use of custom GET_REGS on these special codepaths which apparently
  are:
	- always vcpu local (no need for cpu_synchronize_state).
	- need a particular register read.
	- writeback nothing?

Is to read/write registers directly.
Bharat Bhushan - Jan. 16, 2013, 5 p.m.
> -----Original Message-----
> From: Marcelo Tosatti [mailto:mtosatti@redhat.com]
> Sent: Wednesday, January 16, 2013 9:36 PM
> To: Jason J. Herne
> Cc: agraf@suse.de; borntraeger@de.ibm.com; aliguori@us.ibm.com; qemu-
> devel@nongnu.org; Bhushan Bharat-R65777; jan.kiszka@siemens.com
> Subject: Re: [PATCH 4/7 v2] KVM regsync: Add register bitmap parameter to
> do_kvm_cpu_synchronize_state
> 
> On Thu, Jan 10, 2013 at 10:29:04AM -0500, Jason J. Herne wrote:
> > From: "Jason J. Herne" <jjherne@us.ibm.com>
> >
> > do_kvm_cpu_synchronize_state is called via run_on_cpu, so we can only
> > pass a single argument.  Create SyncStateArgs struct for this purpose
> > and add register bitmap data member to it.
> >
> > Signed-off-by: Jason J. Herne <jjherne@us.ibm.com>
> > Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
> > ---
> >  include/sysemu/kvm.h |    6 ++++++
> >  kvm-all.c            |   27 +++++++++++++++++----------
> >  2 files changed, 23 insertions(+), 10 deletions(-)
> >
> > diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index
> > e0738ba..193d1f4 100644
> > --- a/include/sysemu/kvm.h
> > +++ b/include/sysemu/kvm.h
> > @@ -223,6 +223,12 @@ int kvm_check_extension(KVMState *s, unsigned int
> > extension);
> >
> >  uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function,
> >                                        uint32_t index, int reg);
> > +
> > +struct kvm_cpu_syncstate_args {
> > +    CPUState *cpu;
> > +    int regmap;
> > +};
> > +
> >  void kvm_cpu_synchronize_state(CPUArchState *env);  void
> > kvm_cpu_synchronize_post_reset(CPUArchState *env);  void
> > kvm_cpu_synchronize_post_init(CPUArchState *env); diff --git
> > a/kvm-all.c b/kvm-all.c index 1aa61bb..77ab72a 100644
> > --- a/kvm-all.c
> > +++ b/kvm-all.c
> > @@ -231,7 +231,7 @@ int kvm_init_vcpu(CPUArchState *env)
> >
> >      cpu->kvm_fd = ret;
> >      cpu->kvm_state = s;
> > -    cpu->kvm_vcpu_dirty = true;
> > +    cpu->kvm_vcpu_dirty = KVM_REGSYNC_FULL_STATE;
> >
> >      mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
> >      if (mmap_size < 0) {
> > @@ -1491,20 +1491,27 @@ void kvm_flush_coalesced_mmio_buffer(void)
> >
> >  static void do_kvm_cpu_synchronize_state(void *arg)  {
> > -    CPUState *cpu = arg;
> > +    struct kvm_cpu_syncstate_args *args = arg;
> >
> > -    if (!cpu->kvm_vcpu_dirty) {
> > -        kvm_arch_get_registers(cpu, KVM_REGSYNC_FULL_STATE);
> > -        cpu->kvm_vcpu_dirty = true;
> > +    /* Do not sync regs that are already dirty */
> > +    int regs_to_get = args->regmap & ~args->cpu->kvm_vcpu_dirty;
> > +
> > +    if (regs_to_get) {
> > +        kvm_arch_get_registers(args->cpu, regs_to_get);
> > +        args->cpu->kvm_vcpu_dirty |= regs_to_get;
> >      }
> >  }
> >
> >  void kvm_cpu_synchronize_state(CPUArchState *env)  {
> >      CPUState *cpu = ENV_GET_CPU(env);
> > +    struct kvm_cpu_syncstate_args args;
> > +
> > +    args.cpu = cpu;
> > +    args.regmap = KVM_REGSYNC_FULL_STATE;
> >
> > -    if (!cpu->kvm_vcpu_dirty) {
> > -        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
> > +    if (args.regmap & ~cpu->kvm_vcpu_dirty) {
> > +        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, &args);
> >      }
> >  }
> >
> > @@ -1513,7 +1520,7 @@ void kvm_cpu_synchronize_post_reset(CPUArchState *env)
> >      CPUState *cpu = ENV_GET_CPU(env);
> >
> >      kvm_arch_put_registers(cpu, KVM_REGSYNC_RESET_STATE);
> > -    cpu->kvm_vcpu_dirty = false;
> > +    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RESET_STATE;
> >  }
> >
> >  void kvm_cpu_synchronize_post_init(CPUArchState *env) @@ -1521,7
> > +1528,7 @@ void kvm_cpu_synchronize_post_init(CPUArchState *env)
> >      CPUState *cpu = ENV_GET_CPU(env);
> >
> >      kvm_arch_put_registers(cpu, KVM_REGSYNC_FULL_STATE);
> > -    cpu->kvm_vcpu_dirty = false;
> > +    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_FULL_STATE;
> >  }
> >
> >  int kvm_cpu_exec(CPUArchState *env)
> > @@ -1540,7 +1547,7 @@ int kvm_cpu_exec(CPUArchState *env)
> >      do {
> >          if (cpu->kvm_vcpu_dirty) {
> >              kvm_arch_put_registers(cpu, KVM_REGSYNC_RUNTIME_STATE);
> > -            cpu->kvm_vcpu_dirty = false;
> > +            cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RUNTIME_STATE;
> >          }
> 
> 1) This implies a vcpu can enter guest mode with kvm_vcpu_dirty non-zero.

I think above code should be:
            kvm_arch_put_registers(cpu, cpu->kvm_vcpu_dirty);
            cpu->kvm_vcpu_dirty = false;

so vcpu will not enter guest state with dirty registers in qemu.

> 
> Unrelated:
> 
> 2) Also, what is the reason for specifying sets of registers in arch-specific
> code? Is that because it allows PPC to fix their sync-timer register problem?
> 
> When you are writing generic code, what does it mean to use
> 'KVM_REGSYNC_{RUNTIME,RESET,FULL}_STATE' ?
> Answer: it depends on the architecture.
> 
> 3) On x86, kvm_arch_get_registers(GET_FULL) must not imply
> kvm_arch_put_registers(PUT_FULL).
> 
> The S/390 problem, from
> http://lists.nongnu.org/archive/html/qemu-devel/2012-11/msg02213.html:
> 
> ">>> The kvm register sync needs to happen in the kvm register sync
> >>> function :)
> >> That would eliminate the whole purpose of sync regs and forces us to
> >> have an expensive ioctl on lots of exits (again). I would prefer to
> >> sync the registers that we never need in qemu just here.
> >
> > That's why the register sync has different stages.
> 
> Not the get_register. Which is called on every synchronize_state. Which happen
> quite often on s390."
> 
> But wait: on these S/390 codepaths, you do GET_REGS already, via
> cpu_synchronize_state.
> 
> So on S/390
> 
> - cpu_synchronize_state(env)
> - read any register from env
> 
> Is not valid? This is what generic code assumes.
> 
> 
> Bhushan Bharat, the PPC problem, can you describe it clearly: from what i
> understood, an in-kernel register cannot be read/written back because that
> register value can change in the meantime. When is it necessary to write it
> back? (there is a similar problem with TSC on x86, which is "fixed" by only
> writing TSC on FULL_STATE arch_put_registers).

There are two things:

First-)
For timer related changes on PowerPC, some registers needed to be changed from QEMU, so we have to get the registers via KVM_GET_SREGS and then set those registers back to KVM via KVM_SET_SREGS. cpu_synchronize_state() will get registers but kvm_arch_put_registers() works on level based mechanism and does not provide a good way of setting a register-set. So we wrote a separate function that will push these registers back to KVM and this also uses KVM_SET_SREGS ioctl. This solves what is needed for PPC.

Second-)
Currently kvm_arch_get_registers() is not optimized in two sense; one, it always get all registers from KVM; two, in kvm_arch_get_registers() it copies all registers to env->. This patch-set handles the second issue of optimization, copy only the requested registers to env-> in kvm_arch_get_registers(), plus when kvm_arch_put_registers() is called then it copies only the modified registers for KVM_SET_SREGS.

This optimization is looking good to me and allows sync of registers via one common kvm_arch_get/set_registers() and no separate function definition for setting is needed for timer related changes.

Thanks
-Bharat
Marcelo Tosatti - Jan. 16, 2013, 5:23 p.m.
On Wed, Jan 16, 2013 at 05:00:52PM +0000, Bhushan Bharat-R65777 wrote:
> I think above code should be:
>             kvm_arch_put_registers(cpu, cpu->kvm_vcpu_dirty);
>             cpu->kvm_vcpu_dirty = false;
> 
> so vcpu will not enter guest state with dirty registers in qemu.

Not so clear - currently PUT_FULL/PUT_RESET are performed on
pre-defined points.

> > Unrelated:
> > 
> > 2) Also, what is the reason for specifying sets of registers in arch-specific
> > code? Is that because it allows PPC to fix their sync-timer register problem?
> > 
> > When you are writing generic code, what does it mean to use
> > 'KVM_REGSYNC_{RUNTIME,RESET,FULL}_STATE' ?
> > Answer: it depends on the architecture.
> > 
> > 3) On x86, kvm_arch_get_registers(GET_FULL) must not imply
> > kvm_arch_put_registers(PUT_FULL).
> > 
> > The S/390 problem, from
> > http://lists.nongnu.org/archive/html/qemu-devel/2012-11/msg02213.html:
> > 
> > ">>> The kvm register sync needs to happen in the kvm register sync
> > >>> function :)
> > >> That would eliminate the whole purpose of sync regs and forces us to
> > >> have an expensive ioctl on lots of exits (again). I would prefer to
> > >> sync the registers that we never need in qemu just here.
> > >
> > > That's why the register sync has different stages.
> > 
> > Not the get_register. Which is called on every synchronize_state. Which happen
> > quite often on s390."
> > 
> > But wait: on these S/390 codepaths, you do GET_REGS already, via
> > cpu_synchronize_state.
> > 
> > So on S/390
> > 
> > - cpu_synchronize_state(env)
> > - read any register from env
> > 
> > Is not valid? This is what generic code assumes.
> > 
> > 
> > Bhushan Bharat, the PPC problem, can you describe it clearly: from what i
> > understood, an in-kernel register cannot be read/written back because that
> > register value can change in the meantime. When is it necessary to write it
> > back? (there is a similar problem with TSC on x86, which is "fixed" by only
> > writing TSC on FULL_STATE arch_put_registers).
> 
> There are two things:
> 
> First-)
> For timer related changes on PowerPC, some registers needed to be changed from QEMU, so we have to get the registers via KVM_GET_SREGS and then set those registers back to KVM via KVM_SET_SREGS. cpu_synchronize_state() will get registers but kvm_arch_put_registers() works on level based mechanism and does not provide a good way of setting a register-set. So we wrote a separate function that will push these registers back to KVM and this also uses KVM_SET_SREGS ioctl. This solves what is needed for PPC.

Can you describe the problem in detail? You must sync a particular
timer register only on special conditions, not during normal
cpu_synchronize_state() runs?

What register is that and why it cannot be synced normally? When is it
necessary to sync it?

> Second-)
> Currently kvm_arch_get_registers() is not optimized in two sense; one, it always get all registers from KVM; two, in kvm_arch_get_registers() it copies all registers to env->. This patch-set handles the second issue of optimization, copy only the requested registers to env-> in kvm_arch_get_registers(), plus when kvm_arch_put_registers() is called then it copies only the modified registers for KVM_SET_SREGS.
> 
> This optimization is looking good to me and allows sync of registers via one common kvm_arch_get/set_registers() and no separate function definition for setting is needed for timer related changes.

The problem with S390 is not clear to me (besides
cpu_synchronize_state must sync all state as mentioned).

> 
> Thanks
> -Bharat
>
Christian Borntraeger - Jan. 16, 2013, 8:03 p.m.
On 16/01/13 17:05, Marcelo Tosatti wrote:

> The S/390 problem, from
> http://lists.nongnu.org/archive/html/qemu-devel/2012-11/msg02213.html:
> 
> ">>> The kvm register sync needs to happen in the kvm register sync
>>>> function :)
>>> That would eliminate the whole purpose of sync regs and forces us to
>>> have an
>>> expensive ioctl on lots of exits (again). I would prefer to sync the 
>>> registers
>>> that we never need in qemu just here.
>>
>> That's why the register sync has different stages.
> 
> Not the get_register. Which is called on every synchronize_state. Which
> happen 
> quite often
> on s390."
> 
> But wait: on these S/390 codepaths, you do GET_REGS already, via
> cpu_synchronize_state.
> 
> So on S/390
> 
> - cpu_synchronize_state(env)
> - read any register from env
> 
> Is not valid? This is what generic code assumes.

TO recap the motiviation:

cpu_synchronize_state on s390 currently updates any register in env that is
used by qemu (general purpose, prefix, psw, control and access) in the normal
runtime. it turns out we have all of these regs in kvm_run, so we can do 
synchronize states without doing an additional ioctl call.
Now, for life migration and dump we need some additional registers (which are
only accessable via onereg interface). So synchronize_state would need to
do 3 or 4 additional system calls on the hot path, only to take care of 
something that is not on the hot path at all.
For historic reasons, we have one exit code for almost all exits. Therefore,
we need to call synchronize_states almost always.
We could now start to have a poor mans synchronize_state in arch code, but
that would collide with common code synchronize_state if done at the wrong
time. Thus we want to make common code capable of having only a subset of
the register synched - by making it possible to sync the other regs later
on if needed without wiping the former sync.

Makes sense?

Christian
Marcelo Tosatti - Jan. 16, 2013, 8:09 p.m.
On Wed, Jan 16, 2013 at 05:00:52PM +0000, Bhushan Bharat-R65777 wrote:
> Second-)
> Currently kvm_arch_get_registers() is not optimized in two sense; one, it always get all registers from KVM; two, in kvm_arch_get_registers() it copies all registers to env->. This patch-set handles the second issue of optimization, copy only the requested registers to env-> in kvm_arch_get_registers(), plus when kvm_arch_put_registers() is called then it copies only the modified registers for KVM_SET_SREGS.
> 
> This optimization is looking good to me and allows sync of registers via one common kvm_arch_get/set_registers() and no separate function definition for setting is needed for timer related changes.

To be clearer. For example, executing:

- kvm_get_regs(FULL_STATE)
- kvm_put_regs(FULL_STATE)

Is reading and writing the full register set while
in runtime state.

/* full state set, modified during initialization or on vmload */
#define KVM_PUT_FULL_STATE      3

Again, its necessary to read the full state on "cpu_synchronize_state" 
because its not specified which registers can be modified
during ioctl(KVM_RUN) (therefore separating get registers operation
in full/reset/runtime sets is awkward).

But direct read/write of registers is also bad: if a given register is
read and modified (but not written back), then reading through ioctl()
returns a wrong value.

Apparently accessors are the best option.
Marcelo Tosatti - Jan. 16, 2013, 8:21 p.m.
On Wed, Jan 16, 2013 at 09:03:20PM +0100, Christian Borntraeger wrote:
> On 16/01/13 17:05, Marcelo Tosatti wrote:
> 
> > The S/390 problem, from
> > http://lists.nongnu.org/archive/html/qemu-devel/2012-11/msg02213.html:
> > 
> > ">>> The kvm register sync needs to happen in the kvm register sync
> >>>> function :)
> >>> That would eliminate the whole purpose of sync regs and forces us to
> >>> have an
> >>> expensive ioctl on lots of exits (again). I would prefer to sync the 
> >>> registers
> >>> that we never need in qemu just here.
> >>
> >> That's why the register sync has different stages.
> > 
> > Not the get_register. Which is called on every synchronize_state. Which
> > happen 
> > quite often
> > on s390."
> > 
> > But wait: on these S/390 codepaths, you do GET_REGS already, via
> > cpu_synchronize_state.
> > 
> > So on S/390
> > 
> > - cpu_synchronize_state(env)
> > - read any register from env
> > 
> > Is not valid? This is what generic code assumes.
> 
> TO recap the motiviation:
> 
> cpu_synchronize_state on s390 currently updates any register in env that is
> used by qemu (general purpose, prefix, psw, control and access) in the normal
> runtime. it turns out we have all of these regs in kvm_run, so we can do 
> synchronize states without doing an additional ioctl call.
> Now, for life migration and dump we need some additional registers (which are
> only accessable via onereg interface). So synchronize_state would need to
> do 3 or 4 additional system calls on the hot path, only to take care of 
> something that is not on the hot path at all.
> For historic reasons, we have one exit code for almost all exits. Therefore,
> we need to call synchronize_states almost always.
> We could now start to have a poor mans synchronize_state in arch code, but
> that would collide with common code synchronize_state if done at the wrong
> time. Thus we want to make common code capable of having only a subset of
> the register synched - by making it possible to sync the other regs later
> on if needed without wiping the former sync.
> 
> Makes sense?
> 
> Christian

Yes. As noted in the last email on the thread, runtime/reset/full are to
serapate sets of registers when writing _to_ kernel. When reading _from_
kernel, reset and full distinctions are not appropriate (any register
can change, as far as knowledge goes).

Accessors for reading/writing shared (between userspace and kernel)
registers in CPUState is one option.
Christian Borntraeger - Jan. 16, 2013, 8:41 p.m.
On 16/01/13 21:21, Marcelo Tosatti wrote:
> On Wed, Jan 16, 2013 at 09:03:20PM +0100, Christian Borntraeger wrote:
>> On 16/01/13 17:05, Marcelo Tosatti wrote:
>>
>>> The S/390 problem, from
>>> http://lists.nongnu.org/archive/html/qemu-devel/2012-11/msg02213.html:
>>>
>>> ">>> The kvm register sync needs to happen in the kvm register sync
>>>>>> function :)
>>>>> That would eliminate the whole purpose of sync regs and forces us to
>>>>> have an
>>>>> expensive ioctl on lots of exits (again). I would prefer to sync the 
>>>>> registers
>>>>> that we never need in qemu just here.
>>>>
>>>> That's why the register sync has different stages.
>>>
>>> Not the get_register. Which is called on every synchronize_state. Which
>>> happen 
>>> quite often
>>> on s390."
>>>
>>> But wait: on these S/390 codepaths, you do GET_REGS already, via
>>> cpu_synchronize_state.
>>>
>>> So on S/390
>>>
>>> - cpu_synchronize_state(env)
>>> - read any register from env
>>>
>>> Is not valid? This is what generic code assumes.
>>
>> TO recap the motiviation:
>>
>> cpu_synchronize_state on s390 currently updates any register in env that is
>> used by qemu (general purpose, prefix, psw, control and access) in the normal
>> runtime. it turns out we have all of these regs in kvm_run, so we can do 
>> synchronize states without doing an additional ioctl call.
>> Now, for life migration and dump we need some additional registers (which are
>> only accessable via onereg interface). So synchronize_state would need to
>> do 3 or 4 additional system calls on the hot path, only to take care of 
>> something that is not on the hot path at all.
>> For historic reasons, we have one exit code for almost all exits. Therefore,
>> we need to call synchronize_states almost always.
>> We could now start to have a poor mans synchronize_state in arch code, but
>> that would collide with common code synchronize_state if done at the wrong
>> time. Thus we want to make common code capable of having only a subset of
>> the register synched - by making it possible to sync the other regs later
>> on if needed without wiping the former sync.
>>
>> Makes sense?
>>
>> Christian
> 
> Yes. As noted in the last email on the thread, runtime/reset/full are to
> serapate sets of registers when writing _to_ kernel. When reading _from_
> kernel, reset and full distinctions are not appropriate (any register
> can change, as far as knowledge goes).

Hmm, I probably did not understood your point, so I will try to explain mine
and see what you respond :-)

The point of the patch set, is to allow this distinction when reading. 
In other words it allows code to state: I am only interested in regxy and dont
care if the other regs in env are out of sync.
If a full sync is necessary later on the other regs are synched as well.
If a full sync was already done before a partial get becomes a no-op. 
Why should that be not possible.

Christian
Marcelo Tosatti - Jan. 16, 2013, 11:01 p.m.
On Wed, Jan 16, 2013 at 09:41:54PM +0100, Christian Borntraeger wrote:
> On 16/01/13 21:21, Marcelo Tosatti wrote:
> > On Wed, Jan 16, 2013 at 09:03:20PM +0100, Christian Borntraeger wrote:
> >> On 16/01/13 17:05, Marcelo Tosatti wrote:
> >>
> >>> The S/390 problem, from
> >>> http://lists.nongnu.org/archive/html/qemu-devel/2012-11/msg02213.html:
> >>>
> >>> ">>> The kvm register sync needs to happen in the kvm register sync
> >>>>>> function :)
> >>>>> That would eliminate the whole purpose of sync regs and forces us to
> >>>>> have an
> >>>>> expensive ioctl on lots of exits (again). I would prefer to sync the 
> >>>>> registers
> >>>>> that we never need in qemu just here.
> >>>>
> >>>> That's why the register sync has different stages.
> >>>
> >>> Not the get_register. Which is called on every synchronize_state. Which
> >>> happen 
> >>> quite often
> >>> on s390."
> >>>
> >>> But wait: on these S/390 codepaths, you do GET_REGS already, via
> >>> cpu_synchronize_state.
> >>>
> >>> So on S/390
> >>>
> >>> - cpu_synchronize_state(env)
> >>> - read any register from env
> >>>
> >>> Is not valid? This is what generic code assumes.
> >>
> >> TO recap the motiviation:
> >>
> >> cpu_synchronize_state on s390 currently updates any register in env that is
> >> used by qemu (general purpose, prefix, psw, control and access) in the normal
> >> runtime. it turns out we have all of these regs in kvm_run, so we can do 
> >> synchronize states without doing an additional ioctl call.
> >> Now, for life migration and dump we need some additional registers (which are
> >> only accessable via onereg interface). So synchronize_state would need to
> >> do 3 or 4 additional system calls on the hot path, only to take care of 
> >> something that is not on the hot path at all.
> >> For historic reasons, we have one exit code for almost all exits. Therefore,
> >> we need to call synchronize_states almost always.
> >> We could now start to have a poor mans synchronize_state in arch code, but
> >> that would collide with common code synchronize_state if done at the wrong
> >> time. Thus we want to make common code capable of having only a subset of
> >> the register synched - by making it possible to sync the other regs later
> >> on if needed without wiping the former sync.
> >>
> >> Makes sense?
> >>
> >> Christian
> > 
> > Yes. As noted in the last email on the thread, runtime/reset/full are to
> > serapate sets of registers when writing _to_ kernel. When reading _from_
> > kernel, reset and full distinctions are not appropriate (any register
> > can change, as far as knowledge goes).
> 
> Hmm, I probably did not understood your point, so I will try to explain mine
> and see what you respond :-)
> 
> The point of the patch set, is to allow this distinction when reading. 
> In other words it allows code to state: I am only interested in regxy and dont
> care if the other regs in env are out of sync.

Fine.

> If a full sync is necessary later on the other regs are synched as well.
> If a full sync was already done before a partial get becomes a no-op.

-> FULL is the set of registers written when loadvm/initialization is
performed.
-> RESET, a subset of full, is a set of registers written on SYSTEM
RESET.
-> RUNTIME, a subset of RESET, is a set of registers written during
RUNTIME.

To write both the RESET and FULL set of registers during runtime,
contradicts the description above for both RESET and FULL.

Two examples from i386:

    if (level == KVM_PUT_FULL_STATE) {
        /*
         * KVM is yet unable to synchronize TSC values of multiple VCPUs
         * on
         * writeback. Until this is fixed, we only write the offset to
         * SMP
         * guests after migration, desynchronizing the VCPUs, but
         * avoiding
         * huge jump-backs that would occur without any writeback at
         * all.
         */
	...
    }

And:

    /*
     * The following paravirtual MSRs have side effects on the guest or
     * are
     * too heavy for normal writeback. Limit them to reset or full state
     * updates.
     */

> Why should that be not possible.

It should, but separately from FULL/RESET/RUNTIME distinction.
This sequence

	get_regs(FULLSTATE)
	put_regs(FULLSTATE)

During runtime is not allowed. And only syncing the RUNTIME set of
registers during and leaving the FULL set of registers marked as
dirty is confusing also. 

So perhaps what you'd want is selective read/write of RUNTIME registers
as suggested.


Date: Fri, 4 Jan 2013 23:49:42 -0200
From: Marcelo Tosatti <mtosatti@redhat.com>
To: "Jason J. Herne" <jjherne@linux.vnet.ibm.com>
Cc: Alexander Graf <agraf@suse.de>,
	Bhushan Bharat-R65777 <R65777@freescale.com>,
	Christian Borntraeger <borntraeger@de.ibm.com>,
	Anthony Liguori <aliguori@us.ibm.com>,
	"qemu-devel@nongnu.org qemu-devel" <qemu-devel@nongnu.org>
Subject: Re: [Qemu-devel] [PATCH 7/7] KVM regsync: Fix
 do_kvm_cpu_synchronize_state data integrity issue

On Fri, Jan 04, 2013 at 10:25:45AM -0500, Jason J. Herne wrote:
> If I've followed the conversation correctly this is what needs to be done:
> 
> 1. Remove the level parameters from kvm_arch_get_registers and
> kvm_arch_put_registers.
> 
> 2. Add a new bitmap parameter to kvm_arch_get_registers and
> kvm_arch_put_registers.
> 
> 3. Define a bit that correlates to our current notion of "all
> runtime registers".  This bit, and all bits in this bitmap, would be
> architecture specific.
> 
> 4. Remove the cpustate->kvm_sync_dirty field.  Replace it with a
> bitmap that tracks which bits are dirty and need to be synced back
> to KVM-land.
> 
> 5. As we do today, we'll assume registers are dirty and turn on
> their corresponding bit in this new bitmap whenever we "get" the
> registers from KVM.
> 
> 6. Add other bits as needed on a case by case basis.
> 
> Does this seem to match what was discussed, and what we want to do?
> 
> 
> -- 
> -- Jason J. Herne (jjherne@linux.vnet.ibm.com)

The s390 change to use runtime_state is self-contained and could be
integrated now from my pov.

The suggestion was based on the observation that the ppc guys are trying 
to fix their timer problem and that should be a generic solution,
per-register granularity.

Perhaps something along the lines of 

init
all registers marked as cached (read/write accessors don't 
synchronize state)

Around ioctl(KVM_RUN) have:

if regs dirty
	writeback
r = ioctl(KVM_RUN)
mark runstate registers as not cached

Then

read_reg(x)
	if x not cached
		arch_get_regs(RUNTIME_STATE) (*)

write_reg(x, val)
	read_reg(x)
	cpustate->x = val;
	mark_dirty(x)

Which is basically the pattern used in KVM x86 (but instead of
ioctl(KVM_RUN) there is VMENTRY).

But talk is cheap, how that would work with RESET/FULL states
is unclear. Jan?

For those archs that need, of course, (*) would use ONE_REG or whatever
other interface. For x86, or those archs that are not converted, keep
using kvm_arch_get_regs/kvm_arch_put_regs.
Alexander Graf - Jan. 24, 2013, 12:40 p.m.
On 17.01.2013, at 00:01, Marcelo Tosatti wrote:

> On Wed, Jan 16, 2013 at 09:41:54PM +0100, Christian Borntraeger wrote:
>> On 16/01/13 21:21, Marcelo Tosatti wrote:
>>> On Wed, Jan 16, 2013 at 09:03:20PM +0100, Christian Borntraeger wrote:
>>>> On 16/01/13 17:05, Marcelo Tosatti wrote:
>>>> 
>>>>> The S/390 problem, from
>>>>> http://lists.nongnu.org/archive/html/qemu-devel/2012-11/msg02213.html:
>>>>> 
>>>>> ">>> The kvm register sync needs to happen in the kvm register sync
>>>>>>>> function :)
>>>>>>> That would eliminate the whole purpose of sync regs and forces us to
>>>>>>> have an
>>>>>>> expensive ioctl on lots of exits (again). I would prefer to sync the 
>>>>>>> registers
>>>>>>> that we never need in qemu just here.
>>>>>> 
>>>>>> That's why the register sync has different stages.
>>>>> 
>>>>> Not the get_register. Which is called on every synchronize_state. Which
>>>>> happen 
>>>>> quite often
>>>>> on s390."
>>>>> 
>>>>> But wait: on these S/390 codepaths, you do GET_REGS already, via
>>>>> cpu_synchronize_state.
>>>>> 
>>>>> So on S/390
>>>>> 
>>>>> - cpu_synchronize_state(env)
>>>>> - read any register from env
>>>>> 
>>>>> Is not valid? This is what generic code assumes.
>>>> 
>>>> TO recap the motiviation:
>>>> 
>>>> cpu_synchronize_state on s390 currently updates any register in env that is
>>>> used by qemu (general purpose, prefix, psw, control and access) in the normal
>>>> runtime. it turns out we have all of these regs in kvm_run, so we can do 
>>>> synchronize states without doing an additional ioctl call.
>>>> Now, for life migration and dump we need some additional registers (which are
>>>> only accessable via onereg interface). So synchronize_state would need to
>>>> do 3 or 4 additional system calls on the hot path, only to take care of 
>>>> something that is not on the hot path at all.
>>>> For historic reasons, we have one exit code for almost all exits. Therefore,
>>>> we need to call synchronize_states almost always.
>>>> We could now start to have a poor mans synchronize_state in arch code, but
>>>> that would collide with common code synchronize_state if done at the wrong
>>>> time. Thus we want to make common code capable of having only a subset of
>>>> the register synched - by making it possible to sync the other regs later
>>>> on if needed without wiping the former sync.
>>>> 
>>>> Makes sense?
>>>> 
>>>> Christian
>>> 
>>> Yes. As noted in the last email on the thread, runtime/reset/full are to
>>> serapate sets of registers when writing _to_ kernel. When reading _from_
>>> kernel, reset and full distinctions are not appropriate (any register
>>> can change, as far as knowledge goes).
>> 
>> Hmm, I probably did not understood your point, so I will try to explain mine
>> and see what you respond :-)
>> 
>> The point of the patch set, is to allow this distinction when reading. 
>> In other words it allows code to state: I am only interested in regxy and dont
>> care if the other regs in env are out of sync.
> 
> Fine.
> 
>> If a full sync is necessary later on the other regs are synched as well.
>> If a full sync was already done before a partial get becomes a no-op.
> 
> -> FULL is the set of registers written when loadvm/initialization is
> performed.
> -> RESET, a subset of full, is a set of registers written on SYSTEM
> RESET.
> -> RUNTIME, a subset of RESET, is a set of registers written during
> RUNTIME.
> 
> To write both the RESET and FULL set of registers during runtime,
> contradicts the description above for both RESET and FULL.
> 
> Two examples from i386:
> 
>    if (level == KVM_PUT_FULL_STATE) {
>        /*
>         * KVM is yet unable to synchronize TSC values of multiple VCPUs
>         * on
>         * writeback. Until this is fixed, we only write the offset to
>         * SMP
>         * guests after migration, desynchronizing the VCPUs, but
>         * avoiding
>         * huge jump-backs that would occur without any writeback at
>         * all.
>         */
> 	...
>    }
> 
> And:
> 
>    /*
>     * The following paravirtual MSRs have side effects on the guest or
>     * are
>     * too heavy for normal writeback. Limit them to reset or full state
>     * updates.
>     */
> 
>> Why should that be not possible.
> 
> It should, but separately from FULL/RESET/RUNTIME distinction.
> This sequence
> 
> 	get_regs(FULLSTATE)
> 	put_regs(FULLSTATE)
> 
> During runtime is not allowed. And only syncing the RUNTIME set of
> registers during and leaving the FULL set of registers marked as
> dirty is confusing also. 
> 
> So perhaps what you'd want is selective read/write of RUNTIME registers
> as suggested.
> 
> 
> Date: Fri, 4 Jan 2013 23:49:42 -0200
> From: Marcelo Tosatti <mtosatti@redhat.com>
> To: "Jason J. Herne" <jjherne@linux.vnet.ibm.com>
> Cc: Alexander Graf <agraf@suse.de>,
> 	Bhushan Bharat-R65777 <R65777@freescale.com>,
> 	Christian Borntraeger <borntraeger@de.ibm.com>,
> 	Anthony Liguori <aliguori@us.ibm.com>,
> 	"qemu-devel@nongnu.org qemu-devel" <qemu-devel@nongnu.org>
> Subject: Re: [Qemu-devel] [PATCH 7/7] KVM regsync: Fix
> do_kvm_cpu_synchronize_state data integrity issue
> 
> On Fri, Jan 04, 2013 at 10:25:45AM -0500, Jason J. Herne wrote:
>> If I've followed the conversation correctly this is what needs to be done:
>> 
>> 1. Remove the level parameters from kvm_arch_get_registers and
>> kvm_arch_put_registers.
>> 
>> 2. Add a new bitmap parameter to kvm_arch_get_registers and
>> kvm_arch_put_registers.
>> 
>> 3. Define a bit that correlates to our current notion of "all
>> runtime registers".  This bit, and all bits in this bitmap, would be
>> architecture specific.
>> 
>> 4. Remove the cpustate->kvm_sync_dirty field.  Replace it with a
>> bitmap that tracks which bits are dirty and need to be synced back
>> to KVM-land.
>> 
>> 5. As we do today, we'll assume registers are dirty and turn on
>> their corresponding bit in this new bitmap whenever we "get" the
>> registers from KVM.
>> 
>> 6. Add other bits as needed on a case by case basis.
>> 
>> Does this seem to match what was discussed, and what we want to do?
>> 
>> 
>> -- 
>> -- Jason J. Herne (jjherne@linux.vnet.ibm.com)
> 
> The s390 change to use runtime_state is self-contained and could be
> integrated now from my pov.
> 
> The suggestion was based on the observation that the ppc guys are trying 
> to fix their timer problem and that should be a generic solution,
> per-register granularity.
> 
> Perhaps something along the lines of 
> 
> init
> all registers marked as cached (read/write accessors don't 
> synchronize state)
> 
> Around ioctl(KVM_RUN) have:
> 
> if regs dirty
> 	writeback
> r = ioctl(KVM_RUN)
> mark runstate registers as not cached
> 
> Then
> 
> read_reg(x)
> 	if x not cached
> 		arch_get_regs(RUNTIME_STATE) (*)
> 
> write_reg(x, val)
> 	read_reg(x)
> 	cpustate->x = val;
> 	mark_dirty(x)
> 
> Which is basically the pattern used in KVM x86 (but instead of
> ioctl(KVM_RUN) there is VMENTRY).

But that would mean that any code in QEMU that accesses registers can't access env-> ,but instead needs to go through an accessor function. That's a lot of potential for subtile error, no?

I think for now the best choice for get_regs() would be to ignore the FULL/RESET bits and always keep the syncing as it happens today under the RUNTIME umbrella only. So all of get_regs() only checks for RUNTIME.

Whenever get_xxx() happens, a bit gets set for set_xxx(). Up to this point, only the RUNTIME bit is ever set, because that's what cpu_synchronize_registers() sets.

Then s390 can add special separate bits for "sync GPRs" and "sync CRs". SYNC_RUNTIME would include those bits. The kvm hypercall exit calls a new synchronize_registers() function with a parameter telling it to only sync GPRs. This marks GPRs dirty, but not RUNTIME. The set_registers() function in s390 specific code could handle this particular case specially.

That way everything's solved and scalable, no?


Alex
Alexander Graf - Jan. 24, 2013, 12:41 p.m.
On 16.01.2013, at 18:23, Marcelo Tosatti wrote:

> On Wed, Jan 16, 2013 at 05:00:52PM +0000, Bhushan Bharat-R65777 wrote:
>> I think above code should be:
>>            kvm_arch_put_registers(cpu, cpu->kvm_vcpu_dirty);
>>            cpu->kvm_vcpu_dirty = false;
>> 
>> so vcpu will not enter guest state with dirty registers in qemu.
> 
> Not so clear - currently PUT_FULL/PUT_RESET are performed on
> pre-defined points.
> 
>>> Unrelated:
>>> 
>>> 2) Also, what is the reason for specifying sets of registers in arch-specific
>>> code? Is that because it allows PPC to fix their sync-timer register problem?
>>> 
>>> When you are writing generic code, what does it mean to use
>>> 'KVM_REGSYNC_{RUNTIME,RESET,FULL}_STATE' ?
>>> Answer: it depends on the architecture.
>>> 
>>> 3) On x86, kvm_arch_get_registers(GET_FULL) must not imply
>>> kvm_arch_put_registers(PUT_FULL).
>>> 
>>> The S/390 problem, from
>>> http://lists.nongnu.org/archive/html/qemu-devel/2012-11/msg02213.html:
>>> 
>>> ">>> The kvm register sync needs to happen in the kvm register sync
>>>>>> function :)
>>>>> That would eliminate the whole purpose of sync regs and forces us to
>>>>> have an expensive ioctl on lots of exits (again). I would prefer to
>>>>> sync the registers that we never need in qemu just here.
>>>> 
>>>> That's why the register sync has different stages.
>>> 
>>> Not the get_register. Which is called on every synchronize_state. Which happen
>>> quite often on s390."
>>> 
>>> But wait: on these S/390 codepaths, you do GET_REGS already, via
>>> cpu_synchronize_state.
>>> 
>>> So on S/390
>>> 
>>> - cpu_synchronize_state(env)
>>> - read any register from env
>>> 
>>> Is not valid? This is what generic code assumes.
>>> 
>>> 
>>> Bhushan Bharat, the PPC problem, can you describe it clearly: from what i
>>> understood, an in-kernel register cannot be read/written back because that
>>> register value can change in the meantime. When is it necessary to write it
>>> back? (there is a similar problem with TSC on x86, which is "fixed" by only
>>> writing TSC on FULL_STATE arch_put_registers).
>> 
>> There are two things:
>> 
>> First-)
>> For timer related changes on PowerPC, some registers needed to be changed from QEMU, so we have to get the registers via KVM_GET_SREGS and then set those registers back to KVM via KVM_SET_SREGS. cpu_synchronize_state() will get registers but kvm_arch_put_registers() works on level based mechanism and does not provide a good way of setting a register-set. So we wrote a separate function that will push these registers back to KVM and this also uses KVM_SET_SREGS ioctl. This solves what is needed for PPC.
> 
> Can you describe the problem in detail? You must sync a particular
> timer register only on special conditions, not during normal
> cpu_synchronize_state() runs?

We basically have a "core interrupt pending" register. This register can be

  * written from kernel space when a timer expires
  * written from user space on reset
  * written from user space on watchdog expiry

> What register is that and why it cannot be synced normally? When is it
> necessary to sync it?

We need to sync it on the above 2 occasions.

Thinking about this a bit more, we're trying to keep the synchronization window short to not get into conflicts with the kernel timer kicking in in between. Imagine this race:

  * user space reads TSR
  * kernel timer expires, sets bit in TSR
  * user space writes TSR

That's why we don't want this to be synced every time. We would only set TSR when we reset the counter. At that point in time it doesn't hurt to lose the kernel timer set, because we cleared the bit anyways.

But maybe the better solution would be a special "write to clear" ONE_REG register to clear specific bits and a big hammer "set" ONE_REG (which we have already) for reset only.

That would make things easier, right? Scott, any ideas on this?


Alex
Scott Wood - Jan. 24, 2013, 6:17 p.m.
On 01/24/2013 06:41:05 AM, Alexander Graf wrote:
> 
> On 16.01.2013, at 18:23, Marcelo Tosatti wrote:
> 
> > What register is that and why it cannot be synced normally? When is  
> it
> > necessary to sync it?
> 
> We need to sync it on the above 2 occasions.
> 
> Thinking about this a bit more, we're trying to keep the  
> synchronization window short to not get into conflicts with the  
> kernel timer kicking in in between. Imagine this race:
> 
>   * user space reads TSR
>   * kernel timer expires, sets bit in TSR
>   * user space writes TSR
> 
> That's why we don't want this to be synced every time. We would only  
> set TSR when we reset the counter. At that point in time it doesn't  
> hurt to lose the kernel timer set, because we cleared the bit anyways.
> 
> But maybe the better solution would be a special "write to clear"  
> ONE_REG register to clear specific bits and a big hammer "set"  
> ONE_REG (which we have already) for reset only.
> 
> That would make things easier, right? Scott, any ideas on this?

Yes, we should have a ONE_REG write-to-clear TSR.  Maybe a write-to-set  
version as well, in case userspace wants to inject something.  Likewise  
for MCSR.

-Scott
Peter Maydell - Jan. 24, 2013, 6:22 p.m.
On 24 January 2013 18:17, Scott Wood <scottwood@freescale.com> wrote:
> On 01/24/2013 06:41:05 AM, Alexander Graf wrote:
>> But maybe the better solution would be a special "write to clear" ONE_REG
>> register to clear specific bits and a big hammer "set" ONE_REG (which we
>> have already) for reset only.
>>
>> That would make things easier, right? Scott, any ideas on this?

> Yes, we should have a ONE_REG write-to-clear TSR.  Maybe a write-to-set
> version as well, in case userspace wants to inject something.  Likewise for
> MCSR.

This would go against the principle we adopted for ARM ONE_REG,
which is that the "registers" accessed this way never have side
effects, so userspace could do a complete migration via
"get complete list of regs; get each reg in list; transfer
all values to remote end; remote end sets each reg".

Obviously PPC isn't bound to abide by the same ground rules
for the ioctl, but I figured I'd mention the principle so
you can decide if you like it :-)

-- PMM
Alexander Graf - Jan. 24, 2013, 6:25 p.m.
On 24.01.2013, at 19:22, Peter Maydell wrote:

> On 24 January 2013 18:17, Scott Wood <scottwood@freescale.com> wrote:
>> On 01/24/2013 06:41:05 AM, Alexander Graf wrote:
>>> But maybe the better solution would be a special "write to clear" ONE_REG
>>> register to clear specific bits and a big hammer "set" ONE_REG (which we
>>> have already) for reset only.
>>> 
>>> That would make things easier, right? Scott, any ideas on this?
> 
>> Yes, we should have a ONE_REG write-to-clear TSR.  Maybe a write-to-set
>> version as well, in case userspace wants to inject something.  Likewise for
>> MCSR.
> 
> This would go against the principle we adopted for ARM ONE_REG,
> which is that the "registers" accessed this way never have side
> effects, so userspace could do a complete migration via
> "get complete list of regs; get each reg in list; transfer
> all values to remote end; remote end sets each reg".
> 
> Obviously PPC isn't bound to abide by the same ground rules
> for the ioctl, but I figured I'd mention the principle so
> you can decide if you like it :-)

The "special" register access types wouldn't show up in the "get me all sync registers" list. There, we would get the full "read" and "write" accesses to TSR and TCR.


Alex
Alexander Graf - Jan. 24, 2013, 6:26 p.m.
On 24.01.2013, at 19:17, Scott Wood wrote:

> On 01/24/2013 06:41:05 AM, Alexander Graf wrote:
>> On 16.01.2013, at 18:23, Marcelo Tosatti wrote:
>> > What register is that and why it cannot be synced normally? When is it
>> > necessary to sync it?
>> We need to sync it on the above 2 occasions.
>> Thinking about this a bit more, we're trying to keep the synchronization window short to not get into conflicts with the kernel timer kicking in in between. Imagine this race:
>>  * user space reads TSR
>>  * kernel timer expires, sets bit in TSR
>>  * user space writes TSR
>> That's why we don't want this to be synced every time. We would only set TSR when we reset the counter. At that point in time it doesn't hurt to lose the kernel timer set, because we cleared the bit anyways.
>> But maybe the better solution would be a special "write to clear" ONE_REG register to clear specific bits and a big hammer "set" ONE_REG (which we have already) for reset only.
>> That would make things easier, right? Scott, any ideas on this?
> 
> Yes, we should have a ONE_REG write-to-clear TSR.  Maybe a write-to-set version as well, in case userspace wants to inject something.  Likewise for MCSR.

Bharat, could you please quickly write up a patch to do this?


Thanks!

Alex
Marcelo Tosatti - Jan. 24, 2013, 8:44 p.m.
On Thu, Jan 24, 2013 at 01:40:49PM +0100, Alexander Graf wrote:
> > read_reg(x)
> > 	if x not cached
> > 		arch_get_regs(RUNTIME_STATE) (*)
> > 
> > write_reg(x, val)
> > 	read_reg(x)
> > 	cpustate->x = val;
> > 	mark_dirty(x)
> > 
> > Which is basically the pattern used in KVM x86 (but instead of
> > ioctl(KVM_RUN) there is VMENTRY).
> 
> But that would mean that any code in QEMU that accesses registers can't access env-> ,but instead needs to go through an accessor function. That's a lot of potential for subtile error, no?

I do not see why. It has the potential to catch users of
env->reg which do not call cpu_synchronize_state().

> I think for now the best choice for get_regs() would be to ignore the FULL/RESET bits and always keep the syncing as it happens today under the RUNTIME umbrella only. So all of get_regs() only checks for RUNTIME.

Well the interface "kvm_arch_get_regs" is supposed to synchronize the
entire state ATM. So for example, "info registers" has

- cpu_synchronize_state()
- proceed assuming env-> is an uptodate copy of VCPU registers.

> Whenever get_xxx() happens, a bit gets set for set_xxx(). Up to this point, only the RUNTIME bit is ever set, because that's what cpu_synchronize_registers() sets.

There is no parameter to cpu_synchronize_registers().

> Then s390 can add special separate bits for "sync GPRs" and "sync CRs". SYNC_RUNTIME would include those bits. The kvm hypercall exit calls a new synchronize_registers() function with a parameter telling it to only sync GPRs. This marks GPRs dirty, but not RUNTIME. The set_registers() function in s390 specific code could handle this particular case specially.
> 
> That way everything's solved and scalable, no?

Yes, creating a new subset GPR which is part of RUNTIME is valid. 

S/390 not synchronizing the env-> copy of the FULL register set is still
a bug, though (because the FULL set is what "cpu_synchronize_state" with
no parameter implies).
Marcelo Tosatti - Jan. 25, 2013, 12:01 a.m.
On Thu, Jan 24, 2013 at 06:44:50PM -0200, Marcelo Tosatti wrote:
> On Thu, Jan 24, 2013 at 01:40:49PM +0100, Alexander Graf wrote:
> > > read_reg(x)
> > > 	if x not cached
> > > 		arch_get_regs(RUNTIME_STATE) (*)
> > > 
> > > write_reg(x, val)
> > > 	read_reg(x)
> > > 	cpustate->x = val;
> > > 	mark_dirty(x)
> > > 
> > > Which is basically the pattern used in KVM x86 (but instead of
> > > ioctl(KVM_RUN) there is VMENTRY).
> > 
> > But that would mean that any code in QEMU that accesses registers can't access env-> ,but instead needs to go through an accessor function. That's a lot of potential for subtile error, no?
> 
> I do not see why. It has the potential to catch users of
> env->reg which do not call cpu_synchronize_state().
> 
> > I think for now the best choice for get_regs() would be to ignore the FULL/RESET bits and always keep the syncing as it happens today under the RUNTIME umbrella only. So all of get_regs() only checks for RUNTIME.
> 
> Well the interface "kvm_arch_get_regs" is supposed to synchronize the
> entire state ATM. So for example, "info registers" has
> 
> - cpu_synchronize_state()
> - proceed assuming env-> is an uptodate copy of VCPU registers.
> 
> > Whenever get_xxx() happens, a bit gets set for set_xxx(). Up to this point, only the RUNTIME bit is ever set, because that's what cpu_synchronize_registers() sets.
> 
> There is no parameter to cpu_synchronize_registers().
> 
> > Then s390 can add special separate bits for "sync GPRs" and "sync CRs". SYNC_RUNTIME would include those bits. The kvm hypercall exit calls a new synchronize_registers() function with a parameter telling it to only sync GPRs. This marks GPRs dirty, but not RUNTIME. The set_registers() function in s390 specific code could handle this particular case specially.
> > 
> > That way everything's solved and scalable, no?
> 
> Yes, creating a new subset GPR which is part of RUNTIME is valid. 

Accessors though are more organized, the information of which code
accesses CPUState->env is then explicit. Whether to synchronize a given
register using eg. GET_SREGS (say GPR set) or GET_ONE_REG (individual
register) is up to architecture.

What 'subtle errors' are you thinking of?

It should be easy to convert as its greppable.

> S/390 not synchronizing the env-> copy of the FULL register set is still
> a bug, though (because the FULL set is what "cpu_synchronize_state" with
> no parameter implies).
Alexander Graf - Jan. 25, 2013, 12:26 a.m.
On 25.01.2013, at 01:01, Marcelo Tosatti wrote:

> On Thu, Jan 24, 2013 at 06:44:50PM -0200, Marcelo Tosatti wrote:
>> On Thu, Jan 24, 2013 at 01:40:49PM +0100, Alexander Graf wrote:
>>>> read_reg(x)
>>>> 	if x not cached
>>>> 		arch_get_regs(RUNTIME_STATE) (*)
>>>> 
>>>> write_reg(x, val)
>>>> 	read_reg(x)
>>>> 	cpustate->x = val;
>>>> 	mark_dirty(x)
>>>> 
>>>> Which is basically the pattern used in KVM x86 (but instead of
>>>> ioctl(KVM_RUN) there is VMENTRY).
>>> 
>>> But that would mean that any code in QEMU that accesses registers can't access env-> ,but instead needs to go through an accessor function. That's a lot of potential for subtile error, no?
>> 
>> I do not see why. It has the potential to catch users of
>> env->reg which do not call cpu_synchronize_state().
>> 
>>> I think for now the best choice for get_regs() would be to ignore the FULL/RESET bits and always keep the syncing as it happens today under the RUNTIME umbrella only. So all of get_regs() only checks for RUNTIME.
>> 
>> Well the interface "kvm_arch_get_regs" is supposed to synchronize the
>> entire state ATM. So for example, "info registers" has
>> 
>> - cpu_synchronize_state()
>> - proceed assuming env-> is an uptodate copy of VCPU registers.
>> 
>>> Whenever get_xxx() happens, a bit gets set for set_xxx(). Up to this point, only the RUNTIME bit is ever set, because that's what cpu_synchronize_registers() sets.
>> 
>> There is no parameter to cpu_synchronize_registers().
>> 
>>> Then s390 can add special separate bits for "sync GPRs" and "sync CRs". SYNC_RUNTIME would include those bits. The kvm hypercall exit calls a new synchronize_registers() function with a parameter telling it to only sync GPRs. This marks GPRs dirty, but not RUNTIME. The set_registers() function in s390 specific code could handle this particular case specially.
>>> 
>>> That way everything's solved and scalable, no?
>> 
>> Yes, creating a new subset GPR which is part of RUNTIME is valid. 
> 
> Accessors though are more organized, the information of which code
> accesses CPUState->env is then explicit. Whether to synchronize a given
> register using eg. GET_SREGS (say GPR set) or GET_ONE_REG (individual
> register) is up to architecture.
> 
> What 'subtle errors' are you thinking of?

I'm thinking of vmport for example, which lives in hw/ but still would need conversion to an accessor.

> It should be easy to convert as its greppable.

Only if the code in question makes it greppable. It could be using fancy env->spr[spr] = xxx; semantics instead.

But accessor functions are the second step to this. First we need infrastructure that lets us actually define which bits need to be synchronized. Then the next step could be to create accessor functions to remove use of cpu_synchronize_registers() or cpu_synchronize(REGS_GPRS).


Alex
Jason J. Herne - Jan. 28, 2013, 5:49 p.m.
On 01/24/2013 07:01 PM, Marcelo Tosatti wrote:
> On Thu, Jan 24, 2013 at 06:44:50PM -0200, Marcelo Tosatti wrote:
>
> What 'subtle errors' are you thinking of?
>
> It should be easy to convert as its greppable.
>
>> S/390 not synchronizing the env-> copy of the FULL register set is still
>> a bug, though (because the FULL set is what "cpu_synchronize_state" with
>> no parameter implies).
>
>
>

There seems to be a lot of good discussion going on regarding these 
changes.  Have we reached some kind of consensus on how we want to proceed?

If I understand the arguments being made... this patch set, as 
submitted, is wrong because KVM_ARCH_GET_REGS should always retrieve 
runtime regs only?  Or must it sync everything?  If there is not a clear 
answer here we'll need to decide this before going anywhere with this 
patch, yes?

Alex wrote:
"""
I think for now the best choice for get_regs() would be to ignore the 
FULL/RESET bits and always keep the syncing as it happens today under 
the RUNTIME umbrella only. So all of get_regs() only checks for RUNTIME.

Whenever get_xxx() happens, a bit gets set for set_xxx(). Up to this 
point, only the RUNTIME bit is ever set, because that's what 
cpu_synchronize_registers() sets.

Then s390 can add special separate bits for "sync GPRs" and "sync CRs". 
SYNC_RUNTIME would include those bits. The kvm hypercall exit calls a 
new synchronize_registers() function with a parameter telling it to only 
sync GPRs. This marks GPRs dirty, but not RUNTIME. The set_registers() 
function in s390 specific code could handle this particular case specially.

That way everything's solved and scalable, no?
"""

This is the most comprehensive suggestion I've been able to pick out of 
the discussion.  However, Marcelo has expressed concern over S390 not 
updating everything in env:

"""
S/390 not synchronizing the env-> copy of the FULL register set is still
a bug, though (because the FULL set is what "cpu_synchronize_state" with
no parameter implies).
"""

Also Alex, what did you mean by get_xxx() and set_xxx()?
Marcelo Tosatti - Jan. 29, 2013, 10:56 p.m.
On Mon, Jan 28, 2013 at 12:49:26PM -0500, Jason J. Herne wrote:
> On 01/24/2013 07:01 PM, Marcelo Tosatti wrote:
> >On Thu, Jan 24, 2013 at 06:44:50PM -0200, Marcelo Tosatti wrote:
> >
> >What 'subtle errors' are you thinking of?
> >
> >It should be easy to convert as its greppable.
> >
> >>S/390 not synchronizing the env-> copy of the FULL register set is still
> >>a bug, though (because the FULL set is what "cpu_synchronize_state" with
> >>no parameter implies).
> >
> >
> >
> 
> There seems to be a lot of good discussion going on regarding these
> changes.  Have we reached some kind of consensus on how we want to
> proceed?
> If I understand the arguments being made... this patch set, as
> submitted, is wrong because KVM_ARCH_GET_REGS should always retrieve
> runtime regs only?  

Because only registers on the runtime set should be written
(userspace->kernel direction) during runtime.

All registers must be read (kernel->userspace direction) on
cpu_synchronize_state(void).

> Or must it sync everything?  If there is not a
> clear answer here we'll need to decide this before going anywhere
> with this patch, yes?
> 
> Alex wrote:
> """
> I think for now the best choice for get_regs() would be to ignore
> the FULL/RESET bits and always keep the syncing as it happens today
> under the RUNTIME umbrella only. So all of get_regs() only checks
> for RUNTIME.

This makes this 

1. cpu_synchronize_state()
2. read env->env which is part of RESET or FULL set but not RUNTIME set

sequence invalid (such as 'info regs'). Perhaps on S/390 its not a
problem.

> Whenever get_xxx() happens, a bit gets set for set_xxx(). Up to this
> point, only the RUNTIME bit is ever set, because that's what
> cpu_synchronize_registers() sets.
> 
> Then s390 can add special separate bits for "sync GPRs" and "sync
> CRs". SYNC_RUNTIME would include those bits. The kvm hypercall exit
> calls a new synchronize_registers() function with a parameter
> telling it to only sync GPRs. This marks GPRs dirty, but not
> RUNTIME. The set_registers() function in s390 specific code could
> handle this particular case specially.
> 
> That way everything's solved and scalable, no?
> """
> 
> This is the most comprehensive suggestion I've been able to pick out
> of the discussion. 

This is fine.

> However, Marcelo has expressed concern over S390
> not updating everything in env:
> 
> """
> S/390 not synchronizing the env-> copy of the FULL register set is still
> a bug, though (because the FULL set is what "cpu_synchronize_state" with
> no parameter implies).
> """

Since cpu_synchronize_state() does not take a parameter, it is
expected to synchronize all registers (FULL/RESET/RUNTIME sets).

> 
> Also Alex, what did you mean by get_xxx() and set_xxx()?
> 
> -- 
> -- Jason J. Herne (jjherne@linux.vnet.ibm.com)
Jason J. Herne - Feb. 1, 2013, 3:47 p.m.
On 01/24/2013 07:40 AM, Alexander Graf wrote:
> I think for now the best choice for get_regs() would be to ignore the FULL/RESET bits and always keep the syncing as it happens today under the RUNTIME umbrella only. So all of get_regs() only checks for RUNTIME.
>
> Whenever get_xxx() happens, a bit gets set for set_xxx(). Up to this point, only the RUNTIME bit is ever set, because that's what cpu_synchronize_registers() sets.
>
> Then s390 can add special separate bits for "sync GPRs" and "sync CRs". SYNC_RUNTIME would include those bits. The kvm hypercall exit calls a new synchronize_registers() function with a parameter telling it to only sync GPRs. This marks GPRs dirty, but not RUNTIME. The set_registers() function in s390 specific code could handle this particular case specially.
>
> That way everything's solved and scalable, no?
>
> Alex
>

Ok, based on the discussions we've had I think I have a plan of attack 
based on Alex's above suggestion.  I believe it also satisfies the 
concerns Marcelo pointed out.  Please correct me if I'm wrong.

kvm_arch_get_registers() stays exactly as is for all architectures 
(reads RUNTIME state only). No new parameters.

Each architecture defines arch specific bits for runtime/reset/full states:
     #define KVM_REGSYNC_I386_RUNTIME_BIT  (1 << 1)
     #define KVM_REGSYNC_I386_RESET_BIT    (1 << 2)
     #define KVM_REGSYNC_I386_FULL_BIT     (1 << 3)

Each architecture defines generic bits (for use in platform agnostic 
code: kvm-all.c) for runtime/reset/full states:
     #define KVM_REGSYNC_RUNTIME_STATE    KVM_REGSYNC_I386_RUNTIME_BIT
     #define KVM_REGSYNC_RESET_STATE      KVM_REGSYNC_I386_RESET_BIT
     #define KVM_REGSYNC_FULL_STATE       KVM_REGSYNC_I386_FULL_BIT

S390: replace KVM_REGSYNC_S390_RUNTIME_BIT with two new bits so the S390 
arch specific bits look like this:
     #define KVM_REGSYNC_S390_RUNTIME_SOME_BIT  (1 << 1)
     #define KVM_REGSYNC_S390_RUNTIME_REST_BIT  (1 << 2)
     #define KVM_REGSYNC_S390_RESET_BIT         (1 << 3)
     #define KVM_REGSYNC_S390_FULL_BIT          (1 << 4)
The idea being that SOME represents the set of RUNTIME registers we 
always want to read when we exit from KVM. And REST represents the set 
of RUNTIME registers we want to read for migration/dump and potentially 
other special cases.  My understanding is that SOME and REST should be 
mutually exclusive.  I think they need better names as well :).

S390 defines it's generic bits like this:
     #define KVM_REGSYNC_RUNTIME_STATE
         (KVM_REGSYNC_S390_RUNTIME_SOME_BIT |
             KVM_REGSYNC_S390_RUNTIME_REST_BIT)
     #define KVM_REGSYNC_RESET_STATE      KVM_REGSYNC_S390_RESET_BIT
     #define KVM_REGSYNC_FULL_STATE        KVM_REGSYNC_S390_FULL_BIT

S390: A new function is created: s390_sync_partial_runtime_registers(int 
bitmap).  The bitmap argument indicates which of the SOME/REST register 
sets to read.  Either this new function or perhaps the caller will 
update the cpu->kvm_vcpu_dirty bitmap to indicate which regs are now dirty.

S390: On the hot paths we call 
s390_sync_partial_runtime_registers(KVM_REGSYNC_S390_RUNTIME_SOME_BIT) 
instead of cpu_synchronize_state() to read only the set of runtime 
registers we need on the hot path.  If at some later point 
cpu_synchronize_state() happens to be called then the S390 version of 
kvm_arch_get_registers() needs to be smart enough to avoid data loss. 
So we make it write back all dirty registers (env->kvm_vcpu_dirty) 
before getting anything.

I think this works.  Comments please and thank you!! :)
Jason J. Herne - Feb. 11, 2013, 3:12 p.m.
On 02/01/2013 10:47 AM, Jason J. Herne wrote:
> On 01/24/2013 07:40 AM, Alexander Graf wrote:
>> I think for now the best choice for get_regs() would be to ignore the
>> FULL/RESET bits and always keep the syncing as it happens today under
>> the RUNTIME umbrella only. So all of get_regs() only checks for RUNTIME.
>>
>> Whenever get_xxx() happens, a bit gets set for set_xxx(). Up to this
>> point, only the RUNTIME bit is ever set, because that's what
>> cpu_synchronize_registers() sets.
>>
>> Then s390 can add special separate bits for "sync GPRs" and "sync
>> CRs". SYNC_RUNTIME would include those bits. The kvm hypercall exit
>> calls a new synchronize_registers() function with a parameter telling
>> it to only sync GPRs. This marks GPRs dirty, but not RUNTIME. The
>> set_registers() function in s390 specific code could handle this
>> particular case specially.
>>
>> That way everything's solved and scalable, no?
>>
>> Alex
>>
>
> Ok, based on the discussions we've had I think I have a plan of attack
> based on Alex's above suggestion.  I believe it also satisfies the
> concerns Marcelo pointed out.  Please correct me if I'm wrong.
>
> kvm_arch_get_registers() stays exactly as is for all architectures
> (reads RUNTIME state only). No new parameters.
>
> Each architecture defines arch specific bits for runtime/reset/full states:
>      #define KVM_REGSYNC_I386_RUNTIME_BIT  (1 << 1)
>      #define KVM_REGSYNC_I386_RESET_BIT    (1 << 2)
>      #define KVM_REGSYNC_I386_FULL_BIT     (1 << 3)
>
> Each architecture defines generic bits (for use in platform agnostic
> code: kvm-all.c) for runtime/reset/full states:
>      #define KVM_REGSYNC_RUNTIME_STATE    KVM_REGSYNC_I386_RUNTIME_BIT
>      #define KVM_REGSYNC_RESET_STATE      KVM_REGSYNC_I386_RESET_BIT
>      #define KVM_REGSYNC_FULL_STATE       KVM_REGSYNC_I386_FULL_BIT
>
> S390: replace KVM_REGSYNC_S390_RUNTIME_BIT with two new bits so the S390
> arch specific bits look like this:
>      #define KVM_REGSYNC_S390_RUNTIME_SOME_BIT  (1 << 1)
>      #define KVM_REGSYNC_S390_RUNTIME_REST_BIT  (1 << 2)
>      #define KVM_REGSYNC_S390_RESET_BIT         (1 << 3)
>      #define KVM_REGSYNC_S390_FULL_BIT          (1 << 4)
> The idea being that SOME represents the set of RUNTIME registers we
> always want to read when we exit from KVM. And REST represents the set
> of RUNTIME registers we want to read for migration/dump and potentially
> other special cases.  My understanding is that SOME and REST should be
> mutually exclusive.  I think they need better names as well :).
>
> S390 defines it's generic bits like this:
>      #define KVM_REGSYNC_RUNTIME_STATE
>          (KVM_REGSYNC_S390_RUNTIME_SOME_BIT |
>              KVM_REGSYNC_S390_RUNTIME_REST_BIT)
>      #define KVM_REGSYNC_RESET_STATE      KVM_REGSYNC_S390_RESET_BIT
>      #define KVM_REGSYNC_FULL_STATE        KVM_REGSYNC_S390_FULL_BIT
>
> S390: A new function is created: s390_sync_partial_runtime_registers(int
> bitmap).  The bitmap argument indicates which of the SOME/REST register
> sets to read.  Either this new function or perhaps the caller will
> update the cpu->kvm_vcpu_dirty bitmap to indicate which regs are now dirty.
>
> S390: On the hot paths we call
> s390_sync_partial_runtime_registers(KVM_REGSYNC_S390_RUNTIME_SOME_BIT)
> instead of cpu_synchronize_state() to read only the set of runtime
> registers we need on the hot path.  If at some later point
> cpu_synchronize_state() happens to be called then the S390 version of
> kvm_arch_get_registers() needs to be smart enough to avoid data loss. So
> we make it write back all dirty registers (env->kvm_vcpu_dirty) before
> getting anything.
>
> I think this works.  Comments please and thank you!! :)
>

Ping. :)  Looking for comments on this idea.  Thanks for your time.
Marcelo Tosatti - Feb. 11, 2013, 10:49 p.m.
On Fri, Feb 01, 2013 at 10:47:37AM -0500, Jason J. Herne wrote:
> On 01/24/2013 07:40 AM, Alexander Graf wrote:
> >I think for now the best choice for get_regs() would be to ignore the FULL/RESET bits and always keep the syncing as it happens today under the RUNTIME umbrella only. So all of get_regs() only checks for RUNTIME.
> >
> >Whenever get_xxx() happens, a bit gets set for set_xxx(). Up to this point, only the RUNTIME bit is ever set, because that's what cpu_synchronize_registers() sets.
> >
> >Then s390 can add special separate bits for "sync GPRs" and "sync CRs". SYNC_RUNTIME would include those bits. The kvm hypercall exit calls a new synchronize_registers() function with a parameter telling it to only sync GPRs. This marks GPRs dirty, but not RUNTIME. The set_registers() function in s390 specific code could handle this particular case specially.
> >
> >That way everything's solved and scalable, no?
> >
> >Alex
> >
> 
> Ok, based on the discussions we've had I think I have a plan of
> attack based on Alex's above suggestion.  I believe it also
> satisfies the concerns Marcelo pointed out.  Please correct me if
> I'm wrong.
> 
> kvm_arch_get_registers() stays exactly as is for all architectures
> (reads RUNTIME state only). No new parameters.

kvm_arch_get_registers reads the entire state, ie. FULL state. 

> Each architecture defines arch specific bits for runtime/reset/full states:
>     #define KVM_REGSYNC_I386_RUNTIME_BIT  (1 << 1)
>     #define KVM_REGSYNC_I386_RESET_BIT    (1 << 2)
>     #define KVM_REGSYNC_I386_FULL_BIT     (1 << 3)
> 
> Each architecture defines generic bits (for use in platform agnostic
> code: kvm-all.c) for runtime/reset/full states:
>     #define KVM_REGSYNC_RUNTIME_STATE    KVM_REGSYNC_I386_RUNTIME_BIT
>     #define KVM_REGSYNC_RESET_STATE      KVM_REGSYNC_I386_RESET_BIT
>     #define KVM_REGSYNC_FULL_STATE       KVM_REGSYNC_I386_FULL_BIT
> 
> S390: replace KVM_REGSYNC_S390_RUNTIME_BIT with two new bits so the
> S390 arch specific bits look like this:
>     #define KVM_REGSYNC_S390_RUNTIME_SOME_BIT  (1 << 1)
>     #define KVM_REGSYNC_S390_RUNTIME_REST_BIT  (1 << 2)
>     #define KVM_REGSYNC_S390_RESET_BIT         (1 << 3)
>     #define KVM_REGSYNC_S390_FULL_BIT          (1 << 4)
> The idea being that SOME represents the set of RUNTIME registers we
> always want to read when we exit from KVM.
>
> And REST represents the
> set of RUNTIME registers we want to read for migration/dump and
> potentially other special cases.  My understanding is that SOME and
> REST should be mutually exclusive.  I think they need better names
> as well :).
> 
> S390 defines it's generic bits like this:
>     #define KVM_REGSYNC_RUNTIME_STATE
>         (KVM_REGSYNC_S390_RUNTIME_SOME_BIT |
>             KVM_REGSYNC_S390_RUNTIME_REST_BIT)
>     #define KVM_REGSYNC_RESET_STATE      KVM_REGSYNC_S390_RESET_BIT
>     #define KVM_REGSYNC_FULL_STATE        KVM_REGSYNC_S390_FULL_BIT
> 
> S390: A new function is created:
> s390_sync_partial_runtime_registers(int bitmap).  The bitmap
> argument indicates which of the SOME/REST register sets to read.
> Either this new function or perhaps the caller will update the
> cpu->kvm_vcpu_dirty bitmap to indicate which regs are now dirty.
> 
> S390: On the hot paths we call s390_sync_partial_runtime_registers(KVM_REGSYNC_S390_RUNTIME_SOME_BIT)
> instead of cpu_synchronize_state() to read only the set of runtime
> registers we need on the hot path.  If at some later point
> cpu_synchronize_state() happens to be called then the S390 version
> of kvm_arch_get_registers() needs to be smart enough to avoid data
> loss. So we make it write back all dirty registers
> (env->kvm_vcpu_dirty) before getting anything.
> 
> I think this works.  Comments please and thank you!! :)

The idea behind s390_sync_partial_runtime_registers was that no generic
modifications have to be done. This (containment of the modifications 
to S/390) is possible if: 

1) The hot paths in question are vcpu local. That is, only executed in
VCPU context. Which seems to be the case because these hot paths are hot 
because they are VM-exits handled in userspace.

Can you confirm this?

Because frankly, i dislike splitting the register sets without adding accessors
(so thinking is, either go all the way and have an interface which is
difficult to make mistakes with or contain the register splitting 
changes to S/390).

But, the original author of this interface is Jan Kiskza, so he
should be consulted.
Jan Kiszka - Feb. 13, 2013, 11:39 a.m.
On 2013-02-11 23:49, Marcelo Tosatti wrote:
> On Fri, Feb 01, 2013 at 10:47:37AM -0500, Jason J. Herne wrote:
>> On 01/24/2013 07:40 AM, Alexander Graf wrote:
>>> I think for now the best choice for get_regs() would be to ignore the FULL/RESET bits and always keep the syncing as it happens today under the RUNTIME umbrella only. So all of get_regs() only checks for RUNTIME.
>>>
>>> Whenever get_xxx() happens, a bit gets set for set_xxx(). Up to this point, only the RUNTIME bit is ever set, because that's what cpu_synchronize_registers() sets.
>>>
>>> Then s390 can add special separate bits for "sync GPRs" and "sync CRs". SYNC_RUNTIME would include those bits. The kvm hypercall exit calls a new synchronize_registers() function with a parameter telling it to only sync GPRs. This marks GPRs dirty, but not RUNTIME. The set_registers() function in s390 specific code could handle this particular case specially.
>>>
>>> That way everything's solved and scalable, no?
>>>
>>> Alex
>>>
>>
>> Ok, based on the discussions we've had I think I have a plan of
>> attack based on Alex's above suggestion.  I believe it also
>> satisfies the concerns Marcelo pointed out.  Please correct me if
>> I'm wrong.
>>
>> kvm_arch_get_registers() stays exactly as is for all architectures
>> (reads RUNTIME state only). No new parameters.
> 
> kvm_arch_get_registers reads the entire state, ie. FULL state. 
> 
>> Each architecture defines arch specific bits for runtime/reset/full states:
>>     #define KVM_REGSYNC_I386_RUNTIME_BIT  (1 << 1)
>>     #define KVM_REGSYNC_I386_RESET_BIT    (1 << 2)
>>     #define KVM_REGSYNC_I386_FULL_BIT     (1 << 3)


These states remain levels, i.e. are cumulative: RESET implies RUNTIME,
FULL implies RESET+RUNTIME. The encoding shall express this.

>>
>> Each architecture defines generic bits (for use in platform agnostic
>> code: kvm-all.c) for runtime/reset/full states:
>>     #define KVM_REGSYNC_RUNTIME_STATE    KVM_REGSYNC_I386_RUNTIME_BIT
>>     #define KVM_REGSYNC_RESET_STATE      KVM_REGSYNC_I386_RESET_BIT
>>     #define KVM_REGSYNC_FULL_STATE       KVM_REGSYNC_I386_FULL_BIT
>>
>> S390: replace KVM_REGSYNC_S390_RUNTIME_BIT with two new bits so the
>> S390 arch specific bits look like this:
>>     #define KVM_REGSYNC_S390_RUNTIME_SOME_BIT  (1 << 1)
>>     #define KVM_REGSYNC_S390_RUNTIME_REST_BIT  (1 << 2)
>>     #define KVM_REGSYNC_S390_RESET_BIT         (1 << 3)
>>     #define KVM_REGSYNC_S390_FULL_BIT          (1 << 4)
>> The idea being that SOME represents the set of RUNTIME registers we
>> always want to read when we exit from KVM.

...and only do s390-specific stuff with the registers. At the point
generic code starts to access the state, you must read the FULL state as
that is what generic bits can assume today.

>>
>> And REST represents the
>> set of RUNTIME registers we want to read for migration/dump and
>> potentially other special cases.  My understanding is that SOME and
>> REST should be mutually exclusive.  I think they need better names
>> as well :).
>>
>> S390 defines it's generic bits like this:
>>     #define KVM_REGSYNC_RUNTIME_STATE
>>         (KVM_REGSYNC_S390_RUNTIME_SOME_BIT |
>>             KVM_REGSYNC_S390_RUNTIME_REST_BIT)
>>     #define KVM_REGSYNC_RESET_STATE      KVM_REGSYNC_S390_RESET_BIT
>>     #define KVM_REGSYNC_FULL_STATE        KVM_REGSYNC_S390_FULL_BIT
>>
>> S390: A new function is created:
>> s390_sync_partial_runtime_registers(int bitmap).  The bitmap
>> argument indicates which of the SOME/REST register sets to read.
>> Either this new function or perhaps the caller will update the
>> cpu->kvm_vcpu_dirty bitmap to indicate which regs are now dirty.
>>
>> S390: On the hot paths we call s390_sync_partial_runtime_registers(KVM_REGSYNC_S390_RUNTIME_SOME_BIT)
>> instead of cpu_synchronize_state() to read only the set of runtime
>> registers we need on the hot path.  If at some later point
>> cpu_synchronize_state() happens to be called then the S390 version
>> of kvm_arch_get_registers() needs to be smart enough to avoid data
>> loss. So we make it write back all dirty registers
>> (env->kvm_vcpu_dirty) before getting anything.
>>
>> I think this works.  Comments please and thank you!! :)
> 
> The idea behind s390_sync_partial_runtime_registers was that no generic
> modifications have to be done. This (containment of the modifications 
> to S/390) is possible if: 
> 
> 1) The hot paths in question are vcpu local. That is, only executed in
> VCPU context. Which seems to be the case because these hot paths are hot 
> because they are VM-exits handled in userspace.
> 
> Can you confirm this?
> 
> Because frankly, i dislike splitting the register sets without adding accessors
> (so thinking is, either go all the way and have an interface which is
> difficult to make mistakes with or contain the register splitting 
> changes to S/390).
> 
> But, the original author of this interface is Jan Kiskza, so he
> should be consulted.

Yes, until we have a generic and clean solution for all archs, such a
read split-up should remain s390-local.

Jan

Patch

diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index e0738ba..193d1f4 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -223,6 +223,12 @@  int kvm_check_extension(KVMState *s, unsigned int extension);
 
 uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function,
                                       uint32_t index, int reg);
+
+struct kvm_cpu_syncstate_args {
+    CPUState *cpu;
+    int regmap;
+};
+
 void kvm_cpu_synchronize_state(CPUArchState *env);
 void kvm_cpu_synchronize_post_reset(CPUArchState *env);
 void kvm_cpu_synchronize_post_init(CPUArchState *env);
diff --git a/kvm-all.c b/kvm-all.c
index 1aa61bb..77ab72a 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -231,7 +231,7 @@  int kvm_init_vcpu(CPUArchState *env)
 
     cpu->kvm_fd = ret;
     cpu->kvm_state = s;
-    cpu->kvm_vcpu_dirty = true;
+    cpu->kvm_vcpu_dirty = KVM_REGSYNC_FULL_STATE;
 
     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
     if (mmap_size < 0) {
@@ -1491,20 +1491,27 @@  void kvm_flush_coalesced_mmio_buffer(void)
 
 static void do_kvm_cpu_synchronize_state(void *arg)
 {
-    CPUState *cpu = arg;
+    struct kvm_cpu_syncstate_args *args = arg;
 
-    if (!cpu->kvm_vcpu_dirty) {
-        kvm_arch_get_registers(cpu, KVM_REGSYNC_FULL_STATE);
-        cpu->kvm_vcpu_dirty = true;
+    /* Do not sync regs that are already dirty */
+    int regs_to_get = args->regmap & ~args->cpu->kvm_vcpu_dirty;
+
+    if (regs_to_get) {
+        kvm_arch_get_registers(args->cpu, regs_to_get);
+        args->cpu->kvm_vcpu_dirty |= regs_to_get;
     }
 }
 
 void kvm_cpu_synchronize_state(CPUArchState *env)
 {
     CPUState *cpu = ENV_GET_CPU(env);
+    struct kvm_cpu_syncstate_args args;
+
+    args.cpu = cpu;
+    args.regmap = KVM_REGSYNC_FULL_STATE;
 
-    if (!cpu->kvm_vcpu_dirty) {
-        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
+    if (args.regmap & ~cpu->kvm_vcpu_dirty) {
+        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, &args);
     }
 }
 
@@ -1513,7 +1520,7 @@  void kvm_cpu_synchronize_post_reset(CPUArchState *env)
     CPUState *cpu = ENV_GET_CPU(env);
 
     kvm_arch_put_registers(cpu, KVM_REGSYNC_RESET_STATE);
-    cpu->kvm_vcpu_dirty = false;
+    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RESET_STATE;
 }
 
 void kvm_cpu_synchronize_post_init(CPUArchState *env)
@@ -1521,7 +1528,7 @@  void kvm_cpu_synchronize_post_init(CPUArchState *env)
     CPUState *cpu = ENV_GET_CPU(env);
 
     kvm_arch_put_registers(cpu, KVM_REGSYNC_FULL_STATE);
-    cpu->kvm_vcpu_dirty = false;
+    cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_FULL_STATE;
 }
 
 int kvm_cpu_exec(CPUArchState *env)
@@ -1540,7 +1547,7 @@  int kvm_cpu_exec(CPUArchState *env)
     do {
         if (cpu->kvm_vcpu_dirty) {
             kvm_arch_put_registers(cpu, KVM_REGSYNC_RUNTIME_STATE);
-            cpu->kvm_vcpu_dirty = false;
+            cpu->kvm_vcpu_dirty &= ~KVM_REGSYNC_RUNTIME_STATE;
         }
 
         kvm_arch_pre_run(cpu, run);