Patchwork [26/35] kvm: Eliminate KVMState arguments

login
register
mail settings
Submitter Marcelo Tosatti
Date Jan. 6, 2011, 5:56 p.m.
Message ID <ac450b882064664c79ae02e095155675ba560e88.1294336601.git.mtosatti@redhat.com>
Download mbox | patch
Permalink /patch/77786/
State New
Headers show

Comments

Marcelo Tosatti - Jan. 6, 2011, 5:56 p.m.
From: Jan Kiszka <jan.kiszka@siemens.com>

QEMU supports only one VM, so there is only one kvm_state per process,
and we gain nothing passing a reference to it around. Eliminate any need
to refer to it outside of kvm-all.c.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
CC: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 cpu-defs.h            |    2 -
 kvm-all.c             |  232 +++++++++++++++++++++----------------------------
 kvm-stub.c            |    2 +-
 kvm.h                 |   15 +--
 target-i386/cpuid.c   |    9 +-
 target-i386/kvm.c     |   77 ++++++++--------
 target-i386/kvm_x86.h |    3 +
 target-ppc/kvm.c      |   12 ++--
 target-s390x/kvm.c    |    8 +--
 9 files changed, 160 insertions(+), 200 deletions(-)
Anthony Liguori - Jan. 6, 2011, 7:24 p.m.
On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
> From: Jan Kiszka<jan.kiszka@siemens.com>
>
> QEMU supports only one VM, so there is only one kvm_state per process,
> and we gain nothing passing a reference to it around. Eliminate any need
> to refer to it outside of kvm-all.c.
>
> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
> CC: Alexander Graf<agraf@suse.de>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>    

I think this is a big mistake.

Having to manage kvm_state keeps the abstraction lines well defined.  
Otherwise, it's far too easy for portions of code to call into KVM 
functions that really shouldn't.

Regards,

Anthony Liguori

> ---
>   cpu-defs.h            |    2 -
>   kvm-all.c             |  232 +++++++++++++++++++++----------------------------
>   kvm-stub.c            |    2 +-
>   kvm.h                 |   15 +--
>   target-i386/cpuid.c   |    9 +-
>   target-i386/kvm.c     |   77 ++++++++--------
>   target-i386/kvm_x86.h |    3 +
>   target-ppc/kvm.c      |   12 ++--
>   target-s390x/kvm.c    |    8 +--
>   9 files changed, 160 insertions(+), 200 deletions(-)
>
> diff --git a/cpu-defs.h b/cpu-defs.h
> index 8d4bf86..0e04239 100644
> --- a/cpu-defs.h
> +++ b/cpu-defs.h
> @@ -131,7 +131,6 @@ typedef struct icount_decr_u16 {
>   #endif
>
>   struct kvm_run;
> -struct KVMState;
>   struct qemu_work_item;
>
>   typedef struct CPUBreakpoint {
> @@ -207,7 +206,6 @@ typedef struct CPUWatchpoint {
>       struct QemuCond *halt_cond;                                         \
>       struct qemu_work_item *queued_work_first, *queued_work_last;        \
>       const char *cpu_model_str;                                          \
> -    struct KVMState *kvm_state;                                         \
>       struct kvm_run *kvm_run;                                            \
>       int kvm_fd;                                                         \
>       int kvm_vcpu_dirty;
> diff --git a/kvm-all.c b/kvm-all.c
> index ef2ca3b..d8820c7 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -52,8 +52,7 @@ typedef struct KVMSlot
>
>   typedef struct kvm_dirty_log KVMDirtyLog;
>
> -struct KVMState
> -{
> +static struct KVMState {
>       KVMSlot slots[32];
>       int fd;
>       int vmfd;
> @@ -72,21 +71,19 @@ struct KVMState
>       int irqchip_in_kernel;
>       int pit_in_kernel;
>       int xsave, xcrs;
> -};
> -
> -static KVMState *kvm_state;
> +} kvm_state;
>
> -static KVMSlot *kvm_alloc_slot(KVMState *s)
> +static KVMSlot *kvm_alloc_slot(void)
>   {
>       int i;
>
> -    for (i = 0; i<  ARRAY_SIZE(s->slots); i++) {
> +    for (i = 0; i<  ARRAY_SIZE(kvm_state.slots); i++) {
>           /* KVM private memory slots */
>           if (i>= 8&&  i<  12) {
>               continue;
>           }
> -        if (s->slots[i].memory_size == 0) {
> -            return&s->slots[i];
> +        if (kvm_state.slots[i].memory_size == 0) {
> +            return&kvm_state.slots[i];
>           }
>       }
>
> @@ -94,14 +91,13 @@ static KVMSlot *kvm_alloc_slot(KVMState *s)
>       abort();
>   }
>
> -static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
> -                                         target_phys_addr_t start_addr,
> +static KVMSlot *kvm_lookup_matching_slot(target_phys_addr_t start_addr,
>                                            target_phys_addr_t end_addr)
>   {
>       int i;
>
> -    for (i = 0; i<  ARRAY_SIZE(s->slots); i++) {
> -        KVMSlot *mem =&s->slots[i];
> +    for (i = 0; i<  ARRAY_SIZE(kvm_state.slots); i++) {
> +        KVMSlot *mem =&kvm_state.slots[i];
>
>           if (start_addr == mem->start_addr&&
>               end_addr == mem->start_addr + mem->memory_size) {
> @@ -115,15 +111,14 @@ static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
>   /*
>    * Find overlapping slot with lowest start address
>    */
> -static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
> -                                            target_phys_addr_t start_addr,
> +static KVMSlot *kvm_lookup_overlapping_slot(target_phys_addr_t start_addr,
>                                               target_phys_addr_t end_addr)
>   {
>       KVMSlot *found = NULL;
>       int i;
>
> -    for (i = 0; i<  ARRAY_SIZE(s->slots); i++) {
> -        KVMSlot *mem =&s->slots[i];
> +    for (i = 0; i<  ARRAY_SIZE(kvm_state.slots); i++) {
> +        KVMSlot *mem =&kvm_state.slots[i];
>
>           if (mem->memory_size == 0 ||
>               (found&&  found->start_addr<  mem->start_addr)) {
> @@ -139,13 +134,13 @@ static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
>       return found;
>   }
>
> -int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
> +int kvm_physical_memory_addr_from_ram(ram_addr_t ram_addr,
>                                         target_phys_addr_t *phys_addr)
>   {
>       int i;
>
> -    for (i = 0; i<  ARRAY_SIZE(s->slots); i++) {
> -        KVMSlot *mem =&s->slots[i];
> +    for (i = 0; i<  ARRAY_SIZE(kvm_state.slots); i++) {
> +        KVMSlot *mem =&kvm_state.slots[i];
>
>           if (ram_addr>= mem->phys_offset&&
>               ram_addr<  mem->phys_offset + mem->memory_size) {
> @@ -157,7 +152,7 @@ int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
>       return 0;
>   }
>
> -static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
> +static int kvm_set_user_memory_region(KVMSlot *slot)
>   {
>       struct kvm_userspace_memory_region mem;
>
> @@ -166,10 +161,10 @@ static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
>       mem.memory_size = slot->memory_size;
>       mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
>       mem.flags = slot->flags;
> -    if (s->migration_log) {
> +    if (kvm_state.migration_log) {
>           mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
>       }
> -    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION,&mem);
> +    return kvm_vm_ioctl(KVM_SET_USER_MEMORY_REGION,&mem);
>   }
>
>   static void kvm_reset_vcpu(void *opaque)
> @@ -181,33 +176,31 @@ static void kvm_reset_vcpu(void *opaque)
>
>   int kvm_irqchip_in_kernel(void)
>   {
> -    return kvm_state->irqchip_in_kernel;
> +    return kvm_state.irqchip_in_kernel;
>   }
>
>   int kvm_pit_in_kernel(void)
>   {
> -    return kvm_state->pit_in_kernel;
> +    return kvm_state.pit_in_kernel;
>   }
>
>
>   int kvm_init_vcpu(CPUState *env)
>   {
> -    KVMState *s = kvm_state;
>       long mmap_size;
>       int ret;
>
>       DPRINTF("kvm_init_vcpu\n");
>
> -    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
> +    ret = kvm_vm_ioctl(KVM_CREATE_VCPU, env->cpu_index);
>       if (ret<  0) {
>           DPRINTF("kvm_create_vcpu failed\n");
>           goto err;
>       }
>
>       env->kvm_fd = ret;
> -    env->kvm_state = s;
>
> -    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
> +    mmap_size = kvm_ioctl(KVM_GET_VCPU_MMAP_SIZE, 0);
>       if (mmap_size<  0) {
>           DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
>           goto err;
> @@ -222,9 +215,9 @@ int kvm_init_vcpu(CPUState *env)
>       }
>
>   #ifdef KVM_CAP_COALESCED_MMIO
> -    if (s->coalesced_mmio&&  !s->coalesced_mmio_ring) {
> -        s->coalesced_mmio_ring =
> -            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
> +    if (kvm_state.coalesced_mmio&&  !kvm_state.coalesced_mmio_ring) {
> +        kvm_state.coalesced_mmio_ring =
> +            (void *)env->kvm_run + kvm_state.coalesced_mmio * PAGE_SIZE;
>       }
>   #endif
>
> @@ -243,8 +236,7 @@ err:
>   static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
>                                         ram_addr_t size, int flags, int mask)
>   {
> -    KVMState *s = kvm_state;
> -    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
> +    KVMSlot *mem = kvm_lookup_matching_slot(phys_addr, phys_addr + size);
>       int old_flags;
>
>       if (mem == NULL)  {
> @@ -260,14 +252,14 @@ static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
>       mem->flags = flags;
>
>       /* If nothing changed effectively, no need to issue ioctl */
> -    if (s->migration_log) {
> +    if (kvm_state.migration_log) {
>           flags |= KVM_MEM_LOG_DIRTY_PAGES;
>       }
>       if (flags == old_flags) {
>               return 0;
>       }
>
> -    return kvm_set_user_memory_region(s, mem);
> +    return kvm_set_user_memory_region(mem);
>   }
>
>   int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
> @@ -284,14 +276,13 @@ int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
>
>   static int kvm_set_migration_log(int enable)
>   {
> -    KVMState *s = kvm_state;
>       KVMSlot *mem;
>       int i, err;
>
> -    s->migration_log = enable;
> +    kvm_state.migration_log = enable;
>
> -    for (i = 0; i<  ARRAY_SIZE(s->slots); i++) {
> -        mem =&s->slots[i];
> +    for (i = 0; i<  ARRAY_SIZE(kvm_state.slots); i++) {
> +        mem =&kvm_state.slots[i];
>
>           if (!mem->memory_size) {
>               continue;
> @@ -299,7 +290,7 @@ static int kvm_set_migration_log(int enable)
>           if (!!(mem->flags&  KVM_MEM_LOG_DIRTY_PAGES) == enable) {
>               continue;
>           }
> -        err = kvm_set_user_memory_region(s, mem);
> +        err = kvm_set_user_memory_region(mem);
>           if (err) {
>               return err;
>           }
> @@ -353,7 +344,6 @@ static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
>   static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
>                                             target_phys_addr_t end_addr)
>   {
> -    KVMState *s = kvm_state;
>       unsigned long size, allocated_size = 0;
>       KVMDirtyLog d;
>       KVMSlot *mem;
> @@ -361,7 +351,7 @@ static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
>
>       d.dirty_bitmap = NULL;
>       while (start_addr<  end_addr) {
> -        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
> +        mem = kvm_lookup_overlapping_slot(start_addr, end_addr);
>           if (mem == NULL) {
>               break;
>           }
> @@ -377,7 +367,7 @@ static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
>
>           d.slot = mem->slot;
>
> -        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG,&d) == -1) {
> +        if (kvm_vm_ioctl(KVM_GET_DIRTY_LOG,&d) == -1) {
>               DPRINTF("ioctl failed %d\n", errno);
>               ret = -1;
>               break;
> @@ -395,16 +385,15 @@ static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
>   int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
>   {
>       int ret = -ENOSYS;
> -#ifdef KVM_CAP_COALESCED_MMIO
> -    KVMState *s = kvm_state;
>
> -    if (s->coalesced_mmio) {
> +#ifdef KVM_CAP_COALESCED_MMIO
> +    if (kvm_state.coalesced_mmio) {
>           struct kvm_coalesced_mmio_zone zone;
>
>           zone.addr = start;
>           zone.size = size;
>
> -        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO,&zone);
> +        ret = kvm_vm_ioctl(KVM_REGISTER_COALESCED_MMIO,&zone);
>       }
>   #endif
>
> @@ -414,27 +403,26 @@ int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
>   int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
>   {
>       int ret = -ENOSYS;
> -#ifdef KVM_CAP_COALESCED_MMIO
> -    KVMState *s = kvm_state;
>
> -    if (s->coalesced_mmio) {
> +#ifdef KVM_CAP_COALESCED_MMIO
> +    if (kvm_state.coalesced_mmio) {
>           struct kvm_coalesced_mmio_zone zone;
>
>           zone.addr = start;
>           zone.size = size;
>
> -        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO,&zone);
> +        ret = kvm_vm_ioctl(KVM_UNREGISTER_COALESCED_MMIO,&zone);
>       }
>   #endif
>
>       return ret;
>   }
>
> -int kvm_check_extension(KVMState *s, unsigned int extension)
> +int kvm_check_extension(unsigned int extension)
>   {
>       int ret;
>
> -    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
> +    ret = kvm_ioctl(KVM_CHECK_EXTENSION, extension);
>       if (ret<  0) {
>           ret = 0;
>       }
> @@ -445,7 +433,6 @@ int kvm_check_extension(KVMState *s, unsigned int extension)
>   static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
>                                ram_addr_t phys_offset)
>   {
> -    KVMState *s = kvm_state;
>       ram_addr_t flags = phys_offset&  ~TARGET_PAGE_MASK;
>       KVMSlot *mem, old;
>       int err;
> @@ -459,7 +446,7 @@ static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
>       phys_offset&= ~IO_MEM_ROM;
>
>       while (1) {
> -        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
> +        mem = kvm_lookup_overlapping_slot(start_addr, start_addr + size);
>           if (!mem) {
>               break;
>           }
> @@ -476,7 +463,7 @@ static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
>
>           /* unregister the overlapping slot */
>           mem->memory_size = 0;
> -        err = kvm_set_user_memory_region(s, mem);
> +        err = kvm_set_user_memory_region(mem);
>           if (err) {
>               fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
>                       __func__, strerror(-err));
> @@ -491,16 +478,16 @@ static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
>            * address as the first existing one. If not or if some overlapping
>            * slot comes around later, we will fail (not seen in practice so far)
>            * - and actually require a recent KVM version. */
> -        if (s->broken_set_mem_region&&
> +        if (kvm_state.broken_set_mem_region&&
>               old.start_addr == start_addr&&  old.memory_size<  size&&
>               flags<  IO_MEM_UNASSIGNED) {
> -            mem = kvm_alloc_slot(s);
> +            mem = kvm_alloc_slot();
>               mem->memory_size = old.memory_size;
>               mem->start_addr = old.start_addr;
>               mem->phys_offset = old.phys_offset;
>               mem->flags = 0;
>
> -            err = kvm_set_user_memory_region(s, mem);
> +            err = kvm_set_user_memory_region(mem);
>               if (err) {
>                   fprintf(stderr, "%s: error updating slot: %s\n", __func__,
>                           strerror(-err));
> @@ -515,13 +502,13 @@ static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
>
>           /* register prefix slot */
>           if (old.start_addr<  start_addr) {
> -            mem = kvm_alloc_slot(s);
> +            mem = kvm_alloc_slot();
>               mem->memory_size = start_addr - old.start_addr;
>               mem->start_addr = old.start_addr;
>               mem->phys_offset = old.phys_offset;
>               mem->flags = 0;
>
> -            err = kvm_set_user_memory_region(s, mem);
> +            err = kvm_set_user_memory_region(mem);
>               if (err) {
>                   fprintf(stderr, "%s: error registering prefix slot: %s\n",
>                           __func__, strerror(-err));
> @@ -533,14 +520,14 @@ static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
>           if (old.start_addr + old.memory_size>  start_addr + size) {
>               ram_addr_t size_delta;
>
> -            mem = kvm_alloc_slot(s);
> +            mem = kvm_alloc_slot();
>               mem->start_addr = start_addr + size;
>               size_delta = mem->start_addr - old.start_addr;
>               mem->memory_size = old.memory_size - size_delta;
>               mem->phys_offset = old.phys_offset + size_delta;
>               mem->flags = 0;
>
> -            err = kvm_set_user_memory_region(s, mem);
> +            err = kvm_set_user_memory_region(mem);
>               if (err) {
>                   fprintf(stderr, "%s: error registering suffix slot: %s\n",
>                           __func__, strerror(-err));
> @@ -557,13 +544,13 @@ static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
>       if (flags>= IO_MEM_UNASSIGNED) {
>           return;
>       }
> -    mem = kvm_alloc_slot(s);
> +    mem = kvm_alloc_slot();
>       mem->memory_size = size;
>       mem->start_addr = start_addr;
>       mem->phys_offset = phys_offset;
>       mem->flags = 0;
>
> -    err = kvm_set_user_memory_region(s, mem);
> +    err = kvm_set_user_memory_region(mem);
>       if (err) {
>           fprintf(stderr, "%s: error registering slot: %s\n", __func__,
>                   strerror(-err));
> @@ -602,27 +589,24 @@ int kvm_init(int smp_cpus)
>       static const char upgrade_note[] =
>           "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
>           "(see http://sourceforge.net/projects/kvm).\n";
> -    KVMState *s;
>       int ret;
>       int i;
>
> -    s = qemu_mallocz(sizeof(KVMState));
> -
>   #ifdef KVM_CAP_SET_GUEST_DEBUG
> -    QTAILQ_INIT(&s->kvm_sw_breakpoints);
> +    QTAILQ_INIT(&kvm_state.kvm_sw_breakpoints);
>   #endif
> -    for (i = 0; i<  ARRAY_SIZE(s->slots); i++) {
> -        s->slots[i].slot = i;
> +    for (i = 0; i<  ARRAY_SIZE(kvm_state.slots); i++) {
> +        kvm_state.slots[i].slot = i;
>       }
> -    s->vmfd = -1;
> -    s->fd = qemu_open("/dev/kvm", O_RDWR);
> -    if (s->fd == -1) {
> +    kvm_state.vmfd = -1;
> +    kvm_state.fd = qemu_open("/dev/kvm", O_RDWR);
> +    if (kvm_state.fd == -1) {
>           fprintf(stderr, "Could not access KVM kernel module: %m\n");
>           ret = -errno;
>           goto err;
>       }
>
> -    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
> +    ret = kvm_ioctl(KVM_GET_API_VERSION, 0);
>       if (ret<  KVM_API_VERSION) {
>           if (ret>  0) {
>               ret = -EINVAL;
> @@ -637,8 +621,8 @@ int kvm_init(int smp_cpus)
>           goto err;
>       }
>
> -    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
> -    if (s->vmfd<  0) {
> +    kvm_state.vmfd = kvm_ioctl(KVM_CREATE_VM, 0);
> +    if (kvm_state.vmfd<  0) {
>   #ifdef TARGET_S390X
>           fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
>                           "your host kernel command line\n");
> @@ -651,7 +635,7 @@ int kvm_init(int smp_cpus)
>        * just use a user allocated buffer so we can use regular pages
>        * unmodified.  Make sure we have a sufficiently modern version of KVM.
>        */
> -    if (!kvm_check_extension(s, KVM_CAP_USER_MEMORY)) {
> +    if (!kvm_check_extension(KVM_CAP_USER_MEMORY)) {
>           ret = -EINVAL;
>           fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n%s",
>                   upgrade_note);
> @@ -661,7 +645,7 @@ int kvm_init(int smp_cpus)
>       /* There was a nasty bug in<  kvm-80 that prevents memory slots from being
>        * destroyed properly.  Since we rely on this capability, refuse to work
>        * with any kernel without this capability. */
> -    if (!kvm_check_extension(s, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
> +    if (!kvm_check_extension(KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
>           ret = -EINVAL;
>
>           fprintf(stderr,
> @@ -670,66 +654,55 @@ int kvm_init(int smp_cpus)
>           goto err;
>       }
>
> -    s->coalesced_mmio = 0;
>   #ifdef KVM_CAP_COALESCED_MMIO
> -    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
> -    s->coalesced_mmio_ring = NULL;
> +    kvm_state.coalesced_mmio = kvm_check_extension(KVM_CAP_COALESCED_MMIO);
>   #endif
>
> -    s->broken_set_mem_region = 1;
> +    kvm_state.broken_set_mem_region = 1;
>   #ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
> -    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
> +    ret = kvm_check_extension(KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
>       if (ret>  0) {
> -        s->broken_set_mem_region = 0;
> +        kvm_state.broken_set_mem_region = 0;
>       }
>   #endif
>
> -    s->vcpu_events = 0;
>   #ifdef KVM_CAP_VCPU_EVENTS
> -    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
> +    kvm_state.vcpu_events = kvm_check_extension(KVM_CAP_VCPU_EVENTS);
>   #endif
>
> -    s->robust_singlestep = 0;
>   #ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
> -    s->robust_singlestep =
> -        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
> +    kvm_state.robust_singlestep =
> +        kvm_check_extension(KVM_CAP_X86_ROBUST_SINGLESTEP);
>   #endif
>
> -    s->debugregs = 0;
>   #ifdef KVM_CAP_DEBUGREGS
> -    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
> +    kvm_state.debugregs = kvm_check_extension(KVM_CAP_DEBUGREGS);
>   #endif
>
> -    s->xsave = 0;
>   #ifdef KVM_CAP_XSAVE
> -    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
> +    kvm_state.xsave = kvm_check_extension(KVM_CAP_XSAVE);
>   #endif
>
> -    s->xcrs = 0;
>   #ifdef KVM_CAP_XCRS
> -    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
> +    kvm_state.xcrs = kvm_check_extension(KVM_CAP_XCRS);
>   #endif
>
> -    ret = kvm_arch_init(s, smp_cpus);
> +    ret = kvm_arch_init(smp_cpus);
>       if (ret<  0) {
>           goto err;
>       }
>
> -    kvm_state = s;
>       cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
>
>       return 0;
>
>   err:
> -    if (s) {
> -        if (s->vmfd != -1) {
> -            close(s->vmfd);
> -        }
> -        if (s->fd != -1) {
> -            close(s->fd);
> -        }
> +    if (kvm_state.vmfd != -1) {
> +        close(kvm_state.vmfd);
> +    }
> +    if (kvm_state.fd != -1) {
> +        close(kvm_state.fd);
>       }
> -    qemu_free(s);
>
>       return ret;
>   }
> @@ -777,7 +750,7 @@ static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
>   static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
>   {
>       fprintf(stderr, "KVM internal error.");
> -    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
> +    if (kvm_check_extension(KVM_CAP_INTERNAL_ERROR_DATA)) {
>           int i;
>
>           fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
> @@ -805,9 +778,8 @@ static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
>   void kvm_flush_coalesced_mmio_buffer(void)
>   {
>   #ifdef KVM_CAP_COALESCED_MMIO
> -    KVMState *s = kvm_state;
> -    if (s->coalesced_mmio_ring) {
> -        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
> +    if (kvm_state.coalesced_mmio_ring) {
> +        struct kvm_coalesced_mmio_ring *ring = kvm_state.coalesced_mmio_ring;
>           while (ring->first != ring->last) {
>               struct kvm_coalesced_mmio *ent;
>
> @@ -963,7 +935,7 @@ void kvm_cpu_exec(CPUState *env)
>       }
>   }
>
> -int kvm_ioctl(KVMState *s, int type, ...)
> +int kvm_ioctl(int type, ...)
>   {
>       int ret;
>       void *arg;
> @@ -973,14 +945,14 @@ int kvm_ioctl(KVMState *s, int type, ...)
>       arg = va_arg(ap, void *);
>       va_end(ap);
>
> -    ret = ioctl(s->fd, type, arg);
> +    ret = ioctl(kvm_state.fd, type, arg);
>       if (ret == -1) {
>           ret = -errno;
>       }
>       return ret;
>   }
>
> -int kvm_vm_ioctl(KVMState *s, int type, ...)
> +int kvm_vm_ioctl(int type, ...)
>   {
>       int ret;
>       void *arg;
> @@ -990,7 +962,7 @@ int kvm_vm_ioctl(KVMState *s, int type, ...)
>       arg = va_arg(ap, void *);
>       va_end(ap);
>
> -    ret = ioctl(s->vmfd, type, arg);
> +    ret = ioctl(kvm_state.vmfd, type, arg);
>       if (ret == -1) {
>           ret = -errno;
>       }
> @@ -1017,9 +989,7 @@ int kvm_vcpu_ioctl(CPUState *env, int type, ...)
>   int kvm_has_sync_mmu(void)
>   {
>   #ifdef KVM_CAP_SYNC_MMU
> -    KVMState *s = kvm_state;
> -
> -    return kvm_check_extension(s, KVM_CAP_SYNC_MMU);
> +    return kvm_check_extension(KVM_CAP_SYNC_MMU);
>   #else
>       return 0;
>   #endif
> @@ -1027,27 +997,27 @@ int kvm_has_sync_mmu(void)
>
>   int kvm_has_vcpu_events(void)
>   {
> -    return kvm_state->vcpu_events;
> +    return kvm_state.vcpu_events;
>   }
>
>   int kvm_has_robust_singlestep(void)
>   {
> -    return kvm_state->robust_singlestep;
> +    return kvm_state.robust_singlestep;
>   }
>
>   int kvm_has_debugregs(void)
>   {
> -    return kvm_state->debugregs;
> +    return kvm_state.debugregs;
>   }
>
>   int kvm_has_xsave(void)
>   {
> -    return kvm_state->xsave;
> +    return kvm_state.xsave;
>   }
>
>   int kvm_has_xcrs(void)
>   {
> -    return kvm_state->xcrs;
> +    return kvm_state.xcrs;
>   }
>
>   void kvm_setup_guest_memory(void *start, size_t size)
> @@ -1070,7 +1040,7 @@ struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
>   {
>       struct kvm_sw_breakpoint *bp;
>
> -    QTAILQ_FOREACH(bp,&env->kvm_state->kvm_sw_breakpoints, entry) {
> +    QTAILQ_FOREACH(bp,&kvm_state.kvm_sw_breakpoints, entry) {
>           if (bp->pc == pc) {
>               return bp;
>           }
> @@ -1080,7 +1050,7 @@ struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
>
>   int kvm_sw_breakpoints_active(CPUState *env)
>   {
> -    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
> +    return !QTAILQ_EMPTY(&kvm_state.kvm_sw_breakpoints);
>   }
>
>   struct kvm_set_guest_debug_data {
> @@ -1140,8 +1110,7 @@ int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
>               return err;
>           }
>
> -        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
> -                          bp, entry);
> +        QTAILQ_INSERT_HEAD(&kvm_state.kvm_sw_breakpoints, bp, entry);
>       } else {
>           err = kvm_arch_insert_hw_breakpoint(addr, len, type);
>           if (err) {
> @@ -1181,7 +1150,7 @@ int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
>               return err;
>           }
>
> -        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
> +        QTAILQ_REMOVE(&kvm_state.kvm_sw_breakpoints, bp, entry);
>           qemu_free(bp);
>       } else {
>           err = kvm_arch_remove_hw_breakpoint(addr, len, type);
> @@ -1202,10 +1171,9 @@ int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
>   void kvm_remove_all_breakpoints(CPUState *current_env)
>   {
>       struct kvm_sw_breakpoint *bp, *next;
> -    KVMState *s = current_env->kvm_state;
>       CPUState *env;
>
> -    QTAILQ_FOREACH_SAFE(bp,&s->kvm_sw_breakpoints, entry, next) {
> +    QTAILQ_FOREACH_SAFE(bp,&kvm_state.kvm_sw_breakpoints, entry, next) {
>           if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
>               /* Try harder to find a CPU that currently sees the breakpoint. */
>               for (env = first_cpu; env != NULL; env = env->next_cpu) {
> @@ -1285,7 +1253,7 @@ int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign
>           iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
>       }
>
> -    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD,&iofd);
> +    ret = kvm_vm_ioctl(KVM_IOEVENTFD,&iofd);
>
>       if (ret<  0) {
>           return -errno;
> @@ -1314,7 +1282,7 @@ int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
>       if (!assign) {
>           kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
>       }
> -    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD,&kick);
> +    r = kvm_vm_ioctl(KVM_IOEVENTFD,&kick);
>       if (r<  0) {
>           return r;
>       }
> diff --git a/kvm-stub.c b/kvm-stub.c
> index 352c6a6..3a058ad 100644
> --- a/kvm-stub.c
> +++ b/kvm-stub.c
> @@ -53,7 +53,7 @@ int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
>       return -ENOSYS;
>   }
>
> -int kvm_check_extension(KVMState *s, unsigned int extension)
> +int kvm_check_extension(unsigned int extension)
>   {
>       return 0;
>   }
> diff --git a/kvm.h b/kvm.h
> index 51ad56f..26ca8c1 100644
> --- a/kvm.h
> +++ b/kvm.h
> @@ -74,12 +74,9 @@ int kvm_irqchip_in_kernel(void);
>
>   /* internal API */
>
> -struct KVMState;
> -typedef struct KVMState KVMState;
> +int kvm_ioctl(int type, ...);
>
> -int kvm_ioctl(KVMState *s, int type, ...);
> -
> -int kvm_vm_ioctl(KVMState *s, int type, ...);
> +int kvm_vm_ioctl(int type, ...);
>
>   int kvm_vcpu_ioctl(CPUState *env, int type, ...);
>
> @@ -104,7 +101,7 @@ int kvm_arch_get_registers(CPUState *env);
>
>   int kvm_arch_put_registers(CPUState *env, int level);
>
> -int kvm_arch_init(KVMState *s, int smp_cpus);
> +int kvm_arch_init(int smp_cpus);
>
>   int kvm_arch_init_vcpu(CPUState *env);
>
> @@ -146,10 +143,8 @@ void kvm_arch_update_guest_debug(CPUState *env, struct kvm_guest_debug *dbg);
>
>   bool kvm_arch_stop_on_emulation_error(CPUState *env);
>
> -int kvm_check_extension(KVMState *s, unsigned int extension);
> +int kvm_check_extension(unsigned int extension);
>
> -uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
> -                                      uint32_t index, int reg);
>   void kvm_cpu_synchronize_state(CPUState *env);
>   void kvm_cpu_synchronize_post_reset(CPUState *env);
>   void kvm_cpu_synchronize_post_init(CPUState *env);
> @@ -179,7 +174,7 @@ static inline void cpu_synchronize_post_init(CPUState *env)
>
>
>   #if !defined(CONFIG_USER_ONLY)
> -int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
> +int kvm_physical_memory_addr_from_ram(ram_addr_t ram_addr,
>                                         target_phys_addr_t *phys_addr);
>   #endif
>
> diff --git a/target-i386/cpuid.c b/target-i386/cpuid.c
> index 5382a28..17ab619 100644
> --- a/target-i386/cpuid.c
> +++ b/target-i386/cpuid.c
> @@ -23,6 +23,7 @@
>
>   #include "cpu.h"
>   #include "kvm.h"
> +#include "kvm_x86.h"
>
>   #include "qemu-option.h"
>   #include "qemu-config.h"
> @@ -1138,10 +1139,10 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
>               break;
>           }
>           if (kvm_enabled()) {
> -            *eax = kvm_arch_get_supported_cpuid(env, 0xd, count, R_EAX);
> -            *ebx = kvm_arch_get_supported_cpuid(env, 0xd, count, R_EBX);
> -            *ecx = kvm_arch_get_supported_cpuid(env, 0xd, count, R_ECX);
> -            *edx = kvm_arch_get_supported_cpuid(env, 0xd, count, R_EDX);
> +            *eax = kvm_x86_get_supported_cpuid(0xd, count, R_EAX);
> +            *ebx = kvm_x86_get_supported_cpuid(0xd, count, R_EBX);
> +            *ecx = kvm_x86_get_supported_cpuid(0xd, count, R_ECX);
> +            *edx = kvm_x86_get_supported_cpuid(0xd, count, R_EDX);
>           } else {
>               *eax = 0;
>               *ebx = 0;
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 1789bff..cb6883f 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -60,7 +60,7 @@ static int lm_capable_kernel;
>
>   #ifdef KVM_CAP_EXT_CPUID
>
> -static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
> +static struct kvm_cpuid2 *try_get_cpuid(int max)
>   {
>       struct kvm_cpuid2 *cpuid;
>       int r, size;
> @@ -68,7 +68,7 @@ static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
>       size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
>       cpuid = (struct kvm_cpuid2 *)qemu_mallocz(size);
>       cpuid->nent = max;
> -    r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
> +    r = kvm_ioctl(KVM_GET_SUPPORTED_CPUID, cpuid);
>       if (r == 0&&  cpuid->nent>= max) {
>           r = -E2BIG;
>       }
> @@ -85,20 +85,20 @@ static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
>       return cpuid;
>   }
>
> -uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
> -                                      uint32_t index, int reg)
> +uint32_t kvm_x86_get_supported_cpuid(uint32_t function, uint32_t index,
> +                                     int reg)
>   {
>       struct kvm_cpuid2 *cpuid;
>       int i, max;
>       uint32_t ret = 0;
>       uint32_t cpuid_1_edx;
>
> -    if (!kvm_check_extension(env->kvm_state, KVM_CAP_EXT_CPUID)) {
> +    if (!kvm_check_extension(KVM_CAP_EXT_CPUID)) {
>           return -1U;
>       }
>
>       max = 1;
> -    while ((cpuid = try_get_cpuid(env->kvm_state, max)) == NULL) {
> +    while ((cpuid = try_get_cpuid(max)) == NULL) {
>           max *= 2;
>       }
>
> @@ -126,7 +126,7 @@ uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
>                       /* On Intel, kvm returns cpuid according to the Intel spec,
>                        * so add missing bits according to the AMD spec:
>                        */
> -                    cpuid_1_edx = kvm_arch_get_supported_cpuid(env, 1, 0, R_EDX);
> +                    cpuid_1_edx = kvm_x86_get_supported_cpuid(1, 0, R_EDX);
>                       ret |= cpuid_1_edx&  0x183f7ff;
>                       break;
>                   }
> @@ -142,8 +142,8 @@ uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
>
>   #else
>
> -uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
> -                                      uint32_t index, int reg)
> +uint32_t kvm_x86_get_supported_cpuid(uint32_t function, uint32_t index,
> +                                     int reg)
>   {
>       return -1U;
>   }
> @@ -170,12 +170,12 @@ struct kvm_para_features {
>       { -1, -1 }
>   };
>
> -static int get_para_features(CPUState *env)
> +static int get_para_features(void)
>   {
>       int i, features = 0;
>
>       for (i = 0; i<  ARRAY_SIZE(para_features) - 1; i++) {
> -        if (kvm_check_extension(env->kvm_state, para_features[i].cap)) {
> +        if (kvm_check_extension(para_features[i].cap)) {
>               features |= (1<<  para_features[i].feature);
>           }
>       }
> @@ -184,15 +184,14 @@ static int get_para_features(CPUState *env)
>   #endif
>
>   #ifdef KVM_CAP_MCE
> -static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
> -                                     int *max_banks)
> +static int kvm_get_mce_cap_supported(uint64_t *mce_cap, int *max_banks)
>   {
>       int r;
>
> -    r = kvm_check_extension(s, KVM_CAP_MCE);
> +    r = kvm_check_extension(KVM_CAP_MCE);
>       if (r>  0) {
>           *max_banks = r;
> -        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
> +        return kvm_ioctl(KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
>       }
>       return -ENOSYS;
>   }
> @@ -323,18 +322,18 @@ int kvm_arch_init_vcpu(CPUState *env)
>       uint32_t signature[3];
>   #endif
>
> -    env->cpuid_features&= kvm_arch_get_supported_cpuid(env, 1, 0, R_EDX);
> +    env->cpuid_features&= kvm_x86_get_supported_cpuid(1, 0, R_EDX);
>
>       i = env->cpuid_ext_features&  CPUID_EXT_HYPERVISOR;
> -    env->cpuid_ext_features&= kvm_arch_get_supported_cpuid(env, 1, 0, R_ECX);
> +    env->cpuid_ext_features&= kvm_x86_get_supported_cpuid(1, 0, R_ECX);
>       env->cpuid_ext_features |= i;
>
> -    env->cpuid_ext2_features&= kvm_arch_get_supported_cpuid(env, 0x80000001,
> -                                                             0, R_EDX);
> -    env->cpuid_ext3_features&= kvm_arch_get_supported_cpuid(env, 0x80000001,
> -                                                             0, R_ECX);
> -    env->cpuid_svm_features&= kvm_arch_get_supported_cpuid(env, 0x8000000A,
> -                                                             0, R_EDX);
> +    env->cpuid_ext2_features&= kvm_x86_get_supported_cpuid(0x80000001,
> +                                                            0, R_EDX);
> +    env->cpuid_ext3_features&= kvm_x86_get_supported_cpuid(0x80000001,
> +                                                            0, R_ECX);
> +    env->cpuid_svm_features&= kvm_x86_get_supported_cpuid(0x8000000A,
> +                                                            0, R_EDX);
>
>
>       cpuid_i = 0;
> @@ -353,7 +352,7 @@ int kvm_arch_init_vcpu(CPUState *env)
>       c =&cpuid_data.entries[cpuid_i++];
>       memset(c, 0, sizeof(*c));
>       c->function = KVM_CPUID_FEATURES;
> -    c->eax = env->cpuid_kvm_features&  get_para_features(env);
> +    c->eax = env->cpuid_kvm_features&  get_para_features();
>   #endif
>
>       cpu_x86_cpuid(env, 0, 0,&limit,&unused,&unused,&unused);
> @@ -423,11 +422,11 @@ int kvm_arch_init_vcpu(CPUState *env)
>   #ifdef KVM_CAP_MCE
>       if (((env->cpuid_version>>  8)&0xF)>= 6
>           &&  (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
> -&&  kvm_check_extension(env->kvm_state, KVM_CAP_MCE)>  0) {
> +&&  kvm_check_extension(KVM_CAP_MCE)>  0) {
>           uint64_t mcg_cap;
>           int banks;
>
> -        if (kvm_get_mce_cap_supported(env->kvm_state,&mcg_cap,&banks)) {
> +        if (kvm_get_mce_cap_supported(&mcg_cap,&banks)) {
>               perror("kvm_get_mce_cap_supported FAILED");
>           } else {
>               if (banks>  MCE_BANKS_DEF)
> @@ -461,7 +460,7 @@ void kvm_arch_reset_vcpu(CPUState *env)
>       }
>   }
>
> -static int kvm_get_supported_msrs(KVMState *s)
> +static int kvm_get_supported_msrs(void)
>   {
>       static int kvm_supported_msrs;
>       int ret = 0;
> @@ -475,7 +474,7 @@ static int kvm_get_supported_msrs(KVMState *s)
>           /* Obtain MSR list from KVM.  These are the MSRs that we must
>            * save/restore */
>           msr_list.nmsrs = 0;
> -        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST,&msr_list);
> +        ret = kvm_ioctl(KVM_GET_MSR_INDEX_LIST,&msr_list);
>           if (ret<  0&&  ret != -E2BIG) {
>               return ret;
>           }
> @@ -486,7 +485,7 @@ static int kvm_get_supported_msrs(KVMState *s)
>                                                 sizeof(msr_list.indices[0])));
>
>           kvm_msr_list->nmsrs = msr_list.nmsrs;
> -        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
> +        ret = kvm_ioctl(KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
>           if (ret>= 0) {
>               int i;
>
> @@ -508,17 +507,17 @@ static int kvm_get_supported_msrs(KVMState *s)
>       return ret;
>   }
>
> -static int kvm_init_identity_map_page(KVMState *s)
> +static int kvm_init_identity_map_page(void)
>   {
>   #ifdef KVM_CAP_SET_IDENTITY_MAP_ADDR
>       int ret;
>       uint64_t addr = 0xfffbc000;
>
> -    if (!kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
> +    if (!kvm_check_extension(KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
>           return 0;
>       }
>
> -    ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR,&addr);
> +    ret = kvm_vm_ioctl(KVM_SET_IDENTITY_MAP_ADDR,&addr);
>       if (ret<  0) {
>           fprintf(stderr, "kvm_set_identity_map_addr: %s\n", strerror(ret));
>           return ret;
> @@ -527,12 +526,12 @@ static int kvm_init_identity_map_page(KVMState *s)
>       return 0;
>   }
>
> -int kvm_arch_init(KVMState *s, int smp_cpus)
> +int kvm_arch_init(int smp_cpus)
>   {
>       int ret;
>       struct utsname utsname;
>
> -    ret = kvm_get_supported_msrs(s);
> +    ret = kvm_get_supported_msrs();
>       if (ret<  0) {
>           return ret;
>       }
> @@ -546,7 +545,7 @@ int kvm_arch_init(KVMState *s, int smp_cpus)
>        * versions of KVM just assumed that it would be at the end of physical
>        * memory but that doesn't work with more than 4GB of memory.  We simply
>        * refuse to work with those older versions of KVM. */
> -    ret = kvm_check_extension(s, KVM_CAP_SET_TSS_ADDR);
> +    ret = kvm_check_extension(KVM_CAP_SET_TSS_ADDR);
>       if (ret<= 0) {
>           fprintf(stderr, "kvm does not support KVM_CAP_SET_TSS_ADDR\n");
>           return ret;
> @@ -563,12 +562,12 @@ int kvm_arch_init(KVMState *s, int smp_cpus)
>           perror("e820_add_entry() table is full");
>           exit(1);
>       }
> -    ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, 0xfffbd000);
> +    ret = kvm_vm_ioctl(KVM_SET_TSS_ADDR, 0xfffbd000);
>       if (ret<  0) {
>           return ret;
>       }
>
> -    return kvm_init_identity_map_page(s);
> +    return kvm_init_identity_map_page();
>   }
>
>   static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
> @@ -1861,7 +1860,7 @@ int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
>               || code == BUS_MCEERR_AO)) {
>           vaddr = (void *)addr;
>           if (qemu_ram_addr_from_host(vaddr,&ram_addr) ||
> -            !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr,&paddr)) {
> +            !kvm_physical_memory_addr_from_ram(ram_addr,&paddr)) {
>               fprintf(stderr, "Hardware memory error for memory used by "
>                       "QEMU itself instead of guest system!\n");
>               /* Hope we are lucky for AO MCE */
> @@ -1910,7 +1909,7 @@ int kvm_on_sigbus(int code, void *addr)
>           /* Hope we are lucky for AO MCE */
>           vaddr = addr;
>           if (qemu_ram_addr_from_host(vaddr,&ram_addr) ||
> -            !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr,&paddr)) {
> +            !kvm_physical_memory_addr_from_ram(ram_addr,&paddr)) {
>               fprintf(stderr, "Hardware memory error for memory used by "
>                       "QEMU itself instead of guest system!: %p\n", addr);
>               return 0;
> diff --git a/target-i386/kvm_x86.h b/target-i386/kvm_x86.h
> index 9d7b584..304d0cb 100644
> --- a/target-i386/kvm_x86.h
> +++ b/target-i386/kvm_x86.h
> @@ -22,4 +22,7 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
>                           uint64_t mcg_status, uint64_t addr, uint64_t misc,
>                           int flag);
>
> +uint32_t kvm_x86_get_supported_cpuid(uint32_t function, uint32_t index,
> +                                     int reg);
> +
>   #endif
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 849b404..56d30cc 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -56,13 +56,13 @@ static void kvm_kick_env(void *env)
>       qemu_cpu_kick(env);
>   }
>
> -int kvm_arch_init(KVMState *s, int smp_cpus)
> +int kvm_arch_init(int smp_cpus)
>   {
>   #ifdef KVM_CAP_PPC_UNSET_IRQ
> -    cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
> +    cap_interrupt_unset = kvm_check_extension(KVM_CAP_PPC_UNSET_IRQ);
>   #endif
>   #ifdef KVM_CAP_PPC_IRQ_LEVEL
> -    cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
> +    cap_interrupt_level = kvm_check_extension(KVM_CAP_PPC_IRQ_LEVEL);
>   #endif
>
>       if (!cap_interrupt_level) {
> @@ -164,7 +164,7 @@ int kvm_arch_get_registers(CPUState *env)
>           env->gpr[i] = regs.gpr[i];
>
>   #ifdef KVM_CAP_PPC_SEGSTATE
> -    if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_SEGSTATE)) {
> +    if (kvm_check_extension(KVM_CAP_PPC_SEGSTATE)) {
>           env->sdr1 = sregs.u.s.sdr1;
>
>           /* Sync SLB */
> @@ -371,8 +371,8 @@ int kvmppc_get_hypercall(CPUState *env, uint8_t *buf, int buf_len)
>   #ifdef KVM_CAP_PPC_GET_PVINFO
>       struct kvm_ppc_pvinfo pvinfo;
>
> -    if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_PVINFO)&&
> -        !kvm_vm_ioctl(env->kvm_state, KVM_PPC_GET_PVINFO,&pvinfo)) {
> +    if (kvm_check_extension(KVM_CAP_PPC_GET_PVINFO)&&
> +        !kvm_vm_ioctl(KVM_PPC_GET_PVINFO,&pvinfo)) {
>           memcpy(buf, pvinfo.hcall, buf_len);
>
>           return 0;
> diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
> index adf4a9e..927a37e 100644
> --- a/target-s390x/kvm.c
> +++ b/target-s390x/kvm.c
> @@ -70,7 +70,7 @@
>   #define SCLP_CMDW_READ_SCP_INFO         0x00020001
>   #define SCLP_CMDW_READ_SCP_INFO_FORCED  0x00120001
>
> -int kvm_arch_init(KVMState *s, int smp_cpus)
> +int kvm_arch_init(int smp_cpus)
>   {
>       return 0;
>   }
> @@ -186,10 +186,6 @@ static void kvm_s390_interrupt_internal(CPUState *env, int type, uint32_t parm,
>       struct kvm_s390_interrupt kvmint;
>       int r;
>
> -    if (!env->kvm_state) {
> -        return;
> -    }
> -
>       env->halted = 0;
>       env->exception_index = -1;
>
> @@ -198,7 +194,7 @@ static void kvm_s390_interrupt_internal(CPUState *env, int type, uint32_t parm,
>       kvmint.parm64 = parm64;
>
>       if (vm) {
> -        r = kvm_vm_ioctl(env->kvm_state, KVM_S390_INTERRUPT,&kvmint);
> +        r = kvm_vm_ioctl(KVM_S390_INTERRUPT,&kvmint);
>       } else {
>           r = kvm_vcpu_ioctl(env, KVM_S390_INTERRUPT,&kvmint);
>       }
>
Jan Kiszka - Jan. 7, 2011, 9:03 a.m.
Am 06.01.2011 20:24, Anthony Liguori wrote:
> On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
>> From: Jan Kiszka<jan.kiszka@siemens.com>
>>
>> QEMU supports only one VM, so there is only one kvm_state per process,
>> and we gain nothing passing a reference to it around. Eliminate any need
>> to refer to it outside of kvm-all.c.
>>
>> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
>> CC: Alexander Graf<agraf@suse.de>
>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>>    
> 
> I think this is a big mistake.

Obviously, I don't share your concerns. :)

> 
> Having to manage kvm_state keeps the abstraction lines well defined. 

How does it help?

> Otherwise, it's far too easy for portions of code to call into KVM
> functions that really shouldn't.

I can't imagine we gain anything from requiring kvm_check_extension
callers to hold a kvm_state "capability". Yes, it's now much easier to
call kvm_[vm_]ioctl, but that's the key point of this change:

So far we primarily complicated the internal interface between generic
and arch-dependent kvm parts by requiring kvm_state joggling. But
external users already find interfaces without this restriction
(kvm_log_*, kvm_ioeventfd_*, ...). That's because it's at least
complicated to _cleanly_ pass kvm_state references to all users that
need it - e.g. sysbus devices like kvmclock or upcoming in-kernel irqchips.

Let's just stop this artificial abstraction that has no practical use
and focus on detecting layering violations via code review. That's more
reliable IMHO.

Jan
Anthony Liguori - Jan. 7, 2011, 11:27 p.m.
On 01/07/2011 03:03 AM, Jan Kiszka wrote:
> Am 06.01.2011 20:24, Anthony Liguori wrote:
>    
>> On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
>>      
>>> From: Jan Kiszka<jan.kiszka@siemens.com>
>>>
>>> QEMU supports only one VM, so there is only one kvm_state per process,
>>> and we gain nothing passing a reference to it around. Eliminate any need
>>> to refer to it outside of kvm-all.c.
>>>
>>> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
>>> CC: Alexander Graf<agraf@suse.de>
>>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>>>
>>>        
>> I think this is a big mistake.
>>      
> Obviously, I don't share your concerns. :)
>
>    
>> Having to manage kvm_state keeps the abstraction lines well defined.
>>      
> How does it help?
>
>    
>> Otherwise, it's far too easy for portions of code to call into KVM
>> functions that really shouldn't.
>>      
> I can't imagine we gain anything from requiring kvm_check_extension
> callers to hold a kvm_state "capability". Yes, it's now much easier to
> call kvm_[vm_]ioctl, but that's the key point of this change:
>
> So far we primarily complicated the internal interface between generic
> and arch-dependent kvm parts by requiring kvm_state joggling. But
> external users already find interfaces without this restriction
> (kvm_log_*, kvm_ioeventfd_*, ...). That's because it's at least
> complicated to _cleanly_ pass kvm_state references to all users that
> need it - e.g. sysbus devices like kvmclock or upcoming in-kernel irqchips.
>    

I think you're basically making my point for me.

ioeventfd is a broken interface.  It shouldn't be a VM ioctl but rather 
a VCPU ioctl because PIO events are dispatched on a per-VCPU basis.

kvm_state is available as part of CPU state so it's quite easy to get at 
if these interfaces just took a CPUState argument (and they should).

> Let's just stop this artificial abstraction that has no practical use
> and focus on detecting layering violations via code review. That's more
> reliable IMHO.
>    

Documenting relationships between devices and the CPU is a very 
important task.  Being able to grep for cpu_single_env to see what 
devices models are interacting with the CPU is a very good thing.

When you've got these interactions hidden in a spaghetti of function 
calls, things become impossible to understand.

Regards,

Anthony Liguori

> Jan
>
>
Jan Kiszka - Jan. 8, 2011, 8:47 a.m.
Am 08.01.2011 00:27, Anthony Liguori wrote:
> On 01/07/2011 03:03 AM, Jan Kiszka wrote:
>> Am 06.01.2011 20:24, Anthony Liguori wrote:
>>   
>>> On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
>>>     
>>>> From: Jan Kiszka<jan.kiszka@siemens.com>
>>>>
>>>> QEMU supports only one VM, so there is only one kvm_state per process,
>>>> and we gain nothing passing a reference to it around. Eliminate any
>>>> need
>>>> to refer to it outside of kvm-all.c.
>>>>
>>>> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
>>>> CC: Alexander Graf<agraf@suse.de>
>>>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>>>>
>>>>        
>>> I think this is a big mistake.
>>>      
>> Obviously, I don't share your concerns. :)
>>
>>   
>>> Having to manage kvm_state keeps the abstraction lines well defined.
>>>      
>> How does it help?
>>
>>   
>>> Otherwise, it's far too easy for portions of code to call into KVM
>>> functions that really shouldn't.
>>>      
>> I can't imagine we gain anything from requiring kvm_check_extension
>> callers to hold a kvm_state "capability". Yes, it's now much easier to
>> call kvm_[vm_]ioctl, but that's the key point of this change:
>>
>> So far we primarily complicated the internal interface between generic
>> and arch-dependent kvm parts by requiring kvm_state joggling. But
>> external users already find interfaces without this restriction
>> (kvm_log_*, kvm_ioeventfd_*, ...). That's because it's at least
>> complicated to _cleanly_ pass kvm_state references to all users that
>> need it - e.g. sysbus devices like kvmclock or upcoming in-kernel
>> irqchips.
>>    
> 
> I think you're basically making my point for me.
> 
> ioeventfd is a broken interface.  It shouldn't be a VM ioctl but rather
> a VCPU ioctl because PIO events are dispatched on a per-VCPU basis.

OK, but I don't want to argue about the ioeventfd API. So let's put this
case aside. :)

> 
> kvm_state is available as part of CPU state so it's quite easy to get at
> if these interfaces just took a CPUState argument (and they should).

My point is definitely NOT about cpu-bound devices. That case is clear
and is not touched at all by this patch.

My point is about devices that have clear system scope like kvmclock,
ioapic, pit, pic, whatever-the-future-will-bring. And about KVM services
that have global scope like capability checks and other feature
explorations or VM configurations done by the KVM arch code. You still
didn't explain what we gain in these concrete scenarios by handing the
technically redundant abstraction kvm_state around, especially _inside_
the KVM core.

Jan
Anthony Liguori - Jan. 10, 2011, 7:59 p.m.
On 01/08/2011 02:47 AM, Jan Kiszka wrote:
> Am 08.01.2011 00:27, Anthony Liguori wrote:
>    
>> On 01/07/2011 03:03 AM, Jan Kiszka wrote:
>>      
>>> Am 06.01.2011 20:24, Anthony Liguori wrote:
>>>
>>>        
>>>> On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
>>>>
>>>>          
>>>>> From: Jan Kiszka<jan.kiszka@siemens.com>
>>>>>
>>>>> QEMU supports only one VM, so there is only one kvm_state per process,
>>>>> and we gain nothing passing a reference to it around. Eliminate any
>>>>> need
>>>>> to refer to it outside of kvm-all.c.
>>>>>
>>>>> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
>>>>> CC: Alexander Graf<agraf@suse.de>
>>>>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>>>>>
>>>>>
>>>>>            
>>>> I think this is a big mistake.
>>>>
>>>>          
>>> Obviously, I don't share your concerns. :)
>>>
>>>
>>>        
>>>> Having to manage kvm_state keeps the abstraction lines well defined.
>>>>
>>>>          
>>> How does it help?
>>>
>>>
>>>        
>>>> Otherwise, it's far too easy for portions of code to call into KVM
>>>> functions that really shouldn't.
>>>>
>>>>          
>>> I can't imagine we gain anything from requiring kvm_check_extension
>>> callers to hold a kvm_state "capability". Yes, it's now much easier to
>>> call kvm_[vm_]ioctl, but that's the key point of this change:
>>>
>>> So far we primarily complicated the internal interface between generic
>>> and arch-dependent kvm parts by requiring kvm_state joggling. But
>>> external users already find interfaces without this restriction
>>> (kvm_log_*, kvm_ioeventfd_*, ...). That's because it's at least
>>> complicated to _cleanly_ pass kvm_state references to all users that
>>> need it - e.g. sysbus devices like kvmclock or upcoming in-kernel
>>> irqchips.
>>>
>>>        
>> I think you're basically making my point for me.
>>
>> ioeventfd is a broken interface.  It shouldn't be a VM ioctl but rather
>> a VCPU ioctl because PIO events are dispatched on a per-VCPU basis.
>>      
> OK, but I don't want to argue about the ioeventfd API. So let's put this
> case aside. :)
>
>    
>> kvm_state is available as part of CPU state so it's quite easy to get at
>> if these interfaces just took a CPUState argument (and they should).
>>      
> My point is definitely NOT about cpu-bound devices. That case is clear
> and is not touched at all by this patch.
>
> My point is about devices that have clear system scope like kvmclock,
> ioapic, pit, pic,

I don't see how ioapic, pit, or pic have a system scope.

I don't know enough about kvmclock.

>   whatever-the-future-will-bring. And about KVM services
> that have global scope like capability checks and other feature
> explorations or VM configurations done by the KVM arch code. You still
> didn't explain what we gain in these concrete scenarios by handing the
> technically redundant abstraction kvm_state around, especially _inside_
> the KVM core.
>    

If you have to pass around a KVMState pointer, you establish an explicit 
relationship and communication between subsystems.  Any place where the 
global KVMState is used is a red flag that something is wrong.

I don't see what the advantage to making all of the KVMState global and 
implicit.  It seems like a big step backwards to me.  Can you give a 
very concrete example of where you think it results in easier to 
understand code as I don't see how making relationships implicit ever 
makes code easier to understand?

Regards,

Anthony Liguori

> Jan
>
>
Anthony Liguori - Jan. 10, 2011, 8:11 p.m.
On 01/08/2011 02:47 AM, Jan Kiszka wrote:
> OK, but I don't want to argue about the ioeventfd API. So let's put this
> case aside. :)
>    

I often reply too quickly without explaining myself.  Let me use 
ioeventfd as an example to highlight why KVMState is a good thing.

In real life, PIO and MMIO are never directly communicated to the device 
from the processor.  Instead, they go through a series of other 
devices.  In the case of something like an ISA device, a PIO first goes 
to the chipset into the PCI complex, it will then go through a 
PCI-to-ISA bridge via subtractive decoding, and then forward over the 
ISA device where it will be interpreted by some device.

The path to the chipset may be shared among different processors but it 
may also be unique.  The APIC is the best example as there are historic 
APICs that hung directly off of the CPUs such that the same MMIO access 
across different CPUs did not go to the same device.  This is why the 
APIC emulation in QEMU is so weird because we don't model this behavior 
correctly.

This means that a PIO operation needs to flow from a CPUState to a 
DeviceState.  It can then flow through to another DeviceState until it's 
finally handled.

The first problem with ioeventfd is that it's a per-VM operation.  It 
should be per VCPU.

But even if this were the case, the path that a PIO operation takes 
should not be impacted by ioeventfd.  IOW, a device shouldn't be 
allocating an eventfd() and handing it to a magical KVM call.  Instead, 
a device should register a callback for a particular port in the same 
way it always does.  *As an optimization*, we should have another 
interface that says that these values are only valid for this IO port.  
That would let us create eventfds and register things behind the scenes.

That means we can handle TCG, older KVM kernels, and newer KVM kernels 
without any special support in the device model.  It also means that the 
device models never have to worry about KVMState because there's an 
entirely different piece of code that's consulting the set of special 
ports and then deciding how to handle it.  The result is better, more 
portable code that doesn't have KVM-isms.

If passing state around seems to be ugly, it's probably because we're 
not abstracting things correctly.  Removing the state and making it 
implicit is the wrong solution.  Fixing the abstraction is the right 
solution (or living with the ugliness until someone else is motivated to 
fix it properly).

Regards,

Anthony Liguori
Jan Kiszka - Jan. 10, 2011, 8:12 p.m.
Am 10.01.2011 20:59, Anthony Liguori wrote:
> On 01/08/2011 02:47 AM, Jan Kiszka wrote:
>> Am 08.01.2011 00:27, Anthony Liguori wrote:
>>   
>>> On 01/07/2011 03:03 AM, Jan Kiszka wrote:
>>>     
>>>> Am 06.01.2011 20:24, Anthony Liguori wrote:
>>>>
>>>>       
>>>>> On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
>>>>>
>>>>>         
>>>>>> From: Jan Kiszka<jan.kiszka@siemens.com>
>>>>>>
>>>>>> QEMU supports only one VM, so there is only one kvm_state per
>>>>>> process,
>>>>>> and we gain nothing passing a reference to it around. Eliminate any
>>>>>> need
>>>>>> to refer to it outside of kvm-all.c.
>>>>>>
>>>>>> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
>>>>>> CC: Alexander Graf<agraf@suse.de>
>>>>>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>>>>>>
>>>>>>
>>>>>>            
>>>>> I think this is a big mistake.
>>>>>
>>>>>          
>>>> Obviously, I don't share your concerns. :)
>>>>
>>>>
>>>>       
>>>>> Having to manage kvm_state keeps the abstraction lines well defined.
>>>>>
>>>>>          
>>>> How does it help?
>>>>
>>>>
>>>>       
>>>>> Otherwise, it's far too easy for portions of code to call into KVM
>>>>> functions that really shouldn't.
>>>>>
>>>>>          
>>>> I can't imagine we gain anything from requiring kvm_check_extension
>>>> callers to hold a kvm_state "capability". Yes, it's now much easier to
>>>> call kvm_[vm_]ioctl, but that's the key point of this change:
>>>>
>>>> So far we primarily complicated the internal interface between generic
>>>> and arch-dependent kvm parts by requiring kvm_state joggling. But
>>>> external users already find interfaces without this restriction
>>>> (kvm_log_*, kvm_ioeventfd_*, ...). That's because it's at least
>>>> complicated to _cleanly_ pass kvm_state references to all users that
>>>> need it - e.g. sysbus devices like kvmclock or upcoming in-kernel
>>>> irqchips.
>>>>
>>>>        
>>> I think you're basically making my point for me.
>>>
>>> ioeventfd is a broken interface.  It shouldn't be a VM ioctl but rather
>>> a VCPU ioctl because PIO events are dispatched on a per-VCPU basis.
>>>      
>> OK, but I don't want to argue about the ioeventfd API. So let's put this
>> case aside. :)
>>
>>   
>>> kvm_state is available as part of CPU state so it's quite easy to get at
>>> if these interfaces just took a CPUState argument (and they should).
>>>      
>> My point is definitely NOT about cpu-bound devices. That case is clear
>> and is not touched at all by this patch.
>>
>> My point is about devices that have clear system scope like kvmclock,
>> ioapic, pit, pic,
> 
> I don't see how ioapic, pit, or pic have a system scope.

They are not bound to any CPU like the APIC which you may have in mind.

> 
> I don't know enough about kvmclock.

It's just the same.

> 
>>   whatever-the-future-will-bring. And about KVM services
>> that have global scope like capability checks and other feature
>> explorations or VM configurations done by the KVM arch code. You still
>> didn't explain what we gain in these concrete scenarios by handing the
>> technically redundant abstraction kvm_state around, especially _inside_
>> the KVM core.
>>    
> 
> If you have to pass around a KVMState pointer, you establish an explicit
> relationship and communication between subsystems.  Any place where the
> global KVMState is used is a red flag that something is wrong.

It is and will be _only_ used inside kvm-all.c. Again: What is the
benefit of restricting access to kvm_check_extension this way?

> 
> I don't see what the advantage to making all of the KVMState global and
> implicit.  It seems like a big step backwards to me.  Can you give a
> very concrete example of where you think it results in easier to
> understand code as I don't see how making relationships implicit ever
> makes code easier to understand?

The best example does not yet exist (fortunately): Just look at patch 28
and then try to pass some kvm_state reference to the kvmclock device. Is
this handle worth changing the sysbus API?

Jan
Jan Kiszka - Jan. 10, 2011, 8:15 p.m.
Am 10.01.2011 21:11, Anthony Liguori wrote:
> On 01/08/2011 02:47 AM, Jan Kiszka wrote:
>> OK, but I don't want to argue about the ioeventfd API. So let's put this
>> case aside. :)
>>    
> 
> I often reply too quickly without explaining myself.  Let me use
> ioeventfd as an example to highlight why KVMState is a good thing.
> 
> In real life, PIO and MMIO are never directly communicated to the device
> from the processor.  Instead, they go through a series of other
> devices.  In the case of something like an ISA device, a PIO first goes
> to the chipset into the PCI complex, it will then go through a
> PCI-to-ISA bridge via subtractive decoding, and then forward over the
> ISA device where it will be interpreted by some device.
> 
> The path to the chipset may be shared among different processors but it
> may also be unique.  The APIC is the best example as there are historic
> APICs that hung directly off of the CPUs such that the same MMIO access
> across different CPUs did not go to the same device.  This is why the
> APIC emulation in QEMU is so weird because we don't model this behavior
> correctly.
> 
> This means that a PIO operation needs to flow from a CPUState to a
> DeviceState.  It can then flow through to another DeviceState until it's
> finally handled.
> 
> The first problem with ioeventfd is that it's a per-VM operation.  It
> should be per VCPU.
> 
> But even if this were the case, the path that a PIO operation takes
> should not be impacted by ioeventfd.  IOW, a device shouldn't be
> allocating an eventfd() and handing it to a magical KVM call.  Instead,
> a device should register a callback for a particular port in the same
> way it always does.  *As an optimization*, we should have another
> interface that says that these values are only valid for this IO port. 
> That would let us create eventfds and register things behind the scenes.
> 
> That means we can handle TCG, older KVM kernels, and newer KVM kernels
> without any special support in the device model.  It also means that the
> device models never have to worry about KVMState because there's an
> entirely different piece of code that's consulting the set of special
> ports and then deciding how to handle it.  The result is better, more
> portable code that doesn't have KVM-isms.
> 
> If passing state around seems to be ugly, it's probably because we're
> not abstracting things correctly.  Removing the state and making it
> implicit is the wrong solution.  Fixing the abstraction is the right
> solution (or living with the ugliness until someone else is motivated to
> fix it properly).

Look at my other reply, it should answer this. ioeventfd is the wrong
example IMHO as one may argue about its relation to VCPUS.

Jan
Anthony Liguori - Jan. 10, 2011, 8:23 p.m.
On 01/10/2011 02:12 PM, Jan Kiszka wrote:
> Am 10.01.2011 20:59, Anthony Liguori wrote:
>    
>> On 01/08/2011 02:47 AM, Jan Kiszka wrote:
>>      
>>> Am 08.01.2011 00:27, Anthony Liguori wrote:
>>>
>>>        
>>>> On 01/07/2011 03:03 AM, Jan Kiszka wrote:
>>>>
>>>>          
>>>>> Am 06.01.2011 20:24, Anthony Liguori wrote:
>>>>>
>>>>>
>>>>>            
>>>>>> On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
>>>>>>
>>>>>>
>>>>>>              
>>>>>>> From: Jan Kiszka<jan.kiszka@siemens.com>
>>>>>>>
>>>>>>> QEMU supports only one VM, so there is only one kvm_state per
>>>>>>> process,
>>>>>>> and we gain nothing passing a reference to it around. Eliminate any
>>>>>>> need
>>>>>>> to refer to it outside of kvm-all.c.
>>>>>>>
>>>>>>> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
>>>>>>> CC: Alexander Graf<agraf@suse.de>
>>>>>>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>                
>>>>>> I think this is a big mistake.
>>>>>>
>>>>>>
>>>>>>              
>>>>> Obviously, I don't share your concerns. :)
>>>>>
>>>>>
>>>>>
>>>>>            
>>>>>> Having to manage kvm_state keeps the abstraction lines well defined.
>>>>>>
>>>>>>
>>>>>>              
>>>>> How does it help?
>>>>>
>>>>>
>>>>>
>>>>>            
>>>>>> Otherwise, it's far too easy for portions of code to call into KVM
>>>>>> functions that really shouldn't.
>>>>>>
>>>>>>
>>>>>>              
>>>>> I can't imagine we gain anything from requiring kvm_check_extension
>>>>> callers to hold a kvm_state "capability". Yes, it's now much easier to
>>>>> call kvm_[vm_]ioctl, but that's the key point of this change:
>>>>>
>>>>> So far we primarily complicated the internal interface between generic
>>>>> and arch-dependent kvm parts by requiring kvm_state joggling. But
>>>>> external users already find interfaces without this restriction
>>>>> (kvm_log_*, kvm_ioeventfd_*, ...). That's because it's at least
>>>>> complicated to _cleanly_ pass kvm_state references to all users that
>>>>> need it - e.g. sysbus devices like kvmclock or upcoming in-kernel
>>>>> irqchips.
>>>>>
>>>>>
>>>>>            
>>>> I think you're basically making my point for me.
>>>>
>>>> ioeventfd is a broken interface.  It shouldn't be a VM ioctl but rather
>>>> a VCPU ioctl because PIO events are dispatched on a per-VCPU basis.
>>>>
>>>>          
>>> OK, but I don't want to argue about the ioeventfd API. So let's put this
>>> case aside. :)
>>>
>>>
>>>        
>>>> kvm_state is available as part of CPU state so it's quite easy to get at
>>>> if these interfaces just took a CPUState argument (and they should).
>>>>
>>>>          
>>> My point is definitely NOT about cpu-bound devices. That case is clear
>>> and is not touched at all by this patch.
>>>
>>> My point is about devices that have clear system scope like kvmclock,
>>> ioapic, pit, pic,
>>>        
>> I don't see how ioapic, pit, or pic have a system scope.
>>      
> They are not bound to any CPU like the APIC which you may have in mind.
>    

And none of the above interact with KVM.

They may be replaced by KVM but if you look at the PIT, this is done by 
having two distinct devices.  The KVM specific device can (and should) 
be instantiated with kvm_state.

The way the IOAPIC/APIC/PIC is handled in qemu-kvm is nasty.  The kernel 
devices are separate devices and that should be reflected in the device 
tree.

>> I don't know enough about kvmclock.
>>      
> It's just the same.
>
>    
>>      
>>>    whatever-the-future-will-bring. And about KVM services
>>> that have global scope like capability checks and other feature
>>> explorations or VM configurations done by the KVM arch code. You still
>>> didn't explain what we gain in these concrete scenarios by handing the
>>> technically redundant abstraction kvm_state around, especially _inside_
>>> the KVM core.
>>>
>>>        
>> If you have to pass around a KVMState pointer, you establish an explicit
>> relationship and communication between subsystems.  Any place where the
>> global KVMState is used is a red flag that something is wrong.
>>      
> It is and will be _only_ used inside kvm-all.c. Again: What is the
> benefit of restricting access to kvm_check_extension this way?
>    

The more places that need to deal with KVM compatibility code, the worse 
we will be because it's more opportunities to get it wrong.

>> I don't see what the advantage to making all of the KVMState global and
>> implicit.  It seems like a big step backwards to me.  Can you give a
>> very concrete example of where you think it results in easier to
>> understand code as I don't see how making relationships implicit ever
>> makes code easier to understand?
>>      
> The best example does not yet exist (fortunately): Just look at patch 28
> and then try to pass some kvm_state reference to the kvmclock device. Is
> this handle worth changing the sysbus API?
>    

Let me look at that patch and reply there.

Regards,

Anthony Liguori

> Jan
>
>
Jan Kiszka - Jan. 10, 2011, 8:34 p.m.
Am 10.01.2011 21:23, Anthony Liguori wrote:
> On 01/10/2011 02:12 PM, Jan Kiszka wrote:
>> Am 10.01.2011 20:59, Anthony Liguori wrote:
>>   
>>> On 01/08/2011 02:47 AM, Jan Kiszka wrote:
>>>     
>>>> Am 08.01.2011 00:27, Anthony Liguori wrote:
>>>>
>>>>       
>>>>> On 01/07/2011 03:03 AM, Jan Kiszka wrote:
>>>>>
>>>>>         
>>>>>> Am 06.01.2011 20:24, Anthony Liguori wrote:
>>>>>>
>>>>>>
>>>>>>           
>>>>>>> On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
>>>>>>>
>>>>>>>
>>>>>>>             
>>>>>>>> From: Jan Kiszka<jan.kiszka@siemens.com>
>>>>>>>>
>>>>>>>> QEMU supports only one VM, so there is only one kvm_state per
>>>>>>>> process,
>>>>>>>> and we gain nothing passing a reference to it around. Eliminate any
>>>>>>>> need
>>>>>>>> to refer to it outside of kvm-all.c.
>>>>>>>>
>>>>>>>> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
>>>>>>>> CC: Alexander Graf<agraf@suse.de>
>>>>>>>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>                
>>>>>>> I think this is a big mistake.
>>>>>>>
>>>>>>>
>>>>>>>              
>>>>>> Obviously, I don't share your concerns. :)
>>>>>>
>>>>>>
>>>>>>
>>>>>>           
>>>>>>> Having to manage kvm_state keeps the abstraction lines well defined.
>>>>>>>
>>>>>>>
>>>>>>>              
>>>>>> How does it help?
>>>>>>
>>>>>>
>>>>>>
>>>>>>           
>>>>>>> Otherwise, it's far too easy for portions of code to call into KVM
>>>>>>> functions that really shouldn't.
>>>>>>>
>>>>>>>
>>>>>>>              
>>>>>> I can't imagine we gain anything from requiring kvm_check_extension
>>>>>> callers to hold a kvm_state "capability". Yes, it's now much
>>>>>> easier to
>>>>>> call kvm_[vm_]ioctl, but that's the key point of this change:
>>>>>>
>>>>>> So far we primarily complicated the internal interface between
>>>>>> generic
>>>>>> and arch-dependent kvm parts by requiring kvm_state joggling. But
>>>>>> external users already find interfaces without this restriction
>>>>>> (kvm_log_*, kvm_ioeventfd_*, ...). That's because it's at least
>>>>>> complicated to _cleanly_ pass kvm_state references to all users that
>>>>>> need it - e.g. sysbus devices like kvmclock or upcoming in-kernel
>>>>>> irqchips.
>>>>>>
>>>>>>
>>>>>>            
>>>>> I think you're basically making my point for me.
>>>>>
>>>>> ioeventfd is a broken interface.  It shouldn't be a VM ioctl but
>>>>> rather
>>>>> a VCPU ioctl because PIO events are dispatched on a per-VCPU basis.
>>>>>
>>>>>          
>>>> OK, but I don't want to argue about the ioeventfd API. So let's put
>>>> this
>>>> case aside. :)
>>>>
>>>>
>>>>       
>>>>> kvm_state is available as part of CPU state so it's quite easy to
>>>>> get at
>>>>> if these interfaces just took a CPUState argument (and they should).
>>>>>
>>>>>          
>>>> My point is definitely NOT about cpu-bound devices. That case is clear
>>>> and is not touched at all by this patch.
>>>>
>>>> My point is about devices that have clear system scope like kvmclock,
>>>> ioapic, pit, pic,
>>>>        
>>> I don't see how ioapic, pit, or pic have a system scope.
>>>      
>> They are not bound to any CPU like the APIC which you may have in mind.
>>    
> 
> And none of the above interact with KVM.
> 
> They may be replaced by KVM but if you look at the PIT, this is done by
> having two distinct devices.  The KVM specific device can (and should)
> be instantiated with kvm_state.
> 
> The way the IOAPIC/APIC/PIC is handled in qemu-kvm is nasty.  The kernel
> devices are separate devices and that should be reflected in the device
> tree.

If separate device or hack to existing one - both need to sync their
user space state with the kernel when QEMU asks them to. That's how they
have to interact with KVM all the time. Same for kvmclock if you want to
look at a really trivial example.

> 
>>> I don't know enough about kvmclock.
>>>      
>> It's just the same.
>>
>>   
>>>     
>>>>    whatever-the-future-will-bring. And about KVM services
>>>> that have global scope like capability checks and other feature
>>>> explorations or VM configurations done by the KVM arch code. You still
>>>> didn't explain what we gain in these concrete scenarios by handing the
>>>> technically redundant abstraction kvm_state around, especially _inside_
>>>> the KVM core.
>>>>
>>>>        
>>> If you have to pass around a KVMState pointer, you establish an explicit
>>> relationship and communication between subsystems.  Any place where the
>>> global KVMState is used is a red flag that something is wrong.
>>>      
>> It is and will be _only_ used inside kvm-all.c. Again: What is the
>> benefit of restricting access to kvm_check_extension this way?
>>    
> 
> The more places that need to deal with KVM compatibility code, the worse
> we will be because it's more opportunities to get it wrong.

That code belongs where the related logic is. IMHO, it would be a
needless abstraction to push in-kernel access services and workaround
definitions in the KVM core instead of the KVM device model code -
provided there is only one user.

But this discussion is a bit abstract right now as we do not yet have
anything more complex than kvmclock on the table for QEMU.

> 
>>> I don't see what the advantage to making all of the KVMState global and
>>> implicit.  It seems like a big step backwards to me.  Can you give a
>>> very concrete example of where you think it results in easier to
>>> understand code as I don't see how making relationships implicit ever
>>> makes code easier to understand?
>>>      
>> The best example does not yet exist (fortunately): Just look at patch 28
>> and then try to pass some kvm_state reference to the kvmclock device. Is
>> this handle worth changing the sysbus API?
>>    
> 
> Let me look at that patch and reply there.
> 

OK, great.

Jan
Avi Kivity - Jan. 11, 2011, 9:01 a.m.
On 01/10/2011 10:23 PM, Anthony Liguori wrote:
>>> I don't see how ioapic, pit, or pic have a system scope.
>> They are not bound to any CPU like the APIC which you may have in mind.
>
> And none of the above interact with KVM.

They're implemented by kvm.  What deeper interaction do you have in mind?

>
> They may be replaced by KVM but if you look at the PIT, this is done 
> by having two distinct devices.  The KVM specific device can (and 
> should) be instantiated with kvm_state.
>
> The way the IOAPIC/APIC/PIC is handled in qemu-kvm is nasty.  The 
> kernel devices are separate devices and that should be reflected in 
> the device tree.

I don't see why.  Those are just two different implementations for the 
same guest visible device.  It's like saying IDE should be seen 
differently if it's backed by qcow2 or qed.

The device tree is about the guest view of devices.
Avi Kivity - Jan. 11, 2011, 9:17 a.m.
On 01/10/2011 10:11 PM, Anthony Liguori wrote:
> On 01/08/2011 02:47 AM, Jan Kiszka wrote:
>> OK, but I don't want to argue about the ioeventfd API. So let's put this
>> case aside. :)
>
> I often reply too quickly without explaining myself.  Let me use 
> ioeventfd as an example to highlight why KVMState is a good thing.
>
> In real life, PIO and MMIO are never directly communicated to the 
> device from the processor.  Instead, they go through a series of other 
> devices.  In the case of something like an ISA device, a PIO first 
> goes to the chipset into the PCI complex, it will then go through a 
> PCI-to-ISA bridge via subtractive decoding, and then forward over the 
> ISA device where it will be interpreted by some device.
>
> The path to the chipset may be shared among different processors but 
> it may also be unique.  The APIC is the best example as there are 
> historic APICs that hung directly off of the CPUs such that the same 
> MMIO access across different CPUs did not go to the same device.  This 
> is why the APIC emulation in QEMU is so weird because we don't model 
> this behavior correctly.
>
> This means that a PIO operation needs to flow from a CPUState to a 
> DeviceState.  It can then flow through to another DeviceState until 
> it's finally handled.
>
> The first problem with ioeventfd is that it's a per-VM operation.  It 
> should be per VCPU.

Just consider ioeventfd as something that hooks the system bus, not the 
processor-chipset link, and the problem goes away.  In practice, any 
per-cpu io port (for SMM or power management) would need synchronous 
handling; an eventfd isn't a suitable way to communicate it.

>
> But even if this were the case, the path that a PIO operation takes 
> should not be impacted by ioeventfd.  IOW, a device shouldn't be 
> allocating an eventfd() and handing it to a magical KVM call.  
> Instead, a device should register a callback for a particular port in 
> the same way it always does.  *As an optimization*, we should have 
> another interface that says that these values are only valid for this 
> IO port.  That would let us create eventfds and register things behind 
> the scenes.

The semantics are different.  The normal callbacks are synchronous; the 
vcpu is halted until the callback is serviced.  For most callbacks, this 
is critical, not just per-cpu things like vmport (example: cirrus back 
switching).

I agree it shouldn't be done by a kvm-specific kvm call, but instead by 
an API, but that API needs to be explicitly asynchronous.  When we 
further thread qemu, we'd also need to specify which thread is to 
execute the callback (and the implementation would add the eventfd to 
the thread's fd poll list).

>
> That means we can handle TCG, older KVM kernels, and newer KVM kernels 
> without any special support in the device model.  It also means that 
> the device models never have to worry about KVMState because there's 
> an entirely different piece of code that's consulting the set of 
> special ports and then deciding how to handle it.  The result is 
> better, more portable code that doesn't have KVM-isms.

Yes.

>
> If passing state around seems to be ugly, it's probably because we're 
> not abstracting things correctly.  Removing the state and making it 
> implicit is the wrong solution. 

I agree with the general sentiment that utilizing the fact that a 
variable is global to make it implicit is bad from a software 
engineering point of view.  By restricting access to variables and 
functions, you can enforce modularity on the code.  Much like the 
private: specifier in C++ and other languages.

> Fixing the abstraction is the right solution (or living with the 
> ugliness until someone else is motivated to fix it properly).

And with that too, especially the parenthesized statement.  We have 
qemu-kvm that is overly pragmatic and trie[sd] not to avoid modifying 
qemu as much as possible.  We have the qemu.git kvm implementation that 
tries a perfectionist approach that failed because most of the users 
need the featured and tested pragmatic approach.  The two mix like oil 
and water.
Anthony Liguori - Jan. 11, 2011, 2 p.m.
On 01/11/2011 03:01 AM, Avi Kivity wrote:
> On 01/10/2011 10:23 PM, Anthony Liguori wrote:
>>>> I don't see how ioapic, pit, or pic have a system scope.
>>> They are not bound to any CPU like the APIC which you may have in mind.
>>
>> And none of the above interact with KVM.
>
> They're implemented by kvm.  What deeper interaction do you have in mind?

The emulated ioapic/pit/pic do not interact with KVM at all.

The KVM versions should be completely separate devices.

>
>>
>> They may be replaced by KVM but if you look at the PIT, this is done 
>> by having two distinct devices.  The KVM specific device can (and 
>> should) be instantiated with kvm_state.
>>
>> The way the IOAPIC/APIC/PIC is handled in qemu-kvm is nasty.  The 
>> kernel devices are separate devices and that should be reflected in 
>> the device tree.
>
> I don't see why.  Those are just two different implementations for the 
> same guest visible device.

Right, they should appear the same to the guest but the fact that 
they're two different implementations should be reflected in the device 
tree.

>   It's like saying IDE should be seen differently if it's backed by 
> qcow2 or qed.

No, it's not at all.

Advantages of separating KVM devices:

1) it becomes very clear what functionality is handled in the kernel 
verses in userspace (you can actually look at the code and tell)

2) a user can explicitly create either the emulated version of the 
device or the in-kernel version of the device (no need for -no-kvm-irqchip)

3) a user can pass parameters directly to the in-kernel version of the 
device that are different from the userspace version (like selecting 
different interrupt catch-up methods)

Regards,

Anthony Liguori

> The device tree is about the guest view of devices.
>
Alexander Graf - Jan. 11, 2011, 2:06 p.m.
On 11.01.2011, at 15:00, Anthony Liguori wrote:

> On 01/11/2011 03:01 AM, Avi Kivity wrote:
>> On 01/10/2011 10:23 PM, Anthony Liguori wrote:
>>>>> I don't see how ioapic, pit, or pic have a system scope.
>>>> They are not bound to any CPU like the APIC which you may have in mind.
>>> 
>>> And none of the above interact with KVM.
>> 
>> They're implemented by kvm.  What deeper interaction do you have in mind?
> 
> The emulated ioapic/pit/pic do not interact with KVM at all.
> 
> The KVM versions should be completely separate devices.
> 
>> 
>>> 
>>> They may be replaced by KVM but if you look at the PIT, this is done by having two distinct devices.  The KVM specific device can (and should) be instantiated with kvm_state.
>>> 
>>> The way the IOAPIC/APIC/PIC is handled in qemu-kvm is nasty.  The kernel devices are separate devices and that should be reflected in the device tree.
>> 
>> I don't see why.  Those are just two different implementations for the same guest visible device.
> 
> Right, they should appear the same to the guest but the fact that they're two different implementations should be reflected in the device tree.
> 
>>  It's like saying IDE should be seen differently if it's backed by qcow2 or qed.
> 
> No, it's not at all.
> 
> Advantages of separating KVM devices:
> 
> 1) it becomes very clear what functionality is handled in the kernel verses in userspace (you can actually look at the code and tell)
> 
> 2) a user can explicitly create either the emulated version of the device or the in-kernel version of the device (no need for -no-kvm-irqchip)
> 
> 3) a user can pass parameters directly to the in-kernel version of the device that are different from the userspace version (like selecting different interrupt catch-up methods)

Disadvantages:

1) you lose migration / savevm between KVM and non-KVM VMs

I'm not saying this is unsolvable, but it's certainly something that bothers me :). Some sort of meta-device for KVM implemented devices and emulated devices would be nice. That device would then be the one state gets saved/restored from.


Alex
Anthony Liguori - Jan. 11, 2011, 2:09 p.m.
On 01/11/2011 08:06 AM, Alexander Graf wrote:
> On 11.01.2011, at 15:00, Anthony Liguori wrote:
>
>    
>> On 01/11/2011 03:01 AM, Avi Kivity wrote:
>>      
>>> On 01/10/2011 10:23 PM, Anthony Liguori wrote:
>>>        
>>>>>> I don't see how ioapic, pit, or pic have a system scope.
>>>>>>              
>>>>> They are not bound to any CPU like the APIC which you may have in mind.
>>>>>            
>>>> And none of the above interact with KVM.
>>>>          
>>> They're implemented by kvm.  What deeper interaction do you have in mind?
>>>        
>> The emulated ioapic/pit/pic do not interact with KVM at all.
>>
>> The KVM versions should be completely separate devices.
>>
>>      
>>>        
>>>> They may be replaced by KVM but if you look at the PIT, this is done by having two distinct devices.  The KVM specific device can (and should) be instantiated with kvm_state.
>>>>
>>>> The way the IOAPIC/APIC/PIC is handled in qemu-kvm is nasty.  The kernel devices are separate devices and that should be reflected in the device tree.
>>>>          
>>> I don't see why.  Those are just two different implementations for the same guest visible device.
>>>        
>> Right, they should appear the same to the guest but the fact that they're two different implementations should be reflected in the device tree.
>>
>>      
>>>   It's like saying IDE should be seen differently if it's backed by qcow2 or qed.
>>>        
>> No, it's not at all.
>>
>> Advantages of separating KVM devices:
>>
>> 1) it becomes very clear what functionality is handled in the kernel verses in userspace (you can actually look at the code and tell)
>>
>> 2) a user can explicitly create either the emulated version of the device or the in-kernel version of the device (no need for -no-kvm-irqchip)
>>
>> 3) a user can pass parameters directly to the in-kernel version of the device that are different from the userspace version (like selecting different interrupt catch-up methods)
>>      
> Disadvantages:
>
> 1) you lose migration / savevm between KVM and non-KVM VMs
>    

This doesn't work today and it's never worked.  KVM exposes things that 
TCG cannot emulate (like pvclock).

Even as two devices, nothing prevents it from working.  Both devices 
just have to support each other's savevm format.  If they use the same 
code, it makes it very easy.  Take a look at how the KVM PIT is 
implemented for an example of this.

Regards,

Anthony Liguori

> I'm not saying this is unsolvable, but it's certainly something that bothers me :). Some sort of meta-device for KVM implemented devices and emulated devices would be nice. That device would then be the one state gets saved/restored from.
>
>
> Alex
>
>
Avi Kivity - Jan. 11, 2011, 2:18 p.m.
On 01/11/2011 04:00 PM, Anthony Liguori wrote:
> On 01/11/2011 03:01 AM, Avi Kivity wrote:
>> On 01/10/2011 10:23 PM, Anthony Liguori wrote:
>>>>> I don't see how ioapic, pit, or pic have a system scope.
>>>> They are not bound to any CPU like the APIC which you may have in 
>>>> mind.
>>>
>>> And none of the above interact with KVM.
>>
>> They're implemented by kvm.  What deeper interaction do you have in 
>> mind?
>
> The emulated ioapic/pit/pic do not interact with KVM at all.

How can they "not interact" with kvm if they're implemented by kvm?

I really don't follow here.

>
> The KVM versions should be completely separate devices.
>

Why?

>> I don't see why.  Those are just two different implementations for 
>> the same guest visible device.
>
> Right, they should appear the same to the guest but the fact that 
> they're two different implementations should be reflected in the 
> device tree.

Why?

To move beyond single-word questions, what is the purpose of the device 
tree?  In my mind, it reflects the virtual hardware.  What's important 
is that we have a PIC, virtio network adapter, and IDE disk.  Not that 
they're backed by kvm, vhost-net, and qcow2.

>
>>   It's like saying IDE should be seen differently if it's backed by 
>> qcow2 or qed.
>
> No, it's not at all.
>
> Advantages of separating KVM devices:
>
> 1) it becomes very clear what functionality is handled in the kernel 
> verses in userspace (you can actually look at the code and tell)

How something is implemented is not important, certainly not important 
enough to expose to the user as an monitor or live migration ABI.

>
> 2) a user can explicitly create either the emulated version of the 
> device or the in-kernel version of the device (no need for 
> -no-kvm-irqchip)

-device ioapic,model=kernel vs. -device kvm-ioapic?

Is it really important to do that? 110% of the time we want the kernel 
irqchips.  The remaining -10% are only used for testing.

>
> 3) a user can pass parameters directly to the in-kernel version of the 
> device that are different from the userspace version (like selecting 
> different interrupt catch-up methods)

-device pit,model=qemu,catchup=slew

error: catchup=slew not supported in this model

I'm not overly concerned about the implementation part.  Though I think 
it's better to have a single implementation with kvm acting as an 
accelerator, having it the other way is no big deal.  What I am worried 
about is exposing it as a monitor and migration ABI.  IMO the only 
important thing is the spec that the device implements, not what piece 
of code implements it.
Avi Kivity - Jan. 11, 2011, 2:22 p.m.
On 01/11/2011 04:09 PM, Anthony Liguori wrote:
>> Disadvantages:
>>
>> 1) you lose migration / savevm between KVM and non-KVM VMs
>
> This doesn't work today and it's never worked.  KVM exposes things 
> that TCG cannot emulate (like pvclock).

If you run kvm without pvclock, or implement pvclock in qemu, it works 
fine.  It should work fine for the PIT, PIC, and IOAPIC (never tried it 
myself).

If we decide to have a kernel hpet implementation, for example, it would 
be good to be able to live migrate from a version without kernel hpet, 
to a version with kernel hpet, and have the kernel hpet enabled.

>
> Even as two devices, nothing prevents it from working.  Both devices 
> just have to support each other's savevm format.  If they use the same 
> code, it makes it very easy.  Take a look at how the KVM PIT is 
> implemented for an example of this.

They need to use the same device id then.  And if they share code, that 
indicates that they need to be the same device even more.
Alexander Graf - Jan. 11, 2011, 2:24 p.m.
On 11.01.2011, at 15:09, Anthony Liguori wrote:

> On 01/11/2011 08:06 AM, Alexander Graf wrote:
>> On 11.01.2011, at 15:00, Anthony Liguori wrote:
>> 
>>   
>>> On 01/11/2011 03:01 AM, Avi Kivity wrote:
>>>     
>>>> On 01/10/2011 10:23 PM, Anthony Liguori wrote:
>>>>       
>>>>>>> I don't see how ioapic, pit, or pic have a system scope.
>>>>>>>             
>>>>>> They are not bound to any CPU like the APIC which you may have in mind.
>>>>>>           
>>>>> And none of the above interact with KVM.
>>>>>         
>>>> They're implemented by kvm.  What deeper interaction do you have in mind?
>>>>       
>>> The emulated ioapic/pit/pic do not interact with KVM at all.
>>> 
>>> The KVM versions should be completely separate devices.
>>> 
>>>     
>>>>       
>>>>> They may be replaced by KVM but if you look at the PIT, this is done by having two distinct devices.  The KVM specific device can (and should) be instantiated with kvm_state.
>>>>> 
>>>>> The way the IOAPIC/APIC/PIC is handled in qemu-kvm is nasty.  The kernel devices are separate devices and that should be reflected in the device tree.
>>>>>         
>>>> I don't see why.  Those are just two different implementations for the same guest visible device.
>>>>       
>>> Right, they should appear the same to the guest but the fact that they're two different implementations should be reflected in the device tree.
>>> 
>>>     
>>>>  It's like saying IDE should be seen differently if it's backed by qcow2 or qed.
>>>>       
>>> No, it's not at all.
>>> 
>>> Advantages of separating KVM devices:
>>> 
>>> 1) it becomes very clear what functionality is handled in the kernel verses in userspace (you can actually look at the code and tell)
>>> 
>>> 2) a user can explicitly create either the emulated version of the device or the in-kernel version of the device (no need for -no-kvm-irqchip)
>>> 
>>> 3) a user can pass parameters directly to the in-kernel version of the device that are different from the userspace version (like selecting different interrupt catch-up methods)
>>>     
>> Disadvantages:
>> 
>> 1) you lose migration / savevm between KVM and non-KVM VMs
>>   
> 
> This doesn't work today and it's never worked.  KVM exposes things that TCG cannot emulate (like pvclock).

Those cases simply shouldn't exist and hurt us (or at least me). I had exactly the pvclock issue with xenner. Xenner can't do proper timekeeping in emulation mode. So implementing an emulated pvclock implementation is (pretty low) on my todo list.

> Even as two devices, nothing prevents it from working.  Both devices just have to support each other's savevm format.  If they use the same code, it makes it very easy.  Take a look at how the KVM PIT is implemented for an example of this.

If that's all it takes, fine. It makes it pretty hard to enforce, but I guess we can get away with that :).

Making devices separate basically hurts abstraction. I don't see any use case where we should have a KVM device without emulation equivalent. For the CPU we also think of KVM as an accelerator instead of a separate device, no? :)


Alex
Anthony Liguori - Jan. 11, 2011, 2:28 p.m.
On 01/11/2011 08:18 AM, Avi Kivity wrote:
> On 01/11/2011 04:00 PM, Anthony Liguori wrote:
>> On 01/11/2011 03:01 AM, Avi Kivity wrote:
>>> On 01/10/2011 10:23 PM, Anthony Liguori wrote:
>>>>>> I don't see how ioapic, pit, or pic have a system scope.
>>>>> They are not bound to any CPU like the APIC which you may have in 
>>>>> mind.
>>>>
>>>> And none of the above interact with KVM.
>>>
>>> They're implemented by kvm.  What deeper interaction do you have in 
>>> mind?
>>
>> The emulated ioapic/pit/pic do not interact with KVM at all.
>
> How can they "not interact" with kvm if they're implemented by kvm?
>
> I really don't follow here.

"emulated ioapic/pit/pic" == versions implemented in QEMU.  That's what 
I'm trying to say.  When not using the KVM versions of the devices, 
there are no interactions with KVM.

>>
>> The KVM versions should be completely separate devices.
>>
>
> Why?

Because the KVM versions are replacements.

>>> I don't see why.  Those are just two different implementations for 
>>> the same guest visible device.
>>
>> Right, they should appear the same to the guest but the fact that 
>> they're two different implementations should be reflected in the 
>> device tree.
>
> Why?
>
> To move beyond single-word questions, what is the purpose of the 
> device tree?  In my mind, it reflects the virtual hardware.  What's 
> important is that we have a PIC, virtio network adapter, and IDE 
> disk.  Not that they're backed by kvm, vhost-net, and qcow2.

Let me give a very concrete example to illustrate my point.

One thing I have on my TODO is to implement catch-up support for the 
emulated devices.  I want to implement three modes of catch-up support: 
drop, fast, and gradual.  Gradual is the best policy IMHO but fast is 
necessary on older kernels without highres timers.  Drop is necessary to 
maintain compatibility with what we have today.

The kernel PIT only implements one mode and even if the other two were 
added, even the newest version of QEMU needs to deal with the fact that 
there's old kernels out there with PIT's that only do fast.

So how does this get exposed to management tools?  Do you check for 
drift-mode=fast and transparently enable the KVM pit?  Do you fail if 
anything but drift-mode=fast is specified?

We need to have the following mechanisms:

1) the ability to select an in-kernel PIT vs. a userspace PIT

2) an independent mechanism to configure the userspace PIT

3) an independent mechanism to configure the in-kernel PIT.

The best way to do this is to make the in-kernel PIT a separate device.  
Then we get all of this for free.

>>
>> 2) a user can explicitly create either the emulated version of the 
>> device or the in-kernel version of the device (no need for 
>> -no-kvm-irqchip)
>
> -device ioapic,model=kernel vs. -device kvm-ioapic?
>
> Is it really important to do that? 110% of the time we want the kernel 
> irqchips.  The remaining -10% are only used for testing.

If model=kernel makes the support options different, then you end up 
introduce another layer of option validation.  By using the later form, 
you get to leverage the option validation of qdev plus it makes it much 
clearer to users what options are supported in what model because now 
the documentation is explicit about it.

>>
>> 3) a user can pass parameters directly to the in-kernel version of 
>> the device that are different from the userspace version (like 
>> selecting different interrupt catch-up methods)
>
> -device pit,model=qemu,catchup=slew
>
> error: catchup=slew not supported in this model
>
> I'm not overly concerned about the implementation part.  Though I 
> think it's better to have a single implementation with kvm acting as 
> an accelerator, having it the other way is no big deal.  What I am 
> worried about is exposing it as a monitor and migration ABI.  IMO the 
> only important thing is the spec that the device implements, not what 
> piece of code implements it.

Just as we do in the PIT, there's nothing wrong with making the device's 
migration compatible.  I'm not entirely sure what your concerns about 
the monitor are but there's simply no way to hide the fact that a device 
is implemented in KVM at the monitor level.  But really, is this 
something that management tools want?  I doubt it.  I think they want to 
have ultimate control over what gets created with us providing a 
recommended set of defaults.

Regards,

Anthony Liguori
Anthony Liguori - Jan. 11, 2011, 2:36 p.m.
On 01/11/2011 08:22 AM, Avi Kivity wrote:
> On 01/11/2011 04:09 PM, Anthony Liguori wrote:
>>> Disadvantages:
>>>
>>> 1) you lose migration / savevm between KVM and non-KVM VMs
>>
>> This doesn't work today and it's never worked.  KVM exposes things 
>> that TCG cannot emulate (like pvclock).
>
> If you run kvm without pvclock, or implement pvclock in qemu, it works 
> fine.  It should work fine for the PIT, PIC, and IOAPIC (never tried 
> it myself).
>
> If we decide to have a kernel hpet implementation, for example, it 
> would be good to be able to live migrate from a version without kernel 
> hpet, to a version with kernel hpet, and have the kernel hpet enabled.
>
>>
>> Even as two devices, nothing prevents it from working.  Both devices 
>> just have to support each other's savevm format.  If they use the 
>> same code, it makes it very easy.  Take a look at how the KVM PIT is 
>> implemented for an example of this.
>
> They need to use the same device id then.  And if they share code, 
> that indicates that they need to be the same device even more.

No, it really doesn't :-)  Cirrus VGA and std VGA share a lot of code.  
But that doesn't mean that we treat them as one device.

And BTW, there are guest visible differences between the KVM 
IOAPIC/PIC/PIT than the QEMU versions.  The only reason PIT live 
migration works today is because usually delivers all interrupts 
quickly.  But it actually does maintain state in the work queue that 
isn't saved.  If PIT tried to implement gradual catchup, there would be 
no way not to expose that state to userspace.



Regards,

Anthony Liguori
Avi Kivity - Jan. 11, 2011, 2:52 p.m.
On 01/11/2011 04:28 PM, Anthony Liguori wrote:
> On 01/11/2011 08:18 AM, Avi Kivity wrote:
>> On 01/11/2011 04:00 PM, Anthony Liguori wrote:
>>> On 01/11/2011 03:01 AM, Avi Kivity wrote:
>>>> On 01/10/2011 10:23 PM, Anthony Liguori wrote:
>>>>>>> I don't see how ioapic, pit, or pic have a system scope.
>>>>>> They are not bound to any CPU like the APIC which you may have in 
>>>>>> mind.
>>>>>
>>>>> And none of the above interact with KVM.
>>>>
>>>> They're implemented by kvm.  What deeper interaction do you have in 
>>>> mind?
>>>
>>> The emulated ioapic/pit/pic do not interact with KVM at all.
>>
>> How can they "not interact" with kvm if they're implemented by kvm?
>>
>> I really don't follow here.
>
> "emulated ioapic/pit/pic" == versions implemented in QEMU.  That's 
> what I'm trying to say.  When not using the KVM versions of the 
> devices, there are no interactions with KVM.

Okay.  Isn't that the same for the cpu?  Yet we use the same CPUState 
and are live-migration compatible (as long as cpuids match).

>
>>>
>>> The KVM versions should be completely separate devices.
>>>
>>
>> Why?
>
> Because the KVM versions are replacements.

Only the implementation.  The guest doesn't see the replacement.  They 
have exactly the same state.

>
>>>> I don't see why.  Those are just two different implementations for 
>>>> the same guest visible device.
>>>
>>> Right, they should appear the same to the guest but the fact that 
>>> they're two different implementations should be reflected in the 
>>> device tree.
>>
>> Why?
>>
>> To move beyond single-word questions, what is the purpose of the 
>> device tree?  In my mind, it reflects the virtual hardware.  What's 
>> important is that we have a PIC, virtio network adapter, and IDE 
>> disk.  Not that they're backed by kvm, vhost-net, and qcow2.
>
> Let me give a very concrete example to illustrate my point.
>
> One thing I have on my TODO is to implement catch-up support for the 
> emulated devices.  I want to implement three modes of catch-up 
> support: drop, fast, and gradual.  Gradual is the best policy IMHO but 
> fast is necessary on older kernels without highres timers.  Drop is 
> necessary to maintain compatibility with what we have today.
>
> The kernel PIT only implements one mode and even if the other two were 
> added, even the newest version of QEMU needs to deal with the fact 
> that there's old kernels out there with PIT's that only do fast.
>
> So how does this get exposed to management tools?  Do you check for 
> drift-mode=fast and transparently enable the KVM pit?  Do you fail if 
> anything but drift-mode=fast is specified?
>
> We need to have the following mechanisms:
>
> 1) the ability to select an in-kernel PIT vs. a userspace PIT
>
> 2) an independent mechanism to configure the userspace PIT
>
> 3) an independent mechanism to configure the in-kernel PIT.
>
> The best way to do this is to make the in-kernel PIT a separate 
> device.  Then we get all of this for free.

And it buys us live migration and ABI issues for the same price.

Really, can't we do

     class i8254 {
         ...
         virtual void set_catchup_policy(std::string policy) = 0;
         ...
     }

to deal with the differences?

>
>>>
>>> 2) a user can explicitly create either the emulated version of the 
>>> device or the in-kernel version of the device (no need for 
>>> -no-kvm-irqchip)
>>
>> -device ioapic,model=kernel vs. -device kvm-ioapic?
>>
>> Is it really important to do that? 110% of the time we want the 
>> kernel irqchips.  The remaining -10% are only used for testing.
>
> If model=kernel makes the support options different, then you end up 
> introduce another layer of option validation.  By using the later 
> form, you get to leverage the option validation of qdev plus it makes 
> it much clearer to users what options are supported in what model 
> because now the documentation is explicit about it.

Option validation = internals.  ABI = ABI.  We can deal with the former 
in any number of ways, but exposing it to the ABI is forever.

>
>>>
>>> 3) a user can pass parameters directly to the in-kernel version of 
>>> the device that are different from the userspace version (like 
>>> selecting different interrupt catch-up methods)
>>
>> -device pit,model=qemu,catchup=slew
>>
>> error: catchup=slew not supported in this model
>>
>> I'm not overly concerned about the implementation part.  Though I 
>> think it's better to have a single implementation with kvm acting as 
>> an accelerator, having it the other way is no big deal.  What I am 
>> worried about is exposing it as a monitor and migration ABI.  IMO the 
>> only important thing is the spec that the device implements, not what 
>> piece of code implements it.
>
> Just as we do in the PIT, there's nothing wrong with making the 
> device's migration compatible. 

Then the two devices have the same migration section id?  That's my 
biggest worry.  Not really worried about PIT and PIC (no one uses the 
user PIT now), but more about future devices moving into the kernel, if 
we have to do that.

> I'm not entirely sure what your concerns about the monitor are but 
> there's simply no way to hide the fact that a device is implemented in 
> KVM at the monitor level. 

Why is that?  a PIT is a PIT.  Why does the monitor care where the state 
is managed?

> But really, is this something that management tools want?  I doubt 
> it.  I think they want to have ultimate control over what gets created 
> with us providing a recommended set of defaults.

They also want a forward migration path.  Splitting into two separate 
devices (at the ABI level, ignoring the source level for now) denies 
them that.
Avi Kivity - Jan. 11, 2011, 2:56 p.m.
On 01/11/2011 04:36 PM, Anthony Liguori wrote:
>> They need to use the same device id then.  And if they share code, 
>> that indicates that they need to be the same device even more.
>
>
> No, it really doesn't :-)  Cirrus VGA and std VGA share a lot of 
> code.  But that doesn't mean that we treat them as one device.

Cirrus and VGA really are separate devices.  They share code because on 
evolved from the other, and is backwards compatible with the other.  
i8254 and i8254-kvm did not evolve from each other, both are 
implementations of the i8254 spec, and both are 100% compatible with 
each other (modulu bugs).

>
> And BTW, there are guest visible differences between the KVM 
> IOAPIC/PIC/PIT than the QEMU versions.  The only reason PIT live 
> migration works today is because usually delivers all interrupts 
> quickly.  But it actually does maintain state in the work queue that 
> isn't saved.  If PIT tried to implement gradual catchup, there would 
> be no way not to expose that state to userspace.

Why not?  Whatever state the kernel keeps, we expose to userspace and 
allow sending it over the wire.
Anthony Liguori - Jan. 11, 2011, 3:12 p.m.
On 01/11/2011 08:56 AM, Avi Kivity wrote:
> On 01/11/2011 04:36 PM, Anthony Liguori wrote:
>>> They need to use the same device id then.  And if they share code, 
>>> that indicates that they need to be the same device even more.
>>
>>
>> No, it really doesn't :-)  Cirrus VGA and std VGA share a lot of 
>> code.  But that doesn't mean that we treat them as one device.
>
> Cirrus and VGA really are separate devices.  They share code because 
> on evolved from the other, and is backwards compatible with the 
> other.  i8254 and i8254-kvm did not evolve from each other,

Actually, they did, but that's besides the point.

> both are implementations of the i8254 spec, and both are 100% 
> compatible with each other (modulu bugs).
>
>>
>> And BTW, there are guest visible differences between the KVM 
>> IOAPIC/PIC/PIT than the QEMU versions.  The only reason PIT live 
>> migration works today is because usually delivers all interrupts 
>> quickly.  But it actually does maintain state in the work queue that 
>> isn't saved.  If PIT tried to implement gradual catchup, there would 
>> be no way not to expose that state to userspace.
>
> Why not?  Whatever state the kernel keeps, we expose to userspace and 
> allow sending it over the wire.

What exactly is the scenario you're concerned about?

Migration between userspace HPET and in-kernel HPET?

One thing I've been considering is essentially migration filters.  It 
would be a set of rules that essentially were "hpet-kvm.* = hpet.*" 
which would allow migration from hpet to hpet-kvm given a translation of 
state.  I think this sort of higher level ruleset would make it easier 
to support migration between versions of the device model.

Of course, that only gives you a forward path.  It doesn't give you a 
backwards path.

Regards,

Anthony Liguori
Alexander Graf - Jan. 11, 2011, 3:17 p.m.
On 11.01.2011, at 16:12, Anthony Liguori wrote:

> On 01/11/2011 08:56 AM, Avi Kivity wrote:
>> On 01/11/2011 04:36 PM, Anthony Liguori wrote:
>>>> They need to use the same device id then.  And if they share code, that indicates that they need to be the same device even more.
>>> 
>>> 
>>> No, it really doesn't :-)  Cirrus VGA and std VGA share a lot of code.  But that doesn't mean that we treat them as one device.
>> 
>> Cirrus and VGA really are separate devices.  They share code because on evolved from the other, and is backwards compatible with the other.  i8254 and i8254-kvm did not evolve from each other,
> 
> Actually, they did, but that's besides the point.
> 
>> both are implementations of the i8254 spec, and both are 100% compatible with each other (modulu bugs).
>> 
>>> 
>>> And BTW, there are guest visible differences between the KVM IOAPIC/PIC/PIT than the QEMU versions.  The only reason PIT live migration works today is because usually delivers all interrupts quickly.  But it actually does maintain state in the work queue that isn't saved.  If PIT tried to implement gradual catchup, there would be no way not to expose that state to userspace.
>> 
>> Why not?  Whatever state the kernel keeps, we expose to userspace and allow sending it over the wire.
> 
> What exactly is the scenario you're concerned about?
> 
> Migration between userspace HPET and in-kernel HPET?
> 
> One thing I've been considering is essentially migration filters.  It would be a set of rules that essentially were "hpet-kvm.* = hpet.*" which would allow migration from hpet to hpet-kvm given a translation of state.  I think this sort of higher level ruleset would make it easier to support migration between versions of the device model.
> 
> Of course, that only gives you a forward path.  It doesn't give you a backwards path.

Why not? Just include the version in the rule set and define a backwards rule if it's easy to do. If not, migration isn't possible.


Alex
Avi Kivity - Jan. 11, 2011, 3:37 p.m.
On 01/11/2011 05:12 PM, Anthony Liguori wrote:
>>> No, it really doesn't :-)  Cirrus VGA and std VGA share a lot of 
>>> code.  But that doesn't mean that we treat them as one device.
>>
>> Cirrus and VGA really are separate devices.  They share code because 
>> on evolved from the other, and is backwards compatible with the 
>> other.  i8254 and i8254-kvm did not evolve from each other,
>
>
> Actually, they did, but that's besides the point.

The code did, the devices did not.

>> Why not?  Whatever state the kernel keeps, we expose to userspace and 
>> allow sending it over the wire.
>
> What exactly is the scenario you're concerned about?
>
> Migration between userspace HPET and in-kernel HPET?

Yes.  To a lesser extent, a client doing 'info hpet' or similar and 
failing for kernel hpet.

>
> One thing I've been considering is essentially migration filters.  It 
> would be a set of rules that essentially were "hpet-kvm.* = hpet.*" 
> which would allow migration from hpet to hpet-kvm given a translation 
> of state.  I think this sort of higher level ruleset would make it 
> easier to support migration between versions of the device model.
>
> Of course, that only gives you a forward path.  It doesn't give you a 
> backwards path.
>

It would be easier to have them use the same device id in the first place.

If it looks like an i8254, quacks like an i8254, and live migrates like 
an i8254, it's probably an i8254.
Anthony Liguori - Jan. 11, 2011, 3:55 p.m.
On 01/11/2011 09:37 AM, Avi Kivity wrote:
>>> Why not?  Whatever state the kernel keeps, we expose to userspace 
>>> and allow sending it over the wire.
>>
>> What exactly is the scenario you're concerned about?
>>
>> Migration between userspace HPET and in-kernel HPET?
>
> Yes.  To a lesser extent, a client doing 'info hpet' or similar and 
> failing for kernel hpet.

That's pretty easy to address.

>>
>> One thing I've been considering is essentially migration filters.  It 
>> would be a set of rules that essentially were "hpet-kvm.* = hpet.*" 
>> which would allow migration from hpet to hpet-kvm given a translation 
>> of state.  I think this sort of higher level ruleset would make it 
>> easier to support migration between versions of the device model.
>>
>> Of course, that only gives you a forward path.  It doesn't give you a 
>> backwards path.
>>
>
> It would be easier to have them use the same device id in the first 
> place.
>
> If it looks like an i8254, quacks like an i8254, and live migrates 
> like an i8254, it's probably an i8254.

And that's fine.  I'm not suggesting you call it i8253.  But it's two 
separate implementations.  We should make that visible, not try to hide 
it.  It's an important detail.

Imagine getting a sosreport that includes a dump of the device tree.  
You really want to see something in there that tells you it's an 
in-kernel PIT and not the userspace one.

Regards,

Anthony Liguori
Avi Kivity - Jan. 11, 2011, 4:03 p.m.
On 01/11/2011 05:55 PM, Anthony Liguori wrote:
>
>>>
>>> One thing I've been considering is essentially migration filters.  
>>> It would be a set of rules that essentially were "hpet-kvm.* = 
>>> hpet.*" which would allow migration from hpet to hpet-kvm given a 
>>> translation of state.  I think this sort of higher level ruleset 
>>> would make it easier to support migration between versions of the 
>>> device model.
>>>
>>> Of course, that only gives you a forward path.  It doesn't give you 
>>> a backwards path.
>>>
>>
>> It would be easier to have them use the same device id in the first 
>> place.
>>
>> If it looks like an i8254, quacks like an i8254, and live migrates 
>> like an i8254, it's probably an i8254.
>
> And that's fine.  I'm not suggesting you call it i8253.  But it's two 
> separate implementations.  We should make that visible, not try to 
> hide it.  It's an important detail.
>

Visible, yes, but not in live migration, or in 'info i8254', or 
similar.  We can live migrate between qcow2 and qed (using block 
migration), we should be able to do the same for the two i8254 
implementations.

I'm not happy about separate implementations, but that's a minor 
details.  We can change it 2n+1 times without anybody noticing.  Not so 
about ABI stuff.

> Imagine getting a sosreport that includes a dump of the device tree.  
> You really want to see something in there that tells you it's an 
> in-kernel PIT and not the userspace one.

Sure.  Not the device tree though.  The command line would give all the 
information?

Or 'info i8254' can say something about the implementation.  I don't 
want to have the user say 'info i8254-kvm'.
Anthony Liguori - Jan. 11, 2011, 4:26 p.m.
> Visible, yes, but not in live migration, or in 'info i8254', or 
> similar.  We can live migrate between qcow2 and qed (using block 
> migration), we should be able to do the same for the two i8254 
> implementations.
>
> I'm not happy about separate implementations, but that's a minor 
> details.  We can change it 2n+1 times without anybody noticing.  Not 
> so about ABI stuff.
>
>> Imagine getting a sosreport that includes a dump of the device tree.  
>> You really want to see something in there that tells you it's an 
>> in-kernel PIT and not the userspace one.
>
> Sure.  Not the device tree though.  The command line would give all 
> the information?

Then it's a one off option.  We really want as much info as possible 
stored in the device tree.

>
> Or 'info i8254' can say something about the implementation.  I don't 
> want to have the user say 'info i8254-kvm'.

info doesn't take a qdev device so yes, it can show whatever we want it 
to show.

Regards,

Anthony Liguori
Avi Kivity - Jan. 11, 2011, 5:05 p.m.
On 01/11/2011 06:26 PM, Anthony Liguori wrote:
>
>> Visible, yes, but not in live migration, or in 'info i8254', or 
>> similar.  We can live migrate between qcow2 and qed (using block 
>> migration), we should be able to do the same for the two i8254 
>> implementations.
>>
>> I'm not happy about separate implementations, but that's a minor 
>> details.  We can change it 2n+1 times without anybody noticing.  Not 
>> so about ABI stuff.
>>
>>> Imagine getting a sosreport that includes a dump of the device 
>>> tree.  You really want to see something in there that tells you it's 
>>> an in-kernel PIT and not the userspace one.
>>
>> Sure.  Not the device tree though.  The command line would give all 
>> the information?
>
> Then it's a one off option.  We really want as much info as possible 
> stored in the device tree.
>
>>
>> Or 'info i8254' can say something about the implementation.  I don't 
>> want to have the user say 'info i8254-kvm'.
>
> info doesn't take a qdev device so yes, it can show whatever we want 
> it to show.
>

It may be a qdev read-only attribute (and thus not migrated?) if we have 
to have it in qdev for some reason.

Patch

diff --git a/cpu-defs.h b/cpu-defs.h
index 8d4bf86..0e04239 100644
--- a/cpu-defs.h
+++ b/cpu-defs.h
@@ -131,7 +131,6 @@  typedef struct icount_decr_u16 {
 #endif
 
 struct kvm_run;
-struct KVMState;
 struct qemu_work_item;
 
 typedef struct CPUBreakpoint {
@@ -207,7 +206,6 @@  typedef struct CPUWatchpoint {
     struct QemuCond *halt_cond;                                         \
     struct qemu_work_item *queued_work_first, *queued_work_last;        \
     const char *cpu_model_str;                                          \
-    struct KVMState *kvm_state;                                         \
     struct kvm_run *kvm_run;                                            \
     int kvm_fd;                                                         \
     int kvm_vcpu_dirty;
diff --git a/kvm-all.c b/kvm-all.c
index ef2ca3b..d8820c7 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -52,8 +52,7 @@  typedef struct KVMSlot
 
 typedef struct kvm_dirty_log KVMDirtyLog;
 
-struct KVMState
-{
+static struct KVMState {
     KVMSlot slots[32];
     int fd;
     int vmfd;
@@ -72,21 +71,19 @@  struct KVMState
     int irqchip_in_kernel;
     int pit_in_kernel;
     int xsave, xcrs;
-};
-
-static KVMState *kvm_state;
+} kvm_state;
 
-static KVMSlot *kvm_alloc_slot(KVMState *s)
+static KVMSlot *kvm_alloc_slot(void)
 {
     int i;
 
-    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
+    for (i = 0; i < ARRAY_SIZE(kvm_state.slots); i++) {
         /* KVM private memory slots */
         if (i >= 8 && i < 12) {
             continue;
         }
-        if (s->slots[i].memory_size == 0) {
-            return &s->slots[i];
+        if (kvm_state.slots[i].memory_size == 0) {
+            return &kvm_state.slots[i];
         }
     }
 
@@ -94,14 +91,13 @@  static KVMSlot *kvm_alloc_slot(KVMState *s)
     abort();
 }
 
-static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
-                                         target_phys_addr_t start_addr,
+static KVMSlot *kvm_lookup_matching_slot(target_phys_addr_t start_addr,
                                          target_phys_addr_t end_addr)
 {
     int i;
 
-    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
-        KVMSlot *mem = &s->slots[i];
+    for (i = 0; i < ARRAY_SIZE(kvm_state.slots); i++) {
+        KVMSlot *mem = &kvm_state.slots[i];
 
         if (start_addr == mem->start_addr &&
             end_addr == mem->start_addr + mem->memory_size) {
@@ -115,15 +111,14 @@  static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
 /*
  * Find overlapping slot with lowest start address
  */
-static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
-                                            target_phys_addr_t start_addr,
+static KVMSlot *kvm_lookup_overlapping_slot(target_phys_addr_t start_addr,
                                             target_phys_addr_t end_addr)
 {
     KVMSlot *found = NULL;
     int i;
 
-    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
-        KVMSlot *mem = &s->slots[i];
+    for (i = 0; i < ARRAY_SIZE(kvm_state.slots); i++) {
+        KVMSlot *mem = &kvm_state.slots[i];
 
         if (mem->memory_size == 0 ||
             (found && found->start_addr < mem->start_addr)) {
@@ -139,13 +134,13 @@  static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
     return found;
 }
 
-int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+int kvm_physical_memory_addr_from_ram(ram_addr_t ram_addr,
                                       target_phys_addr_t *phys_addr)
 {
     int i;
 
-    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
-        KVMSlot *mem = &s->slots[i];
+    for (i = 0; i < ARRAY_SIZE(kvm_state.slots); i++) {
+        KVMSlot *mem = &kvm_state.slots[i];
 
         if (ram_addr >= mem->phys_offset &&
             ram_addr < mem->phys_offset + mem->memory_size) {
@@ -157,7 +152,7 @@  int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
     return 0;
 }
 
-static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
+static int kvm_set_user_memory_region(KVMSlot *slot)
 {
     struct kvm_userspace_memory_region mem;
 
@@ -166,10 +161,10 @@  static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
     mem.memory_size = slot->memory_size;
     mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
     mem.flags = slot->flags;
-    if (s->migration_log) {
+    if (kvm_state.migration_log) {
         mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
     }
-    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
+    return kvm_vm_ioctl(KVM_SET_USER_MEMORY_REGION, &mem);
 }
 
 static void kvm_reset_vcpu(void *opaque)
@@ -181,33 +176,31 @@  static void kvm_reset_vcpu(void *opaque)
 
 int kvm_irqchip_in_kernel(void)
 {
-    return kvm_state->irqchip_in_kernel;
+    return kvm_state.irqchip_in_kernel;
 }
 
 int kvm_pit_in_kernel(void)
 {
-    return kvm_state->pit_in_kernel;
+    return kvm_state.pit_in_kernel;
 }
 
 
 int kvm_init_vcpu(CPUState *env)
 {
-    KVMState *s = kvm_state;
     long mmap_size;
     int ret;
 
     DPRINTF("kvm_init_vcpu\n");
 
-    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
+    ret = kvm_vm_ioctl(KVM_CREATE_VCPU, env->cpu_index);
     if (ret < 0) {
         DPRINTF("kvm_create_vcpu failed\n");
         goto err;
     }
 
     env->kvm_fd = ret;
-    env->kvm_state = s;
 
-    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
+    mmap_size = kvm_ioctl(KVM_GET_VCPU_MMAP_SIZE, 0);
     if (mmap_size < 0) {
         DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
         goto err;
@@ -222,9 +215,9 @@  int kvm_init_vcpu(CPUState *env)
     }
 
 #ifdef KVM_CAP_COALESCED_MMIO
-    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
-        s->coalesced_mmio_ring =
-            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
+    if (kvm_state.coalesced_mmio && !kvm_state.coalesced_mmio_ring) {
+        kvm_state.coalesced_mmio_ring =
+            (void *)env->kvm_run + kvm_state.coalesced_mmio * PAGE_SIZE;
     }
 #endif
 
@@ -243,8 +236,7 @@  err:
 static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
                                       ram_addr_t size, int flags, int mask)
 {
-    KVMState *s = kvm_state;
-    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
+    KVMSlot *mem = kvm_lookup_matching_slot(phys_addr, phys_addr + size);
     int old_flags;
 
     if (mem == NULL)  {
@@ -260,14 +252,14 @@  static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
     mem->flags = flags;
 
     /* If nothing changed effectively, no need to issue ioctl */
-    if (s->migration_log) {
+    if (kvm_state.migration_log) {
         flags |= KVM_MEM_LOG_DIRTY_PAGES;
     }
     if (flags == old_flags) {
             return 0;
     }
 
-    return kvm_set_user_memory_region(s, mem);
+    return kvm_set_user_memory_region(mem);
 }
 
 int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
@@ -284,14 +276,13 @@  int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
 
 static int kvm_set_migration_log(int enable)
 {
-    KVMState *s = kvm_state;
     KVMSlot *mem;
     int i, err;
 
-    s->migration_log = enable;
+    kvm_state.migration_log = enable;
 
-    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
-        mem = &s->slots[i];
+    for (i = 0; i < ARRAY_SIZE(kvm_state.slots); i++) {
+        mem = &kvm_state.slots[i];
 
         if (!mem->memory_size) {
             continue;
@@ -299,7 +290,7 @@  static int kvm_set_migration_log(int enable)
         if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
             continue;
         }
-        err = kvm_set_user_memory_region(s, mem);
+        err = kvm_set_user_memory_region(mem);
         if (err) {
             return err;
         }
@@ -353,7 +344,6 @@  static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
 static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
                                           target_phys_addr_t end_addr)
 {
-    KVMState *s = kvm_state;
     unsigned long size, allocated_size = 0;
     KVMDirtyLog d;
     KVMSlot *mem;
@@ -361,7 +351,7 @@  static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
 
     d.dirty_bitmap = NULL;
     while (start_addr < end_addr) {
-        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
+        mem = kvm_lookup_overlapping_slot(start_addr, end_addr);
         if (mem == NULL) {
             break;
         }
@@ -377,7 +367,7 @@  static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
 
         d.slot = mem->slot;
 
-        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
+        if (kvm_vm_ioctl(KVM_GET_DIRTY_LOG, &d) == -1) {
             DPRINTF("ioctl failed %d\n", errno);
             ret = -1;
             break;
@@ -395,16 +385,15 @@  static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
 int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
 {
     int ret = -ENOSYS;
-#ifdef KVM_CAP_COALESCED_MMIO
-    KVMState *s = kvm_state;
 
-    if (s->coalesced_mmio) {
+#ifdef KVM_CAP_COALESCED_MMIO
+    if (kvm_state.coalesced_mmio) {
         struct kvm_coalesced_mmio_zone zone;
 
         zone.addr = start;
         zone.size = size;
 
-        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
+        ret = kvm_vm_ioctl(KVM_REGISTER_COALESCED_MMIO, &zone);
     }
 #endif
 
@@ -414,27 +403,26 @@  int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
 int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
 {
     int ret = -ENOSYS;
-#ifdef KVM_CAP_COALESCED_MMIO
-    KVMState *s = kvm_state;
 
-    if (s->coalesced_mmio) {
+#ifdef KVM_CAP_COALESCED_MMIO
+    if (kvm_state.coalesced_mmio) {
         struct kvm_coalesced_mmio_zone zone;
 
         zone.addr = start;
         zone.size = size;
 
-        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
+        ret = kvm_vm_ioctl(KVM_UNREGISTER_COALESCED_MMIO, &zone);
     }
 #endif
 
     return ret;
 }
 
-int kvm_check_extension(KVMState *s, unsigned int extension)
+int kvm_check_extension(unsigned int extension)
 {
     int ret;
 
-    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
+    ret = kvm_ioctl(KVM_CHECK_EXTENSION, extension);
     if (ret < 0) {
         ret = 0;
     }
@@ -445,7 +433,6 @@  int kvm_check_extension(KVMState *s, unsigned int extension)
 static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
                              ram_addr_t phys_offset)
 {
-    KVMState *s = kvm_state;
     ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
     KVMSlot *mem, old;
     int err;
@@ -459,7 +446,7 @@  static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
     phys_offset &= ~IO_MEM_ROM;
 
     while (1) {
-        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
+        mem = kvm_lookup_overlapping_slot(start_addr, start_addr + size);
         if (!mem) {
             break;
         }
@@ -476,7 +463,7 @@  static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
 
         /* unregister the overlapping slot */
         mem->memory_size = 0;
-        err = kvm_set_user_memory_region(s, mem);
+        err = kvm_set_user_memory_region(mem);
         if (err) {
             fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
                     __func__, strerror(-err));
@@ -491,16 +478,16 @@  static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
          * address as the first existing one. If not or if some overlapping
          * slot comes around later, we will fail (not seen in practice so far)
          * - and actually require a recent KVM version. */
-        if (s->broken_set_mem_region &&
+        if (kvm_state.broken_set_mem_region &&
             old.start_addr == start_addr && old.memory_size < size &&
             flags < IO_MEM_UNASSIGNED) {
-            mem = kvm_alloc_slot(s);
+            mem = kvm_alloc_slot();
             mem->memory_size = old.memory_size;
             mem->start_addr = old.start_addr;
             mem->phys_offset = old.phys_offset;
             mem->flags = 0;
 
-            err = kvm_set_user_memory_region(s, mem);
+            err = kvm_set_user_memory_region(mem);
             if (err) {
                 fprintf(stderr, "%s: error updating slot: %s\n", __func__,
                         strerror(-err));
@@ -515,13 +502,13 @@  static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
 
         /* register prefix slot */
         if (old.start_addr < start_addr) {
-            mem = kvm_alloc_slot(s);
+            mem = kvm_alloc_slot();
             mem->memory_size = start_addr - old.start_addr;
             mem->start_addr = old.start_addr;
             mem->phys_offset = old.phys_offset;
             mem->flags = 0;
 
-            err = kvm_set_user_memory_region(s, mem);
+            err = kvm_set_user_memory_region(mem);
             if (err) {
                 fprintf(stderr, "%s: error registering prefix slot: %s\n",
                         __func__, strerror(-err));
@@ -533,14 +520,14 @@  static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
         if (old.start_addr + old.memory_size > start_addr + size) {
             ram_addr_t size_delta;
 
-            mem = kvm_alloc_slot(s);
+            mem = kvm_alloc_slot();
             mem->start_addr = start_addr + size;
             size_delta = mem->start_addr - old.start_addr;
             mem->memory_size = old.memory_size - size_delta;
             mem->phys_offset = old.phys_offset + size_delta;
             mem->flags = 0;
 
-            err = kvm_set_user_memory_region(s, mem);
+            err = kvm_set_user_memory_region(mem);
             if (err) {
                 fprintf(stderr, "%s: error registering suffix slot: %s\n",
                         __func__, strerror(-err));
@@ -557,13 +544,13 @@  static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
     if (flags >= IO_MEM_UNASSIGNED) {
         return;
     }
-    mem = kvm_alloc_slot(s);
+    mem = kvm_alloc_slot();
     mem->memory_size = size;
     mem->start_addr = start_addr;
     mem->phys_offset = phys_offset;
     mem->flags = 0;
 
-    err = kvm_set_user_memory_region(s, mem);
+    err = kvm_set_user_memory_region(mem);
     if (err) {
         fprintf(stderr, "%s: error registering slot: %s\n", __func__,
                 strerror(-err));
@@ -602,27 +589,24 @@  int kvm_init(int smp_cpus)
     static const char upgrade_note[] =
         "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
         "(see http://sourceforge.net/projects/kvm).\n";
-    KVMState *s;
     int ret;
     int i;
 
-    s = qemu_mallocz(sizeof(KVMState));
-
 #ifdef KVM_CAP_SET_GUEST_DEBUG
-    QTAILQ_INIT(&s->kvm_sw_breakpoints);
+    QTAILQ_INIT(&kvm_state.kvm_sw_breakpoints);
 #endif
-    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
-        s->slots[i].slot = i;
+    for (i = 0; i < ARRAY_SIZE(kvm_state.slots); i++) {
+        kvm_state.slots[i].slot = i;
     }
-    s->vmfd = -1;
-    s->fd = qemu_open("/dev/kvm", O_RDWR);
-    if (s->fd == -1) {
+    kvm_state.vmfd = -1;
+    kvm_state.fd = qemu_open("/dev/kvm", O_RDWR);
+    if (kvm_state.fd == -1) {
         fprintf(stderr, "Could not access KVM kernel module: %m\n");
         ret = -errno;
         goto err;
     }
 
-    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
+    ret = kvm_ioctl(KVM_GET_API_VERSION, 0);
     if (ret < KVM_API_VERSION) {
         if (ret > 0) {
             ret = -EINVAL;
@@ -637,8 +621,8 @@  int kvm_init(int smp_cpus)
         goto err;
     }
 
-    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
-    if (s->vmfd < 0) {
+    kvm_state.vmfd = kvm_ioctl(KVM_CREATE_VM, 0);
+    if (kvm_state.vmfd < 0) {
 #ifdef TARGET_S390X
         fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
                         "your host kernel command line\n");
@@ -651,7 +635,7 @@  int kvm_init(int smp_cpus)
      * just use a user allocated buffer so we can use regular pages
      * unmodified.  Make sure we have a sufficiently modern version of KVM.
      */
-    if (!kvm_check_extension(s, KVM_CAP_USER_MEMORY)) {
+    if (!kvm_check_extension(KVM_CAP_USER_MEMORY)) {
         ret = -EINVAL;
         fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n%s",
                 upgrade_note);
@@ -661,7 +645,7 @@  int kvm_init(int smp_cpus)
     /* There was a nasty bug in < kvm-80 that prevents memory slots from being
      * destroyed properly.  Since we rely on this capability, refuse to work
      * with any kernel without this capability. */
-    if (!kvm_check_extension(s, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
+    if (!kvm_check_extension(KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
         ret = -EINVAL;
 
         fprintf(stderr,
@@ -670,66 +654,55 @@  int kvm_init(int smp_cpus)
         goto err;
     }
 
-    s->coalesced_mmio = 0;
 #ifdef KVM_CAP_COALESCED_MMIO
-    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
-    s->coalesced_mmio_ring = NULL;
+    kvm_state.coalesced_mmio = kvm_check_extension(KVM_CAP_COALESCED_MMIO);
 #endif
 
-    s->broken_set_mem_region = 1;
+    kvm_state.broken_set_mem_region = 1;
 #ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
-    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
+    ret = kvm_check_extension(KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
     if (ret > 0) {
-        s->broken_set_mem_region = 0;
+        kvm_state.broken_set_mem_region = 0;
     }
 #endif
 
-    s->vcpu_events = 0;
 #ifdef KVM_CAP_VCPU_EVENTS
-    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
+    kvm_state.vcpu_events = kvm_check_extension(KVM_CAP_VCPU_EVENTS);
 #endif
 
-    s->robust_singlestep = 0;
 #ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
-    s->robust_singlestep =
-        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
+    kvm_state.robust_singlestep =
+        kvm_check_extension(KVM_CAP_X86_ROBUST_SINGLESTEP);
 #endif
 
-    s->debugregs = 0;
 #ifdef KVM_CAP_DEBUGREGS
-    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
+    kvm_state.debugregs = kvm_check_extension(KVM_CAP_DEBUGREGS);
 #endif
 
-    s->xsave = 0;
 #ifdef KVM_CAP_XSAVE
-    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
+    kvm_state.xsave = kvm_check_extension(KVM_CAP_XSAVE);
 #endif
 
-    s->xcrs = 0;
 #ifdef KVM_CAP_XCRS
-    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
+    kvm_state.xcrs = kvm_check_extension(KVM_CAP_XCRS);
 #endif
 
-    ret = kvm_arch_init(s, smp_cpus);
+    ret = kvm_arch_init(smp_cpus);
     if (ret < 0) {
         goto err;
     }
 
-    kvm_state = s;
     cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
 
     return 0;
 
 err:
-    if (s) {
-        if (s->vmfd != -1) {
-            close(s->vmfd);
-        }
-        if (s->fd != -1) {
-            close(s->fd);
-        }
+    if (kvm_state.vmfd != -1) {
+        close(kvm_state.vmfd);
+    }
+    if (kvm_state.fd != -1) {
+        close(kvm_state.fd);
     }
-    qemu_free(s);
 
     return ret;
 }
@@ -777,7 +750,7 @@  static int kvm_handle_io(uint16_t port, void *data, int direction, int size,
 static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
 {
     fprintf(stderr, "KVM internal error.");
-    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
+    if (kvm_check_extension(KVM_CAP_INTERNAL_ERROR_DATA)) {
         int i;
 
         fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
@@ -805,9 +778,8 @@  static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
 void kvm_flush_coalesced_mmio_buffer(void)
 {
 #ifdef KVM_CAP_COALESCED_MMIO
-    KVMState *s = kvm_state;
-    if (s->coalesced_mmio_ring) {
-        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
+    if (kvm_state.coalesced_mmio_ring) {
+        struct kvm_coalesced_mmio_ring *ring = kvm_state.coalesced_mmio_ring;
         while (ring->first != ring->last) {
             struct kvm_coalesced_mmio *ent;
 
@@ -963,7 +935,7 @@  void kvm_cpu_exec(CPUState *env)
     }
 }
 
-int kvm_ioctl(KVMState *s, int type, ...)
+int kvm_ioctl(int type, ...)
 {
     int ret;
     void *arg;
@@ -973,14 +945,14 @@  int kvm_ioctl(KVMState *s, int type, ...)
     arg = va_arg(ap, void *);
     va_end(ap);
 
-    ret = ioctl(s->fd, type, arg);
+    ret = ioctl(kvm_state.fd, type, arg);
     if (ret == -1) {
         ret = -errno;
     }
     return ret;
 }
 
-int kvm_vm_ioctl(KVMState *s, int type, ...)
+int kvm_vm_ioctl(int type, ...)
 {
     int ret;
     void *arg;
@@ -990,7 +962,7 @@  int kvm_vm_ioctl(KVMState *s, int type, ...)
     arg = va_arg(ap, void *);
     va_end(ap);
 
-    ret = ioctl(s->vmfd, type, arg);
+    ret = ioctl(kvm_state.vmfd, type, arg);
     if (ret == -1) {
         ret = -errno;
     }
@@ -1017,9 +989,7 @@  int kvm_vcpu_ioctl(CPUState *env, int type, ...)
 int kvm_has_sync_mmu(void)
 {
 #ifdef KVM_CAP_SYNC_MMU
-    KVMState *s = kvm_state;
-
-    return kvm_check_extension(s, KVM_CAP_SYNC_MMU);
+    return kvm_check_extension(KVM_CAP_SYNC_MMU);
 #else
     return 0;
 #endif
@@ -1027,27 +997,27 @@  int kvm_has_sync_mmu(void)
 
 int kvm_has_vcpu_events(void)
 {
-    return kvm_state->vcpu_events;
+    return kvm_state.vcpu_events;
 }
 
 int kvm_has_robust_singlestep(void)
 {
-    return kvm_state->robust_singlestep;
+    return kvm_state.robust_singlestep;
 }
 
 int kvm_has_debugregs(void)
 {
-    return kvm_state->debugregs;
+    return kvm_state.debugregs;
 }
 
 int kvm_has_xsave(void)
 {
-    return kvm_state->xsave;
+    return kvm_state.xsave;
 }
 
 int kvm_has_xcrs(void)
 {
-    return kvm_state->xcrs;
+    return kvm_state.xcrs;
 }
 
 void kvm_setup_guest_memory(void *start, size_t size)
@@ -1070,7 +1040,7 @@  struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
 {
     struct kvm_sw_breakpoint *bp;
 
-    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
+    QTAILQ_FOREACH(bp, &kvm_state.kvm_sw_breakpoints, entry) {
         if (bp->pc == pc) {
             return bp;
         }
@@ -1080,7 +1050,7 @@  struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
 
 int kvm_sw_breakpoints_active(CPUState *env)
 {
-    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
+    return !QTAILQ_EMPTY(&kvm_state.kvm_sw_breakpoints);
 }
 
 struct kvm_set_guest_debug_data {
@@ -1140,8 +1110,7 @@  int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
             return err;
         }
 
-        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
-                          bp, entry);
+        QTAILQ_INSERT_HEAD(&kvm_state.kvm_sw_breakpoints, bp, entry);
     } else {
         err = kvm_arch_insert_hw_breakpoint(addr, len, type);
         if (err) {
@@ -1181,7 +1150,7 @@  int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
             return err;
         }
 
-        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
+        QTAILQ_REMOVE(&kvm_state.kvm_sw_breakpoints, bp, entry);
         qemu_free(bp);
     } else {
         err = kvm_arch_remove_hw_breakpoint(addr, len, type);
@@ -1202,10 +1171,9 @@  int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
 void kvm_remove_all_breakpoints(CPUState *current_env)
 {
     struct kvm_sw_breakpoint *bp, *next;
-    KVMState *s = current_env->kvm_state;
     CPUState *env;
 
-    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
+    QTAILQ_FOREACH_SAFE(bp, &kvm_state.kvm_sw_breakpoints, entry, next) {
         if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
             /* Try harder to find a CPU that currently sees the breakpoint. */
             for (env = first_cpu; env != NULL; env = env->next_cpu) {
@@ -1285,7 +1253,7 @@  int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign
         iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
     }
 
-    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
+    ret = kvm_vm_ioctl(KVM_IOEVENTFD, &iofd);
 
     if (ret < 0) {
         return -errno;
@@ -1314,7 +1282,7 @@  int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
     if (!assign) {
         kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
     }
-    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
+    r = kvm_vm_ioctl(KVM_IOEVENTFD, &kick);
     if (r < 0) {
         return r;
     }
diff --git a/kvm-stub.c b/kvm-stub.c
index 352c6a6..3a058ad 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -53,7 +53,7 @@  int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
     return -ENOSYS;
 }
 
-int kvm_check_extension(KVMState *s, unsigned int extension)
+int kvm_check_extension(unsigned int extension)
 {
     return 0;
 }
diff --git a/kvm.h b/kvm.h
index 51ad56f..26ca8c1 100644
--- a/kvm.h
+++ b/kvm.h
@@ -74,12 +74,9 @@  int kvm_irqchip_in_kernel(void);
 
 /* internal API */
 
-struct KVMState;
-typedef struct KVMState KVMState;
+int kvm_ioctl(int type, ...);
 
-int kvm_ioctl(KVMState *s, int type, ...);
-
-int kvm_vm_ioctl(KVMState *s, int type, ...);
+int kvm_vm_ioctl(int type, ...);
 
 int kvm_vcpu_ioctl(CPUState *env, int type, ...);
 
@@ -104,7 +101,7 @@  int kvm_arch_get_registers(CPUState *env);
 
 int kvm_arch_put_registers(CPUState *env, int level);
 
-int kvm_arch_init(KVMState *s, int smp_cpus);
+int kvm_arch_init(int smp_cpus);
 
 int kvm_arch_init_vcpu(CPUState *env);
 
@@ -146,10 +143,8 @@  void kvm_arch_update_guest_debug(CPUState *env, struct kvm_guest_debug *dbg);
 
 bool kvm_arch_stop_on_emulation_error(CPUState *env);
 
-int kvm_check_extension(KVMState *s, unsigned int extension);
+int kvm_check_extension(unsigned int extension);
 
-uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
-                                      uint32_t index, int reg);
 void kvm_cpu_synchronize_state(CPUState *env);
 void kvm_cpu_synchronize_post_reset(CPUState *env);
 void kvm_cpu_synchronize_post_init(CPUState *env);
@@ -179,7 +174,7 @@  static inline void cpu_synchronize_post_init(CPUState *env)
 
 
 #if !defined(CONFIG_USER_ONLY)
-int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+int kvm_physical_memory_addr_from_ram(ram_addr_t ram_addr,
                                       target_phys_addr_t *phys_addr);
 #endif
 
diff --git a/target-i386/cpuid.c b/target-i386/cpuid.c
index 5382a28..17ab619 100644
--- a/target-i386/cpuid.c
+++ b/target-i386/cpuid.c
@@ -23,6 +23,7 @@ 
 
 #include "cpu.h"
 #include "kvm.h"
+#include "kvm_x86.h"
 
 #include "qemu-option.h"
 #include "qemu-config.h"
@@ -1138,10 +1139,10 @@  void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
             break;
         }
         if (kvm_enabled()) {
-            *eax = kvm_arch_get_supported_cpuid(env, 0xd, count, R_EAX);
-            *ebx = kvm_arch_get_supported_cpuid(env, 0xd, count, R_EBX);
-            *ecx = kvm_arch_get_supported_cpuid(env, 0xd, count, R_ECX);
-            *edx = kvm_arch_get_supported_cpuid(env, 0xd, count, R_EDX);
+            *eax = kvm_x86_get_supported_cpuid(0xd, count, R_EAX);
+            *ebx = kvm_x86_get_supported_cpuid(0xd, count, R_EBX);
+            *ecx = kvm_x86_get_supported_cpuid(0xd, count, R_ECX);
+            *edx = kvm_x86_get_supported_cpuid(0xd, count, R_EDX);
         } else {
             *eax = 0;
             *ebx = 0;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 1789bff..cb6883f 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -60,7 +60,7 @@  static int lm_capable_kernel;
 
 #ifdef KVM_CAP_EXT_CPUID
 
-static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
+static struct kvm_cpuid2 *try_get_cpuid(int max)
 {
     struct kvm_cpuid2 *cpuid;
     int r, size;
@@ -68,7 +68,7 @@  static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
     cpuid = (struct kvm_cpuid2 *)qemu_mallocz(size);
     cpuid->nent = max;
-    r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
+    r = kvm_ioctl(KVM_GET_SUPPORTED_CPUID, cpuid);
     if (r == 0 && cpuid->nent >= max) {
         r = -E2BIG;
     }
@@ -85,20 +85,20 @@  static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
     return cpuid;
 }
 
-uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
-                                      uint32_t index, int reg)
+uint32_t kvm_x86_get_supported_cpuid(uint32_t function, uint32_t index,
+                                     int reg)
 {
     struct kvm_cpuid2 *cpuid;
     int i, max;
     uint32_t ret = 0;
     uint32_t cpuid_1_edx;
 
-    if (!kvm_check_extension(env->kvm_state, KVM_CAP_EXT_CPUID)) {
+    if (!kvm_check_extension(KVM_CAP_EXT_CPUID)) {
         return -1U;
     }
 
     max = 1;
-    while ((cpuid = try_get_cpuid(env->kvm_state, max)) == NULL) {
+    while ((cpuid = try_get_cpuid(max)) == NULL) {
         max *= 2;
     }
 
@@ -126,7 +126,7 @@  uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
                     /* On Intel, kvm returns cpuid according to the Intel spec,
                      * so add missing bits according to the AMD spec:
                      */
-                    cpuid_1_edx = kvm_arch_get_supported_cpuid(env, 1, 0, R_EDX);
+                    cpuid_1_edx = kvm_x86_get_supported_cpuid(1, 0, R_EDX);
                     ret |= cpuid_1_edx & 0x183f7ff;
                     break;
                 }
@@ -142,8 +142,8 @@  uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
 
 #else
 
-uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
-                                      uint32_t index, int reg)
+uint32_t kvm_x86_get_supported_cpuid(uint32_t function, uint32_t index,
+                                     int reg)
 {
     return -1U;
 }
@@ -170,12 +170,12 @@  struct kvm_para_features {
     { -1, -1 }
 };
 
-static int get_para_features(CPUState *env)
+static int get_para_features(void)
 {
     int i, features = 0;
 
     for (i = 0; i < ARRAY_SIZE(para_features) - 1; i++) {
-        if (kvm_check_extension(env->kvm_state, para_features[i].cap)) {
+        if (kvm_check_extension(para_features[i].cap)) {
             features |= (1 << para_features[i].feature);
         }
     }
@@ -184,15 +184,14 @@  static int get_para_features(CPUState *env)
 #endif
 
 #ifdef KVM_CAP_MCE
-static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
-                                     int *max_banks)
+static int kvm_get_mce_cap_supported(uint64_t *mce_cap, int *max_banks)
 {
     int r;
 
-    r = kvm_check_extension(s, KVM_CAP_MCE);
+    r = kvm_check_extension(KVM_CAP_MCE);
     if (r > 0) {
         *max_banks = r;
-        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
+        return kvm_ioctl(KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
     }
     return -ENOSYS;
 }
@@ -323,18 +322,18 @@  int kvm_arch_init_vcpu(CPUState *env)
     uint32_t signature[3];
 #endif
 
-    env->cpuid_features &= kvm_arch_get_supported_cpuid(env, 1, 0, R_EDX);
+    env->cpuid_features &= kvm_x86_get_supported_cpuid(1, 0, R_EDX);
 
     i = env->cpuid_ext_features & CPUID_EXT_HYPERVISOR;
-    env->cpuid_ext_features &= kvm_arch_get_supported_cpuid(env, 1, 0, R_ECX);
+    env->cpuid_ext_features &= kvm_x86_get_supported_cpuid(1, 0, R_ECX);
     env->cpuid_ext_features |= i;
 
-    env->cpuid_ext2_features &= kvm_arch_get_supported_cpuid(env, 0x80000001,
-                                                             0, R_EDX);
-    env->cpuid_ext3_features &= kvm_arch_get_supported_cpuid(env, 0x80000001,
-                                                             0, R_ECX);
-    env->cpuid_svm_features  &= kvm_arch_get_supported_cpuid(env, 0x8000000A,
-                                                             0, R_EDX);
+    env->cpuid_ext2_features &= kvm_x86_get_supported_cpuid(0x80000001,
+                                                            0, R_EDX);
+    env->cpuid_ext3_features &= kvm_x86_get_supported_cpuid(0x80000001,
+                                                            0, R_ECX);
+    env->cpuid_svm_features  &= kvm_x86_get_supported_cpuid(0x8000000A,
+                                                            0, R_EDX);
 
 
     cpuid_i = 0;
@@ -353,7 +352,7 @@  int kvm_arch_init_vcpu(CPUState *env)
     c = &cpuid_data.entries[cpuid_i++];
     memset(c, 0, sizeof(*c));
     c->function = KVM_CPUID_FEATURES;
-    c->eax = env->cpuid_kvm_features & get_para_features(env);
+    c->eax = env->cpuid_kvm_features & get_para_features();
 #endif
 
     cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
@@ -423,11 +422,11 @@  int kvm_arch_init_vcpu(CPUState *env)
 #ifdef KVM_CAP_MCE
     if (((env->cpuid_version >> 8)&0xF) >= 6
         && (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
-        && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) {
+        && kvm_check_extension(KVM_CAP_MCE) > 0) {
         uint64_t mcg_cap;
         int banks;
 
-        if (kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks)) {
+        if (kvm_get_mce_cap_supported(&mcg_cap, &banks)) {
             perror("kvm_get_mce_cap_supported FAILED");
         } else {
             if (banks > MCE_BANKS_DEF)
@@ -461,7 +460,7 @@  void kvm_arch_reset_vcpu(CPUState *env)
     }
 }
 
-static int kvm_get_supported_msrs(KVMState *s)
+static int kvm_get_supported_msrs(void)
 {
     static int kvm_supported_msrs;
     int ret = 0;
@@ -475,7 +474,7 @@  static int kvm_get_supported_msrs(KVMState *s)
         /* Obtain MSR list from KVM.  These are the MSRs that we must
          * save/restore */
         msr_list.nmsrs = 0;
-        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
+        ret = kvm_ioctl(KVM_GET_MSR_INDEX_LIST, &msr_list);
         if (ret < 0 && ret != -E2BIG) {
             return ret;
         }
@@ -486,7 +485,7 @@  static int kvm_get_supported_msrs(KVMState *s)
                                               sizeof(msr_list.indices[0])));
 
         kvm_msr_list->nmsrs = msr_list.nmsrs;
-        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
+        ret = kvm_ioctl(KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
         if (ret >= 0) {
             int i;
 
@@ -508,17 +507,17 @@  static int kvm_get_supported_msrs(KVMState *s)
     return ret;
 }
 
-static int kvm_init_identity_map_page(KVMState *s)
+static int kvm_init_identity_map_page(void)
 {
 #ifdef KVM_CAP_SET_IDENTITY_MAP_ADDR
     int ret;
     uint64_t addr = 0xfffbc000;
 
-    if (!kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
+    if (!kvm_check_extension(KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
         return 0;
     }
 
-    ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &addr);
+    ret = kvm_vm_ioctl(KVM_SET_IDENTITY_MAP_ADDR, &addr);
     if (ret < 0) {
         fprintf(stderr, "kvm_set_identity_map_addr: %s\n", strerror(ret));
         return ret;
@@ -527,12 +526,12 @@  static int kvm_init_identity_map_page(KVMState *s)
     return 0;
 }
 
-int kvm_arch_init(KVMState *s, int smp_cpus)
+int kvm_arch_init(int smp_cpus)
 {
     int ret;
     struct utsname utsname;
 
-    ret = kvm_get_supported_msrs(s);
+    ret = kvm_get_supported_msrs();
     if (ret < 0) {
         return ret;
     }
@@ -546,7 +545,7 @@  int kvm_arch_init(KVMState *s, int smp_cpus)
      * versions of KVM just assumed that it would be at the end of physical
      * memory but that doesn't work with more than 4GB of memory.  We simply
      * refuse to work with those older versions of KVM. */
-    ret = kvm_check_extension(s, KVM_CAP_SET_TSS_ADDR);
+    ret = kvm_check_extension(KVM_CAP_SET_TSS_ADDR);
     if (ret <= 0) {
         fprintf(stderr, "kvm does not support KVM_CAP_SET_TSS_ADDR\n");
         return ret;
@@ -563,12 +562,12 @@  int kvm_arch_init(KVMState *s, int smp_cpus)
         perror("e820_add_entry() table is full");
         exit(1);
     }
-    ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, 0xfffbd000);
+    ret = kvm_vm_ioctl(KVM_SET_TSS_ADDR, 0xfffbd000);
     if (ret < 0) {
         return ret;
     }
 
-    return kvm_init_identity_map_page(s);
+    return kvm_init_identity_map_page();
 }
 
 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
@@ -1861,7 +1860,7 @@  int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
             || code == BUS_MCEERR_AO)) {
         vaddr = (void *)addr;
         if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
-            !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) {
+            !kvm_physical_memory_addr_from_ram(ram_addr, &paddr)) {
             fprintf(stderr, "Hardware memory error for memory used by "
                     "QEMU itself instead of guest system!\n");
             /* Hope we are lucky for AO MCE */
@@ -1910,7 +1909,7 @@  int kvm_on_sigbus(int code, void *addr)
         /* Hope we are lucky for AO MCE */
         vaddr = addr;
         if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
-            !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) {
+            !kvm_physical_memory_addr_from_ram(ram_addr, &paddr)) {
             fprintf(stderr, "Hardware memory error for memory used by "
                     "QEMU itself instead of guest system!: %p\n", addr);
             return 0;
diff --git a/target-i386/kvm_x86.h b/target-i386/kvm_x86.h
index 9d7b584..304d0cb 100644
--- a/target-i386/kvm_x86.h
+++ b/target-i386/kvm_x86.h
@@ -22,4 +22,7 @@  void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
                         uint64_t mcg_status, uint64_t addr, uint64_t misc,
                         int flag);
 
+uint32_t kvm_x86_get_supported_cpuid(uint32_t function, uint32_t index,
+                                     int reg);
+
 #endif
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 849b404..56d30cc 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -56,13 +56,13 @@  static void kvm_kick_env(void *env)
     qemu_cpu_kick(env);
 }
 
-int kvm_arch_init(KVMState *s, int smp_cpus)
+int kvm_arch_init(int smp_cpus)
 {
 #ifdef KVM_CAP_PPC_UNSET_IRQ
-    cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
+    cap_interrupt_unset = kvm_check_extension(KVM_CAP_PPC_UNSET_IRQ);
 #endif
 #ifdef KVM_CAP_PPC_IRQ_LEVEL
-    cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
+    cap_interrupt_level = kvm_check_extension(KVM_CAP_PPC_IRQ_LEVEL);
 #endif
 
     if (!cap_interrupt_level) {
@@ -164,7 +164,7 @@  int kvm_arch_get_registers(CPUState *env)
         env->gpr[i] = regs.gpr[i];
 
 #ifdef KVM_CAP_PPC_SEGSTATE
-    if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_SEGSTATE)) {
+    if (kvm_check_extension(KVM_CAP_PPC_SEGSTATE)) {
         env->sdr1 = sregs.u.s.sdr1;
 
         /* Sync SLB */
@@ -371,8 +371,8 @@  int kvmppc_get_hypercall(CPUState *env, uint8_t *buf, int buf_len)
 #ifdef KVM_CAP_PPC_GET_PVINFO
     struct kvm_ppc_pvinfo pvinfo;
 
-    if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_PVINFO) &&
-        !kvm_vm_ioctl(env->kvm_state, KVM_PPC_GET_PVINFO, &pvinfo)) {
+    if (kvm_check_extension(KVM_CAP_PPC_GET_PVINFO) &&
+        !kvm_vm_ioctl(KVM_PPC_GET_PVINFO, &pvinfo)) {
         memcpy(buf, pvinfo.hcall, buf_len);
 
         return 0;
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index adf4a9e..927a37e 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -70,7 +70,7 @@ 
 #define SCLP_CMDW_READ_SCP_INFO         0x00020001
 #define SCLP_CMDW_READ_SCP_INFO_FORCED  0x00120001
 
-int kvm_arch_init(KVMState *s, int smp_cpus)
+int kvm_arch_init(int smp_cpus)
 {
     return 0;
 }
@@ -186,10 +186,6 @@  static void kvm_s390_interrupt_internal(CPUState *env, int type, uint32_t parm,
     struct kvm_s390_interrupt kvmint;
     int r;
 
-    if (!env->kvm_state) {
-        return;
-    }
-
     env->halted = 0;
     env->exception_index = -1;
 
@@ -198,7 +194,7 @@  static void kvm_s390_interrupt_internal(CPUState *env, int type, uint32_t parm,
     kvmint.parm64 = parm64;
 
     if (vm) {
-        r = kvm_vm_ioctl(env->kvm_state, KVM_S390_INTERRUPT, &kvmint);
+        r = kvm_vm_ioctl(KVM_S390_INTERRUPT, &kvmint);
     } else {
         r = kvm_vcpu_ioctl(env, KVM_S390_INTERRUPT, &kvmint);
     }