diff mbox

[28/35] kvm: x86: Introduce kvmclock device to save/restore its state

Message ID db8685e3925e7a5593e3f96ae703e1d19ef2fb0b.1294336601.git.mtosatti@redhat.com
State New
Headers show

Commit Message

Marcelo Tosatti Jan. 6, 2011, 5:56 p.m. UTC
From: Jan Kiszka <jan.kiszka@siemens.com>

If kvmclock is used, which implies the kernel supports it, register a
kvmclock device with the sysbus. Its main purpose is to save and restore
the kernel state on migration, but this will also allow to visualize it
one day.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
CC: Glauber Costa <glommer@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 target-i386/kvm.c |   92 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 91 insertions(+), 1 deletions(-)

Comments

Anthony Liguori Jan. 10, 2011, 8:31 p.m. UTC | #1
On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
> From: Jan Kiszka<jan.kiszka@siemens.com>
>
> If kvmclock is used, which implies the kernel supports it, register a
> kvmclock device with the sysbus. Its main purpose is to save and restore
> the kernel state on migration, but this will also allow to visualize it
> one day.
>
> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
> CC: Glauber Costa<glommer@redhat.com>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
> ---
>   target-i386/kvm.c |   92 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>   1 files changed, 91 insertions(+), 1 deletions(-)
>
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 69b8234..47cb22b 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -29,6 +29,7 @@
>   #include "hw/apic.h"
>   #include "ioport.h"
>   #include "kvm_x86.h"
> +#include "hw/sysbus.h"
>
>   #ifdef CONFIG_KVM_PARA
>   #include<linux/kvm_para.h>
> @@ -309,6 +310,85 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
>   #endif
>   }
>
> +#if defined(CONFIG_KVM_PARA)&&  defined(KVM_CAP_ADJUST_CLOCK)
> +typedef struct KVMClockState {
> +    SysBusDevice busdev;
> +    uint64_t clock;
> +    bool clock_valid;
> +} KVMClockState;
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +    struct kvm_clock_data data;
> +    int ret;
> +
> +    if (s->clock_valid) {
> +        return;
> +    }
> +    ret = kvm_vm_ioctl(KVM_GET_CLOCK,&data);
> +    if (ret<  0) {
> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> +        data.clock = 0;
> +    }
> +    s->clock = data.clock;
> +    /*
> +     * If the VM is stopped, declare the clock state valid to avoid re-reading
> +     * it on next vmsave (which would return a different value). Will be reset
> +     * when the VM is continued.
> +     */
> +    s->clock_valid = !vm_running;
> +}
> +
> +static int kvmclock_post_load(void *opaque, int version_id)
> +{
> +    KVMClockState *s = opaque;
> +    struct kvm_clock_data data;
> +
> +    data.clock = s->clock;
> +    data.flags = 0;
> +    return kvm_vm_ioctl(KVM_SET_CLOCK,&data);
> +}
> +
> +static void kvmclock_vm_state_change(void *opaque, int running, int reason)
> +{
> +    KVMClockState *s = opaque;
> +
> +    if (running) {
> +        s->clock_valid = false;
> +    }
> +}
> +
> +static int kvmclock_init(SysBusDevice *dev)
> +{
> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
> +
> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> +    return 0;
> +}
> +
> +static const VMStateDescription kvmclock_vmsd= {
> +    .name = "kvmclock",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .pre_save = kvmclock_pre_save,
> +    .post_load = kvmclock_post_load,
> +    .fields = (VMStateField []) {
> +        VMSTATE_UINT64(clock, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static SysBusDeviceInfo kvmclock_info = {
> +    .qdev.name = "kvmclock",
> +    .qdev.size = sizeof(KVMClockState),
> +    .qdev.vmsd =&kvmclock_vmsd,
> +    .qdev.no_user = 1,
> +    .init = kvmclock_init,
> +};
> +#endif /* CONFIG_KVM_PARA&&  KVM_CAP_ADJUST_CLOCK */
> +
>   int kvm_arch_init_vcpu(CPUState *env)
>   {
>       struct {
> @@ -335,7 +415,6 @@ int kvm_arch_init_vcpu(CPUState *env)
>       env->cpuid_svm_features&= kvm_x86_get_supported_cpuid(0x8000000A,
>                                                               0, R_EDX);
>
> -
>       cpuid_i = 0;
>
>   #ifdef CONFIG_KVM_PARA
> @@ -442,6 +521,13 @@ int kvm_arch_init_vcpu(CPUState *env)
>       }
>   #endif
>
> +#if defined(CONFIG_KVM_PARA)&&  defined(KVM_CAP_ADJUST_CLOCK)
> +    if (cpu_is_bsp(env)&&
> +        (env->cpuid_kvm_features&  (1ULL<<  KVM_FEATURE_CLOCKSOURCE))) {
> +        sysbus_create_simple("kvmclock", -1, NULL);
> +    }
> +#endif
> +
>       return kvm_vcpu_ioctl(env, KVM_SET_CPUID2,&cpuid_data);
>   }
>
> @@ -531,6 +617,10 @@ int kvm_arch_init(int smp_cpus)
>       int ret;
>       struct utsname utsname;
>
> +#if defined(CONFIG_KVM_PARA)&&  defined(KVM_CAP_ADJUST_CLOCK)
> +    sysbus_register_withprop(&kvmclock_info);
> +#endif
> +
>       ret = kvm_get_supported_msrs();
>       if (ret<  0) {
>           return ret;
>    

There are a couple things wrong with this patch.  It breaks 
compatibility because it does not allow kvmclock to be created or 
initiated in machines.  Older machines didn't expose kvmclock but now 
they do.  It also makes it impossible to pass parameters to kvmclock in 
the future because the device creation is hidden deep in other code 
paths.  Calling any qdev creation function in anything but pc.c (or the 
equivalent) should be a big red flag.

The solution is simple, introduce as kvm_has_clocksource().  Within the 
machine init, create the the kvm clock device after CPU creation wrapped 
in a if (kvm_has_clocksource()) call.  kvmclock should be created with 
kvm_state as a parameter and kvm_vm_ioctl() is passed the stored 
reference.   Taking a global reference to kvm_state in machine_init is 
not a bad thing, obviously the machine initialization function needs 
access to the kvm_state.

Regards,

Anthony Liguori
Jan Kiszka Jan. 10, 2011, 9:06 p.m. UTC | #2
Am 10.01.2011 21:31, Anthony Liguori wrote:
> On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
>> From: Jan Kiszka<jan.kiszka@siemens.com>
>>
>> If kvmclock is used, which implies the kernel supports it, register a
>> kvmclock device with the sysbus. Its main purpose is to save and restore
>> the kernel state on migration, but this will also allow to visualize it
>> one day.
>>
>> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
>> CC: Glauber Costa<glommer@redhat.com>
>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>> ---
>>   target-i386/kvm.c |   92
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 files changed, 91 insertions(+), 1 deletions(-)
>>
>> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
>> index 69b8234..47cb22b 100644
>> --- a/target-i386/kvm.c
>> +++ b/target-i386/kvm.c
>> @@ -29,6 +29,7 @@
>>   #include "hw/apic.h"
>>   #include "ioport.h"
>>   #include "kvm_x86.h"
>> +#include "hw/sysbus.h"
>>
>>   #ifdef CONFIG_KVM_PARA
>>   #include<linux/kvm_para.h>
>> @@ -309,6 +310,85 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank,
>> uint64_t status,
>>   #endif
>>   }
>>
>> +#if defined(CONFIG_KVM_PARA)&&  defined(KVM_CAP_ADJUST_CLOCK)
>> +typedef struct KVMClockState {
>> +    SysBusDevice busdev;
>> +    uint64_t clock;
>> +    bool clock_valid;
>> +} KVMClockState;
>> +
>> +static void kvmclock_pre_save(void *opaque)
>> +{
>> +    KVMClockState *s = opaque;
>> +    struct kvm_clock_data data;
>> +    int ret;
>> +
>> +    if (s->clock_valid) {
>> +        return;
>> +    }
>> +    ret = kvm_vm_ioctl(KVM_GET_CLOCK,&data);
>> +    if (ret<  0) {
>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>> +        data.clock = 0;
>> +    }
>> +    s->clock = data.clock;
>> +    /*
>> +     * If the VM is stopped, declare the clock state valid to avoid
>> re-reading
>> +     * it on next vmsave (which would return a different value). Will
>> be reset
>> +     * when the VM is continued.
>> +     */
>> +    s->clock_valid = !vm_running;
>> +}
>> +
>> +static int kvmclock_post_load(void *opaque, int version_id)
>> +{
>> +    KVMClockState *s = opaque;
>> +    struct kvm_clock_data data;
>> +
>> +    data.clock = s->clock;
>> +    data.flags = 0;
>> +    return kvm_vm_ioctl(KVM_SET_CLOCK,&data);
>> +}
>> +
>> +static void kvmclock_vm_state_change(void *opaque, int running, int
>> reason)
>> +{
>> +    KVMClockState *s = opaque;
>> +
>> +    if (running) {
>> +        s->clock_valid = false;
>> +    }
>> +}
>> +
>> +static int kvmclock_init(SysBusDevice *dev)
>> +{
>> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
>> +
>> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>> +    return 0;
>> +}
>> +
>> +static const VMStateDescription kvmclock_vmsd= {
>> +    .name = "kvmclock",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .minimum_version_id_old = 1,
>> +    .pre_save = kvmclock_pre_save,
>> +    .post_load = kvmclock_post_load,
>> +    .fields = (VMStateField []) {
>> +        VMSTATE_UINT64(clock, KVMClockState),
>> +        VMSTATE_END_OF_LIST()
>> +    }
>> +};
>> +
>> +static SysBusDeviceInfo kvmclock_info = {
>> +    .qdev.name = "kvmclock",
>> +    .qdev.size = sizeof(KVMClockState),
>> +    .qdev.vmsd =&kvmclock_vmsd,
>> +    .qdev.no_user = 1,
>> +    .init = kvmclock_init,
>> +};
>> +#endif /* CONFIG_KVM_PARA&&  KVM_CAP_ADJUST_CLOCK */
>> +
>>   int kvm_arch_init_vcpu(CPUState *env)
>>   {
>>       struct {
>> @@ -335,7 +415,6 @@ int kvm_arch_init_vcpu(CPUState *env)
>>       env->cpuid_svm_features&= kvm_x86_get_supported_cpuid(0x8000000A,
>>                                                               0, R_EDX);
>>
>> -
>>       cpuid_i = 0;
>>
>>   #ifdef CONFIG_KVM_PARA
>> @@ -442,6 +521,13 @@ int kvm_arch_init_vcpu(CPUState *env)
>>       }
>>   #endif
>>
>> +#if defined(CONFIG_KVM_PARA)&&  defined(KVM_CAP_ADJUST_CLOCK)
>> +    if (cpu_is_bsp(env)&&
>> +        (env->cpuid_kvm_features&  (1ULL<<  KVM_FEATURE_CLOCKSOURCE))) {
>> +        sysbus_create_simple("kvmclock", -1, NULL);
>> +    }
>> +#endif
>> +
>>       return kvm_vcpu_ioctl(env, KVM_SET_CPUID2,&cpuid_data);
>>   }
>>
>> @@ -531,6 +617,10 @@ int kvm_arch_init(int smp_cpus)
>>       int ret;
>>       struct utsname utsname;
>>
>> +#if defined(CONFIG_KVM_PARA)&&  defined(KVM_CAP_ADJUST_CLOCK)
>> +    sysbus_register_withprop(&kvmclock_info);
>> +#endif
>> +
>>       ret = kvm_get_supported_msrs();
>>       if (ret<  0) {
>>           return ret;
>>    
> 
> There are a couple things wrong with this patch.  It breaks
> compatibility because it does not allow kvmclock to be created or
> initiated in machines.  Older machines didn't expose kvmclock but now
> they do.  It also makes it impossible to pass parameters to kvmclock in
> the future because the device creation is hidden deep in other code
> paths.

Device parameters should get passed as properties. Would already work
today if we had any.

>  Calling any qdev creation function in anything but pc.c (or the
> equivalent) should be a big red flag.
> 
> The solution is simple, introduce as kvm_has_clocksource().  Within the
> machine init, create the the kvm clock device after CPU creation wrapped
> in a if (kvm_has_clocksource()) call.

No problem with moving sysbus_create_simple to machine initialization,
though.

>  kvmclock should be created with
> kvm_state as a parameter and kvm_vm_ioctl() is passed the stored
> reference.   Taking a global reference to kvm_state in machine_init is
> not a bad thing, obviously the machine initialization function needs
> access to the kvm_state.

This would also require changing sysbus interfaces for the sake of KVM's
"abstraction". If this is the only way forward, I could look into this.

Still, I do not see any benefit for the affected code. You then either
need to "steal" a kvm_state reference from the first cpu or introduce a
marvelous interface like kvm_get_state() to make this work from outside
of the KVM core.

Jan
Jan Kiszka Jan. 10, 2011, 10:21 p.m. UTC | #3
Am 10.01.2011 22:06, Jan Kiszka wrote:
>>  kvmclock should be created with
>> kvm_state as a parameter and kvm_vm_ioctl() is passed the stored
>> reference.   Taking a global reference to kvm_state in machine_init is
>> not a bad thing, obviously the machine initialization function needs
>> access to the kvm_state.
> 
> This would also require changing sysbus interfaces for the sake of KVM's
> "abstraction". If this is the only way forward, I could look into this.

Actually, there is already a channel to pass pointers to qdev devices:
the pointer property hack. I'm not sure we should contribute to its user
base or take the chance for a cleanup, but we are not alone with this
requirement. Point below remains valid, though.

> 
> Still, I do not see any benefit for the affected code. You then either
> need to "steal" a kvm_state reference from the first cpu or introduce a
> marvelous interface like kvm_get_state() to make this work from outside
> of the KVM core.
> 

Jan
Anthony Liguori Jan. 10, 2011, 11:02 p.m. UTC | #4
On 01/10/2011 04:21 PM, Jan Kiszka wrote:
> Am 10.01.2011 22:06, Jan Kiszka wrote:
>    
>>>   kvmclock should be created with
>>> kvm_state as a parameter and kvm_vm_ioctl() is passed the stored
>>> reference.   Taking a global reference to kvm_state in machine_init is
>>> not a bad thing, obviously the machine initialization function needs
>>> access to the kvm_state.
>>>        
>> This would also require changing sysbus interfaces for the sake of KVM's
>> "abstraction". If this is the only way forward, I could look into this.
>>      
> Actually, there is already a channel to pass pointers to qdev devices:
> the pointer property hack. I'm not sure we should contribute to its user
> base or take the chance for a cleanup, but we are not alone with this
> requirement. Point below remains valid, though.
>    

It probably makes sense to have a KVMBus and not pass it as a property 
but rather have it access it from the KvmBusState.

Regards,

Anthony Liguori

>    
>> Still, I do not see any benefit for the affected code. You then either
>> need to "steal" a kvm_state reference from the first cpu or introduce a
>> marvelous interface like kvm_get_state() to make this work from outside
>> of the KVM core.
>>
>>      
> Jan
>
Anthony Liguori Jan. 10, 2011, 11:04 p.m. UTC | #5
On 01/10/2011 03:06 PM, Jan Kiszka wrote:
> Am 10.01.2011 21:31, Anthony Liguori wrote:
>    
>> On 01/06/2011 11:56 AM, Marcelo Tosatti wrote:
>>      
>>> From: Jan Kiszka<jan.kiszka@siemens.com>
>>>
>>> If kvmclock is used, which implies the kernel supports it, register a
>>> kvmclock device with the sysbus. Its main purpose is to save and restore
>>> the kernel state on migration, but this will also allow to visualize it
>>> one day.
>>>
>>> Signed-off-by: Jan Kiszka<jan.kiszka@siemens.com>
>>> CC: Glauber Costa<glommer@redhat.com>
>>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>>> ---
>>>    target-i386/kvm.c |   92
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>    1 files changed, 91 insertions(+), 1 deletions(-)
>>>
>>> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
>>> index 69b8234..47cb22b 100644
>>> --- a/target-i386/kvm.c
>>> +++ b/target-i386/kvm.c
>>> @@ -29,6 +29,7 @@
>>>    #include "hw/apic.h"
>>>    #include "ioport.h"
>>>    #include "kvm_x86.h"
>>> +#include "hw/sysbus.h"
>>>
>>>    #ifdef CONFIG_KVM_PARA
>>>    #include<linux/kvm_para.h>
>>> @@ -309,6 +310,85 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank,
>>> uint64_t status,
>>>    #endif
>>>    }
>>>
>>> +#if defined(CONFIG_KVM_PARA)&&   defined(KVM_CAP_ADJUST_CLOCK)
>>> +typedef struct KVMClockState {
>>> +    SysBusDevice busdev;
>>> +    uint64_t clock;
>>> +    bool clock_valid;
>>> +} KVMClockState;
>>> +
>>> +static void kvmclock_pre_save(void *opaque)
>>> +{
>>> +    KVMClockState *s = opaque;
>>> +    struct kvm_clock_data data;
>>> +    int ret;
>>> +
>>> +    if (s->clock_valid) {
>>> +        return;
>>> +    }
>>> +    ret = kvm_vm_ioctl(KVM_GET_CLOCK,&data);
>>> +    if (ret<   0) {
>>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>>> +        data.clock = 0;
>>> +    }
>>> +    s->clock = data.clock;
>>> +    /*
>>> +     * If the VM is stopped, declare the clock state valid to avoid
>>> re-reading
>>> +     * it on next vmsave (which would return a different value). Will
>>> be reset
>>> +     * when the VM is continued.
>>> +     */
>>> +    s->clock_valid = !vm_running;
>>> +}
>>> +
>>> +static int kvmclock_post_load(void *opaque, int version_id)
>>> +{
>>> +    KVMClockState *s = opaque;
>>> +    struct kvm_clock_data data;
>>> +
>>> +    data.clock = s->clock;
>>> +    data.flags = 0;
>>> +    return kvm_vm_ioctl(KVM_SET_CLOCK,&data);
>>> +}
>>> +
>>> +static void kvmclock_vm_state_change(void *opaque, int running, int
>>> reason)
>>> +{
>>> +    KVMClockState *s = opaque;
>>> +
>>> +    if (running) {
>>> +        s->clock_valid = false;
>>> +    }
>>> +}
>>> +
>>> +static int kvmclock_init(SysBusDevice *dev)
>>> +{
>>> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
>>> +
>>> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>>> +    return 0;
>>> +}
>>> +
>>> +static const VMStateDescription kvmclock_vmsd= {
>>> +    .name = "kvmclock",
>>> +    .version_id = 1,
>>> +    .minimum_version_id = 1,
>>> +    .minimum_version_id_old = 1,
>>> +    .pre_save = kvmclock_pre_save,
>>> +    .post_load = kvmclock_post_load,
>>> +    .fields = (VMStateField []) {
>>> +        VMSTATE_UINT64(clock, KVMClockState),
>>> +        VMSTATE_END_OF_LIST()
>>> +    }
>>> +};
>>> +
>>> +static SysBusDeviceInfo kvmclock_info = {
>>> +    .qdev.name = "kvmclock",
>>> +    .qdev.size = sizeof(KVMClockState),
>>> +    .qdev.vmsd =&kvmclock_vmsd,
>>> +    .qdev.no_user = 1,
>>> +    .init = kvmclock_init,
>>> +};
>>> +#endif /* CONFIG_KVM_PARA&&   KVM_CAP_ADJUST_CLOCK */
>>> +
>>>    int kvm_arch_init_vcpu(CPUState *env)
>>>    {
>>>        struct {
>>> @@ -335,7 +415,6 @@ int kvm_arch_init_vcpu(CPUState *env)
>>>        env->cpuid_svm_features&= kvm_x86_get_supported_cpuid(0x8000000A,
>>>                                                                0, R_EDX);
>>>
>>> -
>>>        cpuid_i = 0;
>>>
>>>    #ifdef CONFIG_KVM_PARA
>>> @@ -442,6 +521,13 @@ int kvm_arch_init_vcpu(CPUState *env)
>>>        }
>>>    #endif
>>>
>>> +#if defined(CONFIG_KVM_PARA)&&   defined(KVM_CAP_ADJUST_CLOCK)
>>> +    if (cpu_is_bsp(env)&&
>>> +        (env->cpuid_kvm_features&   (1ULL<<   KVM_FEATURE_CLOCKSOURCE))) {
>>> +        sysbus_create_simple("kvmclock", -1, NULL);
>>> +    }
>>> +#endif
>>> +
>>>        return kvm_vcpu_ioctl(env, KVM_SET_CPUID2,&cpuid_data);
>>>    }
>>>
>>> @@ -531,6 +617,10 @@ int kvm_arch_init(int smp_cpus)
>>>        int ret;
>>>        struct utsname utsname;
>>>
>>> +#if defined(CONFIG_KVM_PARA)&&   defined(KVM_CAP_ADJUST_CLOCK)
>>> +    sysbus_register_withprop(&kvmclock_info);
>>> +#endif
>>> +
>>>        ret = kvm_get_supported_msrs();
>>>        if (ret<   0) {
>>>            return ret;
>>>
>>>        
>> There are a couple things wrong with this patch.  It breaks
>> compatibility because it does not allow kvmclock to be created or
>> initiated in machines.  Older machines didn't expose kvmclock but now
>> they do.  It also makes it impossible to pass parameters to kvmclock in
>> the future because the device creation is hidden deep in other code
>> paths.
>>      
> Device parameters should get passed as properties. Would already work
> today if we had any.
>
>    
>>   Calling any qdev creation function in anything but pc.c (or the
>> equivalent) should be a big red flag.
>>
>> The solution is simple, introduce as kvm_has_clocksource().  Within the
>> machine init, create the the kvm clock device after CPU creation wrapped
>> in a if (kvm_has_clocksource()) call.
>>      
> No problem with moving sysbus_create_simple to machine initialization,
> though.
>
>    
>>   kvmclock should be created with
>> kvm_state as a parameter and kvm_vm_ioctl() is passed the stored
>> reference.   Taking a global reference to kvm_state in machine_init is
>> not a bad thing, obviously the machine initialization function needs
>> access to the kvm_state.
>>      
> This would also require changing sysbus interfaces for the sake of KVM's
> "abstraction". If this is the only way forward, I could look into this.
>
> Still, I do not see any benefit for the affected code. You then either
> need to "steal" a kvm_state reference from the first cpu or introduce a
> marvelous interface like kvm_get_state() to make this work from outside
> of the KVM core.
>    

Or move kvm_init() to pc_init() and then pc_init() has the kvm_state 
reference.

Regards,

Anthony Liguori

> Jan
>
>
Jan Kiszka Jan. 11, 2011, 5:54 a.m. UTC | #6
Am 11.01.2011 00:02, Anthony Liguori wrote:
> On 01/10/2011 04:21 PM, Jan Kiszka wrote:
>> Am 10.01.2011 22:06, Jan Kiszka wrote:
>>   
>>>>   kvmclock should be created with
>>>> kvm_state as a parameter and kvm_vm_ioctl() is passed the stored
>>>> reference.   Taking a global reference to kvm_state in machine_init is
>>>> not a bad thing, obviously the machine initialization function needs
>>>> access to the kvm_state.
>>>>        
>>> This would also require changing sysbus interfaces for the sake of KVM's
>>> "abstraction". If this is the only way forward, I could look into this.
>>>      
>> Actually, there is already a channel to pass pointers to qdev devices:
>> the pointer property hack. I'm not sure we should contribute to its user
>> base or take the chance for a cleanup, but we are not alone with this
>> requirement. Point below remains valid, though.
>>    
> 
> It probably makes sense to have a KVMBus and not pass it as a property
> but rather have it access it from the KvmBusState.

KVM is orthogonal to the qtree. Some devices like vga (kvm_coalesce_*,
kvm_log_start/stop*) would actually have to be attached to multiple
buses in this model.

Jan
Jan Kiszka Jan. 11, 2011, 5:55 a.m. UTC | #7
Am 11.01.2011 00:04, Anthony Liguori wrote:
>>>   kvmclock should be created with
>>> kvm_state as a parameter and kvm_vm_ioctl() is passed the stored
>>> reference.   Taking a global reference to kvm_state in machine_init is
>>> not a bad thing, obviously the machine initialization function needs
>>> access to the kvm_state.
>>>      
>> This would also require changing sysbus interfaces for the sake of KVM's
>> "abstraction". If this is the only way forward, I could look into this.
>>
>> Still, I do not see any benefit for the affected code. You then either
>> need to "steal" a kvm_state reference from the first cpu or introduce a
>> marvelous interface like kvm_get_state() to make this work from outside
>> of the KVM core.
>>    
> 
> Or move kvm_init() to pc_init() and then pc_init() has the kvm_state
> reference.

Or pass the reference to the machine_init service to avoid duplicating
kvm_init logic for every KVM arch.

That alone would still be consistent. But as long as we do not pass a
kvm_state to each and every memory registration (for
kvm_setup_guest_memory), this all is like putting a fence around half of
your yard and only declaring it closed.

Jan
Paolo Bonzini Jan. 11, 2011, 8 a.m. UTC | #8
On 01/10/2011 11:21 PM, Jan Kiszka wrote:
> Am 10.01.2011 22:06, Jan Kiszka wrote:
>>>   kvmclock should be created with
>>> kvm_state as a parameter and kvm_vm_ioctl() is passed the stored
>>> reference.   Taking a global reference to kvm_state in machine_init is
>>> not a bad thing, obviously the machine initialization function needs
>>> access to the kvm_state.
>>
>> This would also require changing sysbus interfaces for the sake of KVM's
>> "abstraction". If this is the only way forward, I could look into this.
>
> Actually, there is already a channel to pass pointers to qdev devices:
> the pointer property hack. I'm not sure we should contribute to its user
> base or take the chance for a cleanup, but we are not alone with this
> requirement. Point below remains valid, though.

The pointer property uses, at least last time I checked, were all:

1) for a CPU (apic, etrax interrupt controller)

2) for a device (sparc IIRC)

3) useless (smbus_eeprom.c)

So adding a fourth kind is not really a good idea, I think.

Paolo
Gerd Hoffmann Jan. 11, 2011, 8:53 a.m. UTC | #9
Hi,

> Actually, there is already a channel to pass pointers to qdev devices:
> the pointer property hack. I'm not sure we should contribute to its user
> base or take the chance for a cleanup, but we are not alone with this
> requirement. Point below remains valid, though.

It is considered bad/hackish style as you can't create that kind of 
devices using the -device command line switch (or from a machine 
description config file some day in the future).  So we should not add 
more uses of this, especially not in patches which are supposed to 
cleanup things ;)

cheers,
   Gerd
Markus Armbruster Jan. 11, 2011, 9:31 a.m. UTC | #10
Jan Kiszka <jan.kiszka@web.de> writes:

> Am 10.01.2011 22:06, Jan Kiszka wrote:
>>>  kvmclock should be created with
>>> kvm_state as a parameter and kvm_vm_ioctl() is passed the stored
>>> reference.   Taking a global reference to kvm_state in machine_init is
>>> not a bad thing, obviously the machine initialization function needs
>>> access to the kvm_state.
>> 
>> This would also require changing sysbus interfaces for the sake of KVM's
>> "abstraction". If this is the only way forward, I could look into this.
>
> Actually, there is already a channel to pass pointers to qdev devices:
> the pointer property hack. I'm not sure we should contribute to its user
> base

We shouldn't.

>      or take the chance for a cleanup, but we are not alone with this
> requirement. Point below remains valid, though.
>
>> 
>> Still, I do not see any benefit for the affected code. You then either
>> need to "steal" a kvm_state reference from the first cpu or introduce a
>> marvelous interface like kvm_get_state() to make this work from outside
>> of the KVM core.
Anthony Liguori Jan. 11, 2011, 1:54 p.m. UTC | #11
On 01/11/2011 03:31 AM, Markus Armbruster wrote:
> Jan Kiszka<jan.kiszka@web.de>  writes:
>
>    
>> Am 10.01.2011 22:06, Jan Kiszka wrote:
>>      
>>>>   kvmclock should be created with
>>>> kvm_state as a parameter and kvm_vm_ioctl() is passed the stored
>>>> reference.   Taking a global reference to kvm_state in machine_init is
>>>> not a bad thing, obviously the machine initialization function needs
>>>> access to the kvm_state.
>>>>          
>>> This would also require changing sysbus interfaces for the sake of KVM's
>>> "abstraction". If this is the only way forward, I could look into this.
>>>        
>> Actually, there is already a channel to pass pointers to qdev devices:
>> the pointer property hack. I'm not sure we should contribute to its user
>> base
>>      
> We shouldn't.
>    

Right, we should introduce a KVMBus that KVM devices are created on.  
The devices can get at KVMState through the BusState.

Regards,

Anthony Liguori

>>       or take the chance for a cleanup, but we are not alone with this
>> requirement. Point below remains valid, though.
>>
>>      
>>> Still, I do not see any benefit for the affected code. You then either
>>> need to "steal" a kvm_state reference from the first cpu or introduce a
>>> marvelous interface like kvm_get_state() to make this work from outside
>>> of the KVM core.
>>>
Jan Kiszka Jan. 11, 2011, 5:13 p.m. UTC | #12
Am 11.01.2011 09:53, Gerd Hoffmann wrote:
>   Hi,
> 
>> Actually, there is already a channel to pass pointers to qdev devices:
>> the pointer property hack. I'm not sure we should contribute to its user
>> base or take the chance for a cleanup, but we are not alone with this
>> requirement. Point below remains valid, though.
> 
> It is considered bad/hackish style as you can't create that kind of
> devices using the -device command line switch (or from a machine
> description config file some day in the future).

That kind of instantiation wouldn't be possible for device models that
require someone actively passing kvm_state to them...

>  So we should not add
> more uses of this, especially not in patches which are supposed to
> cleanup things ;)

You won't see me disagree.

Jan
Avi Kivity Jan. 12, 2011, 10:22 a.m. UTC | #13
On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>
> Right, we should introduce a KVMBus that KVM devices are created on.  
> The devices can get at KVMState through the BusState.

There is no kvm bus in a PC (I looked).  We're bending the device model 
here because a device is implemented in the kernel and not in 
userspace.  An implementation detail is magnified beyond all proportions.

An ioapic that is implemented by kvm lives in exactly the same place 
that the qemu ioapic lives in.  An assigned pci device lives on the PCI 
bus, not a KVMBus.  If we need a pointer to KVMState, then we must find 
it elsewhere, not through creating imaginary buses that don't exist.
Jan Kiszka Jan. 12, 2011, 10:31 a.m. UTC | #14
Am 12.01.2011 11:22, Avi Kivity wrote:
> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>
>> Right, we should introduce a KVMBus that KVM devices are created on. 
>> The devices can get at KVMState through the BusState.
> 
> There is no kvm bus in a PC (I looked).  We're bending the device model
> here because a device is implemented in the kernel and not in
> userspace.  An implementation detail is magnified beyond all proportions.
> 
> An ioapic that is implemented by kvm lives in exactly the same place
> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
> it elsewhere, not through creating imaginary buses that don't exist.
> 

Exactly.

So we can either "infect" the whole device tree with kvm (or maybe a
more generic accelerator structure that also deals with Xen) or we need
to pull the reference inside the device's init function from some global
service (kvm_get_state).

Jan

PS: I started refreshing my whole patch series with the two
controversial patches removed. Will send out later so that we can
proceed with merging the critical bits, specifically all the bug fixes.
Markus Armbruster Jan. 12, 2011, 12:04 p.m. UTC | #15
Avi Kivity <avi@redhat.com> writes:

> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>
>> Right, we should introduce a KVMBus that KVM devices are created on.
>> The devices can get at KVMState through the BusState.
>
> There is no kvm bus in a PC (I looked).  We're bending the device
> model here because a device is implemented in the kernel and not in
> userspace.  An implementation detail is magnified beyond all
> proportions.
>
> An ioapic that is implemented by kvm lives in exactly the same place
> that the qemu ioapic lives in.

Exactly.  And that place is a bus.

What if the device interfaces in bus-specific ways with its parent bus?
Then we can't simply replace the parent bus by a KVM bus.  We'd need
*two* parent buses, as Jan pointed out upthread.

>                                 An assigned pci device lives on the
> PCI bus, not a KVMBus.  If we need a pointer to KVMState, then we must
> find it elsewhere, not through creating imaginary buses that don't
> exist.
Jan Kiszka Jan. 18, 2011, 2:28 p.m. UTC | #16
On 2011-01-12 11:31, Jan Kiszka wrote:
> Am 12.01.2011 11:22, Avi Kivity wrote:
>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>>
>>> Right, we should introduce a KVMBus that KVM devices are created on. 
>>> The devices can get at KVMState through the BusState.
>>
>> There is no kvm bus in a PC (I looked).  We're bending the device model
>> here because a device is implemented in the kernel and not in
>> userspace.  An implementation detail is magnified beyond all proportions.
>>
>> An ioapic that is implemented by kvm lives in exactly the same place
>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
>> it elsewhere, not through creating imaginary buses that don't exist.
>>
> 
> Exactly.
> 
> So we can either "infect" the whole device tree with kvm (or maybe a
> more generic accelerator structure that also deals with Xen) or we need
> to pull the reference inside the device's init function from some global
> service (kvm_get_state).

Note that this topic is still waiting for good suggestions, specifically
from those who believe in kvm_state references :). This is not only
blocking kvmstate merge but will affect KVM irqchips as well.

It boils down to how we reasonably pass a kvm_state reference from
machine init code to a sysbus device. I'm probably biased, but I don't
see any way that does not work against the idea of confining access to
kvm_state or breaks device instantiation from the command line or a
config file.

Jan
Anthony Liguori Jan. 18, 2011, 3:04 p.m. UTC | #17
On 01/18/2011 08:28 AM, Jan Kiszka wrote:
> On 2011-01-12 11:31, Jan Kiszka wrote:
>    
>> Am 12.01.2011 11:22, Avi Kivity wrote:
>>      
>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>>        
>>>> Right, we should introduce a KVMBus that KVM devices are created on.
>>>> The devices can get at KVMState through the BusState.
>>>>          
>>> There is no kvm bus in a PC (I looked).  We're bending the device model
>>> here because a device is implemented in the kernel and not in
>>> userspace.  An implementation detail is magnified beyond all proportions.
>>>
>>> An ioapic that is implemented by kvm lives in exactly the same place
>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
>>> it elsewhere, not through creating imaginary buses that don't exist.
>>>
>>>        
>> Exactly.
>>
>> So we can either "infect" the whole device tree with kvm (or maybe a
>> more generic accelerator structure that also deals with Xen) or we need
>> to pull the reference inside the device's init function from some global
>> service (kvm_get_state).
>>      
> Note that this topic is still waiting for good suggestions, specifically
> from those who believe in kvm_state references :). This is not only
> blocking kvmstate merge but will affect KVM irqchips as well.
>
> It boils down to how we reasonably pass a kvm_state reference from
> machine init code to a sysbus device. I'm probably biased, but I don't
> see any way that does not work against the idea of confining access to
> kvm_state or breaks device instantiation from the command line or a
> config file.
>    

A KVM device should sit on a KVM specific bus that hangs off of sysbus.  
It can get to kvm_state through that bus.

That bus doesn't get instantiated through qdev so requiring a pointer 
argument should not be an issue.

Regards,

Anthony Liguori

> Jan
>
>
Jan Kiszka Jan. 18, 2011, 3:43 p.m. UTC | #18
On 2011-01-18 16:04, Anthony Liguori wrote:
> On 01/18/2011 08:28 AM, Jan Kiszka wrote:
>> On 2011-01-12 11:31, Jan Kiszka wrote:
>>    
>>> Am 12.01.2011 11:22, Avi Kivity wrote:
>>>      
>>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>>>        
>>>>> Right, we should introduce a KVMBus that KVM devices are created on.
>>>>> The devices can get at KVMState through the BusState.
>>>>>          
>>>> There is no kvm bus in a PC (I looked).  We're bending the device model
>>>> here because a device is implemented in the kernel and not in
>>>> userspace.  An implementation detail is magnified beyond all proportions.
>>>>
>>>> An ioapic that is implemented by kvm lives in exactly the same place
>>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
>>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
>>>> it elsewhere, not through creating imaginary buses that don't exist.
>>>>
>>>>        
>>> Exactly.
>>>
>>> So we can either "infect" the whole device tree with kvm (or maybe a
>>> more generic accelerator structure that also deals with Xen) or we need
>>> to pull the reference inside the device's init function from some global
>>> service (kvm_get_state).
>>>      
>> Note that this topic is still waiting for good suggestions, specifically
>> from those who believe in kvm_state references :). This is not only
>> blocking kvmstate merge but will affect KVM irqchips as well.
>>
>> It boils down to how we reasonably pass a kvm_state reference from
>> machine init code to a sysbus device. I'm probably biased, but I don't
>> see any way that does not work against the idea of confining access to
>> kvm_state or breaks device instantiation from the command line or a
>> config file.
>>    
> 
> A KVM device should sit on a KVM specific bus that hangs off of sysbus.  
> It can get to kvm_state through that bus.
> 
> That bus doesn't get instantiated through qdev so requiring a pointer 
> argument should not be an issue.
> 

This design is in conflict with the requirement to attach KVM-assisted
devices also to their home bus, e.g. an assigned PCI device to the PCI
bus. We don't support multi-homed qdev devices.

Jan
Anthony Liguori Jan. 18, 2011, 3:48 p.m. UTC | #19
On 01/18/2011 09:43 AM, Jan Kiszka wrote:
> On 2011-01-18 16:04, Anthony Liguori wrote:
>    
>> On 01/18/2011 08:28 AM, Jan Kiszka wrote:
>>      
>>> On 2011-01-12 11:31, Jan Kiszka wrote:
>>>
>>>        
>>>> Am 12.01.2011 11:22, Avi Kivity wrote:
>>>>
>>>>          
>>>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>>>>
>>>>>            
>>>>>> Right, we should introduce a KVMBus that KVM devices are created on.
>>>>>> The devices can get at KVMState through the BusState.
>>>>>>
>>>>>>              
>>>>> There is no kvm bus in a PC (I looked).  We're bending the device model
>>>>> here because a device is implemented in the kernel and not in
>>>>> userspace.  An implementation detail is magnified beyond all proportions.
>>>>>
>>>>> An ioapic that is implemented by kvm lives in exactly the same place
>>>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
>>>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
>>>>> it elsewhere, not through creating imaginary buses that don't exist.
>>>>>
>>>>>
>>>>>            
>>>> Exactly.
>>>>
>>>> So we can either "infect" the whole device tree with kvm (or maybe a
>>>> more generic accelerator structure that also deals with Xen) or we need
>>>> to pull the reference inside the device's init function from some global
>>>> service (kvm_get_state).
>>>>
>>>>          
>>> Note that this topic is still waiting for good suggestions, specifically
>>> from those who believe in kvm_state references :). This is not only
>>> blocking kvmstate merge but will affect KVM irqchips as well.
>>>
>>> It boils down to how we reasonably pass a kvm_state reference from
>>> machine init code to a sysbus device. I'm probably biased, but I don't
>>> see any way that does not work against the idea of confining access to
>>> kvm_state or breaks device instantiation from the command line or a
>>> config file.
>>>
>>>        
>> A KVM device should sit on a KVM specific bus that hangs off of sysbus.
>> It can get to kvm_state through that bus.
>>
>> That bus doesn't get instantiated through qdev so requiring a pointer
>> argument should not be an issue.
>>
>>      
> This design is in conflict with the requirement to attach KVM-assisted
> devices also to their home bus, e.g. an assigned PCI device to the PCI
> bus. We don't support multi-homed qdev devices.
>    

With vfio, would an assigned PCI device even need kvm_state?

Regards,

Anthony Liguori

> Jan
>
>
Anthony Liguori Jan. 18, 2011, 3:50 p.m. UTC | #20
On 01/18/2011 09:43 AM, Jan Kiszka wrote:
> On 2011-01-18 16:04, Anthony Liguori wrote:
>    
>> On 01/18/2011 08:28 AM, Jan Kiszka wrote:
>>      
>>> On 2011-01-12 11:31, Jan Kiszka wrote:
>>>
>>>        
>>>> Am 12.01.2011 11:22, Avi Kivity wrote:
>>>>
>>>>          
>>>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>>>>
>>>>>            
>>>>>> Right, we should introduce a KVMBus that KVM devices are created on.
>>>>>> The devices can get at KVMState through the BusState.
>>>>>>
>>>>>>              
>>>>> There is no kvm bus in a PC (I looked).  We're bending the device model
>>>>> here because a device is implemented in the kernel and not in
>>>>> userspace.  An implementation detail is magnified beyond all proportions.
>>>>>
>>>>> An ioapic that is implemented by kvm lives in exactly the same place
>>>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
>>>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
>>>>> it elsewhere, not through creating imaginary buses that don't exist.
>>>>>
>>>>>
>>>>>            
>>>> Exactly.
>>>>
>>>> So we can either "infect" the whole device tree with kvm (or maybe a
>>>> more generic accelerator structure that also deals with Xen) or we need
>>>> to pull the reference inside the device's init function from some global
>>>> service (kvm_get_state).
>>>>
>>>>          
>>> Note that this topic is still waiting for good suggestions, specifically
>>> from those who believe in kvm_state references :). This is not only
>>> blocking kvmstate merge but will affect KVM irqchips as well.
>>>
>>> It boils down to how we reasonably pass a kvm_state reference from
>>> machine init code to a sysbus device. I'm probably biased, but I don't
>>> see any way that does not work against the idea of confining access to
>>> kvm_state or breaks device instantiation from the command line or a
>>> config file.
>>>
>>>        
>> A KVM device should sit on a KVM specific bus that hangs off of sysbus.
>> It can get to kvm_state through that bus.
>>
>> That bus doesn't get instantiated through qdev so requiring a pointer
>> argument should not be an issue.
>>
>>      
> This design is in conflict with the requirement to attach KVM-assisted
> devices also to their home bus, e.g. an assigned PCI device to the PCI
> bus. We don't support multi-homed qdev devices.
>    

The bus topology reflects how I/O flows in and out of a device.  We do 
not model a perfect PC bus architecture and I don't think we ever intend 
to.  Instead, we model a functional architecture.

I/O from an assigned device does not flow through the emulated PCI bus.  
Therefore, it does not belong on the emulated PCI bus.

Assigned devices need to interact with the emulated PCI bus, but they 
shouldn't be children of it.

Regards,

Anthony Liguori

> Jan
>
>
Jan Kiszka Jan. 18, 2011, 3:54 p.m. UTC | #21
On 2011-01-18 16:48, Anthony Liguori wrote:
> On 01/18/2011 09:43 AM, Jan Kiszka wrote:
>> On 2011-01-18 16:04, Anthony Liguori wrote:
>>    
>>> On 01/18/2011 08:28 AM, Jan Kiszka wrote:
>>>      
>>>> On 2011-01-12 11:31, Jan Kiszka wrote:
>>>>
>>>>        
>>>>> Am 12.01.2011 11:22, Avi Kivity wrote:
>>>>>
>>>>>          
>>>>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>>>>>
>>>>>>            
>>>>>>> Right, we should introduce a KVMBus that KVM devices are created on.
>>>>>>> The devices can get at KVMState through the BusState.
>>>>>>>
>>>>>>>              
>>>>>> There is no kvm bus in a PC (I looked).  We're bending the device model
>>>>>> here because a device is implemented in the kernel and not in
>>>>>> userspace.  An implementation detail is magnified beyond all proportions.
>>>>>>
>>>>>> An ioapic that is implemented by kvm lives in exactly the same place
>>>>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
>>>>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
>>>>>> it elsewhere, not through creating imaginary buses that don't exist.
>>>>>>
>>>>>>
>>>>>>            
>>>>> Exactly.
>>>>>
>>>>> So we can either "infect" the whole device tree with kvm (or maybe a
>>>>> more generic accelerator structure that also deals with Xen) or we need
>>>>> to pull the reference inside the device's init function from some global
>>>>> service (kvm_get_state).
>>>>>
>>>>>          
>>>> Note that this topic is still waiting for good suggestions, specifically
>>>> from those who believe in kvm_state references :). This is not only
>>>> blocking kvmstate merge but will affect KVM irqchips as well.
>>>>
>>>> It boils down to how we reasonably pass a kvm_state reference from
>>>> machine init code to a sysbus device. I'm probably biased, but I don't
>>>> see any way that does not work against the idea of confining access to
>>>> kvm_state or breaks device instantiation from the command line or a
>>>> config file.
>>>>
>>>>        
>>> A KVM device should sit on a KVM specific bus that hangs off of sysbus.
>>> It can get to kvm_state through that bus.
>>>
>>> That bus doesn't get instantiated through qdev so requiring a pointer
>>> argument should not be an issue.
>>>
>>>      
>> This design is in conflict with the requirement to attach KVM-assisted
>> devices also to their home bus, e.g. an assigned PCI device to the PCI
>> bus. We don't support multi-homed qdev devices.
>>    
> 
> With vfio, would an assigned PCI device even need kvm_state?

IIUC: Yes, for establishing the irqfd link.

Jan
Jan Kiszka Jan. 18, 2011, 4:01 p.m. UTC | #22
On 2011-01-18 16:50, Anthony Liguori wrote:
> On 01/18/2011 09:43 AM, Jan Kiszka wrote:
>> On 2011-01-18 16:04, Anthony Liguori wrote:
>>    
>>> On 01/18/2011 08:28 AM, Jan Kiszka wrote:
>>>      
>>>> On 2011-01-12 11:31, Jan Kiszka wrote:
>>>>
>>>>        
>>>>> Am 12.01.2011 11:22, Avi Kivity wrote:
>>>>>
>>>>>          
>>>>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>>>>>
>>>>>>            
>>>>>>> Right, we should introduce a KVMBus that KVM devices are created on.
>>>>>>> The devices can get at KVMState through the BusState.
>>>>>>>
>>>>>>>              
>>>>>> There is no kvm bus in a PC (I looked).  We're bending the device model
>>>>>> here because a device is implemented in the kernel and not in
>>>>>> userspace.  An implementation detail is magnified beyond all proportions.
>>>>>>
>>>>>> An ioapic that is implemented by kvm lives in exactly the same place
>>>>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
>>>>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
>>>>>> it elsewhere, not through creating imaginary buses that don't exist.
>>>>>>
>>>>>>
>>>>>>            
>>>>> Exactly.
>>>>>
>>>>> So we can either "infect" the whole device tree with kvm (or maybe a
>>>>> more generic accelerator structure that also deals with Xen) or we need
>>>>> to pull the reference inside the device's init function from some global
>>>>> service (kvm_get_state).
>>>>>
>>>>>          
>>>> Note that this topic is still waiting for good suggestions, specifically
>>>> from those who believe in kvm_state references :). This is not only
>>>> blocking kvmstate merge but will affect KVM irqchips as well.
>>>>
>>>> It boils down to how we reasonably pass a kvm_state reference from
>>>> machine init code to a sysbus device. I'm probably biased, but I don't
>>>> see any way that does not work against the idea of confining access to
>>>> kvm_state or breaks device instantiation from the command line or a
>>>> config file.
>>>>
>>>>        
>>> A KVM device should sit on a KVM specific bus that hangs off of sysbus.
>>> It can get to kvm_state through that bus.
>>>
>>> That bus doesn't get instantiated through qdev so requiring a pointer
>>> argument should not be an issue.
>>>
>>>      
>> This design is in conflict with the requirement to attach KVM-assisted
>> devices also to their home bus, e.g. an assigned PCI device to the PCI
>> bus. We don't support multi-homed qdev devices.
>>    
> 
> The bus topology reflects how I/O flows in and out of a device.  We do 
> not model a perfect PC bus architecture and I don't think we ever intend 
> to.  Instead, we model a functional architecture.
> 
> I/O from an assigned device does not flow through the emulated PCI bus.  
> Therefore, it does not belong on the emulated PCI bus.
> 
> Assigned devices need to interact with the emulated PCI bus, but they 
> shouldn't be children of it.

You should be able to find assigned devices on some PCI bus, so you
either have to hack up the existing bus to host devices that are, on the
other side, not part of it or branch off a pci-kvm sub-bus, just like
you would have to create a sysbus-kvm. I guess, if at all, we want the
latter.

Is that acceptable for everyone?

Jan
Anthony Liguori Jan. 18, 2011, 4:04 p.m. UTC | #23
On 01/18/2011 10:01 AM, Jan Kiszka wrote:
> On 2011-01-18 16:50, Anthony Liguori wrote:
>    
>> On 01/18/2011 09:43 AM, Jan Kiszka wrote:
>>      
>>> On 2011-01-18 16:04, Anthony Liguori wrote:
>>>
>>>        
>>>> On 01/18/2011 08:28 AM, Jan Kiszka wrote:
>>>>
>>>>          
>>>>> On 2011-01-12 11:31, Jan Kiszka wrote:
>>>>>
>>>>>
>>>>>            
>>>>>> Am 12.01.2011 11:22, Avi Kivity wrote:
>>>>>>
>>>>>>
>>>>>>              
>>>>>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>>>>>>
>>>>>>>
>>>>>>>                
>>>>>>>> Right, we should introduce a KVMBus that KVM devices are created on.
>>>>>>>> The devices can get at KVMState through the BusState.
>>>>>>>>
>>>>>>>>
>>>>>>>>                  
>>>>>>> There is no kvm bus in a PC (I looked).  We're bending the device model
>>>>>>> here because a device is implemented in the kernel and not in
>>>>>>> userspace.  An implementation detail is magnified beyond all proportions.
>>>>>>>
>>>>>>> An ioapic that is implemented by kvm lives in exactly the same place
>>>>>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
>>>>>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
>>>>>>> it elsewhere, not through creating imaginary buses that don't exist.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>                
>>>>>> Exactly.
>>>>>>
>>>>>> So we can either "infect" the whole device tree with kvm (or maybe a
>>>>>> more generic accelerator structure that also deals with Xen) or we need
>>>>>> to pull the reference inside the device's init function from some global
>>>>>> service (kvm_get_state).
>>>>>>
>>>>>>
>>>>>>              
>>>>> Note that this topic is still waiting for good suggestions, specifically
>>>>> from those who believe in kvm_state references :). This is not only
>>>>> blocking kvmstate merge but will affect KVM irqchips as well.
>>>>>
>>>>> It boils down to how we reasonably pass a kvm_state reference from
>>>>> machine init code to a sysbus device. I'm probably biased, but I don't
>>>>> see any way that does not work against the idea of confining access to
>>>>> kvm_state or breaks device instantiation from the command line or a
>>>>> config file.
>>>>>
>>>>>
>>>>>            
>>>> A KVM device should sit on a KVM specific bus that hangs off of sysbus.
>>>> It can get to kvm_state through that bus.
>>>>
>>>> That bus doesn't get instantiated through qdev so requiring a pointer
>>>> argument should not be an issue.
>>>>
>>>>
>>>>          
>>> This design is in conflict with the requirement to attach KVM-assisted
>>> devices also to their home bus, e.g. an assigned PCI device to the PCI
>>> bus. We don't support multi-homed qdev devices.
>>>
>>>        
>> The bus topology reflects how I/O flows in and out of a device.  We do
>> not model a perfect PC bus architecture and I don't think we ever intend
>> to.  Instead, we model a functional architecture.
>>
>> I/O from an assigned device does not flow through the emulated PCI bus.
>> Therefore, it does not belong on the emulated PCI bus.
>>
>> Assigned devices need to interact with the emulated PCI bus, but they
>> shouldn't be children of it.
>>      
> You should be able to find assigned devices on some PCI bus, so you
> either have to hack up the existing bus to host devices that are, on the
> other side, not part of it or branch off a pci-kvm sub-bus, just like
> you would have to create a sysbus-kvm.

Management tools should never transverse the device tree to find 
devices.  This is a recipe for disaster in the long term because the 
device tree will not remain stable.

So yes, a management tool should be able to enumerate assigned devices 
as they would enumerate any other PCI device but that has almost nothing 
to do with what the tree layout is.

Regards,

Anthony Liguori

>   I guess, if at all, we want the
> latter.
>
> Is that acceptable for everyone?
>
> Jan
>
>
Jan Kiszka Jan. 18, 2011, 4:17 p.m. UTC | #24
On 2011-01-18 17:04, Anthony Liguori wrote:
>>>>> A KVM device should sit on a KVM specific bus that hangs off of
>>>>> sysbus.
>>>>> It can get to kvm_state through that bus.
>>>>>
>>>>> That bus doesn't get instantiated through qdev so requiring a pointer
>>>>> argument should not be an issue.
>>>>>
>>>>>
>>>>>          
>>>> This design is in conflict with the requirement to attach KVM-assisted
>>>> devices also to their home bus, e.g. an assigned PCI device to the PCI
>>>> bus. We don't support multi-homed qdev devices.
>>>>
>>>>        
>>> The bus topology reflects how I/O flows in and out of a device.  We do
>>> not model a perfect PC bus architecture and I don't think we ever intend
>>> to.  Instead, we model a functional architecture.
>>>
>>> I/O from an assigned device does not flow through the emulated PCI bus.
>>> Therefore, it does not belong on the emulated PCI bus.
>>>
>>> Assigned devices need to interact with the emulated PCI bus, but they
>>> shouldn't be children of it.
>>>      
>> You should be able to find assigned devices on some PCI bus, so you
>> either have to hack up the existing bus to host devices that are, on the
>> other side, not part of it or branch off a pci-kvm sub-bus, just like
>> you would have to create a sysbus-kvm.
> 
> Management tools should never transverse the device tree to find
> devices.  This is a recipe for disaster in the long term because the
> device tree will not remain stable.
> 
> So yes, a management tool should be able to enumerate assigned devices
> as they would enumerate any other PCI device but that has almost nothing
> to do with what the tree layout is.

I'm probably misunderstanding you, but if the bus topology as the guest
sees it is not properly reflected in an object tree on the qemu side, we
are creating hacks again.

Management and analysis tools must be able to traverse the system buses
and find guest devices this way. If they create a device on bus X, it
must never end up on bus Y just because it happens to be KVM-assisted or
has some other property. On the other hand, trying to hide this
dependency will likely cause severe damage to the qdev design.

Jan
Anthony Liguori Jan. 18, 2011, 4:37 p.m. UTC | #25
On 01/18/2011 10:17 AM, Jan Kiszka wrote:
> On 2011-01-18 17:04, Anthony Liguori wrote:
>    
>>>>>> A KVM device should sit on a KVM specific bus that hangs off of
>>>>>> sysbus.
>>>>>> It can get to kvm_state through that bus.
>>>>>>
>>>>>> That bus doesn't get instantiated through qdev so requiring a pointer
>>>>>> argument should not be an issue.
>>>>>>
>>>>>>
>>>>>>
>>>>>>              
>>>>> This design is in conflict with the requirement to attach KVM-assisted
>>>>> devices also to their home bus, e.g. an assigned PCI device to the PCI
>>>>> bus. We don't support multi-homed qdev devices.
>>>>>
>>>>>
>>>>>            
>>>> The bus topology reflects how I/O flows in and out of a device.  We do
>>>> not model a perfect PC bus architecture and I don't think we ever intend
>>>> to.  Instead, we model a functional architecture.
>>>>
>>>> I/O from an assigned device does not flow through the emulated PCI bus.
>>>> Therefore, it does not belong on the emulated PCI bus.
>>>>
>>>> Assigned devices need to interact with the emulated PCI bus, but they
>>>> shouldn't be children of it.
>>>>
>>>>          
>>> You should be able to find assigned devices on some PCI bus, so you
>>> either have to hack up the existing bus to host devices that are, on the
>>> other side, not part of it or branch off a pci-kvm sub-bus, just like
>>> you would have to create a sysbus-kvm.
>>>        
>> Management tools should never transverse the device tree to find
>> devices.  This is a recipe for disaster in the long term because the
>> device tree will not remain stable.
>>
>> So yes, a management tool should be able to enumerate assigned devices
>> as they would enumerate any other PCI device but that has almost nothing
>> to do with what the tree layout is.
>>      
> I'm probably misunderstanding you, but if the bus topology as the guest
> sees it is not properly reflected in an object tree on the qemu side, we
> are creating hacks again.
>    

There is no such thing as the "bus topology as the guest sees it".

The guest just sees a bunch of devices.  The guest can only infer things 
like ISA busses.  The guest sees a bunch of devices: an i8254, i8259, 
RTC, etc.  Whether those devices are on an ISA bus, and LPC bus, or all 
in a SuperI/O chip that's part of the southbridge is all invisible to 
the guest.

The device model topology is 100% a hidden architectural detail.

> Management and analysis tools must be able to traverse the system buses
> and find guest devices this way.

We need to provide a compatible interface to the guest.  If you agree 
with my above statements, then you'll also agree that we can do this 
without keeping the device model topology stable.

But we also need to provide a compatible interface to management tools.  
Exposing the device model topology as a compatible interface 
artificially limits us.  It's far better to provide higher level 
supported interfaces to give us the flexibility to change the device 
model as we need to.

>   If they create a device on bus X, it
> must never end up on bus Y just because it happens to be KVM-assisted or
> has some other property.

Nope.  This is exactly what should happen.

90% of the devices in the device model are not created by management 
tools.  They're part of a chipset.  The chipset has well defined 
extension points and we provide management interfaces to create devices 
on those extension points.  That is, interfaces to create PCI devices.

Regards,

Anthony Liguori

>   On the other hand, trying to hide this
> dependency will likely cause severe damage to the qdev design.
>
> Jan
>
>
Jan Kiszka Jan. 18, 2011, 4:56 p.m. UTC | #26
On 2011-01-18 17:37, Anthony Liguori wrote:
> On 01/18/2011 10:17 AM, Jan Kiszka wrote:
>> On 2011-01-18 17:04, Anthony Liguori wrote:
>>    
>>>>>>> A KVM device should sit on a KVM specific bus that hangs off of
>>>>>>> sysbus.
>>>>>>> It can get to kvm_state through that bus.
>>>>>>>
>>>>>>> That bus doesn't get instantiated through qdev so requiring a pointer
>>>>>>> argument should not be an issue.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>              
>>>>>> This design is in conflict with the requirement to attach KVM-assisted
>>>>>> devices also to their home bus, e.g. an assigned PCI device to the PCI
>>>>>> bus. We don't support multi-homed qdev devices.
>>>>>>
>>>>>>
>>>>>>            
>>>>> The bus topology reflects how I/O flows in and out of a device.  We do
>>>>> not model a perfect PC bus architecture and I don't think we ever intend
>>>>> to.  Instead, we model a functional architecture.
>>>>>
>>>>> I/O from an assigned device does not flow through the emulated PCI bus.
>>>>> Therefore, it does not belong on the emulated PCI bus.
>>>>>
>>>>> Assigned devices need to interact with the emulated PCI bus, but they
>>>>> shouldn't be children of it.
>>>>>
>>>>>          
>>>> You should be able to find assigned devices on some PCI bus, so you
>>>> either have to hack up the existing bus to host devices that are, on the
>>>> other side, not part of it or branch off a pci-kvm sub-bus, just like
>>>> you would have to create a sysbus-kvm.
>>>>        
>>> Management tools should never transverse the device tree to find
>>> devices.  This is a recipe for disaster in the long term because the
>>> device tree will not remain stable.
>>>
>>> So yes, a management tool should be able to enumerate assigned devices
>>> as they would enumerate any other PCI device but that has almost nothing
>>> to do with what the tree layout is.
>>>      
>> I'm probably misunderstanding you, but if the bus topology as the guest
>> sees it is not properly reflected in an object tree on the qemu side, we
>> are creating hacks again.
>>    
> 
> There is no such thing as the "bus topology as the guest sees it".
> 
> The guest just sees a bunch of devices.  The guest can only infer things 
> like ISA busses.  The guest sees a bunch of devices: an i8254, i8259, 
> RTC, etc.  Whether those devices are on an ISA bus, and LPC bus, or all 
> in a SuperI/O chip that's part of the southbridge is all invisible to 
> the guest.
> 
> The device model topology is 100% a hidden architectural detail.

This is true for the sysbus, it is obviously not the case for PCI and
similarly discoverable buses. There we have a guest-explorable topology
that is currently equivalent to the the qdev layout.

> 
>> Management and analysis tools must be able to traverse the system buses
>> and find guest devices this way.
> 
> We need to provide a compatible interface to the guest.  If you agree 
> with my above statements, then you'll also agree that we can do this 
> without keeping the device model topology stable.
> 
> But we also need to provide a compatible interface to management tools.  
> Exposing the device model topology as a compatible interface 
> artificially limits us.  It's far better to provide higher level 
> supported interfaces to give us the flexibility to change the device 
> model as we need to.

How do you want to change qdev to keep the guest and management tool
view stable while branching off kvm sub-buses? Please propose such
extensions so that they can be discussed. IIUC, that would be second
relation between qdev and qbus instances besides the physical topology.
What further use cases (besides passing kvm_state around) do you have in
mind?

> 
>>   If they create a device on bus X, it
>> must never end up on bus Y just because it happens to be KVM-assisted or
>> has some other property.
> 
> Nope.  This is exactly what should happen.
> 
> 90% of the devices in the device model are not created by management 
> tools.  They're part of a chipset.  The chipset has well defined 
> extension points and we provide management interfaces to create devices 
> on those extension points.  That is, interfaces to create PCI devices.
> 

Creating kvm irqchips via the management tool would be one simple way
(not the only one, though) to enable/disable kvm assistance for those
devices.

Jan
Alex Williamson Jan. 18, 2011, 5:02 p.m. UTC | #27
On Tue, 2011-01-18 at 16:54 +0100, Jan Kiszka wrote:
> On 2011-01-18 16:48, Anthony Liguori wrote:
> > On 01/18/2011 09:43 AM, Jan Kiszka wrote:
> >> On 2011-01-18 16:04, Anthony Liguori wrote:
> >>    
> >>> On 01/18/2011 08:28 AM, Jan Kiszka wrote:
> >>>      
> >>>> On 2011-01-12 11:31, Jan Kiszka wrote:
> >>>>
> >>>>        
> >>>>> Am 12.01.2011 11:22, Avi Kivity wrote:
> >>>>>
> >>>>>          
> >>>>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
> >>>>>>
> >>>>>>            
> >>>>>>> Right, we should introduce a KVMBus that KVM devices are created on.
> >>>>>>> The devices can get at KVMState through the BusState.
> >>>>>>>
> >>>>>>>              
> >>>>>> There is no kvm bus in a PC (I looked).  We're bending the device model
> >>>>>> here because a device is implemented in the kernel and not in
> >>>>>> userspace.  An implementation detail is magnified beyond all proportions.
> >>>>>>
> >>>>>> An ioapic that is implemented by kvm lives in exactly the same place
> >>>>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
> >>>>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
> >>>>>> it elsewhere, not through creating imaginary buses that don't exist.
> >>>>>>
> >>>>>>
> >>>>>>            
> >>>>> Exactly.
> >>>>>
> >>>>> So we can either "infect" the whole device tree with kvm (or maybe a
> >>>>> more generic accelerator structure that also deals with Xen) or we need
> >>>>> to pull the reference inside the device's init function from some global
> >>>>> service (kvm_get_state).
> >>>>>
> >>>>>          
> >>>> Note that this topic is still waiting for good suggestions, specifically
> >>>> from those who believe in kvm_state references :). This is not only
> >>>> blocking kvmstate merge but will affect KVM irqchips as well.
> >>>>
> >>>> It boils down to how we reasonably pass a kvm_state reference from
> >>>> machine init code to a sysbus device. I'm probably biased, but I don't
> >>>> see any way that does not work against the idea of confining access to
> >>>> kvm_state or breaks device instantiation from the command line or a
> >>>> config file.
> >>>>
> >>>>        
> >>> A KVM device should sit on a KVM specific bus that hangs off of sysbus.
> >>> It can get to kvm_state through that bus.
> >>>
> >>> That bus doesn't get instantiated through qdev so requiring a pointer
> >>> argument should not be an issue.
> >>>
> >>>      
> >> This design is in conflict with the requirement to attach KVM-assisted
> >> devices also to their home bus, e.g. an assigned PCI device to the PCI
> >> bus. We don't support multi-homed qdev devices.
> >>    
> > 
> > With vfio, would an assigned PCI device even need kvm_state?
> 
> IIUC: Yes, for establishing the irqfd link.

We abstract this through the msi/msix layer though, so the vfio driver
doesn't directly know anything about kvm_state.

Alex
Jan Kiszka Jan. 18, 2011, 5:08 p.m. UTC | #28
On 2011-01-18 18:02, Alex Williamson wrote:
> On Tue, 2011-01-18 at 16:54 +0100, Jan Kiszka wrote:
>> On 2011-01-18 16:48, Anthony Liguori wrote:
>>> On 01/18/2011 09:43 AM, Jan Kiszka wrote:
>>>> On 2011-01-18 16:04, Anthony Liguori wrote:
>>>>    
>>>>> On 01/18/2011 08:28 AM, Jan Kiszka wrote:
>>>>>      
>>>>>> On 2011-01-12 11:31, Jan Kiszka wrote:
>>>>>>
>>>>>>        
>>>>>>> Am 12.01.2011 11:22, Avi Kivity wrote:
>>>>>>>
>>>>>>>          
>>>>>>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>>>>>>>
>>>>>>>>            
>>>>>>>>> Right, we should introduce a KVMBus that KVM devices are created on.
>>>>>>>>> The devices can get at KVMState through the BusState.
>>>>>>>>>
>>>>>>>>>              
>>>>>>>> There is no kvm bus in a PC (I looked).  We're bending the device model
>>>>>>>> here because a device is implemented in the kernel and not in
>>>>>>>> userspace.  An implementation detail is magnified beyond all proportions.
>>>>>>>>
>>>>>>>> An ioapic that is implemented by kvm lives in exactly the same place
>>>>>>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
>>>>>>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
>>>>>>>> it elsewhere, not through creating imaginary buses that don't exist.
>>>>>>>>
>>>>>>>>
>>>>>>>>            
>>>>>>> Exactly.
>>>>>>>
>>>>>>> So we can either "infect" the whole device tree with kvm (or maybe a
>>>>>>> more generic accelerator structure that also deals with Xen) or we need
>>>>>>> to pull the reference inside the device's init function from some global
>>>>>>> service (kvm_get_state).
>>>>>>>
>>>>>>>          
>>>>>> Note that this topic is still waiting for good suggestions, specifically
>>>>>> from those who believe in kvm_state references :). This is not only
>>>>>> blocking kvmstate merge but will affect KVM irqchips as well.
>>>>>>
>>>>>> It boils down to how we reasonably pass a kvm_state reference from
>>>>>> machine init code to a sysbus device. I'm probably biased, but I don't
>>>>>> see any way that does not work against the idea of confining access to
>>>>>> kvm_state or breaks device instantiation from the command line or a
>>>>>> config file.
>>>>>>
>>>>>>        
>>>>> A KVM device should sit on a KVM specific bus that hangs off of sysbus.
>>>>> It can get to kvm_state through that bus.
>>>>>
>>>>> That bus doesn't get instantiated through qdev so requiring a pointer
>>>>> argument should not be an issue.
>>>>>
>>>>>      
>>>> This design is in conflict with the requirement to attach KVM-assisted
>>>> devices also to their home bus, e.g. an assigned PCI device to the PCI
>>>> bus. We don't support multi-homed qdev devices.
>>>>    
>>>
>>> With vfio, would an assigned PCI device even need kvm_state?
>>
>> IIUC: Yes, for establishing the irqfd link.
> 
> We abstract this through the msi/msix layer though, so the vfio driver
> doesn't directly know anything about kvm_state.

Which version/tree are you referring to? It wasn't the case in the last
version I found on the list.

Does the msi layer use irqfd for every source in kvm mode then? Of
course, the key question will be how that layer will once obtain kvm_state.

Jan
Anthony Liguori Jan. 18, 2011, 5:09 p.m. UTC | #29
On 01/18/2011 10:56 AM, Jan Kiszka wrote:
>
>> The device model topology is 100% a hidden architectural detail.
>>      
> This is true for the sysbus, it is obviously not the case for PCI and
> similarly discoverable buses. There we have a guest-explorable topology
> that is currently equivalent to the the qdev layout.
>    

But we also don't do PCI passthrough so we really haven't even explored 
how that maps in qdev.  I don't know if qemu-kvm has attempted to 
qdev-ify it.

>>> Management and analysis tools must be able to traverse the system buses
>>> and find guest devices this way.
>>>        
>> We need to provide a compatible interface to the guest.  If you agree
>> with my above statements, then you'll also agree that we can do this
>> without keeping the device model topology stable.
>>
>> But we also need to provide a compatible interface to management tools.
>> Exposing the device model topology as a compatible interface
>> artificially limits us.  It's far better to provide higher level
>> supported interfaces to give us the flexibility to change the device
>> model as we need to.
>>      
> How do you want to change qdev to keep the guest and management tool
> view stable while branching off kvm sub-buses?

The qdev device model is not a stable interface.  I think that's been 
clear from the very beginning.

>   Please propose such
> extensions so that they can be discussed. IIUC, that would be second
> relation between qdev and qbus instances besides the physical topology.
> What further use cases (besides passing kvm_state around) do you have in
> mind?
>    

The -device interface is a stable interface.  Right now, you don't 
specify any type of identifier of the pci bus when you create a PCI 
device.  It's implied in the interface.

>    
>>      
>>>    If they create a device on bus X, it
>>> must never end up on bus Y just because it happens to be KVM-assisted or
>>> has some other property.
>>>        
>> Nope.  This is exactly what should happen.
>>
>> 90% of the devices in the device model are not created by management
>> tools.  They're part of a chipset.  The chipset has well defined
>> extension points and we provide management interfaces to create devices
>> on those extension points.  That is, interfaces to create PCI devices.
>>
>>      
> Creating kvm irqchips via the management tool would be one simple way
> (not the only one, though) to enable/disable kvm assistance for those
> devices.
>    

It's automatically created as part of the CPUs or as part of the 
chipset.  How to enable/disable kvm assistance is a property of the CPU 
and/or chipset.

Regards,

Anthony Liguori


> Jan
>
>
Jan Kiszka Jan. 18, 2011, 5:20 p.m. UTC | #30
On 2011-01-18 18:09, Anthony Liguori wrote:
> On 01/18/2011 10:56 AM, Jan Kiszka wrote:
>>
>>> The device model topology is 100% a hidden architectural detail.
>>>      
>> This is true for the sysbus, it is obviously not the case for PCI and
>> similarly discoverable buses. There we have a guest-explorable topology
>> that is currently equivalent to the the qdev layout.
>>    
> 
> But we also don't do PCI passthrough so we really haven't even explored 
> how that maps in qdev.  I don't know if qemu-kvm has attempted to 
> qdev-ify it.

It is. And even if it weren't or the current version in qemu-kvm was not
perfect, we need to consider those uses cases now as we are about to
define a generic model for kvm device integration. That's the point of
this discussion.

> 
>>>> Management and analysis tools must be able to traverse the system buses
>>>> and find guest devices this way.
>>>>        
>>> We need to provide a compatible interface to the guest.  If you agree
>>> with my above statements, then you'll also agree that we can do this
>>> without keeping the device model topology stable.
>>>
>>> But we also need to provide a compatible interface to management tools.
>>> Exposing the device model topology as a compatible interface
>>> artificially limits us.  It's far better to provide higher level
>>> supported interfaces to give us the flexibility to change the device
>>> model as we need to.
>>>      
>> How do you want to change qdev to keep the guest and management tool
>> view stable while branching off kvm sub-buses?
> 
> The qdev device model is not a stable interface.  I think that's been 
> clear from the very beginning.

Internals aren't stable, but they should only be changed for a good
reason, specifically when the change may impact the whole set of device
models.

> 
>>   Please propose such
>> extensions so that they can be discussed. IIUC, that would be second
>> relation between qdev and qbus instances besides the physical topology.
>> What further use cases (besides passing kvm_state around) do you have in
>> mind?
>>    
> 
> The -device interface is a stable interface.  Right now, you don't 
> specify any type of identifier of the pci bus when you create a PCI 
> device.  It's implied in the interface.

Which only works as along as we expose a single bus. You don't need to
be an oracle to predict that this is not a stable interface.

> 
>>    
>>>      
>>>>    If they create a device on bus X, it
>>>> must never end up on bus Y just because it happens to be KVM-assisted or
>>>> has some other property.
>>>>        
>>> Nope.  This is exactly what should happen.
>>>
>>> 90% of the devices in the device model are not created by management
>>> tools.  They're part of a chipset.  The chipset has well defined
>>> extension points and we provide management interfaces to create devices
>>> on those extension points.  That is, interfaces to create PCI devices.
>>>
>>>      
>> Creating kvm irqchips via the management tool would be one simple way
>> (not the only one, though) to enable/disable kvm assistance for those
>> devices.
>>    
> 
> It's automatically created as part of the CPUs or as part of the 
> chipset.  How to enable/disable kvm assistance is a property of the CPU 
> and/or chipset.

If we exclude creation via command line / config files, we could also
pass the kvm_state directly from the machine or chipset setup code and
save us at least the kvm system buses.

Jan
Anthony Liguori Jan. 18, 2011, 5:31 p.m. UTC | #31
On 01/18/2011 11:20 AM, Jan Kiszka wrote:
>
> Which only works as along as we expose a single bus. You don't need to
> be an oracle to predict that this is not a stable interface.
>    

Today we only have a very low level factory interface--device creation 
and deletion.

I think we should move to higher level bus factory interfaces.  An 
interface to create a PCI device and to delete PCI devices.  This is the 
only sane way to do hot plug.

This also makes supporting multiple busses a lot more reasonable since 
this factory interface could be a method of the controller.

>> It's automatically created as part of the CPUs or as part of the
>> chipset.  How to enable/disable kvm assistance is a property of the CPU
>> and/or chipset.
>>      
> If we exclude creation via command line / config files, we could also
> pass the kvm_state directly from the machine or chipset setup code and
> save us at least the kvm system buses.
>    

Which is fine in the short term.  This is exactly why we don't want the 
device model to be an ABI.   It gives us the ability to make changes as 
they make sense instead of trying to be perfect from the start (which we 
never will be).

Regards,

Anthony Liguori

> Jan
>
>
Alex Williamson Jan. 18, 2011, 5:39 p.m. UTC | #32
On Tue, 2011-01-18 at 18:08 +0100, Jan Kiszka wrote:
> On 2011-01-18 18:02, Alex Williamson wrote:
> > On Tue, 2011-01-18 at 16:54 +0100, Jan Kiszka wrote:
> >> On 2011-01-18 16:48, Anthony Liguori wrote:
> >>> On 01/18/2011 09:43 AM, Jan Kiszka wrote:
> >>>> On 2011-01-18 16:04, Anthony Liguori wrote:
> >>>>    
> >>>>> On 01/18/2011 08:28 AM, Jan Kiszka wrote:
> >>>>>      
> >>>>>> On 2011-01-12 11:31, Jan Kiszka wrote:
> >>>>>>
> >>>>>>        
> >>>>>>> Am 12.01.2011 11:22, Avi Kivity wrote:
> >>>>>>>
> >>>>>>>          
> >>>>>>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
> >>>>>>>>
> >>>>>>>>            
> >>>>>>>>> Right, we should introduce a KVMBus that KVM devices are created on.
> >>>>>>>>> The devices can get at KVMState through the BusState.
> >>>>>>>>>
> >>>>>>>>>              
> >>>>>>>> There is no kvm bus in a PC (I looked).  We're bending the device model
> >>>>>>>> here because a device is implemented in the kernel and not in
> >>>>>>>> userspace.  An implementation detail is magnified beyond all proportions.
> >>>>>>>>
> >>>>>>>> An ioapic that is implemented by kvm lives in exactly the same place
> >>>>>>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
> >>>>>>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
> >>>>>>>> it elsewhere, not through creating imaginary buses that don't exist.
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>            
> >>>>>>> Exactly.
> >>>>>>>
> >>>>>>> So we can either "infect" the whole device tree with kvm (or maybe a
> >>>>>>> more generic accelerator structure that also deals with Xen) or we need
> >>>>>>> to pull the reference inside the device's init function from some global
> >>>>>>> service (kvm_get_state).
> >>>>>>>
> >>>>>>>          
> >>>>>> Note that this topic is still waiting for good suggestions, specifically
> >>>>>> from those who believe in kvm_state references :). This is not only
> >>>>>> blocking kvmstate merge but will affect KVM irqchips as well.
> >>>>>>
> >>>>>> It boils down to how we reasonably pass a kvm_state reference from
> >>>>>> machine init code to a sysbus device. I'm probably biased, but I don't
> >>>>>> see any way that does not work against the idea of confining access to
> >>>>>> kvm_state or breaks device instantiation from the command line or a
> >>>>>> config file.
> >>>>>>
> >>>>>>        
> >>>>> A KVM device should sit on a KVM specific bus that hangs off of sysbus.
> >>>>> It can get to kvm_state through that bus.
> >>>>>
> >>>>> That bus doesn't get instantiated through qdev so requiring a pointer
> >>>>> argument should not be an issue.
> >>>>>
> >>>>>      
> >>>> This design is in conflict with the requirement to attach KVM-assisted
> >>>> devices also to their home bus, e.g. an assigned PCI device to the PCI
> >>>> bus. We don't support multi-homed qdev devices.
> >>>>    
> >>>
> >>> With vfio, would an assigned PCI device even need kvm_state?
> >>
> >> IIUC: Yes, for establishing the irqfd link.
> > 
> > We abstract this through the msi/msix layer though, so the vfio driver
> > doesn't directly know anything about kvm_state.
> 
> Which version/tree are you referring to? It wasn't the case in the last
> version I found on the list.
> 
> Does the msi layer use irqfd for every source in kvm mode then? Of
> course, the key question will be how that layer will once obtain kvm_state.

Looking at "[RFC PATCH v2] VFIO based device assignment" sent on Nov
5th, I guess we do call kvm_set_irqfd.  Maybe I'm just wishing that the
msi layer abstracted it better.  I'd like to be able to pass in a
userspace interrupt handler function pointer and an event notifier fd
and let the interrupt layers worry about how it's hooked up.

Alex
Jan Kiszka Jan. 18, 2011, 5:45 p.m. UTC | #33
On 2011-01-18 18:31, Anthony Liguori wrote:
>>> It's automatically created as part of the CPUs or as part of the
>>> chipset.  How to enable/disable kvm assistance is a property of the CPU
>>> and/or chipset.
>>>      
>> If we exclude creation via command line / config files, we could also
>> pass the kvm_state directly from the machine or chipset setup code and
>> save us at least the kvm system buses.
>>    
> 
> Which is fine in the short term.

Unless we want to abuse the pointer property for this, and there was
some resistance, we would have to change the sysbus init function
signature. I don't want to propose this for a short-term workaround, we
need a clearer vision and roadmap to avoid multiple invasive changes to
the device model.

>  This is exactly why we don't want the 
> device model to be an ABI.   It gives us the ability to make changes as 
> they make sense instead of trying to be perfect from the start (which we 
> never will be).

The device model will always consist of a stable part, the guest and
management visible topology. That beast needs to be modeled as well,
likely via some new bus objects. If that's the way to go, starting now
is probably the right time as we have an urgent use case, right?

Jan
Gerd Hoffmann Jan. 19, 2011, 9:48 a.m. UTC | #34
On 01/18/11 18:09, Anthony Liguori wrote:
> On 01/18/2011 10:56 AM, Jan Kiszka wrote:
>>
>>> The device model topology is 100% a hidden architectural detail.
>> This is true for the sysbus, it is obviously not the case for PCI and
>> similarly discoverable buses. There we have a guest-explorable topology
>> that is currently equivalent to the the qdev layout.
>
> But we also don't do PCI passthrough so we really haven't even explored
> how that maps in qdev. I don't know if qemu-kvm has attempted to
> qdev-ify it.

It is qdev-ified.  It is a normal pci device from qdev's point of view.

BTW: is there any reason why (vfio-based) pci passthrough couldn't work 
with tcg?

> The -device interface is a stable interface. Right now, you don't
> specify any type of identifier of the pci bus when you create a PCI
> device. It's implied in the interface.

Wrong.  You can specify the bus you want attach the device to via 
bus=<name>.  This is true for *every* device, including all pci devices. 
  If unspecified qdev uses the first bus it finds.

As long as there is a single pci bus only there is simply no need to 
specify it, thats why nobody does that today.  Once q35 finally arrives 
this will change of course.

cheers,
   Gerd
Markus Armbruster Jan. 19, 2011, 1:09 p.m. UTC | #35
Anthony Liguori <aliguori@linux.vnet.ibm.com> writes:

> On 01/18/2011 10:56 AM, Jan Kiszka wrote:
>>
>>> The device model topology is 100% a hidden architectural detail.
>>>      
>> This is true for the sysbus, it is obviously not the case for PCI and
>> similarly discoverable buses. There we have a guest-explorable topology
>> that is currently equivalent to the the qdev layout.
>>    
>
> But we also don't do PCI passthrough so we really haven't even
> explored how that maps in qdev.  I don't know if qemu-kvm has
> attempted to qdev-ify it.
>
>>>> Management and analysis tools must be able to traverse the system buses
>>>> and find guest devices this way.
>>>>        
>>> We need to provide a compatible interface to the guest.  If you agree
>>> with my above statements, then you'll also agree that we can do this
>>> without keeping the device model topology stable.
>>>
>>> But we also need to provide a compatible interface to management tools.
>>> Exposing the device model topology as a compatible interface
>>> artificially limits us.  It's far better to provide higher level
>>> supported interfaces to give us the flexibility to change the device
>>> model as we need to.
>>>      
>> How do you want to change qdev to keep the guest and management tool
>> view stable while branching off kvm sub-buses?
>
> The qdev device model is not a stable interface.  I think that's been
> clear from the very beginning.
>
>>   Please propose such
>> extensions so that they can be discussed. IIUC, that would be second
>> relation between qdev and qbus instances besides the physical topology.
>> What further use cases (besides passing kvm_state around) do you have in
>> mind?
>>    
>
> The -device interface is a stable interface.  Right now, you don't
> specify any type of identifier of the pci bus when you create a PCI
> device.  It's implied in the interface.

Now I'm confused.  Isn't "-device FOO,bus=pci.0" specifying the PCI bus?

[...]
Markus Armbruster Jan. 19, 2011, 1:11 p.m. UTC | #36
Gerd Hoffmann <kraxel@redhat.com> writes:

> On 01/18/11 18:09, Anthony Liguori wrote:
>> On 01/18/2011 10:56 AM, Jan Kiszka wrote:
>>>
>>>> The device model topology is 100% a hidden architectural detail.
>>> This is true for the sysbus, it is obviously not the case for PCI and
>>> similarly discoverable buses. There we have a guest-explorable topology
>>> that is currently equivalent to the the qdev layout.
>>
>> But we also don't do PCI passthrough so we really haven't even explored
>> how that maps in qdev. I don't know if qemu-kvm has attempted to
>> qdev-ify it.
>
> It is qdev-ified.  It is a normal pci device from qdev's point of view.
>
> BTW: is there any reason why (vfio-based) pci passthrough couldn't
> work with tcg?
>
>> The -device interface is a stable interface. Right now, you don't
>> specify any type of identifier of the pci bus when you create a PCI
>> device. It's implied in the interface.
>
> Wrong.  You can specify the bus you want attach the device to via
> bus=<name>.  This is true for *every* device, including all pci
> devices. If unspecified qdev uses the first bus it finds.
>
> As long as there is a single pci bus only there is simply no need to
> specify it, thats why nobody does that today.  Once q35 finally
> arrives this will change of course.

As far as I know, libvirt does it already.
Markus Armbruster Jan. 19, 2011, 1:15 p.m. UTC | #37
Anthony Liguori <aliguori@linux.vnet.ibm.com> writes:

> On 01/18/2011 09:43 AM, Jan Kiszka wrote:
>> On 2011-01-18 16:04, Anthony Liguori wrote:
>>    
>>> On 01/18/2011 08:28 AM, Jan Kiszka wrote:
>>>      
>>>> On 2011-01-12 11:31, Jan Kiszka wrote:
>>>>
>>>>        
>>>>> Am 12.01.2011 11:22, Avi Kivity wrote:
>>>>>
>>>>>          
>>>>>> On 01/11/2011 03:54 PM, Anthony Liguori wrote:
>>>>>>
>>>>>>            
>>>>>>> Right, we should introduce a KVMBus that KVM devices are created on.
>>>>>>> The devices can get at KVMState through the BusState.
>>>>>>>
>>>>>>>              
>>>>>> There is no kvm bus in a PC (I looked).  We're bending the device model
>>>>>> here because a device is implemented in the kernel and not in
>>>>>> userspace.  An implementation detail is magnified beyond all proportions.
>>>>>>
>>>>>> An ioapic that is implemented by kvm lives in exactly the same place
>>>>>> that the qemu ioapic lives in.  An assigned pci device lives on the PCI
>>>>>> bus, not a KVMBus.  If we need a pointer to KVMState, then we must find
>>>>>> it elsewhere, not through creating imaginary buses that don't exist.
>>>>>>
>>>>>>
>>>>>>            
>>>>> Exactly.
>>>>>
>>>>> So we can either "infect" the whole device tree with kvm (or maybe a
>>>>> more generic accelerator structure that also deals with Xen) or we need
>>>>> to pull the reference inside the device's init function from some global
>>>>> service (kvm_get_state).
>>>>>
>>>>>          
>>>> Note that this topic is still waiting for good suggestions, specifically
>>>> from those who believe in kvm_state references :). This is not only
>>>> blocking kvmstate merge but will affect KVM irqchips as well.
>>>>
>>>> It boils down to how we reasonably pass a kvm_state reference from
>>>> machine init code to a sysbus device. I'm probably biased, but I don't
>>>> see any way that does not work against the idea of confining access to
>>>> kvm_state or breaks device instantiation from the command line or a
>>>> config file.
>>>>
>>>>        
>>> A KVM device should sit on a KVM specific bus that hangs off of sysbus.
>>> It can get to kvm_state through that bus.
>>>
>>> That bus doesn't get instantiated through qdev so requiring a pointer
>>> argument should not be an issue.
>>>
>>>      
>> This design is in conflict with the requirement to attach KVM-assisted
>> devices also to their home bus, e.g. an assigned PCI device to the PCI
>> bus. We don't support multi-homed qdev devices.
>>    
>
> The bus topology reflects how I/O flows in and out of a device.  We do
> not model a perfect PC bus architecture and I don't think we ever
> intend to.  Instead, we model a functional architecture.
>
> I/O from an assigned device does not flow through the emulated PCI
> bus.  Therefore, it does not belong on the emulated PCI bus.
>
> Assigned devices need to interact with the emulated PCI bus, but they
> shouldn't be children of it.

So they interact with KVM (need kvm_state), and they interact with the
emulated PCI bus.  Could you elaborate on the fundamental difference
between the two interactions that makes you choose the (hypothetical)
KVM bus over the PCI bus as device parent?
Anthony Liguori Jan. 19, 2011, 4:53 p.m. UTC | #38
On 01/19/2011 03:48 AM, Gerd Hoffmann wrote:
> On 01/18/11 18:09, Anthony Liguori wrote:
>> On 01/18/2011 10:56 AM, Jan Kiszka wrote:
>>>
>>>> The device model topology is 100% a hidden architectural detail.
>>> This is true for the sysbus, it is obviously not the case for PCI and
>>> similarly discoverable buses. There we have a guest-explorable topology
>>> that is currently equivalent to the the qdev layout.
>>
>> But we also don't do PCI passthrough so we really haven't even explored
>> how that maps in qdev. I don't know if qemu-kvm has attempted to
>> qdev-ify it.
>
> It is qdev-ified.  It is a normal pci device from qdev's point of view.
>
> BTW: is there any reason why (vfio-based) pci passthrough couldn't 
> work with tcg?
>
>> The -device interface is a stable interface. Right now, you don't
>> specify any type of identifier of the pci bus when you create a PCI
>> device. It's implied in the interface.
>
> Wrong.  You can specify the bus you want attach the device to via 
> bus=<name>.  This is true for *every* device, including all pci 
> devices.  If unspecified qdev uses the first bus it finds.
>
> As long as there is a single pci bus only there is simply no need to 
> specify it, thats why nobody does that today.

Right.  In terms of specifying bus=, what are we promising re: 
compatibility?  Will there always be a pci.0?  If we add some PCI-to-PCI 
bridges in order to support more devices, is libvirt support to parse 
the hierarchy and figure out which bus to put the device on?

Regards,

Anthony Liguori

>   Once q35 finally arrives this will change of course.
>
> cheers,
>   Gerd
Anthony Liguori Jan. 19, 2011, 4:54 p.m. UTC | #39
On 01/19/2011 07:11 AM, Markus Armbruster wrote:
> Gerd Hoffmann<kraxel@redhat.com>  writes:
>
>    
>> On 01/18/11 18:09, Anthony Liguori wrote:
>>      
>>> On 01/18/2011 10:56 AM, Jan Kiszka wrote:
>>>        
>>>>          
>>>>> The device model topology is 100% a hidden architectural detail.
>>>>>            
>>>> This is true for the sysbus, it is obviously not the case for PCI and
>>>> similarly discoverable buses. There we have a guest-explorable topology
>>>> that is currently equivalent to the the qdev layout.
>>>>          
>>> But we also don't do PCI passthrough so we really haven't even explored
>>> how that maps in qdev. I don't know if qemu-kvm has attempted to
>>> qdev-ify it.
>>>        
>> It is qdev-ified.  It is a normal pci device from qdev's point of view.
>>
>> BTW: is there any reason why (vfio-based) pci passthrough couldn't
>> work with tcg?
>>
>>      
>>> The -device interface is a stable interface. Right now, you don't
>>> specify any type of identifier of the pci bus when you create a PCI
>>> device. It's implied in the interface.
>>>        
>> Wrong.  You can specify the bus you want attach the device to via
>> bus=<name>.  This is true for *every* device, including all pci
>> devices. If unspecified qdev uses the first bus it finds.
>>
>> As long as there is a single pci bus only there is simply no need to
>> specify it, thats why nobody does that today.  Once q35 finally
>> arrives this will change of course.
>>      
> As far as I know, libvirt does it already.
>    

I think that's a bad idea from a forward compatibility perspective.

Regards,

Anthony Liguori
Anthony Liguori Jan. 19, 2011, 4:57 p.m. UTC | #40
On 01/19/2011 07:15 AM, Markus Armbruster wrote:
> So they interact with KVM (need kvm_state), and they interact with the
> emulated PCI bus.  Could you elaborate on the fundamental difference
> between the two interactions that makes you choose the (hypothetical)
> KVM bus over the PCI bus as device parent?
>    

It's almost arbitrary, but I would say it's the direction that I/Os flow.

But if the underlying observation is that the device tree is not really 
a tree, you're 100% correct.  This is part of why a factory interface 
that just takes a parent bus is too simplistic.

I think we ought to introduce a -pci-device option that is specifically 
for creating PCI devices that doesn't require a parent bus argument but 
provides a way to specify stable addressing (for instancing, using a 
linear index).

Regards,

Anthony Liguori
Daniel P. Berrangé Jan. 19, 2011, 5:01 p.m. UTC | #41
On Wed, Jan 19, 2011 at 10:53:30AM -0600, Anthony Liguori wrote:
> On 01/19/2011 03:48 AM, Gerd Hoffmann wrote:
> >On 01/18/11 18:09, Anthony Liguori wrote:
> >>On 01/18/2011 10:56 AM, Jan Kiszka wrote:
> >>>
> >>>>The device model topology is 100% a hidden architectural detail.
> >>>This is true for the sysbus, it is obviously not the case for PCI and
> >>>similarly discoverable buses. There we have a guest-explorable topology
> >>>that is currently equivalent to the the qdev layout.
> >>
> >>But we also don't do PCI passthrough so we really haven't even explored
> >>how that maps in qdev. I don't know if qemu-kvm has attempted to
> >>qdev-ify it.
> >
> >It is qdev-ified.  It is a normal pci device from qdev's point of view.
> >
> >BTW: is there any reason why (vfio-based) pci passthrough couldn't
> >work with tcg?
> >
> >>The -device interface is a stable interface. Right now, you don't
> >>specify any type of identifier of the pci bus when you create a PCI
> >>device. It's implied in the interface.
> >
> >Wrong.  You can specify the bus you want attach the device to via
> >bus=<name>.  This is true for *every* device, including all pci
> >devices.  If unspecified qdev uses the first bus it finds.
> >
> >As long as there is a single pci bus only there is simply no need
> >to specify it, thats why nobody does that today.
> 
> Right.  In terms of specifying bus=, what are we promising re:
> compatibility?  Will there always be a pci.0?  If we add some
> PCI-to-PCI bridges in order to support more devices, is libvirt
> support to parse the hierarchy and figure out which bus to put the
> device on?

The reason we specify 'bus' is that we wanted to be flexible wrt
upgrades of libvirt, without needing restarts of QEMU instances
it manages. That way we can introduce new functionality into
libvirt that relies on it having previously set 'bus' on all
active QEMUs.

If QEMU adds PCI-to-PCI bridges, then I wouldn't expect QEMU to
be adding the extra bridges. I'd expect that QEMU provided just
the first bridge and then libvirt would specify how many more
bridges to create at boot or hotplug them later. So it wouldn't
ever need to parse topology.

Regards,
Daniel
Daniel P. Berrangé Jan. 19, 2011, 5:19 p.m. UTC | #42
On Wed, Jan 19, 2011 at 10:54:10AM -0600, Anthony Liguori wrote:
> On 01/19/2011 07:11 AM, Markus Armbruster wrote:
> >Gerd Hoffmann<kraxel@redhat.com>  writes:
> >
> >>On 01/18/11 18:09, Anthony Liguori wrote:
> >>>On 01/18/2011 10:56 AM, Jan Kiszka wrote:
> >>>>>The device model topology is 100% a hidden architectural detail.
> >>>>This is true for the sysbus, it is obviously not the case for PCI and
> >>>>similarly discoverable buses. There we have a guest-explorable topology
> >>>>that is currently equivalent to the the qdev layout.
> >>>But we also don't do PCI passthrough so we really haven't even explored
> >>>how that maps in qdev. I don't know if qemu-kvm has attempted to
> >>>qdev-ify it.
> >>It is qdev-ified.  It is a normal pci device from qdev's point of view.
> >>
> >>BTW: is there any reason why (vfio-based) pci passthrough couldn't
> >>work with tcg?
> >>
> >>>The -device interface is a stable interface. Right now, you don't
> >>>specify any type of identifier of the pci bus when you create a PCI
> >>>device. It's implied in the interface.
> >>Wrong.  You can specify the bus you want attach the device to via
> >>bus=<name>.  This is true for *every* device, including all pci
> >>devices. If unspecified qdev uses the first bus it finds.
> >>
> >>As long as there is a single pci bus only there is simply no need to
> >>specify it, thats why nobody does that today.  Once q35 finally
> >>arrives this will change of course.
> >As far as I know, libvirt does it already.
> 
> I think that's a bad idea from a forward compatibility perspective.

In our past experiance though, *not* specifying attributes like
these has also been pretty bad from a forward compatibility
perspective too. We're kind of damned either way, so on balance
we decided we'd specify every attribute in qdev that's related
to unique identification of devices & their inter-relationships.
By strictly locking down the topology we were defining, we ought
to have a more stable ABI in face of future changes. I accept
this might not always work out, so we may have to adjust things
over time still. Predicting the future is hard :-)

Daniel
Jan Kiszka Jan. 19, 2011, 5:25 p.m. UTC | #43
On 2011-01-19 17:57, Anthony Liguori wrote:
> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>> So they interact with KVM (need kvm_state), and they interact with the
>> emulated PCI bus.  Could you elaborate on the fundamental difference
>> between the two interactions that makes you choose the (hypothetical)
>> KVM bus over the PCI bus as device parent?
>>    
> 
> It's almost arbitrary, but I would say it's the direction that I/Os flow.

We need both if we want KVM buses. They are useless for enumerating the
device on that bus the guest sees it on.

Jan
Daniel P. Berrangé Jan. 19, 2011, 5:35 p.m. UTC | #44
On Wed, Jan 19, 2011 at 10:53:30AM -0600, Anthony Liguori wrote:
> On 01/19/2011 03:48 AM, Gerd Hoffmann wrote:
> >On 01/18/11 18:09, Anthony Liguori wrote:
> >>On 01/18/2011 10:56 AM, Jan Kiszka wrote:
> >>>
> >>>>The device model topology is 100% a hidden architectural detail.
> >>>This is true for the sysbus, it is obviously not the case for PCI and
> >>>similarly discoverable buses. There we have a guest-explorable topology
> >>>that is currently equivalent to the the qdev layout.
> >>
> >>But we also don't do PCI passthrough so we really haven't even explored
> >>how that maps in qdev. I don't know if qemu-kvm has attempted to
> >>qdev-ify it.
> >
> >It is qdev-ified.  It is a normal pci device from qdev's point of view.
> >
> >BTW: is there any reason why (vfio-based) pci passthrough couldn't
> >work with tcg?
> >
> >>The -device interface is a stable interface. Right now, you don't
> >>specify any type of identifier of the pci bus when you create a PCI
> >>device. It's implied in the interface.
> >
> >Wrong.  You can specify the bus you want attach the device to via
> >bus=<name>.  This is true for *every* device, including all pci
> >devices.  If unspecified qdev uses the first bus it finds.
> >
> >As long as there is a single pci bus only there is simply no need
> >to specify it, thats why nobody does that today.
> 
> Right.  In terms of specifying bus=, what are we promising re:
> compatibility?  Will there always be a pci.0?  If we add some
> PCI-to-PCI bridges in order to support more devices, is libvirt
> support to parse the hierarchy and figure out which bus to put the
> device on?

The answer to your questions probably differ depending on
whether '-nodefconfig' and '-nodefaults' are set on the
command line.  If they are set, then I'd expect to only
ever see one PCI bus with name pci.0 forever more, unless
i explicitly ask for more. If they are not set, then you
might expect to see multiple PCI buses by appear by magic

Daniel
Anthony Liguori Jan. 19, 2011, 5:42 p.m. UTC | #45
On 01/19/2011 11:35 AM, Daniel P. Berrange wrote:
> On Wed, Jan 19, 2011 at 10:53:30AM -0600, Anthony Liguori wrote:
>    
>> On 01/19/2011 03:48 AM, Gerd Hoffmann wrote:
>>      
>>> On 01/18/11 18:09, Anthony Liguori wrote:
>>>        
>>>> On 01/18/2011 10:56 AM, Jan Kiszka wrote:
>>>>          
>>>>>            
>>>>>> The device model topology is 100% a hidden architectural detail.
>>>>>>              
>>>>> This is true for the sysbus, it is obviously not the case for PCI and
>>>>> similarly discoverable buses. There we have a guest-explorable topology
>>>>> that is currently equivalent to the the qdev layout.
>>>>>            
>>>> But we also don't do PCI passthrough so we really haven't even explored
>>>> how that maps in qdev. I don't know if qemu-kvm has attempted to
>>>> qdev-ify it.
>>>>          
>>> It is qdev-ified.  It is a normal pci device from qdev's point of view.
>>>
>>> BTW: is there any reason why (vfio-based) pci passthrough couldn't
>>> work with tcg?
>>>
>>>        
>>>> The -device interface is a stable interface. Right now, you don't
>>>> specify any type of identifier of the pci bus when you create a PCI
>>>> device. It's implied in the interface.
>>>>          
>>> Wrong.  You can specify the bus you want attach the device to via
>>> bus=<name>.  This is true for *every* device, including all pci
>>> devices.  If unspecified qdev uses the first bus it finds.
>>>
>>> As long as there is a single pci bus only there is simply no need
>>> to specify it, thats why nobody does that today.
>>>        
>> Right.  In terms of specifying bus=, what are we promising re:
>> compatibility?  Will there always be a pci.0?  If we add some
>> PCI-to-PCI bridges in order to support more devices, is libvirt
>> support to parse the hierarchy and figure out which bus to put the
>> device on?
>>      
> The answer to your questions probably differ depending on
> whether '-nodefconfig' and '-nodefaults' are set on the
> command line.  If they are set, then I'd expect to only
> ever see one PCI bus with name pci.0 forever more, unless
> i explicitly ask for more. If they are not set, then you
> might expect to see multiple PCI buses by appear by magic
>    

Yeah, we can't promise that.  If you use -M pc, you aren't guaranteed a 
stable PCI bus topology even with -nodefconfig/-nodefaults.

Regards,

Anthony Liguori

> Daniel
>
Anthony Liguori Jan. 19, 2011, 5:43 p.m. UTC | #46
On 01/19/2011 11:19 AM, Daniel P. Berrange wrote:
>
> In our past experiance though, *not* specifying attributes like
> these has also been pretty bad from a forward compatibility
> perspective too. We're kind of damned either way, so on balance
> we decided we'd specify every attribute in qdev that's related
> to unique identification of devices&  their inter-relationships.
> By strictly locking down the topology we were defining, we ought
> to have a more stable ABI in face of future changes. I accept
> this might not always work out, so we may have to adjust things
> over time still. Predicting the future is hard :-)
>    

There are two distinct things here:

1) creating exactly the same virtual machine (like for migration) given 
a newer version of QEMU

2) creating a reasonably similar virtual machine given a newer version 
of QEMU

For (1), you cannot use -M pc.  You should use things like bus=X,addr=Y 
much better is for QEMU to dump a device file and to just reuse that 
instead of guessing what you need.

For (2), you cannot use bus=X,addr=Y because it makes assumptions about 
the PCI topology which may change in newer -M pc's.

I think libvirt needs to treat this two scenarios differently to support 
forwards compatibility.

Regards,

Anthony Liguori

> Daniel
>
Anthony Liguori Jan. 19, 2011, 5:51 p.m. UTC | #47
On 01/19/2011 11:01 AM, Daniel P. Berrange wrote:
>
> The reason we specify 'bus' is that we wanted to be flexible wrt
> upgrades of libvirt, without needing restarts of QEMU instances
> it manages. That way we can introduce new functionality into
> libvirt that relies on it having previously set 'bus' on all
> active QEMUs.
>
> If QEMU adds PCI-to-PCI bridges, then I wouldn't expect QEMU to
> be adding the extra bridges. I'd expect that QEMU provided just
> the first bridge and then libvirt would specify how many more
> bridges to create at boot or hotplug them later. So it wouldn't
> ever need to parse topology.
>    

Yeah, but replacing the main chipset will certainly change the PCI 
topology such that if you're specifying bus=X and addr=X and then also 
using -M pc, unless you're parsing the default topology to come up with 
the addressing, it will break in the future.

That's why I think something simpler like a linear index that QEMU maps 
to a static location in the topology is probably the best future proof 
interface.

Regards,

Anthony Liguori

> Regards,
> Daniel
>
Daniel P. Berrangé Jan. 19, 2011, 6:52 p.m. UTC | #48
On Wed, Jan 19, 2011 at 11:51:58AM -0600, Anthony Liguori wrote:
> On 01/19/2011 11:01 AM, Daniel P. Berrange wrote:
> >
> >The reason we specify 'bus' is that we wanted to be flexible wrt
> >upgrades of libvirt, without needing restarts of QEMU instances
> >it manages. That way we can introduce new functionality into
> >libvirt that relies on it having previously set 'bus' on all
> >active QEMUs.
> >
> >If QEMU adds PCI-to-PCI bridges, then I wouldn't expect QEMU to
> >be adding the extra bridges. I'd expect that QEMU provided just
> >the first bridge and then libvirt would specify how many more
> >bridges to create at boot or hotplug them later. So it wouldn't
> >ever need to parse topology.
> 
> Yeah, but replacing the main chipset will certainly change the PCI
> topology such that if you're specifying bus=X and addr=X and then
> also using -M pc, unless you're parsing the default topology to come
> up with the addressing, it will break in the future.

We never use a bare '-M pc' though, we always canonicalize to
one of the versioned forms.  So if we run '-M pc-0.12', then
neither the main PCI chipset nor topology would have changed
in newer QEMU.  Of course if we deployed a new VM with
'-M pc-0.20' that might have new PCI chipset, so bus=pci.0
might have different meaning that it did when used with
'-M pc-0.12', but I don't think that's an immediate problem

Regards,
Daniel
Daniel P. Berrangé Jan. 19, 2011, 6:53 p.m. UTC | #49
On Wed, Jan 19, 2011 at 11:42:18AM -0600, Anthony Liguori wrote:
> On 01/19/2011 11:35 AM, Daniel P. Berrange wrote:
> >On Wed, Jan 19, 2011 at 10:53:30AM -0600, Anthony Liguori wrote:
> >>On 01/19/2011 03:48 AM, Gerd Hoffmann wrote:
> >>>On 01/18/11 18:09, Anthony Liguori wrote:
> >>>>On 01/18/2011 10:56 AM, Jan Kiszka wrote:
> >>>>>>The device model topology is 100% a hidden architectural detail.
> >>>>>This is true for the sysbus, it is obviously not the case for PCI and
> >>>>>similarly discoverable buses. There we have a guest-explorable topology
> >>>>>that is currently equivalent to the the qdev layout.
> >>>>But we also don't do PCI passthrough so we really haven't even explored
> >>>>how that maps in qdev. I don't know if qemu-kvm has attempted to
> >>>>qdev-ify it.
> >>>It is qdev-ified.  It is a normal pci device from qdev's point of view.
> >>>
> >>>BTW: is there any reason why (vfio-based) pci passthrough couldn't
> >>>work with tcg?
> >>>
> >>>>The -device interface is a stable interface. Right now, you don't
> >>>>specify any type of identifier of the pci bus when you create a PCI
> >>>>device. It's implied in the interface.
> >>>Wrong.  You can specify the bus you want attach the device to via
> >>>bus=<name>.  This is true for *every* device, including all pci
> >>>devices.  If unspecified qdev uses the first bus it finds.
> >>>
> >>>As long as there is a single pci bus only there is simply no need
> >>>to specify it, thats why nobody does that today.
> >>Right.  In terms of specifying bus=, what are we promising re:
> >>compatibility?  Will there always be a pci.0?  If we add some
> >>PCI-to-PCI bridges in order to support more devices, is libvirt
> >>support to parse the hierarchy and figure out which bus to put the
> >>device on?
> >The answer to your questions probably differ depending on
> >whether '-nodefconfig' and '-nodefaults' are set on the
> >command line.  If they are set, then I'd expect to only
> >ever see one PCI bus with name pci.0 forever more, unless
> >i explicitly ask for more. If they are not set, then you
> >might expect to see multiple PCI buses by appear by magic
> 
> Yeah, we can't promise that.  If you use -M pc, you aren't
> guaranteed a stable PCI bus topology even with
> -nodefconfig/-nodefaults.

That's why we never use '-M pc' when actually invoking QEMU.
If the user specifies 'pc' in the XML, we canonicalize that
to the versioned alternative like 'pc-0.12' before invoking
QEMU. We also expose the list of versioned machines to apps
so they can do canonicalization themselves if desired.

Regards,
Daniel
Anthony Liguori Jan. 19, 2011, 6:58 p.m. UTC | #50
On 01/19/2011 12:52 PM, Daniel P. Berrange wrote:
> On Wed, Jan 19, 2011 at 11:51:58AM -0600, Anthony Liguori wrote:
>    
>> On 01/19/2011 11:01 AM, Daniel P. Berrange wrote:
>>      
>>> The reason we specify 'bus' is that we wanted to be flexible wrt
>>> upgrades of libvirt, without needing restarts of QEMU instances
>>> it manages. That way we can introduce new functionality into
>>> libvirt that relies on it having previously set 'bus' on all
>>> active QEMUs.
>>>
>>> If QEMU adds PCI-to-PCI bridges, then I wouldn't expect QEMU to
>>> be adding the extra bridges. I'd expect that QEMU provided just
>>> the first bridge and then libvirt would specify how many more
>>> bridges to create at boot or hotplug them later. So it wouldn't
>>> ever need to parse topology.
>>>        
>> Yeah, but replacing the main chipset will certainly change the PCI
>> topology such that if you're specifying bus=X and addr=X and then
>> also using -M pc, unless you're parsing the default topology to come
>> up with the addressing, it will break in the future.
>>      
> We never use a bare '-M pc' though, we always canonicalize to
> one of the versioned forms.  So if we run '-M pc-0.12', then
> neither the main PCI chipset nor topology would have changed
> in newer QEMU.  Of course if we deployed a new VM with
> '-M pc-0.20' that might have new PCI chipset, so bus=pci.0
> might have different meaning that it did when used with
> '-M pc-0.12', but I don't think that's an immediate problem
>    

Right, but you expose bus addressing via the XML, no?  That means that 
if a user specifies something like '1.0', and you translate that to 
bus='pci.0',addr='1.0', then when pc-0.50 comes out and slot 1.0 is used 
for the integrated 3D VGA graphics adapter, the guest creation will fail.

If you expose topological configuration to the user, the guest will not 
continue working down the road unless you come up with a scheme where 
you map addresses to a different address range for newer pcs.

Regards,

Anthony Liguori

> Regards,
> Daniel
>
Blue Swirl Jan. 19, 2011, 7:32 p.m. UTC | #51
On Wed, Jan 19, 2011 at 4:57 PM, Anthony Liguori
<aliguori@linux.vnet.ibm.com> wrote:
> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>>
>> So they interact with KVM (need kvm_state), and they interact with the
>> emulated PCI bus.  Could you elaborate on the fundamental difference
>> between the two interactions that makes you choose the (hypothetical)
>> KVM bus over the PCI bus as device parent?
>>
>
> It's almost arbitrary, but I would say it's the direction that I/Os flow.
>
> But if the underlying observation is that the device tree is not really a
> tree, you're 100% correct.  This is part of why a factory interface that
> just takes a parent bus is too simplistic.
>
> I think we ought to introduce a -pci-device option that is specifically for
> creating PCI devices that doesn't require a parent bus argument but provides
> a way to specify stable addressing (for instancing, using a linear index).

I think kvm_state should not be a property of any device or bus. It
should be split to more logical pieces.

Some parts of it could remain in CPUState, because they are associated
with a VCPU.

Also, for example irqfd could be considered to be similar object to
char or block devices provided by QEMU to devices. Would it make sense
to introduce new host types for passing parts of kvm_state to devices?

I'd also make coalesced MMIO stuff part of memory object. We are not
passing any state references when using cpu_physical_memory_rw(), but
that could be changed.
Gerd Hoffmann Jan. 20, 2011, 8:44 a.m. UTC | #52
Hi,

> For (2), you cannot use bus=X,addr=Y because it makes assumptions about
> the PCI topology which may change in newer -M pc's.

Why should the PCI topology for 'pc' ever change?

We'll probably get q35 support some day, but when this lands I expect 
we'll see a new machine type 'q35', so '-m q35' will pick the ich9 
chipset (which will have a different pci topology of course) and '-m pc' 
will pick the existing piix chipset (which will continue to look like it 
looks today).

cheers,
   Gerd
Jan Kiszka Jan. 20, 2011, 9:33 a.m. UTC | #53
On 2011-01-19 20:32, Blue Swirl wrote:
> On Wed, Jan 19, 2011 at 4:57 PM, Anthony Liguori
> <aliguori@linux.vnet.ibm.com> wrote:
>> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>>>
>>> So they interact with KVM (need kvm_state), and they interact with the
>>> emulated PCI bus.  Could you elaborate on the fundamental difference
>>> between the two interactions that makes you choose the (hypothetical)
>>> KVM bus over the PCI bus as device parent?
>>>
>>
>> It's almost arbitrary, but I would say it's the direction that I/Os flow.
>>
>> But if the underlying observation is that the device tree is not really a
>> tree, you're 100% correct.  This is part of why a factory interface that
>> just takes a parent bus is too simplistic.
>>
>> I think we ought to introduce a -pci-device option that is specifically for
>> creating PCI devices that doesn't require a parent bus argument but provides
>> a way to specify stable addressing (for instancing, using a linear index).
> 
> I think kvm_state should not be a property of any device or bus. It
> should be split to more logical pieces.
> 
> Some parts of it could remain in CPUState, because they are associated
> with a VCPU.
> 
> Also, for example irqfd could be considered to be similar object to
> char or block devices provided by QEMU to devices. Would it make sense
> to introduce new host types for passing parts of kvm_state to devices?
> 
> I'd also make coalesced MMIO stuff part of memory object. We are not
> passing any state references when using cpu_physical_memory_rw(), but
> that could be changed.

There are currently no VCPU-specific bits remaining in kvm_state. It may
be a good idea to introduce an arch-specific kvm_state and move related
bits over. It may also once be feasible to carve out memory management
related fields if we have proper abstractions for that, but I'm not
completely sure here.

Anyway, all these things are secondary. The primary topic here is how to
deal with kvm_state and its fields that have VM-global scope.

Jan
Daniel P. Berrangé Jan. 20, 2011, 10:33 a.m. UTC | #54
On Thu, Jan 20, 2011 at 09:44:05AM +0100, Gerd Hoffmann wrote:
>   Hi,
> 
> >For (2), you cannot use bus=X,addr=Y because it makes assumptions about
> >the PCI topology which may change in newer -M pc's.
> 
> Why should the PCI topology for 'pc' ever change?
> 
> We'll probably get q35 support some day, but when this lands I
> expect we'll see a new machine type 'q35', so '-m q35' will pick the
> ich9 chipset (which will have a different pci topology of course)
> and '-m pc' will pick the existing piix chipset (which will continue
> to look like it looks today).

If the topology does ever change (eg in the kind of way anthony
suggests, first bus only has the graphics card), I think libvirt
is going to need a little work to adapt to the new topology,
regardless of whether we currently specify a bus= arg to -device
or not. I'm not sure there's anything we could do to future proof
us to that kind of change.

Regards,
Daniel
Blue Swirl Jan. 20, 2011, 7:27 p.m. UTC | #55
On Thu, Jan 20, 2011 at 9:33 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2011-01-19 20:32, Blue Swirl wrote:
>> On Wed, Jan 19, 2011 at 4:57 PM, Anthony Liguori
>> <aliguori@linux.vnet.ibm.com> wrote:
>>> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>>>>
>>>> So they interact with KVM (need kvm_state), and they interact with the
>>>> emulated PCI bus.  Could you elaborate on the fundamental difference
>>>> between the two interactions that makes you choose the (hypothetical)
>>>> KVM bus over the PCI bus as device parent?
>>>>
>>>
>>> It's almost arbitrary, but I would say it's the direction that I/Os flow.
>>>
>>> But if the underlying observation is that the device tree is not really a
>>> tree, you're 100% correct.  This is part of why a factory interface that
>>> just takes a parent bus is too simplistic.
>>>
>>> I think we ought to introduce a -pci-device option that is specifically for
>>> creating PCI devices that doesn't require a parent bus argument but provides
>>> a way to specify stable addressing (for instancing, using a linear index).
>>
>> I think kvm_state should not be a property of any device or bus. It
>> should be split to more logical pieces.
>>
>> Some parts of it could remain in CPUState, because they are associated
>> with a VCPU.
>>
>> Also, for example irqfd could be considered to be similar object to
>> char or block devices provided by QEMU to devices. Would it make sense
>> to introduce new host types for passing parts of kvm_state to devices?
>>
>> I'd also make coalesced MMIO stuff part of memory object. We are not
>> passing any state references when using cpu_physical_memory_rw(), but
>> that could be changed.
>
> There are currently no VCPU-specific bits remaining in kvm_state.

I think fields vcpu_events, robust_singlestep, debugregs,
kvm_sw_breakpoints, xsave, xcrs belong to CPUX86State. They may be the
same for all VCPUs but still they are sort of CPU properties. I'm not
sure about fd field.

> It may
> be a good idea to introduce an arch-specific kvm_state and move related
> bits over.

This should probably contain only irqchip_in_kernel, pit_in_kernel and
many_ioeventfds, maybe fd.

> It may also once be feasible to carve out memory management
> related fields if we have proper abstractions for that, but I'm not
> completely sure here.

I'd put slots, vmfd, coalesced_mmio, broken_set_mem_region,
migration_log into the memory object.

> Anyway, all these things are secondary. The primary topic here is how to
> deal with kvm_state and its fields that have VM-global scope.

If it is an opaque blob which contains various unrelated stuff, no
clear place will be found.

By the way, we don't have a QEMUState but instead use globals. Perhaps
this should be reorganized as well. For fd field, maybe even using a
global variable could be justified, since it is used for direct access
to kernel, not unlike a system call.
Anthony Liguori Jan. 20, 2011, 7:37 p.m. UTC | #56
On 01/20/2011 03:33 AM, Jan Kiszka wrote:
> On 2011-01-19 20:32, Blue Swirl wrote:
>    
>> On Wed, Jan 19, 2011 at 4:57 PM, Anthony Liguori
>> <aliguori@linux.vnet.ibm.com>  wrote:
>>      
>>> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>>>        
>>>> So they interact with KVM (need kvm_state), and they interact with the
>>>> emulated PCI bus.  Could you elaborate on the fundamental difference
>>>> between the two interactions that makes you choose the (hypothetical)
>>>> KVM bus over the PCI bus as device parent?
>>>>
>>>>          
>>> It's almost arbitrary, but I would say it's the direction that I/Os flow.
>>>
>>> But if the underlying observation is that the device tree is not really a
>>> tree, you're 100% correct.  This is part of why a factory interface that
>>> just takes a parent bus is too simplistic.
>>>
>>> I think we ought to introduce a -pci-device option that is specifically for
>>> creating PCI devices that doesn't require a parent bus argument but provides
>>> a way to specify stable addressing (for instancing, using a linear index).
>>>        
>> I think kvm_state should not be a property of any device or bus. It
>> should be split to more logical pieces.
>>
>> Some parts of it could remain in CPUState, because they are associated
>> with a VCPU.
>>
>> Also, for example irqfd could be considered to be similar object to
>> char or block devices provided by QEMU to devices. Would it make sense
>> to introduce new host types for passing parts of kvm_state to devices?
>>
>> I'd also make coalesced MMIO stuff part of memory object. We are not
>> passing any state references when using cpu_physical_memory_rw(), but
>> that could be changed.
>>      
> There are currently no VCPU-specific bits remaining in kvm_state. It may
> be a good idea to introduce an arch-specific kvm_state and move related
> bits over. It may also once be feasible to carve out memory management
> related fields if we have proper abstractions for that, but I'm not
> completely sure here.
>
> Anyway, all these things are secondary. The primary topic here is how to
> deal with kvm_state and its fields that have VM-global scope.
>    

The debate is really:

1) should we remove all passing of kvm_state and just assume it's static

2) deal with a couple places in the code where we need to figure out how 
to get at kvm_state

I think we've only identified 1 real instance of (2) and it's resulted 
in some good discussions about how to model KVM devices vs. emulated 
devices.  Honestly, (1) just stinks.  I see absolutely no advantage to 
it at all.   In the very worst case scenario, the thing we need to do is 
just reference an extern variable in a few places.  That completely 
avoids all of the modelling discussions for now (while leaving for 
placeholder FIXMEs so the problem can be tackled later).

I don't understand the resistance here.

Regards,

Anthony Liguori

> Jan
>
>
Anthony Liguori Jan. 20, 2011, 7:39 p.m. UTC | #57
On 01/20/2011 02:44 AM, Gerd Hoffmann wrote:
>   Hi,
>
>> For (2), you cannot use bus=X,addr=Y because it makes assumptions about
>> the PCI topology which may change in newer -M pc's.
>
> Why should the PCI topology for 'pc' ever change?
>
> We'll probably get q35 support some day, but when this lands I expect 
> we'll see a new machine type 'q35', so '-m q35' will pick the ich9 
> chipset (which will have a different pci topology of course) and '-m 
> pc' will pick the existing piix chipset (which will continue to look 
> like it looks today).

But then what's the default machine type?  When I say -M pc, I really 
mean the default machine.

At some point, "qemu-system-x86_64 -device virtio-net-pci,addr=2.0"

Is not going to be a reliable way to invoke qemu because there's no way 
we can guarantee that slot 2 isn't occupied by a chipset device or some 
other default device.

Regards,

Anthony Liguori

> cheers,
>   Gerd
Anthony Liguori Jan. 20, 2011, 7:42 p.m. UTC | #58
On 01/20/2011 04:33 AM, Daniel P. Berrange wrote:
> On Thu, Jan 20, 2011 at 09:44:05AM +0100, Gerd Hoffmann wrote:
>    
>>    Hi,
>>
>>      
>>> For (2), you cannot use bus=X,addr=Y because it makes assumptions about
>>> the PCI topology which may change in newer -M pc's.
>>>        
>> Why should the PCI topology for 'pc' ever change?
>>
>> We'll probably get q35 support some day, but when this lands I
>> expect we'll see a new machine type 'q35', so '-m q35' will pick the
>> ich9 chipset (which will have a different pci topology of course)
>> and '-m pc' will pick the existing piix chipset (which will continue
>> to look like it looks today).
>>      
> If the topology does ever change (eg in the kind of way anthony
> suggests, first bus only has the graphics card), I think libvirt
> is going to need a little work to adapt to the new topology,
> regardless of whether we currently specify a bus= arg to -device
> or not. I'm not sure there's anything we could do to future proof
> us to that kind of change.
>    

I assume that libvirt today assumes that it can use a set of PCI slots 
in bus 0, correct?  Probably in the range 3-31?  Such assumptions are 
very likely to break.

Regards,

Anthony Liguori

> Regards,
> Daniel
>
Blue Swirl Jan. 20, 2011, 8:02 p.m. UTC | #59
On Thu, Jan 20, 2011 at 7:37 PM, Anthony Liguori
<aliguori@linux.vnet.ibm.com> wrote:
> On 01/20/2011 03:33 AM, Jan Kiszka wrote:
>>
>> On 2011-01-19 20:32, Blue Swirl wrote:
>>
>>>
>>> On Wed, Jan 19, 2011 at 4:57 PM, Anthony Liguori
>>> <aliguori@linux.vnet.ibm.com>  wrote:
>>>
>>>>
>>>> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>>>>
>>>>>
>>>>> So they interact with KVM (need kvm_state), and they interact with the
>>>>> emulated PCI bus.  Could you elaborate on the fundamental difference
>>>>> between the two interactions that makes you choose the (hypothetical)
>>>>> KVM bus over the PCI bus as device parent?
>>>>>
>>>>>
>>>>
>>>> It's almost arbitrary, but I would say it's the direction that I/Os
>>>> flow.
>>>>
>>>> But if the underlying observation is that the device tree is not really
>>>> a
>>>> tree, you're 100% correct.  This is part of why a factory interface that
>>>> just takes a parent bus is too simplistic.
>>>>
>>>> I think we ought to introduce a -pci-device option that is specifically
>>>> for
>>>> creating PCI devices that doesn't require a parent bus argument but
>>>> provides
>>>> a way to specify stable addressing (for instancing, using a linear
>>>> index).
>>>>
>>>
>>> I think kvm_state should not be a property of any device or bus. It
>>> should be split to more logical pieces.
>>>
>>> Some parts of it could remain in CPUState, because they are associated
>>> with a VCPU.
>>>
>>> Also, for example irqfd could be considered to be similar object to
>>> char or block devices provided by QEMU to devices. Would it make sense
>>> to introduce new host types for passing parts of kvm_state to devices?
>>>
>>> I'd also make coalesced MMIO stuff part of memory object. We are not
>>> passing any state references when using cpu_physical_memory_rw(), but
>>> that could be changed.
>>>
>>
>> There are currently no VCPU-specific bits remaining in kvm_state. It may
>> be a good idea to introduce an arch-specific kvm_state and move related
>> bits over. It may also once be feasible to carve out memory management
>> related fields if we have proper abstractions for that, but I'm not
>> completely sure here.
>>
>> Anyway, all these things are secondary. The primary topic here is how to
>> deal with kvm_state and its fields that have VM-global scope.
>>
>
> The debate is really:
>
> 1) should we remove all passing of kvm_state and just assume it's static
>
> 2) deal with a couple places in the code where we need to figure out how to
> get at kvm_state
>
> I think we've only identified 1 real instance of (2) and it's resulted in
> some good discussions about how to model KVM devices vs. emulated devices.
>  Honestly, (1) just stinks.  I see absolutely no advantage to it at all.

Fully agree.

> In the very worst case scenario, the thing we need to do is just reference
> an extern variable in a few places.  That completely avoids all of the
> modelling discussions for now (while leaving for placeholder FIXMEs so the
> problem can be tackled later).

I think KVMState was designed to match KVM ioctl interface: all stuff
that is needed for talking to KVM or received from KVM are there. But
I think this shouldn't be a design driver.

If the only pieces of kvm_state that are needed by the devices are
irqchip_in_kernel, pit_in_kernel and many_ioeventfds, the problem of
passing kvm_state to devices becomes very different. Each of these are
just single bits, affecting only a few devices. Perhaps they could be
device properties which the board level sets when KVM is used?
Jan Kiszka Jan. 20, 2011, 9:22 p.m. UTC | #60
On 2011-01-20 20:27, Blue Swirl wrote:
> On Thu, Jan 20, 2011 at 9:33 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2011-01-19 20:32, Blue Swirl wrote:
>>> On Wed, Jan 19, 2011 at 4:57 PM, Anthony Liguori
>>> <aliguori@linux.vnet.ibm.com> wrote:
>>>> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>>>>>
>>>>> So they interact with KVM (need kvm_state), and they interact with the
>>>>> emulated PCI bus.  Could you elaborate on the fundamental difference
>>>>> between the two interactions that makes you choose the (hypothetical)
>>>>> KVM bus over the PCI bus as device parent?
>>>>>
>>>>
>>>> It's almost arbitrary, but I would say it's the direction that I/Os flow.
>>>>
>>>> But if the underlying observation is that the device tree is not really a
>>>> tree, you're 100% correct.  This is part of why a factory interface that
>>>> just takes a parent bus is too simplistic.
>>>>
>>>> I think we ought to introduce a -pci-device option that is specifically for
>>>> creating PCI devices that doesn't require a parent bus argument but provides
>>>> a way to specify stable addressing (for instancing, using a linear index).
>>>
>>> I think kvm_state should not be a property of any device or bus. It
>>> should be split to more logical pieces.
>>>
>>> Some parts of it could remain in CPUState, because they are associated
>>> with a VCPU.
>>>
>>> Also, for example irqfd could be considered to be similar object to
>>> char or block devices provided by QEMU to devices. Would it make sense
>>> to introduce new host types for passing parts of kvm_state to devices?
>>>
>>> I'd also make coalesced MMIO stuff part of memory object. We are not
>>> passing any state references when using cpu_physical_memory_rw(), but
>>> that could be changed.
>>
>> There are currently no VCPU-specific bits remaining in kvm_state.
> 
> I think fields vcpu_events, robust_singlestep, debugregs,
> kvm_sw_breakpoints, xsave, xcrs belong to CPUX86State. They may be the
> same for all VCPUs but still they are sort of CPU properties. I'm not
> sure about fd field.

They are all properties of the currently loaded KVM subsystem in the
host kernel. They can't change while KVM's root fd is opened.
Replicating this static information into each and every VCPU state would
be crazy.

In fact, services like kvm_has_vcpu_events() already encode this: they
are static functions without any kvm_state reference that simply return
the content of those fields. Totally inconsistent to this, we force the
caller of kvm_check_extension to pass a handle. This is part of my
problem with the current situation and any halfhearted steps in this
context. Either we work towards eliminating "static KVMState *kvm_state"
in kvm-all.c or eliminating KVMState.

> 
>> It may
>> be a good idea to introduce an arch-specific kvm_state and move related
>> bits over.
> 
> This should probably contain only irqchip_in_kernel, pit_in_kernel and
> many_ioeventfds, maybe fd.

fd is that root file descriptor you need for a few KVM services that are
not bound to a specific VM - e.g. feature queries. It's not arch
specific. Arch specific are e.g. robust_singlestep or xsave feature states.

> 
>> It may also once be feasible to carve out memory management
>> related fields if we have proper abstractions for that, but I'm not
>> completely sure here.
> 
> I'd put slots, vmfd, coalesced_mmio, broken_set_mem_region,
> migration_log into the memory object.

vmfd is the VM-scope file descriptor you need at machine-level. The rest
logically belongs to a memory object, but I haven't looked at technical
details yet.

> 
>> Anyway, all these things are secondary. The primary topic here is how to
>> deal with kvm_state and its fields that have VM-global scope.
> 
> If it is an opaque blob which contains various unrelated stuff, no
> clear place will be found.

We aren't moving fields yet (and we shouldn't). We first of all need to
establish the handle distribution (which apparently requires quite some
work in areas beyond KVM).

> 
> By the way, we don't have a QEMUState but instead use globals. Perhaps
> this should be reorganized as well. For fd field, maybe even using a
> global variable could be justified, since it is used for direct access
> to kernel, not unlike a system call.

The fd field is part of this discussion. Making it global (but local to
the kvm subsystem) was an implicit part of my original suggestion.

I've no problem with something like a QEMUState, or better a
MachineState that would also include a few KVM-specific fields like the
vmfd - just like we already do for CPUstate (or should we better
introduce a KVM CPU bus... ;) ).

Jan
Jan Kiszka Jan. 20, 2011, 9:27 p.m. UTC | #61
On 2011-01-20 20:37, Anthony Liguori wrote:
> On 01/20/2011 03:33 AM, Jan Kiszka wrote:
>> On 2011-01-19 20:32, Blue Swirl wrote:
>>   
>>> On Wed, Jan 19, 2011 at 4:57 PM, Anthony Liguori
>>> <aliguori@linux.vnet.ibm.com>  wrote:
>>>     
>>>> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>>>>       
>>>>> So they interact with KVM (need kvm_state), and they interact with the
>>>>> emulated PCI bus.  Could you elaborate on the fundamental difference
>>>>> between the two interactions that makes you choose the (hypothetical)
>>>>> KVM bus over the PCI bus as device parent?
>>>>>
>>>>>          
>>>> It's almost arbitrary, but I would say it's the direction that I/Os
>>>> flow.
>>>>
>>>> But if the underlying observation is that the device tree is not
>>>> really a
>>>> tree, you're 100% correct.  This is part of why a factory interface
>>>> that
>>>> just takes a parent bus is too simplistic.
>>>>
>>>> I think we ought to introduce a -pci-device option that is
>>>> specifically for
>>>> creating PCI devices that doesn't require a parent bus argument but
>>>> provides
>>>> a way to specify stable addressing (for instancing, using a linear
>>>> index).
>>>>        
>>> I think kvm_state should not be a property of any device or bus. It
>>> should be split to more logical pieces.
>>>
>>> Some parts of it could remain in CPUState, because they are associated
>>> with a VCPU.
>>>
>>> Also, for example irqfd could be considered to be similar object to
>>> char or block devices provided by QEMU to devices. Would it make sense
>>> to introduce new host types for passing parts of kvm_state to devices?
>>>
>>> I'd also make coalesced MMIO stuff part of memory object. We are not
>>> passing any state references when using cpu_physical_memory_rw(), but
>>> that could be changed.
>>>      
>> There are currently no VCPU-specific bits remaining in kvm_state. It may
>> be a good idea to introduce an arch-specific kvm_state and move related
>> bits over. It may also once be feasible to carve out memory management
>> related fields if we have proper abstractions for that, but I'm not
>> completely sure here.
>>
>> Anyway, all these things are secondary. The primary topic here is how to
>> deal with kvm_state and its fields that have VM-global scope.
>>    
> 
> The debate is really:
> 
> 1) should we remove all passing of kvm_state and just assume it's static
> 
> 2) deal with a couple places in the code where we need to figure out how
> to get at kvm_state
> 
> I think we've only identified 1 real instance of (2) and it's resulted
> in some good discussions about how to model KVM devices vs. emulated
> devices.  Honestly, (1) just stinks.  I see absolutely no advantage to
> it at all.   In the very worst case scenario, the thing we need to do is
> just reference an extern variable in a few places.  That completely
> avoids all of the modelling discussions for now (while leaving for
> placeholder FIXMEs so the problem can be tackled later).

The PCI bus discussion is surely an interesting outcome, but now almost
completely off-topic to the original, way less critical issue (as we
were discussing internals).

> 
> I don't understand the resistance here.

IMHO, most suggestions on the table are still over-designed (like a
KVMBus that only passes a kvm_state - or do you have more features for
it in mind?). The idea I love most so far is establishing a machine
state that also carries those few KVM bits which correspond to the KVM
extension of CPUState.

But in the end I want an implementable consensus that helps moving
forward with main topic: the overdue KVM upstream merge. I just do not
have a clear picture yet.

Jan
Blue Swirl Jan. 20, 2011, 9:40 p.m. UTC | #62
On Thu, Jan 20, 2011 at 9:22 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2011-01-20 20:27, Blue Swirl wrote:
>> On Thu, Jan 20, 2011 at 9:33 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>> On 2011-01-19 20:32, Blue Swirl wrote:
>>>> On Wed, Jan 19, 2011 at 4:57 PM, Anthony Liguori
>>>> <aliguori@linux.vnet.ibm.com> wrote:
>>>>> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>>>>>>
>>>>>> So they interact with KVM (need kvm_state), and they interact with the
>>>>>> emulated PCI bus.  Could you elaborate on the fundamental difference
>>>>>> between the two interactions that makes you choose the (hypothetical)
>>>>>> KVM bus over the PCI bus as device parent?
>>>>>>
>>>>>
>>>>> It's almost arbitrary, but I would say it's the direction that I/Os flow.
>>>>>
>>>>> But if the underlying observation is that the device tree is not really a
>>>>> tree, you're 100% correct.  This is part of why a factory interface that
>>>>> just takes a parent bus is too simplistic.
>>>>>
>>>>> I think we ought to introduce a -pci-device option that is specifically for
>>>>> creating PCI devices that doesn't require a parent bus argument but provides
>>>>> a way to specify stable addressing (for instancing, using a linear index).
>>>>
>>>> I think kvm_state should not be a property of any device or bus. It
>>>> should be split to more logical pieces.
>>>>
>>>> Some parts of it could remain in CPUState, because they are associated
>>>> with a VCPU.
>>>>
>>>> Also, for example irqfd could be considered to be similar object to
>>>> char or block devices provided by QEMU to devices. Would it make sense
>>>> to introduce new host types for passing parts of kvm_state to devices?
>>>>
>>>> I'd also make coalesced MMIO stuff part of memory object. We are not
>>>> passing any state references when using cpu_physical_memory_rw(), but
>>>> that could be changed.
>>>
>>> There are currently no VCPU-specific bits remaining in kvm_state.
>>
>> I think fields vcpu_events, robust_singlestep, debugregs,
>> kvm_sw_breakpoints, xsave, xcrs belong to CPUX86State. They may be the
>> same for all VCPUs but still they are sort of CPU properties. I'm not
>> sure about fd field.
>
> They are all properties of the currently loaded KVM subsystem in the
> host kernel. They can't change while KVM's root fd is opened.
> Replicating this static information into each and every VCPU state would
> be crazy.

Then each CPUX86State could have a pointer to common structure.

> In fact, services like kvm_has_vcpu_events() already encode this: they
> are static functions without any kvm_state reference that simply return
> the content of those fields. Totally inconsistent to this, we force the
> caller of kvm_check_extension to pass a handle. This is part of my
> problem with the current situation and any halfhearted steps in this
> context. Either we work towards eliminating "static KVMState *kvm_state"
> in kvm-all.c or eliminating KVMState.

If the CPU related fields are accessible through CPUState, the handle
should be available.

>>> It may
>>> be a good idea to introduce an arch-specific kvm_state and move related
>>> bits over.
>>
>> This should probably contain only irqchip_in_kernel, pit_in_kernel and
>> many_ioeventfds, maybe fd.
>
> fd is that root file descriptor you need for a few KVM services that are
> not bound to a specific VM - e.g. feature queries. It's not arch
> specific. Arch specific are e.g. robust_singlestep or xsave feature states.

By arch you mean guest CPU architecture? They are not machine features.

>>
>>> It may also once be feasible to carve out memory management
>>> related fields if we have proper abstractions for that, but I'm not
>>> completely sure here.
>>
>> I'd put slots, vmfd, coalesced_mmio, broken_set_mem_region,
>> migration_log into the memory object.
>
> vmfd is the VM-scope file descriptor you need at machine-level. The rest
> logically belongs to a memory object, but I haven't looked at technical
> details yet.
>
>>
>>> Anyway, all these things are secondary. The primary topic here is how to
>>> deal with kvm_state and its fields that have VM-global scope.
>>
>> If it is an opaque blob which contains various unrelated stuff, no
>> clear place will be found.
>
> We aren't moving fields yet (and we shouldn't). We first of all need to
> establish the handle distribution (which apparently requires quite some
> work in areas beyond KVM).

But I think this is exactly  the problem. If the handle is for the
current KVMState, you'll indeed need it in various places and passing
it around will be cumbersome. By moving the fields around, the
information should  be available more naturally.

>> By the way, we don't have a QEMUState but instead use globals. Perhaps
>> this should be reorganized as well. For fd field, maybe even using a
>> global variable could be justified, since it is used for direct access
>> to kernel, not unlike a system call.
>
> The fd field is part of this discussion. Making it global (but local to
> the kvm subsystem) was an implicit part of my original suggestion.
>
> I've no problem with something like a QEMUState, or better a
> MachineState that would also include a few KVM-specific fields like the
> vmfd - just like we already do for CPUstate (or should we better
> introduce a KVM CPU bus... ;) ).
>
> Jan
>
>
Jan Kiszka Jan. 20, 2011, 9:42 p.m. UTC | #63
On 2011-01-20 21:02, Blue Swirl wrote:
> I think KVMState was designed to match KVM ioctl interface: all stuff
> that is needed for talking to KVM or received from KVM are there. But
> I think this shouldn't be a design driver.

Agreed. The nice cleanup would probably be the complete assimilation of
KVMState by something bigger of comparable scope.

If a machine was brought up with KVM support, every device that refers
to this machine (as it is supposed to become part of it) should be able
to use KVM services in order to accelerate its model.

> 
> If the only pieces of kvm_state that are needed by the devices are
> irqchip_in_kernel, pit_in_kernel and many_ioeventfds, the problem of
> passing kvm_state to devices becomes very different. Each of these are
> just single bits, affecting only a few devices. Perhaps they could be
> device properties which the board level sets when KVM is used?

Forget about the static capabilities for now. The core of kvm_state are
handles that enable you to use KVM services and maybe state fields that
have machine scope (unless we find more local homes like a memory
object). Those need to be accessible by the kvm layer when servicing
requests of components that are related to that very same machine.

Jan
Jan Kiszka Jan. 20, 2011, 9:53 p.m. UTC | #64
On 2011-01-20 22:40, Blue Swirl wrote:
> On Thu, Jan 20, 2011 at 9:22 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>> On 2011-01-20 20:27, Blue Swirl wrote:
>>> On Thu, Jan 20, 2011 at 9:33 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>>> On 2011-01-19 20:32, Blue Swirl wrote:
>>>>> On Wed, Jan 19, 2011 at 4:57 PM, Anthony Liguori
>>>>> <aliguori@linux.vnet.ibm.com> wrote:
>>>>>> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>>>>>>>
>>>>>>> So they interact with KVM (need kvm_state), and they interact with the
>>>>>>> emulated PCI bus.  Could you elaborate on the fundamental difference
>>>>>>> between the two interactions that makes you choose the (hypothetical)
>>>>>>> KVM bus over the PCI bus as device parent?
>>>>>>>
>>>>>>
>>>>>> It's almost arbitrary, but I would say it's the direction that I/Os flow.
>>>>>>
>>>>>> But if the underlying observation is that the device tree is not really a
>>>>>> tree, you're 100% correct.  This is part of why a factory interface that
>>>>>> just takes a parent bus is too simplistic.
>>>>>>
>>>>>> I think we ought to introduce a -pci-device option that is specifically for
>>>>>> creating PCI devices that doesn't require a parent bus argument but provides
>>>>>> a way to specify stable addressing (for instancing, using a linear index).
>>>>>
>>>>> I think kvm_state should not be a property of any device or bus. It
>>>>> should be split to more logical pieces.
>>>>>
>>>>> Some parts of it could remain in CPUState, because they are associated
>>>>> with a VCPU.
>>>>>
>>>>> Also, for example irqfd could be considered to be similar object to
>>>>> char or block devices provided by QEMU to devices. Would it make sense
>>>>> to introduce new host types for passing parts of kvm_state to devices?
>>>>>
>>>>> I'd also make coalesced MMIO stuff part of memory object. We are not
>>>>> passing any state references when using cpu_physical_memory_rw(), but
>>>>> that could be changed.
>>>>
>>>> There are currently no VCPU-specific bits remaining in kvm_state.
>>>
>>> I think fields vcpu_events, robust_singlestep, debugregs,
>>> kvm_sw_breakpoints, xsave, xcrs belong to CPUX86State. They may be the
>>> same for all VCPUs but still they are sort of CPU properties. I'm not
>>> sure about fd field.
>>
>> They are all properties of the currently loaded KVM subsystem in the
>> host kernel. They can't change while KVM's root fd is opened.
>> Replicating this static information into each and every VCPU state would
>> be crazy.
> 
> Then each CPUX86State could have a pointer to common structure.

That already exists.

> 
>> In fact, services like kvm_has_vcpu_events() already encode this: they
>> are static functions without any kvm_state reference that simply return
>> the content of those fields. Totally inconsistent to this, we force the
>> caller of kvm_check_extension to pass a handle. This is part of my
>> problem with the current situation and any halfhearted steps in this
>> context. Either we work towards eliminating "static KVMState *kvm_state"
>> in kvm-all.c or eliminating KVMState.
> 
> If the CPU related fields are accessible through CPUState, the handle
> should be available.
> 
>>>> It may
>>>> be a good idea to introduce an arch-specific kvm_state and move related
>>>> bits over.
>>>
>>> This should probably contain only irqchip_in_kernel, pit_in_kernel and
>>> many_ioeventfds, maybe fd.
>>
>> fd is that root file descriptor you need for a few KVM services that are
>> not bound to a specific VM - e.g. feature queries. It's not arch
>> specific. Arch specific are e.g. robust_singlestep or xsave feature states.
> 
> By arch you mean guest CPU architecture? They are not machine features.

No, they are practically static host features.

> 
>>>
>>>> It may also once be feasible to carve out memory management
>>>> related fields if we have proper abstractions for that, but I'm not
>>>> completely sure here.
>>>
>>> I'd put slots, vmfd, coalesced_mmio, broken_set_mem_region,
>>> migration_log into the memory object.
>>
>> vmfd is the VM-scope file descriptor you need at machine-level. The rest
>> logically belongs to a memory object, but I haven't looked at technical
>> details yet.
>>
>>>
>>>> Anyway, all these things are secondary. The primary topic here is how to
>>>> deal with kvm_state and its fields that have VM-global scope.
>>>
>>> If it is an opaque blob which contains various unrelated stuff, no
>>> clear place will be found.
>>
>> We aren't moving fields yet (and we shouldn't). We first of all need to
>> establish the handle distribution (which apparently requires quite some
>> work in areas beyond KVM).
> 
> But I think this is exactly  the problem. If the handle is for the
> current KVMState, you'll indeed need it in various places and passing
> it around will be cumbersome. By moving the fields around, the
> information should  be available more naturally.

Yeah, if we had a MachineState or if we could agree on introducing it,
I'm with you again. Improving the currently cumbersome KVM API
interaction was the main motivation for my original patch.

Jan
Gerd Hoffmann Jan. 21, 2011, 8:35 a.m. UTC | #65
On 01/20/11 20:39, Anthony Liguori wrote:
> On 01/20/2011 02:44 AM, Gerd Hoffmann wrote:
>> Hi,
>>
>>> For (2), you cannot use bus=X,addr=Y because it makes assumptions about
>>> the PCI topology which may change in newer -M pc's.
>>
>> Why should the PCI topology for 'pc' ever change?
>>
>> We'll probably get q35 support some day, but when this lands I expect
>> we'll see a new machine type 'q35', so '-m q35' will pick the ich9
>> chipset (which will have a different pci topology of course) and '-m
>> pc' will pick the existing piix chipset (which will continue to look
>> like it looks today).
>
> But then what's the default machine type? When I say -M pc, I really
> mean the default machine.

I'd tend to leave pc as default for a release cycle or two so we can 
hash out issues with q35, then flip the default once it got broader 
testing and runs stable.

> At some point, "qemu-system-x86_64 -device virtio-net-pci,addr=2.0"
>
> Is not going to be a reliable way to invoke qemu because there's no way
> we can guarantee that slot 2 isn't occupied by a chipset device or some
> other default device.

Indeed.  But qemu -M pc should continue to work though.  'pc' would 
better named 'piix3', but renaming it now is probably not worth the trouble.

cheers,
   Gerd
Gerd Hoffmann Jan. 21, 2011, 8:46 a.m. UTC | #66
Hi,

> By the way, we don't have a QEMUState but instead use globals.

/me wants to underline this.

IMO it is absolutely pointless to worry about ways to pass around 
kvm_state.  There never ever will be a serious need for that.

We can stick with the current model of keeping global state in global 
variables.  And just do the same with kvm_state.

Or we can move to have all state in a QEMUState struct which we'll pass 
around basically everywhere.  Then we can simply embed or reference 
kvm_state there.

I'd tend to stick with the global variables as I don't see the point in 
having a QEMUstate.  I doubt we'll ever see two virtual machines driven 
by a single qemu process.  YMMV.

cheers,
   Gerd
Markus Armbruster Jan. 21, 2011, 10:03 a.m. UTC | #67
Gerd Hoffmann <kraxel@redhat.com> writes:

> On 01/20/11 20:39, Anthony Liguori wrote:
>> On 01/20/2011 02:44 AM, Gerd Hoffmann wrote:
>>> Hi,
>>>
>>>> For (2), you cannot use bus=X,addr=Y because it makes assumptions about
>>>> the PCI topology which may change in newer -M pc's.
>>>
>>> Why should the PCI topology for 'pc' ever change?
>>>
>>> We'll probably get q35 support some day, but when this lands I expect
>>> we'll see a new machine type 'q35', so '-m q35' will pick the ich9
>>> chipset (which will have a different pci topology of course) and '-m
>>> pc' will pick the existing piix chipset (which will continue to look
>>> like it looks today).
>>
>> But then what's the default machine type? When I say -M pc, I really
>> mean the default machine.
>
> I'd tend to leave pc as default for a release cycle or two so we can
> hash out issues with q35, then flip the default once it got broader
> testing and runs stable.
>
>> At some point, "qemu-system-x86_64 -device virtio-net-pci,addr=2.0"
>>
>> Is not going to be a reliable way to invoke qemu because there's no way
>> we can guarantee that slot 2 isn't occupied by a chipset device or some
>> other default device.
>
> Indeed.  But qemu -M pc should continue to work though.  'pc' would
> better named 'piix3', but renaming it now is probably not worth the
> trouble.

We mustn't change pc-0.14 & friends.  We routinely change pc, but
whether an upgrade to q35 qualifies as routine change is debatable.

If you don't want PCI topology (and more) to change across QEMU updates,
consider using the versioned machine types.
Markus Armbruster Jan. 21, 2011, 10:05 a.m. UTC | #68
Gerd Hoffmann <kraxel@redhat.com> writes:

>   Hi,
>
>> By the way, we don't have a QEMUState but instead use globals.
>
> /me wants to underline this.
>
> IMO it is absolutely pointless to worry about ways to pass around
> kvm_state.  There never ever will be a serious need for that.
>
> We can stick with the current model of keeping global state in global
> variables.  And just do the same with kvm_state.
>
> Or we can move to have all state in a QEMUState struct which we'll
> pass around basically everywhere.  Then we can simply embed or
> reference kvm_state there.
>
> I'd tend to stick with the global variables as I don't see the point
> in having a QEMUstate.  I doubt we'll ever see two virtual machines
> driven by a single qemu process.  YMMV.

/me grabs the fat magic marker and underlines some more.
Blue Swirl Jan. 21, 2011, 4:37 p.m. UTC | #69
On Fri, Jan 21, 2011 at 8:46 AM, Gerd Hoffmann <kraxel@redhat.com> wrote:
>  Hi,
>
>> By the way, we don't have a QEMUState but instead use globals.
>
> /me wants to underline this.
>
> IMO it is absolutely pointless to worry about ways to pass around kvm_state.
>  There never ever will be a serious need for that.
>
> We can stick with the current model of keeping global state in global
> variables.  And just do the same with kvm_state.
>
> Or we can move to have all state in a QEMUState struct which we'll pass
> around basically everywhere.  Then we can simply embed or reference
> kvm_state there.
>
> I'd tend to stick with the global variables as I don't see the point in
> having a QEMUstate.  I doubt we'll ever see two virtual machines driven by a
> single qemu process.  YMMV.

Global variables are signs of a poor design. QEMUState would not help
that, instead more specific structures should be designed, much like
what I've proposed for KVMState. Some of these new structures should
be even passed around when it makes sense.

But I'd not start kvm_state redesign around global variables or QEMUState.
Jan Kiszka Jan. 21, 2011, 5:21 p.m. UTC | #70
On 2011-01-21 17:37, Blue Swirl wrote:
> On Fri, Jan 21, 2011 at 8:46 AM, Gerd Hoffmann <kraxel@redhat.com> wrote:
>>  Hi,
>>
>>> By the way, we don't have a QEMUState but instead use globals.
>>
>> /me wants to underline this.
>>
>> IMO it is absolutely pointless to worry about ways to pass around kvm_state.
>>  There never ever will be a serious need for that.
>>
>> We can stick with the current model of keeping global state in global
>> variables.  And just do the same with kvm_state.
>>
>> Or we can move to have all state in a QEMUState struct which we'll pass
>> around basically everywhere.  Then we can simply embed or reference
>> kvm_state there.
>>
>> I'd tend to stick with the global variables as I don't see the point in
>> having a QEMUstate.  I doubt we'll ever see two virtual machines driven by a
>> single qemu process.  YMMV.
> 
> Global variables are signs of a poor design.

s/are/can be/.

> QEMUState would not help
> that, instead more specific structures should be designed, much like
> what I've proposed for KVMState. Some of these new structures should
> be even passed around when it makes sense.
> 
> But I'd not start kvm_state redesign around global variables or QEMUState.

We do not need to move individual fields yet, but we need to define
classes of fields and strategies how to deal with them long-term. Then
we can move forward, and that already in the right direction.

Obvious classes are
 - static host capabilities and means for the KVM core to query them
 - per-VM fields
 - fields related to memory management

And we now need at least a plan for the second class to proceed with the
actual job.

Jan
Blue Swirl Jan. 21, 2011, 6:04 p.m. UTC | #71
On Fri, Jan 21, 2011 at 5:21 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2011-01-21 17:37, Blue Swirl wrote:
>> On Fri, Jan 21, 2011 at 8:46 AM, Gerd Hoffmann <kraxel@redhat.com> wrote:
>>>  Hi,
>>>
>>>> By the way, we don't have a QEMUState but instead use globals.
>>>
>>> /me wants to underline this.
>>>
>>> IMO it is absolutely pointless to worry about ways to pass around kvm_state.
>>>  There never ever will be a serious need for that.
>>>
>>> We can stick with the current model of keeping global state in global
>>> variables.  And just do the same with kvm_state.
>>>
>>> Or we can move to have all state in a QEMUState struct which we'll pass
>>> around basically everywhere.  Then we can simply embed or reference
>>> kvm_state there.
>>>
>>> I'd tend to stick with the global variables as I don't see the point in
>>> having a QEMUstate.  I doubt we'll ever see two virtual machines driven by a
>>> single qemu process.  YMMV.
>>
>> Global variables are signs of a poor design.
>
> s/are/can be/.
>
>> QEMUState would not help
>> that, instead more specific structures should be designed, much like
>> what I've proposed for KVMState. Some of these new structures should
>> be even passed around when it makes sense.
>>
>> But I'd not start kvm_state redesign around global variables or QEMUState.
>
> We do not need to move individual fields yet, but we need to define
> classes of fields and strategies how to deal with them long-term. Then
> we can move forward, and that already in the right direction.

Excellent plan.

> Obvious classes are
>  - static host capabilities and means for the KVM core to query them

OK. There could be other host capabilities here in the future too,
like Xen. I don't think there are any Xen capabilities ATM though but
IIRC some recently sent patches had something like those.

>  - per-VM fields

What is per-VM which is not machine or CPU architecture specific?

>  - fields related to memory management

OK.

I'd add fourth possible class:
 - device, CPU and machine configuration, like nographic,
win2k_install_hack, no_hpet, smp_cpus etc. Maybe also
irqchip_in_kernel could fit here, though it obviously depends on a
host capability too.
Jan Kiszka Jan. 21, 2011, 6:17 p.m. UTC | #72
On 2011-01-21 19:04, Blue Swirl wrote:
> On Fri, Jan 21, 2011 at 5:21 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2011-01-21 17:37, Blue Swirl wrote:
>>> On Fri, Jan 21, 2011 at 8:46 AM, Gerd Hoffmann <kraxel@redhat.com> wrote:
>>>>  Hi,
>>>>
>>>>> By the way, we don't have a QEMUState but instead use globals.
>>>>
>>>> /me wants to underline this.
>>>>
>>>> IMO it is absolutely pointless to worry about ways to pass around kvm_state.
>>>>  There never ever will be a serious need for that.
>>>>
>>>> We can stick with the current model of keeping global state in global
>>>> variables.  And just do the same with kvm_state.
>>>>
>>>> Or we can move to have all state in a QEMUState struct which we'll pass
>>>> around basically everywhere.  Then we can simply embed or reference
>>>> kvm_state there.
>>>>
>>>> I'd tend to stick with the global variables as I don't see the point in
>>>> having a QEMUstate.  I doubt we'll ever see two virtual machines driven by a
>>>> single qemu process.  YMMV.
>>>
>>> Global variables are signs of a poor design.
>>
>> s/are/can be/.
>>
>>> QEMUState would not help
>>> that, instead more specific structures should be designed, much like
>>> what I've proposed for KVMState. Some of these new structures should
>>> be even passed around when it makes sense.
>>>
>>> But I'd not start kvm_state redesign around global variables or QEMUState.
>>
>> We do not need to move individual fields yet, but we need to define
>> classes of fields and strategies how to deal with them long-term. Then
>> we can move forward, and that already in the right direction.
> 
> Excellent plan.
> 
>> Obvious classes are
>>  - static host capabilities and means for the KVM core to query them
> 
> OK. There could be other host capabilities here in the future too,
> like Xen. I don't think there are any Xen capabilities ATM though but
> IIRC some recently sent patches had something like those.
> 
>>  - per-VM fields
> 
> What is per-VM which is not machine or CPU architecture specific?

I think it would suffice for a first step to consider all per-VM fields
as independent of CPU architecture or machine type.

> 
>>  - fields related to memory management
> 
> OK.
> 
> I'd add fourth possible class:
>  - device, CPU and machine configuration, like nographic,
> win2k_install_hack, no_hpet, smp_cpus etc. Maybe also
> irqchip_in_kernel could fit here, though it obviously depends on a
> host capability too.

I would count everything that cannot be assigned to a concrete device
upfront to the dynamic state of a machine, thus class 2. The point is,
(potentially) every device of that machine requires access to it, just
like (indirectly, via the KVM core services) to some KVM VM state bits.

Jan
Blue Swirl Jan. 21, 2011, 6:49 p.m. UTC | #73
On Fri, Jan 21, 2011 at 6:17 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2011-01-21 19:04, Blue Swirl wrote:
>> On Fri, Jan 21, 2011 at 5:21 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>> On 2011-01-21 17:37, Blue Swirl wrote:
>>>> On Fri, Jan 21, 2011 at 8:46 AM, Gerd Hoffmann <kraxel@redhat.com> wrote:
>>>>>  Hi,
>>>>>
>>>>>> By the way, we don't have a QEMUState but instead use globals.
>>>>>
>>>>> /me wants to underline this.
>>>>>
>>>>> IMO it is absolutely pointless to worry about ways to pass around kvm_state.
>>>>>  There never ever will be a serious need for that.
>>>>>
>>>>> We can stick with the current model of keeping global state in global
>>>>> variables.  And just do the same with kvm_state.
>>>>>
>>>>> Or we can move to have all state in a QEMUState struct which we'll pass
>>>>> around basically everywhere.  Then we can simply embed or reference
>>>>> kvm_state there.
>>>>>
>>>>> I'd tend to stick with the global variables as I don't see the point in
>>>>> having a QEMUstate.  I doubt we'll ever see two virtual machines driven by a
>>>>> single qemu process.  YMMV.
>>>>
>>>> Global variables are signs of a poor design.
>>>
>>> s/are/can be/.
>>>
>>>> QEMUState would not help
>>>> that, instead more specific structures should be designed, much like
>>>> what I've proposed for KVMState. Some of these new structures should
>>>> be even passed around when it makes sense.
>>>>
>>>> But I'd not start kvm_state redesign around global variables or QEMUState.
>>>
>>> We do not need to move individual fields yet, but we need to define
>>> classes of fields and strategies how to deal with them long-term. Then
>>> we can move forward, and that already in the right direction.
>>
>> Excellent plan.
>>
>>> Obvious classes are
>>>  - static host capabilities and means for the KVM core to query them
>>
>> OK. There could be other host capabilities here in the future too,
>> like Xen. I don't think there are any Xen capabilities ATM though but
>> IIRC some recently sent patches had something like those.
>>
>>>  - per-VM fields
>>
>> What is per-VM which is not machine or CPU architecture specific?
>
> I think it would suffice for a first step to consider all per-VM fields
> as independent of CPU architecture or machine type.

I'm afraid that would not be progress.

>>>  - fields related to memory management
>>
>> OK.
>>
>> I'd add fourth possible class:
>>  - device, CPU and machine configuration, like nographic,
>> win2k_install_hack, no_hpet, smp_cpus etc. Maybe also
>> irqchip_in_kernel could fit here, though it obviously depends on a
>> host capability too.
>
> I would count everything that cannot be assigned to a concrete device
> upfront to the dynamic state of a machine, thus class 2. The point is,
> (potentially) every device of that machine requires access to it, just
> like (indirectly, via the KVM core services) to some KVM VM state bits.

The machine class should not be a catch-all, it would be like
QEMUState or KVMState then. Perhaps each field or variable should be
listed and given more thought.
Gleb Natapov Jan. 24, 2011, 8:45 a.m. UTC | #74
On Tue, Jan 18, 2011 at 11:09:01AM -0600, Anthony Liguori wrote:
> >>But we also need to provide a compatible interface to management tools.
> >>Exposing the device model topology as a compatible interface
> >>artificially limits us.  It's far better to provide higher level
> >>supported interfaces to give us the flexibility to change the device
> >>model as we need to.
> >How do you want to change qdev to keep the guest and management tool
> >view stable while branching off kvm sub-buses?
> 
> The qdev device model is not a stable interface.  I think that's
> been clear from the very beginning.
> 

And what was the reason it was declared not stable? May be because we
were not sure we will do it right from the start, so change will be
needed later. But changes should bring qdev closer to reflect what guest
expect device topology to look like. This will bring us to stable state
as close possible.  We need this knowledge and stability in qdev for
device path creation.  Both kind of device paths: OF and the one we use
for migration. To create OF device path we need to know topology as seen
by a guest (and guest does not care how isa bus is implemented internally
inside south bridge), to create device path used for migration we need
stability, otherwise change in qdev topology will break migration. All
this artificial buses you propose to add move us in opposite direction
and make qdev useless for anything but .... well for anything.

--
			Gleb.
Jan Kiszka Jan. 24, 2011, 2:08 p.m. UTC | #75
On 2011-01-21 19:49, Blue Swirl wrote:
>>> I'd add fourth possible class:
>>>  - device, CPU and machine configuration, like nographic,
>>> win2k_install_hack, no_hpet, smp_cpus etc. Maybe also
>>> irqchip_in_kernel could fit here, though it obviously depends on a
>>> host capability too.
>>
>> I would count everything that cannot be assigned to a concrete device
>> upfront to the dynamic state of a machine, thus class 2. The point is,
>> (potentially) every device of that machine requires access to it, just
>> like (indirectly, via the KVM core services) to some KVM VM state bits.
> 
> The machine class should not be a catch-all, it would be like
> QEMUState or KVMState then. Perhaps each field or variable should be
> listed and given more thought.

Let's start with what is most urgent:

 - vmfd: file descriptor required for any KVM request that has VM scope
   (in-kernel device creation, device state synchronizations, IRQ
   routing etc.)
 - irqchip_in_kernel: VM uses in-kernel irqchip acceleration
   (some devices will have to adjust their behavior depending on this)

pit_in_kernel would be analogue to irqchip, but it's also conceptually
x86-only (irqchips is only used by x86, but not tied to it) and it's not
mandatory for a first round of KVM devices for upstream.

Jan
Blue Swirl Jan. 24, 2011, 9:35 p.m. UTC | #76
On Mon, Jan 24, 2011 at 2:08 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2011-01-21 19:49, Blue Swirl wrote:
>>>> I'd add fourth possible class:
>>>>  - device, CPU and machine configuration, like nographic,
>>>> win2k_install_hack, no_hpet, smp_cpus etc. Maybe also
>>>> irqchip_in_kernel could fit here, though it obviously depends on a
>>>> host capability too.
>>>
>>> I would count everything that cannot be assigned to a concrete device
>>> upfront to the dynamic state of a machine, thus class 2. The point is,
>>> (potentially) every device of that machine requires access to it, just
>>> like (indirectly, via the KVM core services) to some KVM VM state bits.
>>
>> The machine class should not be a catch-all, it would be like
>> QEMUState or KVMState then. Perhaps each field or variable should be
>> listed and given more thought.
>
> Let's start with what is most urgent:
>
>  - vmfd: file descriptor required for any KVM request that has VM scope
>   (in-kernel device creation, device state synchronizations, IRQ
>   routing etc.)

I'd say VM state.

>  - irqchip_in_kernel: VM uses in-kernel irqchip acceleration
>   (some devices will have to adjust their behavior depending on this)

Since QEMU version is useless, I peeked at qemu-kvm version.

There are a lot of lines like:
if (kvm_enabled() && !kvm_irqchip_in_kernel())
    kvm_just_do_it();

Perhaps these would be cleaner with stub functions.

The device cases are obvious: the devices need a flag, passed to them
by pc.c, which combines kvm_enabled && kvm_irqchip_in_kernel(). This
gets stored in device state.

But exec.c case, where kvm_update_interrupt_request() is called, is
more interesting. CPU init could set up function pointer to either
stub/NULL or kvm_update_interrupt_request().

I didn't look at kvm*.c, qemu-kvm*.c or stuff in kvm/.

So I'd eliminate kvm_irqchip_in_kernel() from outside of KVM and pc.c.
The information could be stored in a MachineState, where pc.c could
grab it for device and CPU setup.
Jan Kiszka Jan. 24, 2011, 9:57 p.m. UTC | #77
On 2011-01-24 22:35, Blue Swirl wrote:
> On Mon, Jan 24, 2011 at 2:08 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2011-01-21 19:49, Blue Swirl wrote:
>>>>> I'd add fourth possible class:
>>>>>  - device, CPU and machine configuration, like nographic,
>>>>> win2k_install_hack, no_hpet, smp_cpus etc. Maybe also
>>>>> irqchip_in_kernel could fit here, though it obviously depends on a
>>>>> host capability too.
>>>>
>>>> I would count everything that cannot be assigned to a concrete device
>>>> upfront to the dynamic state of a machine, thus class 2. The point is,
>>>> (potentially) every device of that machine requires access to it, just
>>>> like (indirectly, via the KVM core services) to some KVM VM state bits.
>>>
>>> The machine class should not be a catch-all, it would be like
>>> QEMUState or KVMState then. Perhaps each field or variable should be
>>> listed and given more thought.
>>
>> Let's start with what is most urgent:
>>
>>  - vmfd: file descriptor required for any KVM request that has VM scope
>>   (in-kernel device creation, device state synchronizations, IRQ
>>   routing etc.)
> 
> I'd say VM state.

Good. That's +1 for introducing and distributing it.

> 
>>  - irqchip_in_kernel: VM uses in-kernel irqchip acceleration
>>   (some devices will have to adjust their behavior depending on this)
> 
> Since QEMU version is useless, I peeked at qemu-kvm version.
> 
> There are a lot of lines like:
> if (kvm_enabled() && !kvm_irqchip_in_kernel())
>     kvm_just_do_it();
> 
> Perhaps these would be cleaner with stub functions.

Probably. I guess there is quite some room left for cleanups in this area.

> 
> The device cases are obvious: the devices need a flag, passed to them
> by pc.c, which combines kvm_enabled && kvm_irqchip_in_kernel(). This
> gets stored in device state.

Not all devices are only instantiated by the machine init code. Even if
we are lucky that all those we need on x86 are created that way, we
shouldn't rely on this for future use case, including other KVM archs.

> 
> But exec.c case, where kvm_update_interrupt_request() is called, is
> more interesting. CPU init could set up function pointer to either
> stub/NULL or kvm_update_interrupt_request().
> 

Yes, callbacks are the way to go long term. Here we could also define
one for VCPU interrupt handling and set it according to the VCPU mode.

> I didn't look at kvm*.c, qemu-kvm*.c or stuff in kvm/.
> 
> So I'd eliminate kvm_irqchip_in_kernel() from outside of KVM and pc.c.
> The information could be stored in a MachineState, where pc.c could
> grab it for device and CPU setup.

I still don't see how we can distribute the information to all
interested devices. It's basically the same issue as with current kvm_state.

Jan
Avi Kivity Jan. 25, 2011, 10:27 a.m. UTC | #78
On 01/18/2011 04:28 PM, Jan Kiszka wrote:
> >
> >  So we can either "infect" the whole device tree with kvm (or maybe a
> >  more generic accelerator structure that also deals with Xen) or we need
> >  to pull the reference inside the device's init function from some global
> >  service (kvm_get_state).
>
> Note that this topic is still waiting for good suggestions, specifically
> from those who believe in kvm_state references :). This is not only
> blocking kvmstate merge but will affect KVM irqchips as well.

I'm one of them, but I don't have anything better to suggest than adding 
"kvm_state" attribute to qdev, which seems mighty artificial.  So I'm in 
favour of eliminating it now.

> It boils down to how we reasonably pass a kvm_state reference from
> machine init code to a sysbus device. I'm probably biased, but I don't
> see any way that does not work against the idea of confining access to
> kvm_state or breaks device instantiation from the command line or a
> config file.

I'm biased in the other direction, but I agree.
Avi Kivity Jan. 25, 2011, 10:34 a.m. UTC | #79
On 01/18/2011 05:50 PM, Anthony Liguori wrote:
>> This design is in conflict with the requirement to attach KVM-assisted
>> devices also to their home bus, e.g. an assigned PCI device to the PCI
>> bus. We don't support multi-homed qdev devices.
>
> The bus topology reflects how I/O flows in and out of a device.  We do 
> not model a perfect PC bus architecture and I don't think we ever 
> intend to.  Instead, we model a functional architecture.

A KVM bus is far from a function architecture.  It simply departs from 
both what a real PC looks like, and what a KVM PC looks like to a guest, 
for no reason except to force a particular object model.

It's completely artificial.  If kvm were not behind the kernel/user 
interface, and an unchangeable ABI, we'd refactor it.  However, we can't 
refactor it, and you're trying to warp the device model to adapt to this 
design problem instead of working around it.  You're elevating a kink 
into an architectural feature.

>
> I/O from an assigned device does not flow through the emulated PCI 
> bus.  Therefore, it does not belong on the emulated PCI bus.

Yes it does.  Config space and some mmio flows through qemu and the 
emulated PCI bus.  So do things like hotplug/hotunplug.


>
> Assigned devices need to interact with the emulated PCI bus, but they 
> shouldn't be children of it.
>

What's the difference, from the guest point of view, from an assigned 
RTL8139 card, and an emulated RTL8139 card?

If you believe there is no difference, what better way to model this 
than implement them the same way using the same interfaces?
Avi Kivity Jan. 25, 2011, 11:06 a.m. UTC | #80
On 01/19/2011 06:57 PM, Anthony Liguori wrote:
> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>> So they interact with KVM (need kvm_state), and they interact with the
>> emulated PCI bus.  Could you elaborate on the fundamental difference
>> between the two interactions that makes you choose the (hypothetical)
>> KVM bus over the PCI bus as device parent?
>
> It's almost arbitrary, but I would say it's the direction that I/Os flow.
>

In the case of kvm, things are somewhat misleading.  I/O still flows 
through the (virtual) PCI bus, it's just short-circuited to a real 
device.  Similarly when attaching an ioeventfd to a virtio kick 
register, things still logically from the same way as without ioeventfd; 
we simply add a fast path for the operation.  But it doesn't change the 
logical view of things.
Avi Kivity Jan. 25, 2011, 11:10 a.m. UTC | #81
On 01/20/2011 11:22 PM, Jan Kiszka wrote:
> On 2011-01-20 20:27, Blue Swirl wrote:
> >  On Thu, Jan 20, 2011 at 9:33 AM, Jan Kiszka<jan.kiszka@siemens.com>  wrote:
> >>  On 2011-01-19 20:32, Blue Swirl wrote:
> >>>  On Wed, Jan 19, 2011 at 4:57 PM, Anthony Liguori
> >>>  <aliguori@linux.vnet.ibm.com>  wrote:
> >>>>  On 01/19/2011 07:15 AM, Markus Armbruster wrote:
> >>>>>
> >>>>>  So they interact with KVM (need kvm_state), and they interact with the
> >>>>>  emulated PCI bus.  Could you elaborate on the fundamental difference
> >>>>>  between the two interactions that makes you choose the (hypothetical)
> >>>>>  KVM bus over the PCI bus as device parent?
> >>>>>
> >>>>
> >>>>  It's almost arbitrary, but I would say it's the direction that I/Os flow.
> >>>>
> >>>>  But if the underlying observation is that the device tree is not really a
> >>>>  tree, you're 100% correct.  This is part of why a factory interface that
> >>>>  just takes a parent bus is too simplistic.
> >>>>
> >>>>  I think we ought to introduce a -pci-device option that is specifically for
> >>>>  creating PCI devices that doesn't require a parent bus argument but provides
> >>>>  a way to specify stable addressing (for instancing, using a linear index).
> >>>
> >>>  I think kvm_state should not be a property of any device or bus. It
> >>>  should be split to more logical pieces.
> >>>
> >>>  Some parts of it could remain in CPUState, because they are associated
> >>>  with a VCPU.
> >>>
> >>>  Also, for example irqfd could be considered to be similar object to
> >>>  char or block devices provided by QEMU to devices. Would it make sense
> >>>  to introduce new host types for passing parts of kvm_state to devices?
> >>>
> >>>  I'd also make coalesced MMIO stuff part of memory object. We are not
> >>>  passing any state references when using cpu_physical_memory_rw(), but
> >>>  that could be changed.
> >>
> >>  There are currently no VCPU-specific bits remaining in kvm_state.
> >
> >  I think fields vcpu_events, robust_singlestep, debugregs,
> >  kvm_sw_breakpoints, xsave, xcrs belong to CPUX86State. They may be the
> >  same for all VCPUs but still they are sort of CPU properties. I'm not
> >  sure about fd field.
>
> They are all properties of the currently loaded KVM subsystem in the
> host kernel. They can't change while KVM's root fd is opened.
> Replicating this static information into each and every VCPU state would
> be crazy.

Perhaps they should be renamed to have_xsave or features.xsave, and be 
made bools, to improve readability.
Anthony Liguori Jan. 25, 2011, 1:58 p.m. UTC | #82
On 01/25/2011 04:27 AM, Avi Kivity wrote:
>> It boils down to how we reasonably pass a kvm_state reference from
>> machine init code to a sysbus device. I'm probably biased, but I don't
>> see any way that does not work against the idea of confining access to
>> kvm_state or breaks device instantiation from the command line or a
>> config file.
>
> I'm biased in the other direction, but I agree.

Just #include "kvm.h" and reference the global kvm_state once in the 
initfn.  We don't have to solve this problem yet.  References to the 
global kvm_state become placeholders of where things need to be fixed up.

Regards,

Anthony Liguori
Anthony Liguori Jan. 25, 2011, 2:30 p.m. UTC | #83
On 01/25/2011 05:06 AM, Avi Kivity wrote:
> On 01/19/2011 06:57 PM, Anthony Liguori wrote:
>> On 01/19/2011 07:15 AM, Markus Armbruster wrote:
>>> So they interact with KVM (need kvm_state), and they interact with the
>>> emulated PCI bus.  Could you elaborate on the fundamental difference
>>> between the two interactions that makes you choose the (hypothetical)
>>> KVM bus over the PCI bus as device parent?
>>
>> It's almost arbitrary, but I would say it's the direction that I/Os 
>> flow.
>>
>
> In the case of kvm, things are somewhat misleading.  I/O still flows 
> through the (virtual) PCI bus, it's just short-circuited to a real device.

It doesn't.  If we have a PCI bus that transforms I/O or remaps I/O via 
an IOMMU, that device doesn't participate in it.

But this whole discussion is way off track.

We don't have to solve any of these problems today.  Just don't remove 
kvm_state and grab a global reference to it when we need to (which is 
*at best* one place in the code today) and let's move on with our lives.

Regards,

Anthony Liguori

>   Similarly when attaching an ioeventfd to a virtio kick register, 
> things still logically from the same way as without ioeventfd; we 
> simply add a fast path for the operation.  But it doesn't change the 
> logical view of things.
>
diff mbox

Patch

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 69b8234..47cb22b 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -29,6 +29,7 @@ 
 #include "hw/apic.h"
 #include "ioport.h"
 #include "kvm_x86.h"
+#include "hw/sysbus.h"
 
 #ifdef CONFIG_KVM_PARA
 #include <linux/kvm_para.h>
@@ -309,6 +310,85 @@  void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
 #endif
 }
 
+#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
+typedef struct KVMClockState {
+    SysBusDevice busdev;
+    uint64_t clock;
+    bool clock_valid;
+} KVMClockState;
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+    struct kvm_clock_data data;
+    int ret;
+
+    if (s->clock_valid) {
+        return;
+    }
+    ret = kvm_vm_ioctl(KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+        data.clock = 0;
+    }
+    s->clock = data.clock;
+    /*
+     * If the VM is stopped, declare the clock state valid to avoid re-reading
+     * it on next vmsave (which would return a different value). Will be reset
+     * when the VM is continued.
+     */
+    s->clock_valid = !vm_running;
+}
+
+static int kvmclock_post_load(void *opaque, int version_id)
+{
+    KVMClockState *s = opaque;
+    struct kvm_clock_data data;
+
+    data.clock = s->clock;
+    data.flags = 0;
+    return kvm_vm_ioctl(KVM_SET_CLOCK, &data);
+}
+
+static void kvmclock_vm_state_change(void *opaque, int running, int reason)
+{
+    KVMClockState *s = opaque;
+
+    if (running) {
+        s->clock_valid = false;
+    }
+}
+
+static int kvmclock_init(SysBusDevice *dev)
+{
+    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
+
+    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
+    return 0;
+}
+
+static const VMStateDescription kvmclock_vmsd= {
+    .name = "kvmclock",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .pre_save = kvmclock_pre_save,
+    .post_load = kvmclock_post_load,
+    .fields = (VMStateField []) {
+        VMSTATE_UINT64(clock, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static SysBusDeviceInfo kvmclock_info = {
+    .qdev.name = "kvmclock",
+    .qdev.size = sizeof(KVMClockState),
+    .qdev.vmsd = &kvmclock_vmsd,
+    .qdev.no_user = 1,
+    .init = kvmclock_init,
+};
+#endif /* CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK */
+
 int kvm_arch_init_vcpu(CPUState *env)
 {
     struct {
@@ -335,7 +415,6 @@  int kvm_arch_init_vcpu(CPUState *env)
     env->cpuid_svm_features  &= kvm_x86_get_supported_cpuid(0x8000000A,
                                                             0, R_EDX);
 
-
     cpuid_i = 0;
 
 #ifdef CONFIG_KVM_PARA
@@ -442,6 +521,13 @@  int kvm_arch_init_vcpu(CPUState *env)
     }
 #endif
 
+#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
+    if (cpu_is_bsp(env) &&
+        (env->cpuid_kvm_features & (1ULL << KVM_FEATURE_CLOCKSOURCE))) {
+        sysbus_create_simple("kvmclock", -1, NULL);
+    }
+#endif
+
     return kvm_vcpu_ioctl(env, KVM_SET_CPUID2, &cpuid_data);
 }
 
@@ -531,6 +617,10 @@  int kvm_arch_init(int smp_cpus)
     int ret;
     struct utsname utsname;
 
+#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
+    sysbus_register_withprop(&kvmclock_info);
+#endif
+
     ret = kvm_get_supported_msrs();
     if (ret < 0) {
         return ret;