diff mbox

[v2,9/9] Add -kvm option

Message ID 1254953315-5761-10-git-send-email-glommer@redhat.com
State Changes Requested
Headers show

Commit Message

Glauber Costa Oct. 7, 2009, 10:08 p.m. UTC
This option deprecates --enable-kvm. It is a more flexible option,
that makes use of qemu-opts, and allow us to pass on options to enable or
disable kernel irqchip, for example.

Signed-off-by: Glauber Costa <glommer@redhat.com>
---
 kvm-all.c       |    1 +
 kvm.h           |    1 +
 qemu-config.c   |   16 ++++++++++++++++
 qemu-config.h   |    1 +
 qemu-options.hx |   13 +++++++++----
 vl.c            |   11 +++++++++++
 6 files changed, 39 insertions(+), 4 deletions(-)

Comments

Anthony Liguori Oct. 7, 2009, 11 p.m. UTC | #1
Glauber Costa wrote:
> This option deprecates --enable-kvm. It is a more flexible option,
> that makes use of qemu-opts, and allow us to pass on options to enable or
> disable kernel irqchip, for example.
>   

With proper qdev support, you could select kvm device models based on 
-device so I think this option isn't all that useful.

What I'd like to see in the interim is a kvm specific machine type 
that's defaulted to if kvm is enabled.  I think this would be useful not 
only for enabling things like in-kernel apic, but also for selecting a 
default cpu model.

> Signed-off-by: Glauber Costa <glommer@redhat.com>
> ---
>  kvm-all.c       |    1 +
>  kvm.h           |    1 +
>  qemu-config.c   |   16 ++++++++++++++++
>  qemu-config.h   |    1 +
>  qemu-options.hx |   13 +++++++++----
>  vl.c            |   11 +++++++++++
>  6 files changed, 39 insertions(+), 4 deletions(-)
>
> diff --git a/kvm-all.c b/kvm-all.c
> index f33354d..b31d085 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -51,6 +51,7 @@ typedef struct KVMSlot
>  typedef struct kvm_dirty_log KVMDirtyLog;
>
>  int kvm_allowed = 0;
> +int kvm_use_kernel_chip = 0;
>
>  struct KVMState
>  {
> diff --git a/kvm.h b/kvm.h
> index f0c9201..49a2b56 100644
> --- a/kvm.h
> +++ b/kvm.h
> @@ -20,6 +20,7 @@
>
>  #ifdef CONFIG_KVM
>  extern int kvm_allowed;
> +extern int kvm_use_kernel_chip;
>
>  #define kvm_enabled() (kvm_allowed)
>  #else
> diff --git a/qemu-config.c b/qemu-config.c
> index bafaea2..9461766 100644
> --- a/qemu-config.c
> +++ b/qemu-config.c
> @@ -184,12 +184,28 @@ QemuOptsList qemu_rtc_opts = {
>      },
>  };
>
> +QemuOptsList qemu_kvm_opts = {
> +    .name = "kvm",
> +    .head = QTAILQ_HEAD_INITIALIZER(qemu_kvm_opts.head),
> +    .desc = {
> +        {
> +            .name = "irqchip-in-kernel",
> +            .type = QEMU_OPT_BOOL,
> +        },{
> +            .name = "enabled",
> +            .type = QEMU_OPT_BOOL,
> +        },
> +        { /* end if list */ }
> +    },
> +};
> +
>  static QemuOptsList *lists[] = {
>      &qemu_drive_opts,
>      &qemu_chardev_opts,
>      &qemu_device_opts,
>      &qemu_net_opts,
>      &qemu_rtc_opts,
> +    &qemu_kvm_opts,
>      NULL,
>  };
>
> diff --git a/qemu-config.h b/qemu-config.h
> index cdad5ac..58cead2 100644
> --- a/qemu-config.h
> +++ b/qemu-config.h
> @@ -6,6 +6,7 @@ extern QemuOptsList qemu_chardev_opts;
>  extern QemuOptsList qemu_device_opts;
>  extern QemuOptsList qemu_net_opts;
>  extern QemuOptsList qemu_rtc_opts;
> +extern QemuOptsList qemu_kvm_opts;
>
>  int qemu_set_option(const char *str);
>
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 3dd76b3..1cb8431 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -1431,15 +1431,20 @@ Set the filename for the BIOS.
>  ETEXI
>
>  #ifdef CONFIG_KVM
> -DEF("enable-kvm", 0, QEMU_OPTION_enable_kvm, \
> -    "-enable-kvm     enable KVM full virtualization support\n")
> -#endif
> +HXCOMM Options deprecated by -kvm
> +DEF("enable-kvm", 0, QEMU_OPTION_enable_kvm, "")
> +
> +DEF("kvm", HAS_ARG, QEMU_OPTION_kvm, \
> +    "-kvm enable=on|off,irqchip-in-kernel=on|off\n" \
> +    "                enable KVM full virtualization support\n")
>  STEXI
> -@item -enable-kvm
> +@item -kvm [enable=on|off][,irqchip-in-kernel=on|off]
>  Enable KVM full virtualization support. This option is only available
>  if KVM support is enabled when compiling.
>  ETEXI
>
> +#endif
> +
>  #ifdef CONFIG_XEN
>  DEF("xen-domid", HAS_ARG, QEMU_OPTION_xen_domid,
>      "-xen-domid id   specify xen guest domain id\n")
> diff --git a/vl.c b/vl.c
> index afe01af..a6f9eb7 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -5353,6 +5353,17 @@ int main(int argc, char **argv, char **envp)
>              case QEMU_OPTION_enable_kvm:
>                  kvm_allowed = 1;
>                  break;
> +            case QEMU_OPTION_kvm:
> +
> +                opts = qemu_opts_parse(&qemu_kvm_opts, optarg, NULL);
> +                if (!opts) {
> +                    fprintf(stderr, "parse error: %s\n", optarg);
> +                    exit(1);
> +                }
> +
> +                kvm_allowed = qemu_opt_get_bool(opts, "enabled", 1);
> +                kvm_use_kernel_chip = qemu_opt_get_bool(opts, "irqchip-in-kernel", 1);
> +                break;
>  #endif
>              case QEMU_OPTION_usb:
>                  usb_enabled = 1;
>
Glauber Costa Oct. 7, 2009, 11:14 p.m. UTC | #2
On Wed, Oct 07, 2009 at 06:00:34PM -0500, Anthony Liguori wrote:
> Glauber Costa wrote:
>> This option deprecates --enable-kvm. It is a more flexible option,
>> that makes use of qemu-opts, and allow us to pass on options to enable or
>> disable kernel irqchip, for example.
>>   
>
> With proper qdev support, you could select kvm device models based on  
> -device so I think this option isn't all that useful.
>
> What I'd like to see in the interim is a kvm specific machine type  
> that's defaulted to if kvm is enabled.  I think this would be useful not  
> only for enabling things like in-kernel apic, but also for selecting a  
> default cpu model.
I don't really follow.

even if we have qdev on the irq controllers, one could still come up with
situations in which we'd like to force the use of one device over another.

>
>> Signed-off-by: Glauber Costa <glommer@redhat.com>
>> ---
>>  kvm-all.c       |    1 +
>>  kvm.h           |    1 +
>>  qemu-config.c   |   16 ++++++++++++++++
>>  qemu-config.h   |    1 +
>>  qemu-options.hx |   13 +++++++++----
>>  vl.c            |   11 +++++++++++
>>  6 files changed, 39 insertions(+), 4 deletions(-)
>>
>> diff --git a/kvm-all.c b/kvm-all.c
>> index f33354d..b31d085 100644
>> --- a/kvm-all.c
>> +++ b/kvm-all.c
>> @@ -51,6 +51,7 @@ typedef struct KVMSlot
>>  typedef struct kvm_dirty_log KVMDirtyLog;
>>
>>  int kvm_allowed = 0;
>> +int kvm_use_kernel_chip = 0;
>>
>>  struct KVMState
>>  {
>> diff --git a/kvm.h b/kvm.h
>> index f0c9201..49a2b56 100644
>> --- a/kvm.h
>> +++ b/kvm.h
>> @@ -20,6 +20,7 @@
>>
>>  #ifdef CONFIG_KVM
>>  extern int kvm_allowed;
>> +extern int kvm_use_kernel_chip;
>>
>>  #define kvm_enabled() (kvm_allowed)
>>  #else
>> diff --git a/qemu-config.c b/qemu-config.c
>> index bafaea2..9461766 100644
>> --- a/qemu-config.c
>> +++ b/qemu-config.c
>> @@ -184,12 +184,28 @@ QemuOptsList qemu_rtc_opts = {
>>      },
>>  };
>>
>> +QemuOptsList qemu_kvm_opts = {
>> +    .name = "kvm",
>> +    .head = QTAILQ_HEAD_INITIALIZER(qemu_kvm_opts.head),
>> +    .desc = {
>> +        {
>> +            .name = "irqchip-in-kernel",
>> +            .type = QEMU_OPT_BOOL,
>> +        },{
>> +            .name = "enabled",
>> +            .type = QEMU_OPT_BOOL,
>> +        },
>> +        { /* end if list */ }
>> +    },
>> +};
>> +
>>  static QemuOptsList *lists[] = {
>>      &qemu_drive_opts,
>>      &qemu_chardev_opts,
>>      &qemu_device_opts,
>>      &qemu_net_opts,
>>      &qemu_rtc_opts,
>> +    &qemu_kvm_opts,
>>      NULL,
>>  };
>>
>> diff --git a/qemu-config.h b/qemu-config.h
>> index cdad5ac..58cead2 100644
>> --- a/qemu-config.h
>> +++ b/qemu-config.h
>> @@ -6,6 +6,7 @@ extern QemuOptsList qemu_chardev_opts;
>>  extern QemuOptsList qemu_device_opts;
>>  extern QemuOptsList qemu_net_opts;
>>  extern QemuOptsList qemu_rtc_opts;
>> +extern QemuOptsList qemu_kvm_opts;
>>
>>  int qemu_set_option(const char *str);
>>
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index 3dd76b3..1cb8431 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -1431,15 +1431,20 @@ Set the filename for the BIOS.
>>  ETEXI
>>
>>  #ifdef CONFIG_KVM
>> -DEF("enable-kvm", 0, QEMU_OPTION_enable_kvm, \
>> -    "-enable-kvm     enable KVM full virtualization support\n")
>> -#endif
>> +HXCOMM Options deprecated by -kvm
>> +DEF("enable-kvm", 0, QEMU_OPTION_enable_kvm, "")
>> +
>> +DEF("kvm", HAS_ARG, QEMU_OPTION_kvm, \
>> +    "-kvm enable=on|off,irqchip-in-kernel=on|off\n" \
>> +    "                enable KVM full virtualization support\n")
>>  STEXI
>> -@item -enable-kvm
>> +@item -kvm [enable=on|off][,irqchip-in-kernel=on|off]
>>  Enable KVM full virtualization support. This option is only available
>>  if KVM support is enabled when compiling.
>>  ETEXI
>>
>> +#endif
>> +
>>  #ifdef CONFIG_XEN
>>  DEF("xen-domid", HAS_ARG, QEMU_OPTION_xen_domid,
>>      "-xen-domid id   specify xen guest domain id\n")
>> diff --git a/vl.c b/vl.c
>> index afe01af..a6f9eb7 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -5353,6 +5353,17 @@ int main(int argc, char **argv, char **envp)
>>              case QEMU_OPTION_enable_kvm:
>>                  kvm_allowed = 1;
>>                  break;
>> +            case QEMU_OPTION_kvm:
>> +
>> +                opts = qemu_opts_parse(&qemu_kvm_opts, optarg, NULL);
>> +                if (!opts) {
>> +                    fprintf(stderr, "parse error: %s\n", optarg);
>> +                    exit(1);
>> +                }
>> +
>> +                kvm_allowed = qemu_opt_get_bool(opts, "enabled", 1);
>> +                kvm_use_kernel_chip = qemu_opt_get_bool(opts, "irqchip-in-kernel", 1);
>> +                break;
>>  #endif
>>              case QEMU_OPTION_usb:
>>                  usb_enabled = 1;
>>   
>
>
> -- 
> Regards,
>
> Anthony Liguori
>
Anthony Liguori Oct. 7, 2009, 11:28 p.m. UTC | #3
Glauber Costa wrote:
> On Wed, Oct 07, 2009 at 06:00:34PM -0500, Anthony Liguori wrote:
>   
>> Glauber Costa wrote:
>>     
>>> This option deprecates --enable-kvm. It is a more flexible option,
>>> that makes use of qemu-opts, and allow us to pass on options to enable or
>>> disable kernel irqchip, for example.
>>>   
>>>       
>> With proper qdev support, you could select kvm device models based on  
>> -device so I think this option isn't all that useful.
>>
>> What I'd like to see in the interim is a kvm specific machine type  
>> that's defaulted to if kvm is enabled.  I think this would be useful not  
>> only for enabling things like in-kernel apic, but also for selecting a  
>> default cpu model.
>>     
> I don't really follow.
>
> even if we have qdev on the irq controllers, one could still come up with
> situations in which we'd like to force the use of one device over another.
>   

Right, my assumption is that the devices will have different qdev names 
and therefore a user can select which one gets used.

Right now, -device is just additive.  I'm not sure the best way to 
express something like, replace this existing device with this new 
device.  Maybe some trickery with id=.  Gerd, any thoughts?

Regards,

Anthony Liguori
Avi Kivity Oct. 8, 2009, 2:07 p.m. UTC | #4
On 10/08/2009 01:00 AM, Anthony Liguori wrote:
> Glauber Costa wrote:
>> This option deprecates --enable-kvm. It is a more flexible option,
>> that makes use of qemu-opts, and allow us to pass on options to 
>> enable or
>> disable kernel irqchip, for example.
>
> With proper qdev support, you could select kvm device models based on 
> -device so I think this option isn't all that useful.

qdev (mostly?) is about the guest interface; this is more about the host 
implementation, so akin to -drive file= and cache=.

> What I'd like to see in the interim is a kvm specific machine type 
> that's defaulted to if kvm is enabled.  I think this would be useful 
> not only for enabling things like in-kernel apic, but also for 
> selecting a default cpu model.

We should make a distinction between guest-visible changes and 
accelerators like kvm-irqchip and vhost.
Anthony Liguori Oct. 8, 2009, 2:23 p.m. UTC | #5
Avi Kivity wrote:
> On 10/08/2009 01:00 AM, Anthony Liguori wrote:
>> Glauber Costa wrote:
>>> This option deprecates --enable-kvm. It is a more flexible option,
>>> that makes use of qemu-opts, and allow us to pass on options to 
>>> enable or
>>> disable kernel irqchip, for example.
>>
>> With proper qdev support, you could select kvm device models based on 
>> -device so I think this option isn't all that useful.
>
> qdev (mostly?) is about the guest interface; this is more about the 
> host implementation, so akin to -drive file= and cache=.

If we make the kvm device a separate device, then it's about choosing 
which device to use.

Regardless, even if it were a single device, it's still a property of 
the device so a qdev property would be the logical way to expose it.

>> What I'd like to see in the interim is a kvm specific machine type 
>> that's defaulted to if kvm is enabled.  I think this would be useful 
>> not only for enabling things like in-kernel apic, but also for 
>> selecting a default cpu model.
>
> We should make a distinction between guest-visible changes and 
> accelerators like kvm-irqchip and vhost.

They are different device models that happen to implement the same 
guest-visible device.

Regards,

Anthony Liguori
Avi Kivity Oct. 8, 2009, 2:30 p.m. UTC | #6
On 10/08/2009 04:23 PM, Anthony Liguori wrote:
>>> What I'd like to see in the interim is a kvm specific machine type 
>>> that's defaulted to if kvm is enabled.  I think this would be useful 
>>> not only for enabling things like in-kernel apic, but also for 
>>> selecting a default cpu model.
>>
>> We should make a distinction between guest-visible changes and 
>> accelerators like kvm-irqchip and vhost.
>
> They are different device models that happen to implement the same 
> guest-visible device. 

I think this separation is artificial.  From both the guest's view and 
the user's view, there's no difference between qemu ioapic and kvm 
ioapic (modulo bugs).  To me this indicates they're the same device.

We'll have the same problem with vhost-net, only there the duplication 
will be much greater if we split the implementation.
Anthony Liguori Oct. 8, 2009, 2:33 p.m. UTC | #7
Avi Kivity wrote:
> On 10/08/2009 04:23 PM, Anthony Liguori wrote:
>>>> What I'd like to see in the interim is a kvm specific machine type 
>>>> that's defaulted to if kvm is enabled.  I think this would be 
>>>> useful not only for enabling things like in-kernel apic, but also 
>>>> for selecting a default cpu model.
>>>
>>> We should make a distinction between guest-visible changes and 
>>> accelerators like kvm-irqchip and vhost.
>>
>> They are different device models that happen to implement the same 
>> guest-visible device. 
>
> I think this separation is artificial.  From both the guest's view and 
> the user's view, there's no difference between qemu ioapic and kvm 
> ioapic (modulo bugs).  To me this indicates they're the same device.

That's not always been true historically.  For instance, the in-kernel 
pit does interrupt catchup whereas the userspace pit does not.

> We'll have the same problem with vhost-net, only there the duplication 
> will be much greater if we split the implementation.

I'd rather not treat vhost-net like a device model and instead treat it 
like a -net backend.  If we can bounce requests through userspace (and 
we can), we should be able to use it for any device model.  In the case 
of virtio-net, we should be able to short-circuit things such that we 
don't have to go through userspace.  It's admittedly very hairy without 
a point-to-point net abstraction.

Regards,

Anthony Liguori
Gleb Natapov Oct. 8, 2009, 2:34 p.m. UTC | #8
On Thu, Oct 08, 2009 at 04:30:17PM +0200, Avi Kivity wrote:
> On 10/08/2009 04:23 PM, Anthony Liguori wrote:
> >>>What I'd like to see in the interim is a kvm specific machine
> >>>type that's defaulted to if kvm is enabled.  I think this
> >>>would be useful not only for enabling things like in-kernel
> >>>apic, but also for selecting a default cpu model.
> >>
> >>We should make a distinction between guest-visible changes and
> >>accelerators like kvm-irqchip and vhost.
> >
> >They are different device models that happen to implement the same
> >guest-visible device.
> 
> I think this separation is artificial.  From both the guest's view
> and the user's view, there's no difference between qemu ioapic and
> kvm ioapic (modulo bugs).  To me this indicates they're the same
> device.
> 
Same device but different implementation. The only two things that are
used from qemu apic implementation with in-kernel apic are migration and
reset. Both depends on implementation. Reusing code will require us to
keep implementation in sync. This reuse is a constant source of bugs
BTW.

> We'll have the same problem with vhost-net, only there the
> duplication will be much greater if we split the implementation.
> 
> -- 
> error compiling committee.c: too many arguments to function
> 
> 

--
			Gleb.
Avi Kivity Oct. 8, 2009, 2:41 p.m. UTC | #9
On 10/08/2009 04:33 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>> On 10/08/2009 04:23 PM, Anthony Liguori wrote:
>>>>> What I'd like to see in the interim is a kvm specific machine type 
>>>>> that's defaulted to if kvm is enabled.  I think this would be 
>>>>> useful not only for enabling things like in-kernel apic, but also 
>>>>> for selecting a default cpu model.
>>>>
>>>> We should make a distinction between guest-visible changes and 
>>>> accelerators like kvm-irqchip and vhost.
>>>
>>> They are different device models that happen to implement the same 
>>> guest-visible device. 
>>
>> I think this separation is artificial.  From both the guest's view 
>> and the user's view, there's no difference between qemu ioapic and 
>> kvm ioapic (modulo bugs).  To me this indicates they're the same device.
>
> That's not always been true historically.  For instance, the in-kernel 
> pit does interrupt catchup whereas the userspace pit does not.

That's a bug in the userspace pit :)

I'm worried about things like 'info pit' needing dual implementations.

>> We'll have the same problem with vhost-net, only there the 
>> duplication will be much greater if we split the implementation.
>
> I'd rather not treat vhost-net like a device model and instead treat 
> it like a -net backend.  If we can bounce requests through userspace 
> (and we can), we should be able to use it for any device model.  In 
> the case of virtio-net, we should be able to short-circuit things such 
> that we don't have to go through userspace.  It's admittedly very 
> hairy without a point-to-point net abstraction.
>

Interesting idea.  I think it'll be hairy even with a point-to-point net 
abstraction, but it's worth trying.
Anthony Liguori Oct. 8, 2009, 2:56 p.m. UTC | #10
Avi Kivity wrote:
> That's a bug in the userspace pit :)
>
> I'm worried about things like 'info pit' needing dual implementations.

You mean info qdm?

It'll Just Work.

>>> We'll have the same problem with vhost-net, only there the 
>>> duplication will be much greater if we split the implementation.
>>
>> I'd rather not treat vhost-net like a device model and instead treat 
>> it like a -net backend.  If we can bounce requests through userspace 
>> (and we can), we should be able to use it for any device model.  In 
>> the case of virtio-net, we should be able to short-circuit things 
>> such that we don't have to go through userspace.  It's admittedly 
>> very hairy without a point-to-point net abstraction.
>>
>
> Interesting idea.  I think it'll be hairy even with a point-to-point 
> net abstraction, but it's worth trying.

Could be interesting trying to tie the e1000 to vhost-net performance 
wise...

Regards,

Anthony Liguori
Avi Kivity Oct. 8, 2009, 3:05 p.m. UTC | #11
On 10/08/2009 04:56 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>> That's a bug in the userspace pit :)
>>
>> I'm worried about things like 'info pit' needing dual implementations.
>
> You mean info qdm?
>
> It'll Just Work.
>

I mean

    (qemu) info pit
    pit: counter 0x1234 period 0x3453

or something.

>> Interesting idea.  I think it'll be hairy even with a point-to-point 
>> net abstraction, but it's worth trying.
>
> Could be interesting trying to tie the e1000 to vhost-net performance 
> wise...

I don't think there's much chance of that - e1000 places important 
registers (like interrupt disable) in BARs whereas virtio puts them in 
the ring.
Anthony Liguori Oct. 8, 2009, 3:14 p.m. UTC | #12
Avi Kivity wrote:
> On 10/08/2009 04:56 PM, Anthony Liguori wrote:
>> Avi Kivity wrote:
>>> That's a bug in the userspace pit :)
>>>
>>> I'm worried about things like 'info pit' needing dual implementations.
>>
>> You mean info qdm?
>>
>> It'll Just Work.
>>
>
> I mean
>
>    (qemu) info pit
>    pit: counter 0x1234 period 0x3453
>
> or something.

When the pit is qdev converted, it will have properties.  counter and 
period could be two of those properties in which case, 'info qtree 
i8254' would show the above output or 'info qtree i8254-kvm' would show it.

In fact, a very good reason to have two device models is that there are 
certain properties that make sense for i8254-kvm that don't make sense 
for i8254.  For instance, you could expose information about interrupt 
catch up.

Regards,

Anthony Liguori
Gerd Hoffmann Oct. 12, 2009, 11:15 a.m. UTC | #13
>> (qemu) info pit
>> pit: counter 0x1234 period 0x3453
>>
>> or something.
>
> When the pit is qdev converted, it will have properties. counter and
> period could be two of those properties in which case, 'info qtree
> i8254' would show the above output or 'info qtree i8254-kvm' would show it.

Device state belongs more into vmstate than into properties.  Ideally 
each qdev device has the vmstate entry filled as well, so looking up the 
device in the tree then dump the state can be done.

cheers,
   Gerd
Gerd Hoffmann Oct. 12, 2009, 11:58 a.m. UTC | #14
On 10/08/09 01:28, Anthony Liguori wrote:
> Glauber Costa wrote:
>> even if we have qdev on the irq controllers, one could still come up with
>> situations in which we'd like to force the use of one device over
>> another.
>
> Right, my assumption is that the devices will have different qdev names
> and therefore a user can select which one gets used.
>
> Right now, -device is just additive. I'm not sure the best way to
> express something like, replace this existing device with this new
> device. Maybe some trickery with id=. Gerd, any thoughts?

-M $machine gives you a barebone machine with all core devices which 
belong to it. You can't easily remove and/or replace devices. 
Especially not something central as the IRQ controller.  But also no 
other core components, i.e. you wouldn't stick a piix4 ide controller 
into a Q35 machine.  Just say 'no'.

That leaves two options:

   (1) create two devices, create new machines which use the kvm
       versions (aka -M pc-kvm).
   (2) make using the in-kernel kvm code a device property.

I think (2) is the way to go.  Especially with multiple devices having 
kvm support (apic, pit, more?) (1) becomes unmanageable.

Right now there is no way to set properties for devices *not* added via 
-device[1].  We'll need support for that anyway though (to set rtc 
properties for example), so we can have a apic.kvm property and switch 
between user/kernel implementions using it.

cheers,
   Gerd

[1] my idea for implementing that is extending the compat property
     mechanism.
Anthony Liguori Oct. 12, 2009, 2:04 p.m. UTC | #15
Gerd Hoffmann wrote:
> On 10/08/09 01:28, Anthony Liguori wrote:
>> Glauber Costa wrote:
>>> even if we have qdev on the irq controllers, one could still come up 
>>> with
>>> situations in which we'd like to force the use of one device over
>>> another.
>>
>> Right, my assumption is that the devices will have different qdev names
>> and therefore a user can select which one gets used.
>>
>> Right now, -device is just additive. I'm not sure the best way to
>> express something like, replace this existing device with this new
>> device. Maybe some trickery with id=. Gerd, any thoughts?
>
> -M $machine gives you a barebone machine with all core devices which 
> belong to it. You can't easily remove and/or replace devices. 
> Especially not something central as the IRQ controller.  But also no 
> other core components, i.e. you wouldn't stick a piix4 ide controller 
> into a Q35 machine.  Just say 'no'.

This seems fundamentally flawed to me.  If you cannot remove a device 
from a machine using command line options, then how do we support 
something like -net none?  Do we make the default machine not contain a nic?

The value of a machine type to a user is that it presents a useful 
machine--not a barebones machine.  A user should not have to think about 
which type of nic they need or whether they want to enable usb.

>
> That leaves two options:
>
>   (1) create two devices, create new machines which use the kvm
>       versions (aka -M pc-kvm).
>   (2) make using the in-kernel kvm code a device property.

(3) Add the ability to remove device from a machine type.
Gerd Hoffmann Oct. 12, 2009, 3:34 p.m. UTC | #16
On 10/12/09 16:04, Anthony Liguori wrote:
> Gerd Hoffmann wrote:
>> -M $machine gives you a barebone machine with all core devices which
>> belong to it. You can't easily remove and/or replace devices.
>> Especially not something central as the IRQ controller. But also no
>> other core components, i.e. you wouldn't stick a piix4 ide controller
>> into a Q35 machine. Just say 'no'.
>
> This seems fundamentally flawed to me. If you cannot remove a device
> from a machine using command line options, then how do we support
> something like -net none?

'-net none' does not remove a nic.  It makes qemu not add the default nic.

> Do we make the default machine not contain a nic?
 >
> The value of a machine type to a user is that it presents a useful
> machine--not a barebones machine. A user should not have to think about
> which type of nic they need or whether they want to enable usb.

There are a bunch of places where qemu does something like

    if (user-did-not-specify-a-$foo-device) {
        add_$foo_device_with_defaults()
    }

That applies (for pc) to:

   * nic.
   * vga.
   * serial port.
   * parallel port.
   * cdrom drive.

For all of these (except cdrom) you can disable the creation of the 
default device via "-$dev none".  I don't consider them core devices.
Core devices are the ones which a machine can't live without, i.e. rtc, 
pic, ...

You can't add/remove core stuff like rtc, interrupt controller.  The 
virtual machine will simply not work then ...

>> (1) create two devices, create new machines which use the kvm
>> versions (aka -M pc-kvm).
>> (2) make using the in-kernel kvm code a device property.
>
> (3) Add the ability to remove device from a machine type.

-rtc none ?  Have fun booting your machine.

cheers,
   Gerd
Anthony Liguori Oct. 12, 2009, 4:31 p.m. UTC | #17
Gerd Hoffmann wrote:
> On 10/12/09 16:04, Anthony Liguori wrote:
>> Gerd Hoffmann wrote:
>>> -M $machine gives you a barebone machine with all core devices which
>>> belong to it. You can't easily remove and/or replace devices.
>>> Especially not something central as the IRQ controller. But also no
>>> other core components, i.e. you wouldn't stick a piix4 ide controller
>>> into a Q35 machine. Just say 'no'.
>>
>> This seems fundamentally flawed to me. If you cannot remove a device
>> from a machine using command line options, then how do we support
>> something like -net none?
>
> '-net none' does not remove a nic.  It makes qemu not add the default 
> nic.

qemu should not have the concept of a default nic.   Instead, if you 
choose the pc machine type, by default you get an e1000.  I think it's 
also reasonable to get a USB controller too along with the balloon 
driver and anything else we think a user could benefit from.

If you give devices well known ids, then changing a default device is 
not really that big of a deal.  It involves removing the device and 
adding a new device with that same device id.

For platform devices, like the interrupt controller/pic, the same 
principle could be applied to switch out a userspace irqchip/pit with 
the kvm kernel implementations.
Gerd Hoffmann Oct. 13, 2009, 8:05 a.m. UTC | #18
On 10/12/09 18:31, Anthony Liguori wrote:
> Gerd Hoffmann wrote:
>> On 10/12/09 16:04, Anthony Liguori wrote:
>>> Gerd Hoffmann wrote:
>>>> -M $machine gives you a barebone machine with all core devices which
>>>> belong to it. You can't easily remove and/or replace devices.
>>>> Especially not something central as the IRQ controller. But also no
>>>> other core components, i.e. you wouldn't stick a piix4 ide controller
>>>> into a Q35 machine. Just say 'no'.
>>>
>>> This seems fundamentally flawed to me. If you cannot remove a device
>>> from a machine using command line options, then how do we support
>>> something like -net none?
>>
>> '-net none' does not remove a nic. It makes qemu not add the default nic.
>
> qemu should not have the concept of a default nic.

Well, right now it has (likewise for serial, vga, ...).  And it causes 
problems with the qdev way of managing devices, so we have to find a way 
to deal with it.

Right now the default devices are tied to a command line switch, i.e. in 
case there isn't a -serial switch specified qemu will add a default 
serial device, even in case one was added via -device isa-serial.

> Instead, if you
> choose the pc machine type, by default you get an e1000. I think it's
> also reasonable to get a USB controller too along with the balloon
> driver and anything else we think a user could benefit from.

Hmm.  It makes sense to have a usable default or example configuration.

I don't think the machine type should be that though.  IMO the machine 
type should be more like a chipset, i.e. the current pc type should 
include all core stuff (pit, pic, apic, rtc, ...) and the piix devices 
(host bridge, isa bridge, apci controller, ide controller, usb 
controller) but nothing else.

> If you give devices well known ids, then changing a default device is
> not really that big of a deal. It involves removing the device and
> adding a new device with that same device id.

Might be workable for anything added via -device, because you could do 
the operations in QemuOpts space, before actually creating qdev device 
instances.  And the user could switch the nic type via
'-set device.defaultnic.driver=rtl8139' then.

I still don't like the concept though.  Configuring a second nic would 
be different from configuring the first nic, because for the first 
you'll modify the default device, the second is added instead.  libvirt 
folks will hate us for doing this.

> For platform devices, like the interrupt controller/pic, the same
> principle could be applied to switch out a userspace irqchip/pit with
> the kvm kernel implementations.

Doesn't fly.  You can't simply add interrupt controllers via -device. 
They are tied way to much with the other core devices.

cheers,
   Gerd
Anthony Liguori Oct. 13, 2009, 3:33 p.m. UTC | #19
Gerd Hoffmann wrote:
>> qemu should not have the concept of a default nic.
>
> Well, right now it has (likewise for serial, vga, ...).  And it causes 
> problems with the qdev way of managing devices, so we have to find a 
> way to deal with it.

There are a few separate issues that should be treated differently.  For 
vga, it should just be a default device in the machine description.  A 
special id should be used for the default vga card and -vga std should 
have the effect of modifying the a vga device of that id to be std.  
IOW, it should be -device-remove id=default_vga -device 
stdvga,id=default_vga.

For serial, we default to having one port.  Again, it should have a well 
known id and -serial none should be -device-remove id=default_serial.  
For nics, it's a bit complicated by the fact that -net nic doesn't 
require a model and by default, each machine may have a default model.  
We will likely have to preserve this as a config option which is 
unfortunate.

That said, without -net none, we get a single network device.  Again, it 
should have a well known id and -net none should remove that device.
> Right now the default devices are tied to a command line switch, i.e. 
> in case there isn't a -serial switch specified qemu will add a default 
> serial device, even in case one was added via -device isa-serial.

I don't think -device has to have the same property of -serial.  That 
is, -device isa-serial doesn't need to replace the default serial device 
which is connected to a vc.  Rather, we should allow a user to modify 
the char device associated with that default serial device via -set.
 
We shouldn't think of it like we add a default -serial device if no 
-serial switch was used.  Instead, there has always been a default 
isa-serial device.  You can modify it via -set or you can remove it.  
-serial none removes it.  For the first occurrence of -serial, it 
behaves like -set.  Future occurrences of -set behave like -device 
isa-serial.
Anthony Liguori Oct. 13, 2009, 3:36 p.m. UTC | #20
Gerd Hoffmann wrote:
>> For platform devices, like the interrupt controller/pic, the same
>> principle could be applied to switch out a userspace irqchip/pit with
>> the kvm kernel implementations.
>
> Doesn't fly.  You can't simply add interrupt controllers via -device. 
> They are tied way to much with the other core devices.

We definitely want a pc machine type and a kvm-pc machine type.  The 
later would change the default cpu to kvm64 and would use the in-kernel 
apic.

If we want to support non in-kernel apic for debugging, I'd suggest that 
we support it by introducing a third machine type: kvm-pc-debug.

When we complete the qdev conversion of pc, different machine types are 
just config files so it seems like a reasonable thing to do.

> cheers,
>   Gerd
Markus Armbruster Oct. 13, 2009, 7:26 p.m. UTC | #21
Anthony Liguori <aliguori@us.ibm.com> writes:

> Gerd Hoffmann wrote:
>>> qemu should not have the concept of a default nic.
>>
>> Well, right now it has (likewise for serial, vga, ...).  And it
>> causes problems with the qdev way of managing devices, so we have to
>> find a way to deal with it.
>
> There are a few separate issues that should be treated differently.
> For vga, it should just be a default device in the machine
> description.  A special id should be used for the default vga card and
> -vga std should have the effect of modifying the a vga device of that
> id to be std.  IOW, it should be -device-remove id=default_vga -device
> stdvga,id=default_vga.
>
> For serial, we default to having one port.  Again, it should have a
> well known id and -serial none should be -device-remove
> id=default_serial.  For nics, it's a bit complicated by the fact that
> -net nic doesn't require a model and by default, each machine may have
> a default model.  We will likely have to preserve this as a config
> option which is unfortunate.
>
> That said, without -net none, we get a single network device.  Again,
> it should have a well known id and -net none should remove that
> device.
>> Right now the default devices are tied to a command line switch,
>> i.e. in case there isn't a -serial switch specified qemu will add a
>> default serial device, even in case one was added via -device
>> isa-serial.
>
> I don't think -device has to have the same property of -serial.  That
> is, -device isa-serial doesn't need to replace the default serial
> device which is connected to a vc.  Rather, we should allow a user to
> modify the char device associated with that default serial device via
> -set.
>
> We shouldn't think of it like we add a default -serial device if no
> -serial switch was used.  Instead, there has always been a default
> isa-serial device.  You can modify it via -set or you can remove it.
> -serial none removes it.  For the first occurrence of -serial, it
> behaves like -set.  Future occurrences of -set behave like -device
> isa-serial.

The problem with that approach is in a part of Gerd's reply you snipped:

>> I still don't like the concept though.  Configuring a second nic would
>> be different from configuring the first nic, because for the first
>> you'll modify the default device, the second is added instead.
>> libvirt folks will hate us for doing this.

Having to use -set for configuring the first device of a kind, and
-device for the second is a bad user interface.  It's made worse by the
fact that you need -set only for some kinds of devices, namely the kinds
where QEMU provides a default.

I agree with Gerd that we should distinguish between required and
optional devices.  It's fine to require -set for modifying required
devices like RTC.  But when I configure my first and second NIC, I don't
really want to know that I'm actually modifying the first NIC and adding
the second NIC.

What about this: give the user a default FOO (for FOO in serial, NIC,
...) if he didn't configure one (no matter how, be it -device or some
legacy option to ask for FOOs).  This way, you ask for your first FOO
exactly like any other: -device.

Extra points for providing a way to say "do not give me any optional
devices I didn't explicitely ask for".
Anthony Liguori Oct. 13, 2009, 8:43 p.m. UTC | #22
>
>>> I still don't like the concept though.  Configuring a second nic would
>>> be different from configuring the first nic, because for the first
>>> you'll modify the default device, the second is added instead.
>>> libvirt folks will hate us for doing this.
>>>       
>
> Having to use -set for configuring the first device of a kind, and
> -device for the second is a bad user interface.  It's made worse by the
> fact that you need -set only for some kinds of devices, namely the kinds
> where QEMU provides a default.
>   

The guest already has a nic, if you want to modify an existing nic, then 
you use -set.  If you want to add a second nic, you use -device.  This 
isn't two difference behaviors.

> I agree with Gerd that we should distinguish between required and
> optional devices.  It's fine to require -set for modifying required
> devices like RTC.  But when I configure my first and second NIC, I don't
> really want to know that I'm actually modifying the first NIC and adding
> the second NIC.
>   

I think what you're arguing for is a truly bare bones machine type where 
there are no assumptions about having a first nic.

But let's look at this from a much higher level.  What is a user 
typically going to want to do?  They're probably going to want to change 
the networking settings globally.  They'll either want to always use tap 
and use the default network device or they're going to want to always 
have two nics because they have to physical devices with separate networks.

A user would either create a new machine type to use for all of their 
machines, or they would modify a global host config to change from slirp 
to tap.  It shouldn't be the common case that they are manually 
specifying -device command line options.  -device is pretty obtuse and 
really is an expert option for things like libvirt and as a placeholder 
for a proper config.

> What about this: give the user a default FOO (for FOO in serial, NIC,
> ...) if he didn't configure one (no matter how, be it -device or some
> legacy option to ask for FOOs).  This way, you ask for your first FOO
> exactly like any other: -device.
>   

You cannot express the concept "give a user FOO if they didn't ask for 
one" in a machine config file.  That forces machine information to be 
baked into qemu which would be unfortunate.  It suggests that you need 
another mechanism to configure what the type of these defaults are.

Regards,

Anthony Liguori
Jamie Lokier Oct. 13, 2009, 10:57 p.m. UTC | #23
Anthony Liguori wrote:
> We definitely want a pc machine type and a kvm-pc machine type.  The 
> later would change the default cpu to kvm64 and would use the in-kernel 
> apic.

kvm32-pc for 32-bit hosts?

-- Jamie
Gerd Hoffmann Oct. 14, 2009, 8:08 a.m. UTC | #24
Hi,

> I think what you're arguing for is a truly bare bones machine type where
> there are no assumptions about having a first nic.
>
> But let's look at this from a much higher level. What is a user
> typically going to want to do? They're probably going to want to change
> the networking settings globally. They'll either want to always use tap
> and use the default network device or they're going to want to always
> have two nics because they have to physical devices with separate networks.
>
> A user would either create a new machine type to use for all of their
> machines, or they would modify a global host config to change from slirp
> to tap. It shouldn't be the common case that they are manually
> specifying -device command line options. -device is pretty obtuse and
> really is an expert option for things like libvirt and as a placeholder
> for a proper config.

Time to polish and post my config file patches for discussion ...

I expect we'll have *two* kinds of config files:

#1 describes a machine type, i.e. -M $machine with only the really 
essential hardware.  Users should not have to deal with this at all. 
Something like the dtc bits from pbrook.

#2 describes the configuration of a virtual machine, i.e. what 
additional devices are plugged in, virtual disks, network setup, ...

I have patches for #2 which essentially read/write QemuOpts to a 
git-style config file.

We could simply ship a type #2 default config file with nic, vga, serial 
etc.  Or two, with the second being the -nographic variant.

cheers,
   Gerd
diff mbox

Patch

diff --git a/kvm-all.c b/kvm-all.c
index f33354d..b31d085 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -51,6 +51,7 @@  typedef struct KVMSlot
 typedef struct kvm_dirty_log KVMDirtyLog;
 
 int kvm_allowed = 0;
+int kvm_use_kernel_chip = 0;
 
 struct KVMState
 {
diff --git a/kvm.h b/kvm.h
index f0c9201..49a2b56 100644
--- a/kvm.h
+++ b/kvm.h
@@ -20,6 +20,7 @@ 
 
 #ifdef CONFIG_KVM
 extern int kvm_allowed;
+extern int kvm_use_kernel_chip;
 
 #define kvm_enabled() (kvm_allowed)
 #else
diff --git a/qemu-config.c b/qemu-config.c
index bafaea2..9461766 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -184,12 +184,28 @@  QemuOptsList qemu_rtc_opts = {
     },
 };
 
+QemuOptsList qemu_kvm_opts = {
+    .name = "kvm",
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_kvm_opts.head),
+    .desc = {
+        {
+            .name = "irqchip-in-kernel",
+            .type = QEMU_OPT_BOOL,
+        },{
+            .name = "enabled",
+            .type = QEMU_OPT_BOOL,
+        },
+        { /* end if list */ }
+    },
+};
+
 static QemuOptsList *lists[] = {
     &qemu_drive_opts,
     &qemu_chardev_opts,
     &qemu_device_opts,
     &qemu_net_opts,
     &qemu_rtc_opts,
+    &qemu_kvm_opts,
     NULL,
 };
 
diff --git a/qemu-config.h b/qemu-config.h
index cdad5ac..58cead2 100644
--- a/qemu-config.h
+++ b/qemu-config.h
@@ -6,6 +6,7 @@  extern QemuOptsList qemu_chardev_opts;
 extern QemuOptsList qemu_device_opts;
 extern QemuOptsList qemu_net_opts;
 extern QemuOptsList qemu_rtc_opts;
+extern QemuOptsList qemu_kvm_opts;
 
 int qemu_set_option(const char *str);
 
diff --git a/qemu-options.hx b/qemu-options.hx
index 3dd76b3..1cb8431 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1431,15 +1431,20 @@  Set the filename for the BIOS.
 ETEXI
 
 #ifdef CONFIG_KVM
-DEF("enable-kvm", 0, QEMU_OPTION_enable_kvm, \
-    "-enable-kvm     enable KVM full virtualization support\n")
-#endif
+HXCOMM Options deprecated by -kvm
+DEF("enable-kvm", 0, QEMU_OPTION_enable_kvm, "")
+
+DEF("kvm", HAS_ARG, QEMU_OPTION_kvm, \
+    "-kvm enable=on|off,irqchip-in-kernel=on|off\n" \
+    "                enable KVM full virtualization support\n")
 STEXI
-@item -enable-kvm
+@item -kvm [enable=on|off][,irqchip-in-kernel=on|off]
 Enable KVM full virtualization support. This option is only available
 if KVM support is enabled when compiling.
 ETEXI
 
+#endif
+
 #ifdef CONFIG_XEN
 DEF("xen-domid", HAS_ARG, QEMU_OPTION_xen_domid,
     "-xen-domid id   specify xen guest domain id\n")
diff --git a/vl.c b/vl.c
index afe01af..a6f9eb7 100644
--- a/vl.c
+++ b/vl.c
@@ -5353,6 +5353,17 @@  int main(int argc, char **argv, char **envp)
             case QEMU_OPTION_enable_kvm:
                 kvm_allowed = 1;
                 break;
+            case QEMU_OPTION_kvm:
+
+                opts = qemu_opts_parse(&qemu_kvm_opts, optarg, NULL);
+                if (!opts) {
+                    fprintf(stderr, "parse error: %s\n", optarg);
+                    exit(1);
+                }
+
+                kvm_allowed = qemu_opt_get_bool(opts, "enabled", 1);
+                kvm_use_kernel_chip = qemu_opt_get_bool(opts, "irqchip-in-kernel", 1);
+                break;
 #endif
             case QEMU_OPTION_usb:
                 usb_enabled = 1;