diff mbox

[v7] kvm: notify host when the guest is panicked

Message ID 500A565A.8080403@cn.fujitsu.com
State New
Headers show

Commit Message

Wen Congyang July 21, 2012, 7:12 a.m. UTC
We can know the guest is panicked when the guest runs on xen.
But we do not have such feature on kvm.

Another purpose of this feature is: management app(for example:
libvirt) can do auto dump when the guest is panicked. If management
app does not do auto dump, the guest's user can do dump by hand if
he sees the guest is panicked.

We have three solutions to implement this feature:
1. use vmcall
2. use I/O port
3. use virtio-serial.

We have decided to avoid touching hypervisor. The reason why I choose
choose the I/O port is:
1. it is easier to implememt
2. it does not depend any virtual device
3. it can work when startint the kernel

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 arch/ia64/include/asm/kvm_para.h    |    5 +++++
 arch/powerpc/include/asm/kvm_para.h |    5 +++++
 arch/s390/include/asm/kvm_para.h    |    5 +++++
 arch/x86/include/asm/kvm_para.h     |    7 +++++++
 arch/x86/kernel/kvm.c               |   14 ++++++++++++++
 include/linux/kvm_para.h            |   13 +++++++++++++
 6 files changed, 49 insertions(+), 0 deletions(-)

Comments

Jan Kiszka July 21, 2012, 7:19 a.m. UTC | #1
On 2012-07-21 09:12, Wen Congyang wrote:
> We can know the guest is panicked when the guest runs on xen.
> But we do not have such feature on kvm.
> 
> Another purpose of this feature is: management app(for example:
> libvirt) can do auto dump when the guest is panicked. If management
> app does not do auto dump, the guest's user can do dump by hand if
> he sees the guest is panicked.
> 
> We have three solutions to implement this feature:
> 1. use vmcall
> 2. use I/O port
> 3. use virtio-serial.
> 
> We have decided to avoid touching hypervisor. The reason why I choose
> choose the I/O port is:
> 1. it is easier to implememt
> 2. it does not depend any virtual device
> 3. it can work when startint the kernel
> 
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> ---
>  arch/ia64/include/asm/kvm_para.h    |    5 +++++
>  arch/powerpc/include/asm/kvm_para.h |    5 +++++
>  arch/s390/include/asm/kvm_para.h    |    5 +++++
>  arch/x86/include/asm/kvm_para.h     |    7 +++++++
>  arch/x86/kernel/kvm.c               |   14 ++++++++++++++
>  include/linux/kvm_para.h            |   13 +++++++++++++
>  6 files changed, 49 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h
> index 2019cb9..187c0e2 100644
> --- a/arch/ia64/include/asm/kvm_para.h
> +++ b/arch/ia64/include/asm/kvm_para.h
> @@ -31,6 +31,11 @@ static inline bool kvm_check_and_clear_guest_paused(void)
>  	return false;
>  }
>  
> +static inline unsigned int kvm_arch_pv_features(void)
> +{
> +	return 0;
> +}
> +
>  #endif
>  
>  #endif
> diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
> index c18916b..be81aac 100644
> --- a/arch/powerpc/include/asm/kvm_para.h
> +++ b/arch/powerpc/include/asm/kvm_para.h
> @@ -211,6 +211,11 @@ static inline bool kvm_check_and_clear_guest_paused(void)
>  	return false;
>  }
>  
> +static inline unsigned int kvm_arch_pv_features(void)
> +{
> +	return 0;
> +}
> +
>  #endif /* __KERNEL__ */
>  
>  #endif /* __POWERPC_KVM_PARA_H__ */
> diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
> index a988329..3d993b7 100644
> --- a/arch/s390/include/asm/kvm_para.h
> +++ b/arch/s390/include/asm/kvm_para.h
> @@ -154,6 +154,11 @@ static inline bool kvm_check_and_clear_guest_paused(void)
>  	return false;
>  }
>  
> +static inline unsigned int kvm_arch_pv_features(void)
> +{
> +	return 0;
> +}
> +
>  #endif
>  
>  #endif /* __S390_KVM_PARA_H */
> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> index 63ab166..c8ad86e 100644
> --- a/arch/x86/include/asm/kvm_para.h
> +++ b/arch/x86/include/asm/kvm_para.h
> @@ -89,6 +89,8 @@ struct kvm_vcpu_pv_apf_data {
>  	__u32 enabled;
>  };
>  
> +#define KVM_PV_PORT	(0x505UL)
> +
>  #ifdef __KERNEL__
>  #include <asm/processor.h>
>  
> @@ -221,6 +223,11 @@ static inline void kvm_disable_steal_time(void)
>  }
>  #endif
>  
> +static inline unsigned int kvm_arch_pv_features(void)
> +{
> +	return inl(KVM_PV_PORT);
> +}
> +
>  #endif /* __KERNEL__ */
>  
>  #endif /* _ASM_X86_KVM_PARA_H */
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index e554e5a..9a97f7e 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -328,6 +328,17 @@ static struct notifier_block kvm_pv_reboot_nb = {
>  	.notifier_call = kvm_pv_reboot_notify,
>  };
>  
> +static int
> +kvm_pv_panic_notify(struct notifier_block *nb, unsigned long code, void *unused)
> +{
> +	outl(KVM_PV_PANICKED, KVM_PV_PORT);
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block kvm_pv_panic_nb = {
> +	.notifier_call = kvm_pv_panic_notify,
> +};
> +
>  static u64 kvm_steal_clock(int cpu)
>  {
>  	u64 steal;
> @@ -414,6 +425,9 @@ void __init kvm_guest_init(void)
>  
>  	paravirt_ops_setup();
>  	register_reboot_notifier(&kvm_pv_reboot_nb);
> +	if (kvm_pv_has_feature(KVM_PV_FEATURE_PANICKED))
> +		atomic_notifier_chain_register(&panic_notifier_list,
> +			&kvm_pv_panic_nb);
>  	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
>  		spin_lock_init(&async_pf_sleepers[i].lock);
>  	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
> diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
> index ff476dd..e73efcf 100644
> --- a/include/linux/kvm_para.h
> +++ b/include/linux/kvm_para.h
> @@ -20,6 +20,12 @@
>  #define KVM_HC_FEATURES			3
>  #define KVM_HC_PPC_MAP_MAGIC_PAGE	4
>  
> +/* The bit of the value read from KVM_PV_PORT */
> +#define KVM_PV_FEATURE_PANICKED	0
> +
> +/* The value writen to KVM_PV_PORT */
> +#define KVM_PV_PANICKED		1
> +
>  /*
>   * hypercalls use architecture specific
>   */
> @@ -33,5 +39,12 @@ static inline int kvm_para_has_feature(unsigned int feature)
>  		return 1;
>  	return 0;
>  }
> +
> +static inline int kvm_pv_has_feature(unsigned int feature)
> +{
> +	if (kvm_arch_pv_features() & (1UL << feature))

Reading from an invalid I/O port will return -1. So your test will
deliver a wrong result on a platform that doesn't support this PV channel.

Jan

> +		return 1;
> +	return 0;
> +}
>  #endif /* __KERNEL__ */
>  #endif /* __LINUX_KVM_PARA_H */
>
Wen Congyang July 21, 2012, 8:41 a.m. UTC | #2
At 07/21/2012 03:19 PM, Jan Kiszka Wrote:
> On 2012-07-21 09:12, Wen Congyang wrote:
>> We can know the guest is panicked when the guest runs on xen.
>> But we do not have such feature on kvm.
>>
>> Another purpose of this feature is: management app(for example:
>> libvirt) can do auto dump when the guest is panicked. If management
>> app does not do auto dump, the guest's user can do dump by hand if
>> he sees the guest is panicked.
>>
>> We have three solutions to implement this feature:
>> 1. use vmcall
>> 2. use I/O port
>> 3. use virtio-serial.
>>
>> We have decided to avoid touching hypervisor. The reason why I choose
>> choose the I/O port is:
>> 1. it is easier to implememt
>> 2. it does not depend any virtual device
>> 3. it can work when startint the kernel
>>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>> ---
>>  arch/ia64/include/asm/kvm_para.h    |    5 +++++
>>  arch/powerpc/include/asm/kvm_para.h |    5 +++++
>>  arch/s390/include/asm/kvm_para.h    |    5 +++++
>>  arch/x86/include/asm/kvm_para.h     |    7 +++++++
>>  arch/x86/kernel/kvm.c               |   14 ++++++++++++++
>>  include/linux/kvm_para.h            |   13 +++++++++++++
>>  6 files changed, 49 insertions(+), 0 deletions(-)
>>
>> diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h
>> index 2019cb9..187c0e2 100644
>> --- a/arch/ia64/include/asm/kvm_para.h
>> +++ b/arch/ia64/include/asm/kvm_para.h
>> @@ -31,6 +31,11 @@ static inline bool kvm_check_and_clear_guest_paused(void)
>>  	return false;
>>  }
>>  
>> +static inline unsigned int kvm_arch_pv_features(void)
>> +{
>> +	return 0;
>> +}
>> +
>>  #endif
>>  
>>  #endif
>> diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
>> index c18916b..be81aac 100644
>> --- a/arch/powerpc/include/asm/kvm_para.h
>> +++ b/arch/powerpc/include/asm/kvm_para.h
>> @@ -211,6 +211,11 @@ static inline bool kvm_check_and_clear_guest_paused(void)
>>  	return false;
>>  }
>>  
>> +static inline unsigned int kvm_arch_pv_features(void)
>> +{
>> +	return 0;
>> +}
>> +
>>  #endif /* __KERNEL__ */
>>  
>>  #endif /* __POWERPC_KVM_PARA_H__ */
>> diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
>> index a988329..3d993b7 100644
>> --- a/arch/s390/include/asm/kvm_para.h
>> +++ b/arch/s390/include/asm/kvm_para.h
>> @@ -154,6 +154,11 @@ static inline bool kvm_check_and_clear_guest_paused(void)
>>  	return false;
>>  }
>>  
>> +static inline unsigned int kvm_arch_pv_features(void)
>> +{
>> +	return 0;
>> +}
>> +
>>  #endif
>>  
>>  #endif /* __S390_KVM_PARA_H */
>> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
>> index 63ab166..c8ad86e 100644
>> --- a/arch/x86/include/asm/kvm_para.h
>> +++ b/arch/x86/include/asm/kvm_para.h
>> @@ -89,6 +89,8 @@ struct kvm_vcpu_pv_apf_data {
>>  	__u32 enabled;
>>  };
>>  
>> +#define KVM_PV_PORT	(0x505UL)
>> +
>>  #ifdef __KERNEL__
>>  #include <asm/processor.h>
>>  
>> @@ -221,6 +223,11 @@ static inline void kvm_disable_steal_time(void)
>>  }
>>  #endif
>>  
>> +static inline unsigned int kvm_arch_pv_features(void)
>> +{
>> +	return inl(KVM_PV_PORT);
>> +}
>> +
>>  #endif /* __KERNEL__ */
>>  
>>  #endif /* _ASM_X86_KVM_PARA_H */
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> index e554e5a..9a97f7e 100644
>> --- a/arch/x86/kernel/kvm.c
>> +++ b/arch/x86/kernel/kvm.c
>> @@ -328,6 +328,17 @@ static struct notifier_block kvm_pv_reboot_nb = {
>>  	.notifier_call = kvm_pv_reboot_notify,
>>  };
>>  
>> +static int
>> +kvm_pv_panic_notify(struct notifier_block *nb, unsigned long code, void *unused)
>> +{
>> +	outl(KVM_PV_PANICKED, KVM_PV_PORT);
>> +	return NOTIFY_DONE;
>> +}
>> +
>> +static struct notifier_block kvm_pv_panic_nb = {
>> +	.notifier_call = kvm_pv_panic_notify,
>> +};
>> +
>>  static u64 kvm_steal_clock(int cpu)
>>  {
>>  	u64 steal;
>> @@ -414,6 +425,9 @@ void __init kvm_guest_init(void)
>>  
>>  	paravirt_ops_setup();
>>  	register_reboot_notifier(&kvm_pv_reboot_nb);
>> +	if (kvm_pv_has_feature(KVM_PV_FEATURE_PANICKED))
>> +		atomic_notifier_chain_register(&panic_notifier_list,
>> +			&kvm_pv_panic_nb);
>>  	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
>>  		spin_lock_init(&async_pf_sleepers[i].lock);
>>  	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
>> diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
>> index ff476dd..e73efcf 100644
>> --- a/include/linux/kvm_para.h
>> +++ b/include/linux/kvm_para.h
>> @@ -20,6 +20,12 @@
>>  #define KVM_HC_FEATURES			3
>>  #define KVM_HC_PPC_MAP_MAGIC_PAGE	4
>>  
>> +/* The bit of the value read from KVM_PV_PORT */
>> +#define KVM_PV_FEATURE_PANICKED	0
>> +
>> +/* The value writen to KVM_PV_PORT */
>> +#define KVM_PV_PANICKED		1
>> +
>>  /*
>>   * hypercalls use architecture specific
>>   */
>> @@ -33,5 +39,12 @@ static inline int kvm_para_has_feature(unsigned int feature)
>>  		return 1;
>>  	return 0;
>>  }
>> +
>> +static inline int kvm_pv_has_feature(unsigned int feature)
>> +{
>> +	if (kvm_arch_pv_features() & (1UL << feature))
> 
> Reading from an invalid I/O port will return -1. So your test will
> deliver a wrong result on a platform that doesn't support this PV channel.

Yes, you are right. I will update it.

Thanks
Wen Congyang

> 
> Jan
> 
>> +		return 1;
>> +	return 0;
>> +}
>>  #endif /* __KERNEL__ */
>>  #endif /* __LINUX_KVM_PARA_H */
>>
> 
> 
> 
>
Sasha Levin July 21, 2012, 10:50 a.m. UTC | #3
On 07/21/2012 09:12 AM, Wen Congyang wrote:
> +#define KVM_PV_PORT	(0x505UL)
> +
>  #ifdef __KERNEL__
>  #include <asm/processor.h>
>  
> @@ -221,6 +223,11 @@ static inline void kvm_disable_steal_time(void)
>  }
>  #endif
>  
> +static inline unsigned int kvm_arch_pv_features(void)
> +{
> +	return inl(KVM_PV_PORT);
> +}
> +

Why is this safe?

I'm not sure you can just pick any ioport you'd like and use it.
Anthony Liguori July 22, 2012, 7:22 p.m. UTC | #4
Sasha Levin <levinsasha928@gmail.com> writes:

> On 07/21/2012 09:12 AM, Wen Congyang wrote:
>> +#define KVM_PV_PORT	(0x505UL)
>> +
>>  #ifdef __KERNEL__
>>  #include <asm/processor.h>
>>  
>> @@ -221,6 +223,11 @@ static inline void kvm_disable_steal_time(void)
>>  }
>>  #endif
>>  
>> +static inline unsigned int kvm_arch_pv_features(void)
>> +{
>> +	return inl(KVM_PV_PORT);
>> +}
>> +
>
> Why is this safe?
>
> I'm not sure you can just pick any ioport you'd like and use it.

There are three ways I/O ports get used on a PC:

1) Platform devices
 - This is well defined since the vast majority of platform devices are
   implemented within a single chip.  If you're emulating an i440fx
   chipset, the PIIX4 spec has an exhaustive list.

2) PCI devices
 - Typically, PCI only allocates ports starting at 0x0d00 to avoid
   conflicts with ISA devices.

3) ISA devices
 - ISA uses subtractive decoding so any ISA device can access.  In
   theory, an ISA device could attempt to use port 0x0505 but it's
   unlikely.  In a modern guest, there aren't really any ISA devices being
   added either.

So yes, picking port 0x0505 is safe for something like this (as long as
you check to make sure that you really are under KVM).

Regards,

Anthony Liguori

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sasha Levin July 22, 2012, 8:19 p.m. UTC | #5
On 07/22/2012 09:22 PM, Anthony Liguori wrote:
> Sasha Levin <levinsasha928@gmail.com> writes:
> 
>> On 07/21/2012 09:12 AM, Wen Congyang wrote:
>>> +#define KVM_PV_PORT	(0x505UL)
>>> +
>>>  #ifdef __KERNEL__
>>>  #include <asm/processor.h>
>>>  
>>> @@ -221,6 +223,11 @@ static inline void kvm_disable_steal_time(void)
>>>  }
>>>  #endif
>>>  
>>> +static inline unsigned int kvm_arch_pv_features(void)
>>> +{
>>> +	return inl(KVM_PV_PORT);
>>> +}
>>> +
>>
>> Why is this safe?
>>
>> I'm not sure you can just pick any ioport you'd like and use it.
> 
> There are three ways I/O ports get used on a PC:
> 
> 1) Platform devices
>  - This is well defined since the vast majority of platform devices are
>    implemented within a single chip.  If you're emulating an i440fx
>    chipset, the PIIX4 spec has an exhaustive list.
> 
> 2) PCI devices
>  - Typically, PCI only allocates ports starting at 0x0d00 to avoid
>    conflicts with ISA devices.
> 
> 3) ISA devices
>  - ISA uses subtractive decoding so any ISA device can access.  In
>    theory, an ISA device could attempt to use port 0x0505 but it's
>    unlikely.  In a modern guest, there aren't really any ISA devices being
>    added either.
> 
> So yes, picking port 0x0505 is safe for something like this (as long as
> you check to make sure that you really are under KVM).

Is there anything that actually prevents me from using PCI ports lower than 0x0d00? As you said in (3), ISA isn't really used anymore (nor is implemented by lkvm for example), so placing PCI below 0x0d00 might even make sense in that case.

Furthermore, I can place one of these brand new virtio-mmio devices which got introduced recently wherever I want right now - Having a device that uses 0x505 would cause a pretty non-obvious failure mode.

Either way, If we are going to grab an ioport, then:

 - It should be documented well somewhere in Documentation/virt/kvm
 - It should go through request_region() to actually claim those ioports.
 - It should fail gracefully if that port is taken for some reason, instead of not even checking it.
Sasha Levin July 22, 2012, 8:31 p.m. UTC | #6
On 07/22/2012 10:19 PM, Sasha Levin wrote:
> On 07/22/2012 09:22 PM, Anthony Liguori wrote:
>> Sasha Levin <levinsasha928@gmail.com> writes:
>>
>>> On 07/21/2012 09:12 AM, Wen Congyang wrote:
>>>> +#define KVM_PV_PORT	(0x505UL)
>>>> +
>>>>  #ifdef __KERNEL__
>>>>  #include <asm/processor.h>
>>>>  
>>>> @@ -221,6 +223,11 @@ static inline void kvm_disable_steal_time(void)
>>>>  }
>>>>  #endif
>>>>  
>>>> +static inline unsigned int kvm_arch_pv_features(void)
>>>> +{
>>>> +	return inl(KVM_PV_PORT);
>>>> +}
>>>> +
>>>
>>> Why is this safe?
>>>
>>> I'm not sure you can just pick any ioport you'd like and use it.
>>
>> There are three ways I/O ports get used on a PC:
>>
>> 1) Platform devices
>>  - This is well defined since the vast majority of platform devices are
>>    implemented within a single chip.  If you're emulating an i440fx
>>    chipset, the PIIX4 spec has an exhaustive list.
>>
>> 2) PCI devices
>>  - Typically, PCI only allocates ports starting at 0x0d00 to avoid
>>    conflicts with ISA devices.
>>
>> 3) ISA devices
>>  - ISA uses subtractive decoding so any ISA device can access.  In
>>    theory, an ISA device could attempt to use port 0x0505 but it's
>>    unlikely.  In a modern guest, there aren't really any ISA devices being
>>    added either.
>>
>> So yes, picking port 0x0505 is safe for something like this (as long as
>> you check to make sure that you really are under KVM).
> 
> Is there anything that actually prevents me from using PCI ports lower than 0x0d00? As you said in (3), ISA isn't really used anymore (nor is implemented by lkvm for example), so placing PCI below 0x0d00 might even make sense in that case.
> 
> Furthermore, I can place one of these brand new virtio-mmio devices which got introduced recently wherever I want right now - Having a device that uses 0x505 would cause a pretty non-obvious failure mode.
> 
> Either way, If we are going to grab an ioport, then:
> 
>  - It should be documented well somewhere in Documentation/virt/kvm
>  - It should go through request_region() to actually claim those ioports.
>  - It should fail gracefully if that port is taken for some reason, instead of not even checking it.
> 

Out of curiosity I tested that, and apparently lkvm has no problem allocating virtio-pci devices in that range:

sh-4.2# pwd
/sys/devices/pci0000:00/0000:00:01.0
sh-4.2# cat resource | head -n1
0x0000000000000500 0x00000000000005ff 0x0000000000040101

This was with the commit in question applied.
Anthony Liguori July 22, 2012, 10:20 p.m. UTC | #7
Sasha Levin <levinsasha928@gmail.com> writes:

> On 07/22/2012 09:22 PM, Anthony Liguori wrote:
>> Sasha Levin <levinsasha928@gmail.com> writes:
>> 
>>> On 07/21/2012 09:12 AM, Wen Congyang wrote:
>>>> +#define KVM_PV_PORT	(0x505UL)
>>>> +
>>>>  #ifdef __KERNEL__
>>>>  #include <asm/processor.h>
>>>>  
>>>> @@ -221,6 +223,11 @@ static inline void kvm_disable_steal_time(void)
>>>>  }
>>>>  #endif
>>>>  
>>>> +static inline unsigned int kvm_arch_pv_features(void)
>>>> +{
>>>> +	return inl(KVM_PV_PORT);
>>>> +}
>>>> +
>>>
>>> Why is this safe?
>>>
>>> I'm not sure you can just pick any ioport you'd like and use it.
>> 
>> There are three ways I/O ports get used on a PC:
>> 
>> 1) Platform devices
>>  - This is well defined since the vast majority of platform devices are
>>    implemented within a single chip.  If you're emulating an i440fx
>>    chipset, the PIIX4 spec has an exhaustive list.
>> 
>> 2) PCI devices
>>  - Typically, PCI only allocates ports starting at 0x0d00 to avoid
>>    conflicts with ISA devices.
>> 
>> 3) ISA devices
>>  - ISA uses subtractive decoding so any ISA device can access.  In
>>    theory, an ISA device could attempt to use port 0x0505 but it's
>>    unlikely.  In a modern guest, there aren't really any ISA devices being
>>    added either.
>> 
>> So yes, picking port 0x0505 is safe for something like this (as long as
>> you check to make sure that you really are under KVM).
>
> Is there anything that actually prevents me from using PCI ports lower
> than 0x0d00? As you said in (3), ISA isn't really used anymore (nor is
> implemented by lkvm for example), so placing PCI below 0x0d00 might
> even make sense in that case.

On modern systems, the OS goes by whatever is in the ACPI table
describing the PCI bus.  In QEMU, we have:

    WordIO (ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange,
          0x0000,             // Address Space Granularity
          0x0D00,             // Address Range Minimum
          0xFFFF,             // Address Range Maximum
          0x0000,             // Address Translation Offset
          0xF300,             // Address Length
          ,, , TypeStatic)

So Linux will always use 0x0D00 -> 0xFFFF for the valid
range. Practically speaking, you can't use anything below 0x0D00 because
the PCI bus configuration registers live at 0xCF8-0xCFF.  If you tried
to create the region starting at 0x0500 you'd have to limit it to 0xCF8
to avoid conflicting with the PCI host controller.

That's not a useful amount of space for I/O ports so that would be a
pretty dumb thing to do.

> Furthermore, I can place one of these brand new virtio-mmio devices
> which got introduced recently wherever I want right now - Having a
> device that uses 0x505 would cause a pretty non-obvious failure mode.

I think you're confusing PIO with MMIO.  They are separate address
spaces.

You could certainly argue that relying on PIO is way too architecture
specific since that's only available on x86.  That's a good argument but
the counter is that other architectures have their own interfaces for
this sort of thing.

> Either way, If we are going to grab an ioport, then:
>
>  - It should be documented well somewhere in Documentation/virt/kvm
>  - It should go through request_region() to actually claim those ioports.
>  - It should fail gracefully if that port is taken for some reason,
>  instead of not even checking it.

I agree with the above.

Regards,

Anthony Liguori
Anthony Liguori July 22, 2012, 10:29 p.m. UTC | #8
Sasha Levin <levinsasha928@gmail.com> writes:

> On 07/22/2012 10:19 PM, Sasha Levin wrote:
>> On 07/22/2012 09:22 PM, Anthony Liguori wrote:
>>> Sasha Levin <levinsasha928@gmail.com> writes:
>>>
>>>> On 07/21/2012 09:12 AM, Wen Congyang wrote:
>>>>> +#define KVM_PV_PORT	(0x505UL)
>>>>> +
>>>>>  #ifdef __KERNEL__
>>>>>  #include <asm/processor.h>
>>>>>  
>>>>> @@ -221,6 +223,11 @@ static inline void kvm_disable_steal_time(void)
>>>>>  }
>>>>>  #endif
>>>>>  
>>>>> +static inline unsigned int kvm_arch_pv_features(void)
>>>>> +{
>>>>> +	return inl(KVM_PV_PORT);
>>>>> +}
>>>>> +
>>>>
>>>> Why is this safe?
>>>>
>>>> I'm not sure you can just pick any ioport you'd like and use it.
>>>
>>> There are three ways I/O ports get used on a PC:
>>>
>>> 1) Platform devices
>>>  - This is well defined since the vast majority of platform devices are
>>>    implemented within a single chip.  If you're emulating an i440fx
>>>    chipset, the PIIX4 spec has an exhaustive list.
>>>
>>> 2) PCI devices
>>>  - Typically, PCI only allocates ports starting at 0x0d00 to avoid
>>>    conflicts with ISA devices.
>>>
>>> 3) ISA devices
>>>  - ISA uses subtractive decoding so any ISA device can access.  In
>>>    theory, an ISA device could attempt to use port 0x0505 but it's
>>>    unlikely.  In a modern guest, there aren't really any ISA devices being
>>>    added either.
>>>
>>> So yes, picking port 0x0505 is safe for something like this (as long as
>>> you check to make sure that you really are under KVM).
>> 
>> Is there anything that actually prevents me from using PCI ports lower than 0x0d00? As you said in (3), ISA isn't really used anymore (nor is implemented by lkvm for example), so placing PCI below 0x0d00 might even make sense in that case.
>> 
>> Furthermore, I can place one of these brand new virtio-mmio devices which got introduced recently wherever I want right now - Having a device that uses 0x505 would cause a pretty non-obvious failure mode.
>> 
>> Either way, If we are going to grab an ioport, then:
>> 
>>  - It should be documented well somewhere in Documentation/virt/kvm
>>  - It should go through request_region() to actually claim those ioports.
>>  - It should fail gracefully if that port is taken for some reason, instead of not even checking it.
>> 
>
> Out of curiosity I tested that, and apparently lkvm has no problem allocating virtio-pci devices in that range:
>
> sh-4.2# pwd
> /sys/devices/pci0000:00/0000:00:01.0
> sh-4.2# cat resource | head -n1
> 0x0000000000000500 0x00000000000005ff 0x0000000000040101
>
> This was with the commit in question applied.

With all due respect, lkvm has a half-baked implementation of PCI.  This
is why you have to pass kernel parameters to disable ACPI and disable
PCI BIOS probing.

So yeah, you can do funky things in lkvm but that doesn't mean a system
that emulated actual hardware would ever do that.

Regards,

Anthony Liguori
Sasha Levin July 22, 2012, 11:35 p.m. UTC | #9
On 07/23/2012 12:29 AM, Anthony Liguori wrote:
> Sasha Levin <levinsasha928@gmail.com> writes:
> 
>> On 07/22/2012 10:19 PM, Sasha Levin wrote:
>>> On 07/22/2012 09:22 PM, Anthony Liguori wrote:
>>>> Sasha Levin <levinsasha928@gmail.com> writes:
>>>>
>>>>> On 07/21/2012 09:12 AM, Wen Congyang wrote:
>>>>>> +#define KVM_PV_PORT	(0x505UL)
>>>>>> +
>>>>>>  #ifdef __KERNEL__
>>>>>>  #include <asm/processor.h>
>>>>>>  
>>>>>> @@ -221,6 +223,11 @@ static inline void kvm_disable_steal_time(void)
>>>>>>  }
>>>>>>  #endif
>>>>>>  
>>>>>> +static inline unsigned int kvm_arch_pv_features(void)
>>>>>> +{
>>>>>> +	return inl(KVM_PV_PORT);
>>>>>> +}
>>>>>> +
>>>>>
>>>>> Why is this safe?
>>>>>
>>>>> I'm not sure you can just pick any ioport you'd like and use it.
>>>>
>>>> There are three ways I/O ports get used on a PC:
>>>>
>>>> 1) Platform devices
>>>>  - This is well defined since the vast majority of platform devices are
>>>>    implemented within a single chip.  If you're emulating an i440fx
>>>>    chipset, the PIIX4 spec has an exhaustive list.
>>>>
>>>> 2) PCI devices
>>>>  - Typically, PCI only allocates ports starting at 0x0d00 to avoid
>>>>    conflicts with ISA devices.
>>>>
>>>> 3) ISA devices
>>>>  - ISA uses subtractive decoding so any ISA device can access.  In
>>>>    theory, an ISA device could attempt to use port 0x0505 but it's
>>>>    unlikely.  In a modern guest, there aren't really any ISA devices being
>>>>    added either.
>>>>
>>>> So yes, picking port 0x0505 is safe for something like this (as long as
>>>> you check to make sure that you really are under KVM).
>>>
>>> Is there anything that actually prevents me from using PCI ports lower than 0x0d00? As you said in (3), ISA isn't really used anymore (nor is implemented by lkvm for example), so placing PCI below 0x0d00 might even make sense in that case.
>>>
>>> Furthermore, I can place one of these brand new virtio-mmio devices which got introduced recently wherever I want right now - Having a device that uses 0x505 would cause a pretty non-obvious failure mode.
>>>
>>> Either way, If we are going to grab an ioport, then:
>>>
>>>  - It should be documented well somewhere in Documentation/virt/kvm
>>>  - It should go through request_region() to actually claim those ioports.
>>>  - It should fail gracefully if that port is taken for some reason, instead of not even checking it.
>>>
>>
>> Out of curiosity I tested that, and apparently lkvm has no problem allocating virtio-pci devices in that range:
>>
>> sh-4.2# pwd
>> /sys/devices/pci0000:00/0000:00:01.0
>> sh-4.2# cat resource | head -n1
>> 0x0000000000000500 0x00000000000005ff 0x0000000000040101
>>
>> This was with the commit in question applied.
> 
> With all due respect, lkvm has a half-baked implementation of PCI.  This
> is why you have to pass kernel parameters to disable ACPI and disable
> PCI BIOS probing.
> 
> So yeah, you can do funky things in lkvm but that doesn't mean a system
> that emulated actual hardware would ever do that.

We disable ACPI simply because we don't support it. MPtable is a perfectly valid mechanism to do everything we need so far, so implementing ACPI didn't interest either of us too much. What's more - why implement a "complete design disaster in every way" ;)

Regarding PCI probing, while we do force the use of direct memory probing this is because we lack anything which reassembles a BIOS. Like the above, this wasn't too interesting in a virtualized environment, and the kernel is pretty happy running without it. PCI probing does happen in a standard way.

I think that the interesting part in that test was not that you could actually put a PCI device in the 0x500 range, but that nothing failed and no one yelled at me (with the panic commit applied).

I'm not worried about port 0x505 being taken, I'm worried that it'll silently break a (although not very common/reasonable/typical) perfectly valid use case.
Wen Congyang July 23, 2012, 6:27 a.m. UTC | #10
At 07/23/2012 04:19 AM, Sasha Levin Wrote:
> On 07/22/2012 09:22 PM, Anthony Liguori wrote:
>> Sasha Levin <levinsasha928@gmail.com> writes:
>>
>>> On 07/21/2012 09:12 AM, Wen Congyang wrote:
>>>> +#define KVM_PV_PORT	(0x505UL)
>>>> +
>>>>  #ifdef __KERNEL__
>>>>  #include <asm/processor.h>
>>>>  
>>>> @@ -221,6 +223,11 @@ static inline void kvm_disable_steal_time(void)
>>>>  }
>>>>  #endif
>>>>  
>>>> +static inline unsigned int kvm_arch_pv_features(void)
>>>> +{
>>>> +	return inl(KVM_PV_PORT);
>>>> +}
>>>> +
>>>
>>> Why is this safe?
>>>
>>> I'm not sure you can just pick any ioport you'd like and use it.
>>
>> There are three ways I/O ports get used on a PC:
>>
>> 1) Platform devices
>>  - This is well defined since the vast majority of platform devices are
>>    implemented within a single chip.  If you're emulating an i440fx
>>    chipset, the PIIX4 spec has an exhaustive list.
>>
>> 2) PCI devices
>>  - Typically, PCI only allocates ports starting at 0x0d00 to avoid
>>    conflicts with ISA devices.
>>
>> 3) ISA devices
>>  - ISA uses subtractive decoding so any ISA device can access.  In
>>    theory, an ISA device could attempt to use port 0x0505 but it's
>>    unlikely.  In a modern guest, there aren't really any ISA devices being
>>    added either.
>>
>> So yes, picking port 0x0505 is safe for something like this (as long as
>> you check to make sure that you really are under KVM).
> 
> Is there anything that actually prevents me from using PCI ports lower than 0x0d00? As you said in (3), ISA isn't really used anymore (nor is implemented by lkvm for example), so placing PCI below 0x0d00 might even make sense in that case.
> 
> Furthermore, I can place one of these brand new virtio-mmio devices which got introduced recently wherever I want right now - Having a device that uses 0x505 would cause a pretty non-obvious failure mode.
> 
> Either way, If we are going to grab an ioport, then:
> 
>  - It should be documented well somewhere in Documentation/virt/kvm
>  - It should go through request_region() to actually claim those ioports.

Good idea.

>  - It should fail gracefully if that port is taken for some reason, instead of not even checking it.

Yes, I agree it.

I will update it.

Thanks
Wen Congyang

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>
diff mbox

Patch

diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h
index 2019cb9..187c0e2 100644
--- a/arch/ia64/include/asm/kvm_para.h
+++ b/arch/ia64/include/asm/kvm_para.h
@@ -31,6 +31,11 @@  static inline bool kvm_check_and_clear_guest_paused(void)
 	return false;
 }
 
+static inline unsigned int kvm_arch_pv_features(void)
+{
+	return 0;
+}
+
 #endif
 
 #endif
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index c18916b..be81aac 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -211,6 +211,11 @@  static inline bool kvm_check_and_clear_guest_paused(void)
 	return false;
 }
 
+static inline unsigned int kvm_arch_pv_features(void)
+{
+	return 0;
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* __POWERPC_KVM_PARA_H__ */
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index a988329..3d993b7 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -154,6 +154,11 @@  static inline bool kvm_check_and_clear_guest_paused(void)
 	return false;
 }
 
+static inline unsigned int kvm_arch_pv_features(void)
+{
+	return 0;
+}
+
 #endif
 
 #endif /* __S390_KVM_PARA_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 63ab166..c8ad86e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -89,6 +89,8 @@  struct kvm_vcpu_pv_apf_data {
 	__u32 enabled;
 };
 
+#define KVM_PV_PORT	(0x505UL)
+
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
@@ -221,6 +223,11 @@  static inline void kvm_disable_steal_time(void)
 }
 #endif
 
+static inline unsigned int kvm_arch_pv_features(void)
+{
+	return inl(KVM_PV_PORT);
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e554e5a..9a97f7e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -328,6 +328,17 @@  static struct notifier_block kvm_pv_reboot_nb = {
 	.notifier_call = kvm_pv_reboot_notify,
 };
 
+static int
+kvm_pv_panic_notify(struct notifier_block *nb, unsigned long code, void *unused)
+{
+	outl(KVM_PV_PANICKED, KVM_PV_PORT);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_panic_nb = {
+	.notifier_call = kvm_pv_panic_notify,
+};
+
 static u64 kvm_steal_clock(int cpu)
 {
 	u64 steal;
@@ -414,6 +425,9 @@  void __init kvm_guest_init(void)
 
 	paravirt_ops_setup();
 	register_reboot_notifier(&kvm_pv_reboot_nb);
+	if (kvm_pv_has_feature(KVM_PV_FEATURE_PANICKED))
+		atomic_notifier_chain_register(&panic_notifier_list,
+			&kvm_pv_panic_nb);
 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
 		spin_lock_init(&async_pf_sleepers[i].lock);
 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index ff476dd..e73efcf 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -20,6 +20,12 @@ 
 #define KVM_HC_FEATURES			3
 #define KVM_HC_PPC_MAP_MAGIC_PAGE	4
 
+/* The bit of the value read from KVM_PV_PORT */
+#define KVM_PV_FEATURE_PANICKED	0
+
+/* The value writen to KVM_PV_PORT */
+#define KVM_PV_PANICKED		1
+
 /*
  * hypercalls use architecture specific
  */
@@ -33,5 +39,12 @@  static inline int kvm_para_has_feature(unsigned int feature)
 		return 1;
 	return 0;
 }
+
+static inline int kvm_pv_has_feature(unsigned int feature)
+{
+	if (kvm_arch_pv_features() & (1UL << feature))
+		return 1;
+	return 0;
+}
 #endif /* __KERNEL__ */
 #endif /* __LINUX_KVM_PARA_H */