diff mbox

Re: [PATCH 0/9] S390x KVM support

Message ID 4ADDE7E3.9090601@de.ibm.com
State New
Headers show

Commit Message

Carsten Otte Oct. 20, 2009, 4:40 p.m. UTC
Alexander Graf wrote:
> This is the resulting code. Please comment on things you like and also on the
> ones you don't :-).
I've reviewed and tested it, great work Alex :-)

> Also to actually run this code you need a patch for an ugly bug in the kernel
> module: http://alex.csgraf.de/psw.patch
Well, it was not exactly a bug. Kuli does'nt need the psw to be updated 
in kvm_run at all times. Your hotfix updates the psw in a union, even if 
the exit reason indicates that s390_sieic is not valid. The patch at the 
end of this mail moves the PSW out of the union. I think it makes most 
sense if Avi picks this patch and you adopt your series to it. This way 
the user interface of the kvm module works for both kuli and qemu.







This patch moves s390 processor status word into the base kvm_run
struct and keeps it up-to date on all userspace exits.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c |   14 ++++++++------
 include/linux/kvm.h      |    8 ++++++--
 2 files changed, 14 insertions(+), 8 deletions(-)

Comments

Alexander Graf Oct. 21, 2009, 7:20 a.m. UTC | #1
On 20.10.2009, at 18:40, Carsten Otte wrote:

> Alexander Graf wrote:
>> This is the resulting code. Please comment on things you like and  
>> also on the
>> ones you don't :-).
> I've reviewed and tested it, great work Alex :-)

Thanks :-).

>> Also to actually run this code you need a patch for an ugly bug in  
>> the kernel
>> module: http://alex.csgraf.de/psw.patch
> Well, it was not exactly a bug. Kuli does'nt need the psw to be  
> updated in kvm_run at all times. Your hotfix updates the psw in a  
> union, even if the exit reason indicates that s390_sieic is not  
> valid. The patch at the end of this mail moves the PSW out of the  
> union. I think it makes most sense if Avi picks this patch and you  
> adopt your series to it. This way the user interface of the kvm  
> module works for both kuli and qemu.

Alright, expect v2 today!

On a sidenode, please post the patch to kvm-devel too, so Marcelo has  
the chance to pick it up.


> This patch moves s390 processor status word into the base kvm_run
> struct and keeps it up-to date on all userspace exits.
>
> Signed-off-by: Carsten Otte <cotte@de.ibm.com>
> ---
> arch/s390/kvm/kvm-s390.c |   14 ++++++++------
> include/linux/kvm.h      |    8 ++++++--
> 2 files changed, 14 insertions(+), 8 deletions(-)
>
> Index: kvm/include/linux/kvm.h
> ===================================================================
> --- kvm.orig/include/linux/kvm.h	2009-10-20 15:00:40.000000000 +0200
> +++ kvm/include/linux/kvm.h	2009-10-20 16:51:48.000000000 +0200
> @@ -7,6 +7,7 @@
> * Note: you must update KVM_API_VERSION if you change this interface.
> */
> +#include <linux/autoconf.h>
> #include <linux/types.h>
> #include <linux/compiler.h>
> #include <linux/ioctl.h>
> @@ -116,6 +117,11 @@
> 	__u64 cr8;
> 	__u64 apic_base;
> +#ifdef CONFIG_S390
> +	/* the processor status word for s390 */
> +	__u64 psw_mask; /* psw upper half */
> +	__u64 psw_addr; /* psw lower half */
> +#endif
> 	union {
> 		/* KVM_EXIT_UNKNOWN */
> 		struct {
> @@ -167,8 +173,6 @@
> 		/* KVM_EXIT_S390_SIEIC */
> 		struct {
> 			__u8 icptcode;
> -			__u64 mask; /* psw upper half */
> -			__u64 addr; /* psw lower half */
> 			__u16 ipa;
> 			__u32 ipb;
> 		} s390_sieic;
> Index: kvm/arch/s390/kvm/kvm-s390.c
> ===================================================================
> --- kvm.orig/arch/s390/kvm/kvm-s390.c	2009-10-20 15:01:02.000000000  
> +0200
> +++ kvm/arch/s390/kvm/kvm-s390.c	2009-10-20 18:13:45.000000000 +0200
> @@ -421,7 +421,8 @@
> 	if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
> 		rc = -EBUSY;
> 	else
> -		vcpu->arch.sie_block->gpsw = psw;
> +		vcpu->run->psw_mask = psw.mask;
> +		vcpu->run->psw_addr = psw.addr;
> 	vcpu_put(vcpu);
> 	return rc;
> }
> @@ -509,9 +510,6 @@
> 	switch (kvm_run->exit_reason) {
> 	case KVM_EXIT_S390_SIEIC:
> -		vcpu->arch.sie_block->gpsw.mask = kvm_run->s390_sieic.mask;
> -		vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr;
> -		break;
> 	case KVM_EXIT_UNKNOWN:
> 	case KVM_EXIT_INTR:
> 	case KVM_EXIT_S390_RESET:
> @@ -520,6 +518,9 @@
> 		BUG();
> 	}

Why leave the switch? Do you want the BUG() that badly?

Most of the time that's just wasted cycles, right?

> +	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
> +	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
> +
> 	might_fault();
> 	do {
> @@ -539,8 +540,6 @@
> 		/* intercept cannot be handled in-kernel, prepare kvm-run */
> 		kvm_run->exit_reason         = KVM_EXIT_S390_SIEIC;
> 		kvm_run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode;
> -		kvm_run->s390_sieic.mask     = vcpu->arch.sie_block->gpsw.mask;
> -		kvm_run->s390_sieic.addr     = vcpu->arch.sie_block->gpsw.addr;
> 		kvm_run->s390_sieic.ipa      = vcpu->arch.sie_block->ipa;
> 		kvm_run->s390_sieic.ipb      = vcpu->arch.sie_block->ipb;
> 		rc = 0;
> @@ -552,6 +551,9 @@
> 		rc = 0;
> 	}
> +	kvm_run->psw_mask     = vcpu->arch.sie_block->gpsw.mask;
> +	kvm_run->psw_addr     = vcpu->arch.sie_block->gpsw.addr;
> +

I think you should not indent the equal signs here. They're not  
aligned to the ones above anymore anyways.

Apart from that, great patch!
I'll put all the pieces together today, so expect to be running Qemu  
backed virtualization on your mainframe :-).

Alex
Alexander Graf Oct. 21, 2009, 7:24 a.m. UTC | #2
On 20.10.2009, at 18:40, Carsten Otte wrote:

> Alexander Graf wrote:
>> This is the resulting code. Please comment on things you like and  
>> also on the
>> ones you don't :-).
> I've reviewed and tested it, great work Alex :-)
>
>> Also to actually run this code you need a patch for an ugly bug in  
>> the kernel
>> module: http://alex.csgraf.de/psw.patch
> Well, it was not exactly a bug. Kuli does'nt need the psw to be  
> updated in kvm_run at all times. Your hotfix updates the psw in a  
> union, even if the exit reason indicates that s390_sieic is not  
> valid. The patch at the end of this mail moves the PSW out of the  
> union. I think it makes most sense if Avi picks this patch and you  
> adopt your series to it. This way the user interface of the kvm  
> module works for both kuli and qemu.
>
>
>
>
>
>
>
> This patch moves s390 processor status word into the base kvm_run
> struct and keeps it up-to date on all userspace exits.
>
> Signed-off-by: Carsten Otte <cotte@de.ibm.com>
> ---
> arch/s390/kvm/kvm-s390.c |   14 ++++++++------
> include/linux/kvm.h      |    8 ++++++--
> 2 files changed, 14 insertions(+), 8 deletions(-)
>
> Index: kvm/include/linux/kvm.h
> ===================================================================
> --- kvm.orig/include/linux/kvm.h	2009-10-20 15:00:40.000000000 +0200
> +++ kvm/include/linux/kvm.h	2009-10-20 16:51:48.000000000 +0200
> @@ -7,6 +7,7 @@
> * Note: you must update KVM_API_VERSION if you change this interface.
> */
> +#include <linux/autoconf.h>
> #include <linux/types.h>
> #include <linux/compiler.h>
> #include <linux/ioctl.h>
> @@ -116,6 +117,11 @@
> 	__u64 cr8;
> 	__u64 apic_base;
> +#ifdef CONFIG_S390
> +	/* the processor status word for s390 */
> +	__u64 psw_mask; /* psw upper half */
> +	__u64 psw_addr; /* psw lower half */
> +#endif

While at it, I think we should either go all or nothing here.

Either we take the psw in on x86 too or we #ifdef out the x86 specific  
stuff on s390.

Alex
Alexander Graf Oct. 21, 2009, 7:26 a.m. UTC | #3
On 20.10.2009, at 18:40, Carsten Otte wrote:

> Alexander Graf wrote:
>> This is the resulting code. Please comment on things you like and  
>> also on the
>> ones you don't :-).
> I've reviewed and tested it, great work Alex :-)
>
>> Also to actually run this code you need a patch for an ugly bug in  
>> the kernel
>> module: http://alex.csgraf.de/psw.patch
> Well, it was not exactly a bug. Kuli does'nt need the psw to be  
> updated in kvm_run at all times. Your hotfix updates the psw in a  
> union, even if the exit reason indicates that s390_sieic is not  
> valid. The patch at the end of this mail moves the PSW out of the  
> union. I think it makes most sense if Avi picks this patch and you  
> adopt your series to it. This way the user interface of the kvm  
> module works for both kuli and qemu.
>
>
>
>
>
>
>
> This patch moves s390 processor status word into the base kvm_run
> struct and keeps it up-to date on all userspace exits.
>
> Signed-off-by: Carsten Otte <cotte@de.ibm.com>
> ---
> arch/s390/kvm/kvm-s390.c |   14 ++++++++------
> include/linux/kvm.h      |    8 ++++++--
> 2 files changed, 14 insertions(+), 8 deletions(-)
>
> Index: kvm/include/linux/kvm.h
> ===================================================================
> --- kvm.orig/include/linux/kvm.h	2009-10-20 15:00:40.000000000 +0200
> +++ kvm/include/linux/kvm.h	2009-10-20 16:51:48.000000000 +0200
> @@ -7,6 +7,7 @@
> * Note: you must update KVM_API_VERSION if you change this interface.
> */
> +#include <linux/autoconf.h>
> #include <linux/types.h>
> #include <linux/compiler.h>
> #include <linux/ioctl.h>
> @@ -116,6 +117,11 @@
> 	__u64 cr8;
> 	__u64 apic_base;
> +#ifdef CONFIG_S390
> +	/* the processor status word for s390 */
> +	__u64 psw_mask; /* psw upper half */
> +	__u64 psw_addr; /* psw lower half */
> +#endif
> 	union {
> 		/* KVM_EXIT_UNKNOWN */
> 		struct {
> @@ -167,8 +173,6 @@
> 		/* KVM_EXIT_S390_SIEIC */
> 		struct {
> 			__u8 icptcode;
> -			__u64 mask; /* psw upper half */
> -			__u64 addr; /* psw lower half */
> 			__u16 ipa;
> 			__u32 ipb;
> 		} s390_sieic;
> Index: kvm/arch/s390/kvm/kvm-s390.c
> ===================================================================
> --- kvm.orig/arch/s390/kvm/kvm-s390.c	2009-10-20 15:01:02.000000000  
> +0200
> +++ kvm/arch/s390/kvm/kvm-s390.c	2009-10-20 18:13:45.000000000 +0200
> @@ -421,7 +421,8 @@
> 	if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
> 		rc = -EBUSY;
> 	else
> -		vcpu->arch.sie_block->gpsw = psw;
> +		vcpu->run->psw_mask = psw.mask;
> +		vcpu->run->psw_addr = psw.addr;

Missing braces.

Alex
Carsten Otte Oct. 21, 2009, 8:18 a.m. UTC | #4
Alexander Graf wrote:
> Either we take the psw in on x86 too or we #ifdef out the x86 specific 
> stuff on s390.
Good idea, I'll #ifndef CONFIG_S390 the x86 specifics while we're at it.
Alexander Graf Oct. 21, 2009, 8:23 a.m. UTC | #5
On 21.10.2009, at 10:18, Carsten Otte wrote:

> Alexander Graf wrote:
>> Either we take the psw in on x86 too or we #ifdef out the x86  
>> specific stuff on s390.
> Good idea, I'll #ifndef CONFIG_S390 the x86 specifics while we're at  
> it.

Maybe better #if defined(TARGET_X86) || defined(TARGET_IA64). We don't  
really need them for ppc either :-).

Alex
Avi Kivity Oct. 22, 2009, 9:08 a.m. UTC | #6
On 10/20/2009 06:40 PM, Carsten Otte wrote:
>
> This patch moves s390 processor status word into the base kvm_run
> struct and keeps it up-to date on all userspace exits.
>
> +#include <linux/autoconf.h>
> #include <linux/types.h>
> #include <linux/compiler.h>
> #include <linux/ioctl.h>

Not needed.

> @@ -116,6 +117,11 @@
>     __u64 cr8;
>     __u64 apic_base;
>
> +#ifdef CONFIG_S390
> +    /* the processor status word for s390 */
> +    __u64 psw_mask; /* psw upper half */
> +    __u64 psw_addr; /* psw lower half */
> +#endif

Doesn't this break backward compatibility by changing the structure?

Best to put it after the union (and as a copy, so userspace that expects 
the previous location still works).  If you're reading it from the 
kernel, also need a way to tell the kernel which copy to read from.

Also advertise with a KVM_CAP.

Additionally, CONFIG_ in public headers are frowned upon as 
non-portable.  A workaround is to #define __KVM_S390 in <asm/kvm.h> and 
depend on that.

> --- kvm.orig/arch/s390/kvm/kvm-s390.c    2009-10-20 15:01:02.000000000 
> +0200
> +++ kvm/arch/s390/kvm/kvm-s390.c    2009-10-20 18:13:45.000000000 +0200
> @@ -421,7 +421,8 @@
>     if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
>         rc = -EBUSY;
>     else
> -        vcpu->arch.sie_block->gpsw = psw;
> +        vcpu->run->psw_mask = psw.mask;
> +        vcpu->run->psw_addr = psw.addr;

It's traditional to add braces around multi-line else blocks.

I'd also appreciate an explanation of what this is all about.
Alexander Graf Oct. 22, 2009, 9:11 a.m. UTC | #7
On 22.10.2009, at 11:08, Avi Kivity wrote:

> On 10/20/2009 06:40 PM, Carsten Otte wrote:
>>
>> This patch moves s390 processor status word into the base kvm_run
>> struct and keeps it up-to date on all userspace exits.
>>
>> +#include <linux/autoconf.h>
>> #include <linux/types.h>
>> #include <linux/compiler.h>
>> #include <linux/ioctl.h>
>
> Not needed.
>
>> @@ -116,6 +117,11 @@
>>    __u64 cr8;
>>    __u64 apic_base;
>>
>> +#ifdef CONFIG_S390
>> +    /* the processor status word for s390 */
>> +    __u64 psw_mask; /* psw upper half */
>> +    __u64 psw_addr; /* psw lower half */
>> +#endif
>
> Doesn't this break backward compatibility by changing the structure?
>
> Best to put it after the union (and as a copy, so userspace that  
> expects the previous location still works).  If you're reading it  
> from the kernel, also need a way to tell the kernel which copy to  
> read from.
>
> Also advertise with a KVM_CAP.

I don't think we need to go through the hassle here. There is  
effectively no user of that code for now and the ABI is considered  
unstable.

> Additionally, CONFIG_ in public headers are frowned upon as non- 
> portable.  A workaround is to #define __KVM_S390 in <asm/kvm.h> and  
> depend on that.
>
>> --- kvm.orig/arch/s390/kvm/kvm-s390.c    2009-10-20  
>> 15:01:02.000000000 +0200
>> +++ kvm/arch/s390/kvm/kvm-s390.c    2009-10-20 18:13:45.000000000  
>> +0200
>> @@ -421,7 +421,8 @@
>>    if (atomic_read(&vcpu->arch.sie_block->cpuflags) &  
>> CPUSTAT_RUNNING)
>>        rc = -EBUSY;
>>    else
>> -        vcpu->arch.sie_block->gpsw = psw;
>> +        vcpu->run->psw_mask = psw.mask;
>> +        vcpu->run->psw_addr = psw.addr;
>
> It's traditional to add braces around multi-line else blocks.
>
> I'd also appreciate an explanation of what this is all about.

Explanation in the code or explanation in an email reply?

Alex
Carsten Otte Oct. 22, 2009, 9:18 a.m. UTC | #8
Avi Kivity wrote:
>> @@ -116,6 +117,11 @@
>>     __u64 cr8;
>>     __u64 apic_base;
>>
>> +#ifdef CONFIG_S390
>> +    /* the processor status word for s390 */
>> +    __u64 psw_mask; /* psw upper half */
>> +    __u64 psw_addr; /* psw lower half */
>> +#endif
> 
> Doesn't this break backward compatibility by changing the structure?
Yes, but with a zero user base I think it's okay. I'd update our 
userspace. Once we pull CONFIG_EXPERIMENTAL we keep the API stable.

> Additionally, CONFIG_ in public headers are frowned upon as 
> non-portable.  A workaround is to #define __KVM_S390 in <asm/kvm.h> and 
> depend on that.
Yea, that's better.

>> --- kvm.orig/arch/s390/kvm/kvm-s390.c    2009-10-20 15:01:02.000000000 
>> +0200
>> +++ kvm/arch/s390/kvm/kvm-s390.c    2009-10-20 18:13:45.000000000 +0200
>> @@ -421,7 +421,8 @@
>>     if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
>>         rc = -EBUSY;
>>     else
>> -        vcpu->arch.sie_block->gpsw = psw;
>> +        vcpu->run->psw_mask = psw.mask;
>> +        vcpu->run->psw_addr = psw.addr;
> 
> It's traditional to add braces around multi-line else blocks.
This is a plain bug, will fix.


> I'd also appreciate an explanation of what this is all about.
The processor status word does contain various bits about the CPU's 
state, such as interrupt mask bits, current address space, and the 
current instruction address. The status is kept in the in-kernel sie 
control block data structure and has so far only been mirrored into 
kvm_run during exit_reason == s390_sieic exits because user space needs 
to work on it. It was never part of get_regs/set_regs and friends as 
performance optimization: it's needed on almost every exit, having it in 
kvm_run saves doing syscalls.
The gdb stub requires an up-to-date copy at every exit, and therefore 
the patch moves it out of the union and updates it at all userland exits.
Avi Kivity Oct. 22, 2009, 9:53 a.m. UTC | #9
On 10/22/2009 11:11 AM, Alexander Graf wrote:
>> Doesn't this break backward compatibility by changing the structure?
>>
>> Best to put it after the union (and as a copy, so userspace that 
>> expects the previous location still works).  If you're reading it 
>> from the kernel, also need a way to tell the kernel which copy to 
>> read from.
>>
>> Also advertise with a KVM_CAP.
>
>
> I don't think we need to go through the hassle here. There is 
> effectively no user of that code for now and the ABI is considered 
> unstable.

At the very least we need a KVM_CAP so qemu knows to fail on older kernels.

>>
>> I'd also appreciate an explanation of what this is all about.
>
> Explanation in the code or explanation in an email reply?


email.  I assume s390 hackers would understand why the psw needs to be 
exposed to qemu on every exit.  This is mostly for my personal interest.
Alexander Graf Oct. 22, 2009, 9:55 a.m. UTC | #10
On 22.10.2009, at 11:53, Avi Kivity wrote:

> On 10/22/2009 11:11 AM, Alexander Graf wrote:
>>> Doesn't this break backward compatibility by changing the structure?
>>>
>>> Best to put it after the union (and as a copy, so userspace that  
>>> expects the previous location still works).  If you're reading it  
>>> from the kernel, also need a way to tell the kernel which copy to  
>>> read from.
>>>
>>> Also advertise with a KVM_CAP.
>>
>>
>> I don't think we need to go through the hassle here. There is  
>> effectively no user of that code for now and the ABI is considered  
>> unstable.
>
> At the very least we need a KVM_CAP so qemu knows to fail on older  
> kernels.

Hm. Oh well :-). It can't hurt to have yet another CAP, right?

>>> I'd also appreciate an explanation of what this is all about.
>>
>> Explanation in the code or explanation in an email reply?
>
>
> email.  I assume s390 hackers would understand why the psw needs to  
> be exposed to qemu on every exit.  This is mostly for my personal  
> interest.

PSW = (eflags << 32) | pc;

:-)

Before that patch it was only synced with the "vmcb" on special  
userspace handled intercepts, now it's synced on every exit to  
userspace.

Alex
Alexander Graf Oct. 22, 2009, 9:58 a.m. UTC | #11
On 22.10.2009, at 11:55, Alexander Graf wrote:

> PSW = (eflags << 32) | pc;

Eh - 64 of course. The PSW is 128 bits.

Alex
Avi Kivity Oct. 22, 2009, 10:02 a.m. UTC | #12
On 10/22/2009 11:18 AM, Carsten Otte wrote:
>
>> I'd also appreciate an explanation of what this is all about.
> The processor status word does contain various bits about the CPU's 
> state, such as interrupt mask bits, current address space, and the 
> current instruction address. The status is kept in the in-kernel sie 
> control block data structure and has so far only been mirrored into 
> kvm_run during exit_reason == s390_sieic exits because user space 
> needs to work on it. It was never part of get_regs/set_regs and 
> friends as performance optimization: it's needed on almost every exit, 
> having it in kvm_run saves doing syscalls.
> The gdb stub requires an up-to-date copy at every exit, and therefore 
> the patch moves it out of the union and updates it at all userland exits.

gdb is hardly performance critical.  Is that the only reason for the change?
Avi Kivity Oct. 22, 2009, 10:03 a.m. UTC | #13
On 10/22/2009 11:55 AM, Alexander Graf wrote:
>
> On 22.10.2009, at 11:53, Avi Kivity wrote:
>
>> On 10/22/2009 11:11 AM, Alexander Graf wrote:
>>>> Doesn't this break backward compatibility by changing the structure?
>>>>
>>>> Best to put it after the union (and as a copy, so userspace that 
>>>> expects the previous location still works).  If you're reading it 
>>>> from the kernel, also need a way to tell the kernel which copy to 
>>>> read from.
>>>>
>>>> Also advertise with a KVM_CAP.
>>>
>>>
>>> I don't think we need to go through the hassle here. There is 
>>> effectively no user of that code for now and the ABI is considered 
>>> unstable.
>>
>> At the very least we need a KVM_CAP so qemu knows to fail on older 
>> kernels.
>
> Hm. Oh well :-). It can't hurt to have yet another CAP, right?
>
>>>> I'd also appreciate an explanation of what this is all about.
>>>
>>> Explanation in the code or explanation in an email reply?
>>
>>
>> email.  I assume s390 hackers would understand why the psw needs to 
>> be exposed to qemu on every exit.  This is mostly for my personal 
>> interest.
>
> PSW = (eflags << 32) | pc;
>
> :-)
>
> Before that patch it was only synced with the "vmcb" on special 
> userspace handled intercepts, now it's synced on every exit to userspace.

Right, but why?  x86 qemu doesn't care about either pc or eflags (with 
in-kernel irqchip, which s390 essentially is).
Alexander Graf Oct. 22, 2009, 10:13 a.m. UTC | #14
On 22.10.2009, at 12:03, Avi Kivity wrote:

> On 10/22/2009 11:55 AM, Alexander Graf wrote:
>>
>> On 22.10.2009, at 11:53, Avi Kivity wrote:
>>
>>> On 10/22/2009 11:11 AM, Alexander Graf wrote:
>>>>> Doesn't this break backward compatibility by changing the  
>>>>> structure?
>>>>>
>>>>> Best to put it after the union (and as a copy, so userspace that  
>>>>> expects the previous location still works).  If you're reading  
>>>>> it from the kernel, also need a way to tell the kernel which  
>>>>> copy to read from.
>>>>>
>>>>> Also advertise with a KVM_CAP.
>>>>
>>>>
>>>> I don't think we need to go through the hassle here. There is  
>>>> effectively no user of that code for now and the ABI is  
>>>> considered unstable.
>>>
>>> At the very least we need a KVM_CAP so qemu knows to fail on older  
>>> kernels.
>>
>> Hm. Oh well :-). It can't hurt to have yet another CAP, right?
>>
>>>>> I'd also appreciate an explanation of what this is all about.
>>>>
>>>> Explanation in the code or explanation in an email reply?
>>>
>>>
>>> email.  I assume s390 hackers would understand why the psw needs  
>>> to be exposed to qemu on every exit.  This is mostly for my  
>>> personal interest.
>>
>> PSW = (eflags << 32) | pc;
>>
>> :-)
>>
>> Before that patch it was only synced with the "vmcb" on special  
>> userspace handled intercepts, now it's synced on every exit to  
>> userspace.
>
> Right, but why?  x86 qemu doesn't care about either pc or eflags  
> (with in-kernel irqchip, which s390 essentially is).

The old code would only sync the psw for us on "intercepts that the  
kernel knows require userspace interaction". We'd "x /i $pc" somewhere  
along the way and would get an old psw, but GET_REGS would give us  
recent register states, not including the psw. That's a total nightmare.

To solve the psw nightmare there were basically 2 roads possible to  
take here:

1) Sync the PSW in the GET_REGS ioctl
2) Sync the PSW in kvm_run on all exits to userspace

Otherwise we can't debug guests.

The same goes for setting the PSW. Setting the PSW was only  
implemented when kvm_run->exit_reason == "the kernel thinks userspace  
needs to do something". So If you wanted to change the pc along the  
way, you couldn't.

Furthermore, there was a nasty bug in this whole optimization that  
lead me to think it's better to just get rid of the whole "kernel  
tries to be clever" implementation and just sync the psw at all times.  
Most exits to userspace are of the "userspace needs to do something"  
type anyways, so we're not really losing anything here. Implementing  
2) was just really easy.

So I guess Carsten thought the same way and improved my original patch.


I hope that clarifies things.

Alex
Carsten Otte Oct. 22, 2009, 10:20 a.m. UTC | #15
Avi Kivity wrote:
> gdb is hardly performance critical.  Is that the only reason for the 
> change?
Right, gdb is not performance critical. gdb is the reason for moving it 
out of the union, performance is the reason for having it in kvm_run at all.
There's only more reason to it: with a little tweaking in kuli, we'll be 
able to get rid of KVM_S390_SET_INITIAL_PSW ioctl with this change. That 
removes one oddity that makes us different from other platforms.
Carsten Otte Oct. 22, 2009, 10:22 a.m. UTC | #16
Avi Kivity wrote:
> Right, but why?  x86 qemu doesn't care about either pc or eflags (with 
> in-kernel irqchip, which s390 essentially is).
For different reasons. Most prominent for setting the condition code,
which is a sideband result of most instructions that indicates whether or
not the instruction actually worked - similar to the exception model in
high level programming languages.
Avi Kivity Oct. 22, 2009, 10:28 a.m. UTC | #17
On 10/22/2009 12:22 PM, Carsten Otte wrote:
> Avi Kivity wrote:
>> Right, but why?  x86 qemu doesn't care about either pc or eflags 
>> (with in-kernel irqchip, which s390 essentially is).
> For different reasons. Most prominent for setting the condition code,
> which is a sideband result of most instructions that indicates whether or
> not the instruction actually worked - similar to the exception model in
> high level programming languages.

Ok.  Thanks for the explanation.

On x86 we avoid emulating instructions in userspace.  Instead the kernel 
requests userspace to do something (triggered by the instruction), and 
the kernel does anything which might be implied by the instruction (like 
copying the result into a register, or updating pc).

An example is port I/O.  instead of userspace reading %edx to query the 
port number and setting %eax to indicate the result, userspace reads a 
port number struct field and writes an I/O result struct field.  Only 
the kernel accesses registers.

I don't know whether that model makes sense or not for s390, but please 
consider it.
Avi Kivity Oct. 22, 2009, 10:29 a.m. UTC | #18
On 10/22/2009 12:20 PM, Carsten Otte wrote:
> Avi Kivity wrote:
>> gdb is hardly performance critical.  Is that the only reason for the 
>> change?
> Right, gdb is not performance critical. gdb is the reason for moving 
> it out of the union, performance is the reason for having it in 
> kvm_run at all.
> There's only more reason to it: with a little tweaking in kuli, we'll 
> be able to get rid of KVM_S390_SET_INITIAL_PSW ioctl with this change. 
> That removes one oddity that makes us different from other platforms.

In fact qemu/x86 does a KVM_SET_SREGS (which updates pc) on reset.
Carsten Otte Oct. 22, 2009, 10:43 a.m. UTC | #19
Avi Kivity wrote:
> On x86 we avoid emulating instructions in userspace.  Instead the kernel 
> requests userspace to do something (triggered by the instruction), and 
> the kernel does anything which might be implied by the instruction (like 
> copying the result into a register, or updating pc).
> 
> An example is port I/O.  instead of userspace reading %edx to query the 
> port number and setting %eax to indicate the result, userspace reads a 
> port number struct field and writes an I/O result struct field.  Only 
> the kernel accesses registers.
> 
> I don't know whether that model makes sense or not for s390, but please 
> consider it.
We do the same for many instructions (arch/s390/kvm/instruction.c). User 
exits
are only performed for isntructions that cannot be handled in kernel. 
Also, we do exit with requests to userspace, see the s390_reset exit 
reason. I think in this regard, our implementation is very similar to 
x86. Btw: this is something I did copycat from your implementation on 
integration into kvm. The original zlive code did handle all 
instructions in userland.
Avi Kivity Oct. 22, 2009, 10:49 a.m. UTC | #20
On 10/22/2009 12:43 PM, Carsten Otte wrote:
> Avi Kivity wrote:
>> On x86 we avoid emulating instructions in userspace.  Instead the 
>> kernel requests userspace to do something (triggered by the 
>> instruction), and the kernel does anything which might be implied by 
>> the instruction (like copying the result into a register, or updating 
>> pc).
>>
>> An example is port I/O.  instead of userspace reading %edx to query 
>> the port number and setting %eax to indicate the result, userspace 
>> reads a port number struct field and writes an I/O result struct 
>> field.  Only the kernel accesses registers.
>>
>> I don't know whether that model makes sense or not for s390, but 
>> please consider it.
> We do the same for many instructions (arch/s390/kvm/instruction.c). 
> User exits
> are only performed for isntructions that cannot be handled in kernel. 
> Also, we do exit with requests to userspace, see the s390_reset exit 
> reason. I think in this regard, our implementation is very similar to 
> x86. Btw: this is something I did copycat from your implementation on 
> integration into kvm. The original zlive code did handle all 
> instructions in userland.

So why not do it for this instruction as well?  Instead of updating the 
psw, return a success/error code and let the kernel update psw.
Carsten Otte Oct. 22, 2009, 11:10 a.m. UTC | #21
Avi Kivity wrote:
> So why not do it for this instruction as well?  Instead of updating the 
> psw, return a success/error code and let the kernel update psw.
It's not a single instruction, but a set of reasons we need the psw in 
userspace:
- for logging the instruction address on exits
- to check if a wait-type psw is active, and if so to switch between:
	- enabled wait psws, where interrupts could wake the cpu (aka
	  /halt, which needs to be handled in-kernel)
	- disabled wait psws, where CPUs are suspended due to cpu
           hotremove, system dump on panic
- to handle cpus in stopped state, typically on regular
   reboots/shutdowns

- for setting the condition code during implementation of:
   - all s/390 I/O instructions
   - IPI for restarting remote CPUs (that are not in KVM_RUN)
   - IPI for storing remote CPU status into the remote cpu's lowcore page
   - IPI for resetting remote CPUs
   - for all IPIs where the remote CPU is not known to the kernel, such
     as the detection loop for all 65535 cpu slots on startup of the
     guest kernel
   - for interpretion of service calls (usually to the machine's service
     processor)
- virtio I/O

The sweet thing about having the psw in userspace is that for almost all 
above reasons, there's nothing more than the psw that we need to have in 
order to perform that operation. Giving the 128-bit psw to userspace 
does simply save us having special cases for above list, with an 
exit_reason and a struct for each of them.
Alexander Graf Nov. 2, 2009, 8:23 p.m. UTC | #22
Carsten Otte wrote:
> Avi Kivity wrote:
>> So why not do it for this instruction as well?  Instead of updating
>> the psw, return a success/error code and let the kernel update psw.
> It's not a single instruction, but a set of reasons we need the psw in
> userspace:
> - for logging the instruction address on exits
> - to check if a wait-type psw is active, and if so to switch between:
>     - enabled wait psws, where interrupts could wake the cpu (aka
>       /halt, which needs to be handled in-kernel)
>     - disabled wait psws, where CPUs are suspended due to cpu
>           hotremove, system dump on panic
> - to handle cpus in stopped state, typically on regular
>   reboots/shutdowns
>
> - for setting the condition code during implementation of:
>   - all s/390 I/O instructions
>   - IPI for restarting remote CPUs (that are not in KVM_RUN)
>   - IPI for storing remote CPU status into the remote cpu's lowcore page
>   - IPI for resetting remote CPUs
>   - for all IPIs where the remote CPU is not known to the kernel, such
>     as the detection loop for all 65535 cpu slots on startup of the
>     guest kernel
>   - for interpretion of service calls (usually to the machine's service
>     processor)
> - virtio I/O
>
> The sweet thing about having the psw in userspace is that for almost
> all above reasons, there's nothing more than the psw that we need to
> have in order to perform that operation. Giving the 128-bit psw to
> userspace does simply save us having special cases for above list,
> with an exit_reason and a struct for each of them.

Any progress on the patch? This is really important to make KVM work
properly on S390. I'd even go as far as suggesting it for linux-stable.

Alex
Avi Kivity Nov. 3, 2009, 8:55 a.m. UTC | #23
On 11/02/2009 10:23 PM, Alexander Graf wrote:
> Any progress on the patch? This is really important to make KVM work
> properly on S390. I'd even go as far as suggesting it for linux-stable.
>
>    

I forgot all about it, sorry.  Marcelo, can you commit it?
diff mbox

Patch

Index: kvm/include/linux/kvm.h
===================================================================
--- kvm.orig/include/linux/kvm.h	2009-10-20 15:00:40.000000000 +0200
+++ kvm/include/linux/kvm.h	2009-10-20 16:51:48.000000000 +0200
@@ -7,6 +7,7 @@ 
  * Note: you must update KVM_API_VERSION if you change this interface.
  */
 
+#include <linux/autoconf.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/ioctl.h>
@@ -116,6 +117,11 @@ 
 	__u64 cr8;
 	__u64 apic_base;
 
+#ifdef CONFIG_S390
+	/* the processor status word for s390 */
+	__u64 psw_mask; /* psw upper half */
+	__u64 psw_addr; /* psw lower half */
+#endif
 	union {
 		/* KVM_EXIT_UNKNOWN */
 		struct {
@@ -167,8 +173,6 @@ 
 		/* KVM_EXIT_S390_SIEIC */
 		struct {
 			__u8 icptcode;
-			__u64 mask; /* psw upper half */
-			__u64 addr; /* psw lower half */
 			__u16 ipa;
 			__u32 ipb;
 		} s390_sieic;
Index: kvm/arch/s390/kvm/kvm-s390.c
===================================================================
--- kvm.orig/arch/s390/kvm/kvm-s390.c	2009-10-20 15:01:02.000000000 +0200
+++ kvm/arch/s390/kvm/kvm-s390.c	2009-10-20 18:13:45.000000000 +0200
@@ -421,7 +421,8 @@ 
 	if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
 		rc = -EBUSY;
 	else
-		vcpu->arch.sie_block->gpsw = psw;
+		vcpu->run->psw_mask = psw.mask;
+		vcpu->run->psw_addr = psw.addr;
 	vcpu_put(vcpu);
 	return rc;
 }
@@ -509,9 +510,6 @@ 
 
 	switch (kvm_run->exit_reason) {
 	case KVM_EXIT_S390_SIEIC:
-		vcpu->arch.sie_block->gpsw.mask = kvm_run->s390_sieic.mask;
-		vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr;
-		break;
 	case KVM_EXIT_UNKNOWN:
 	case KVM_EXIT_INTR:
 	case KVM_EXIT_S390_RESET:
@@ -520,6 +518,9 @@ 
 		BUG();
 	}
 
+	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
+	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
+
 	might_fault();
 
 	do {
@@ -539,8 +540,6 @@ 
 		/* intercept cannot be handled in-kernel, prepare kvm-run */
 		kvm_run->exit_reason         = KVM_EXIT_S390_SIEIC;
 		kvm_run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode;
-		kvm_run->s390_sieic.mask     = vcpu->arch.sie_block->gpsw.mask;
-		kvm_run->s390_sieic.addr     = vcpu->arch.sie_block->gpsw.addr;
 		kvm_run->s390_sieic.ipa      = vcpu->arch.sie_block->ipa;
 		kvm_run->s390_sieic.ipb      = vcpu->arch.sie_block->ipb;
 		rc = 0;
@@ -552,6 +551,9 @@ 
 		rc = 0;
 	}
 
+	kvm_run->psw_mask     = vcpu->arch.sie_block->gpsw.mask;
+	kvm_run->psw_addr     = vcpu->arch.sie_block->gpsw.addr;
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);