diff mbox

[RFC,1/1] KVM: s390: Add MEMOP ioctl for reading/writing guest memory

Message ID 1422965498-11500-2-git-send-email-thuth@linux.vnet.ibm.com
State New
Headers show

Commit Message

Thomas Huth Feb. 3, 2015, 12:11 p.m. UTC
On s390, we've got to make sure to hold the IPTE lock while accessing
virtual memory. So let's add an ioctl for reading and writing virtual
memory to provide this feature for userspace, too.

Signed-off-by: Thomas Huth <thuth@linux.vnet.ibm.com>
Reviewed-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
---
 Documentation/virtual/kvm/api.txt |   44 +++++++++++++++++++++++++
 arch/s390/kvm/gaccess.c           |   22 +++++++++++++
 arch/s390/kvm/gaccess.h           |    2 +
 arch/s390/kvm/kvm-s390.c          |   63 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |   21 ++++++++++++
 5 files changed, 152 insertions(+), 0 deletions(-)

Comments

Paolo Bonzini Feb. 3, 2015, 1:04 p.m. UTC | #1
On 03/02/2015 13:11, Thomas Huth wrote:
> On s390, we've got to make sure to hold the IPTE lock while accessing
> virtual memory. So let's add an ioctl for reading and writing virtual
> memory to provide this feature for userspace, too.
> 
> Signed-off-by: Thomas Huth <thuth@linux.vnet.ibm.com>
> Reviewed-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
> Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
> ---
>  Documentation/virtual/kvm/api.txt |   44 +++++++++++++++++++++++++
>  arch/s390/kvm/gaccess.c           |   22 +++++++++++++
>  arch/s390/kvm/gaccess.h           |    2 +
>  arch/s390/kvm/kvm-s390.c          |   63 +++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h          |   21 ++++++++++++
>  5 files changed, 152 insertions(+), 0 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index b112efc..bf44b53 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2716,6 +2716,50 @@ The fields in each entry are defined as follows:
>     eax, ebx, ecx, edx: the values returned by the cpuid instruction for
>           this function/index combination
>  
> +4.89 KVM_GUEST_MEM_OP
> +
> +Capability: KVM_CAP_MEM_OP

Put "virtual" somewhere in the ioctl name and capability?

> +Architectures: s390
> +Type: vcpu ioctl
> +Parameters: struct kvm_guest_mem_op (in)
> +Returns: = 0 on success,
> +         < 0 on generic error (e.g. -EFAULT or -ENOMEM),
> +         > 0 if an exception occurred while walking the page tables
> +
> +Read or write data from/to the virtual memory of a VPCU.
> +
> +Parameters are specified via the following structure:
> +
> +struct kvm_guest_mem_op {
> +	__u64 gaddr;		/* the guest address */
> +	__u64 flags;		/* arch specific flags */
> +	__u32 size;		/* amount of bytes */
> +	__u32 op;		/* type of operation */
> +	__u64 buf;		/* buffer in userspace */
> +	__u8 reserved[32];	/* should be set to 0 */
> +};
> +
> +The type of operation is specified in the "op" field, either KVM_MEMOP_VIRTREAD
> +for reading from memory, KVM_MEMOP_VIRTWRITE for writing to memory, or
> +KVM_MEMOP_CHECKVIRTREAD or KVM_MEMOP_CHECKVIRTWRITE to check whether the

Better:

#define KVM_MEMOP_READ       0
#define KVM_MEMOP_WRITE      1

and in the flags field:

#define KVM_MEMOP_F_CHECK_ONLY (1 << 1)

> +corresponding memory access would create an access exception (without
> +changing the data in the memory at the destination). In case an access
> +exception occurred while walking the MMU tables of the guest, the ioctl
> +returns a positive error number to indicate the type of exception. The
> +exception is raised directly at the corresponding VCPU if the bit
> +KVM_MEMOP_F_INJECT_EXC is set in the "flags" field.

KVM_MEMOP_F_INJECT_EXCEPTION.

> +The logical (virtual) start address of the memory region has to be specified
> +in the "gaddr" field, and the length of the region in the "size" field.
> +"buf" is the buffer supplied by the userspace application where the read data
> +should be written to for KVM_MEMOP_VIRTREAD, or where the data that should
> +be written is stored for a KVM_MEMOP_VIRTWRITE. "buf" can be NULL for both
> +CHECK operations.

"buf" is unused and can be NULL for both CHECK operations.

> +The "reserved" field is meant for future extensions. It must currently be
> +set to 0.

Not really true, as you don't check it.  So "It is not used by KVM with
the currently defined set of flags" is a better explanation.

Paolo

> +
>  5. The kvm_run structure
>  ------------------------
>  
> diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
> index 8a1be90..d912362 100644
> --- a/arch/s390/kvm/gaccess.c
> +++ b/arch/s390/kvm/gaccess.c
> @@ -697,6 +697,28 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
>  }
>  
>  /**
> + * check_gva_range - test a range of guest virtual addresses for accessibility
> + */
> +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva,
> +		    unsigned long length, int is_write)
> +{
> +	unsigned long gpa;
> +	unsigned long currlen;
> +	int rc = 0;
> +
> +	ipte_lock(vcpu);
> +	while (length > 0 && !rc) {
> +		currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE));
> +		rc = guest_translate_address(vcpu, gva, &gpa, is_write);
> +		gva += currlen;
> +		length -= currlen;
> +	}
> +	ipte_unlock(vcpu);
> +
> +	return rc;
> +}
> +
> +/**
>   * kvm_s390_check_low_addr_protection - check for low-address protection
>   * @ga: Guest address
>   *
> diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
> index 0149cf1..268beb7 100644
> --- a/arch/s390/kvm/gaccess.h
> +++ b/arch/s390/kvm/gaccess.h
> @@ -157,6 +157,8 @@ int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
>  
>  int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
>  			    unsigned long *gpa, int write);
> +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva,
> +		    unsigned long length, int is_write);
>  
>  int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
>  		 unsigned long len, int write);
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index b2371c0..c80a640 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -168,6 +168,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_VM_ATTRIBUTES:
>  	case KVM_CAP_MP_STATE:
>  	case KVM_CAP_S390_USER_SIGP:
> +	case KVM_CAP_MEM_OP:
>  		r = 1;
>  		break;
>  	case KVM_CAP_NR_VCPUS:
> @@ -1886,6 +1887,59 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>  	return r;
>  }
>  
> +static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
> +				  struct kvm_guest_mem_op *mop)
> +{
> +	void __user *uaddr = (void __user *)mop->buf;
> +	void *tmpbuf = NULL;
> +	int r, srcu_idx;
> +
> +	if ((mop->flags & ~KVM_MEMOP_F_INJECT_EXC) != 0)
> +		return -EINVAL;
> +
> +	if (mop->op == KVM_MEMOP_VIRTREAD || mop->op == KVM_MEMOP_VIRTWRITE) {
> +		if (mop->size > 16 * PAGE_SIZE)
> +			return -E2BIG;
> +		tmpbuf = vmalloc(mop->size);
> +		if (!tmpbuf)
> +			return -ENOMEM;
> +	}
> +
> +	srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> +
> +	switch (mop->op) {
> +	case KVM_MEMOP_VIRTREAD:
> +		r = read_guest(vcpu, mop->gaddr, tmpbuf, mop->size);
> +		if (r == 0) {
> +			if (copy_to_user(uaddr, tmpbuf, mop->size))
> +				r = -EFAULT;
> +		}
> +		break;
> +	case KVM_MEMOP_VIRTWRITE:
> +		if (copy_from_user(tmpbuf, uaddr, mop->size)) {
> +			r = -EFAULT;
> +			break;
> +		}
> +		r = write_guest(vcpu, mop->gaddr, tmpbuf, mop->size);
> +		break;
> +	case KVM_MEMOP_CHECKVIRTREAD:
> +	case KVM_MEMOP_CHECKVIRTWRITE:
> +		r = check_gva_range(vcpu, mop->gaddr, mop->size,
> +				    mop->op == KVM_MEMOP_CHECKVIRTWRITE);
> +		break;
> +	default:
> +		r = -EINVAL;
> +	}
> +
> +	srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
> +
> +	if (r > 0 && (mop->flags & KVM_MEMOP_F_INJECT_EXC) != 0)
> +		kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
> +
> +	vfree(tmpbuf);
> +	return r;
> +}
> +
>  long kvm_arch_vcpu_ioctl(struct file *filp,
>  			 unsigned int ioctl, unsigned long arg)
>  {
> @@ -1985,6 +2039,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
>  		break;
>  	}
> +	case KVM_GUEST_MEM_OP: {
> +		struct kvm_guest_mem_op mem_op;
> +
> +		if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
> +			r = kvm_s390_guest_mem_op(vcpu, &mem_op);
> +		else
> +			r = -EFAULT;
> +		break;
> +	}
>  	default:
>  		r = -ENOTTY;
>  	}
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 8055706..528a3d4 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -365,6 +365,24 @@ struct kvm_translation {
>  	__u8  pad[5];
>  };
>  
> +/* for KVM_GUEST_MEM_OP */
> +struct kvm_guest_mem_op {
> +	/* in */
> +	__u64 gaddr;		/* the guest address */
> +	__u64 flags;		/* arch specific flags */
> +	__u32 size;		/* amount of bytes */
> +	__u32 op;		/* type of operation */
> +	__u64 buf;		/* buffer in userspace */
> +	__u8 reserved[32];	/* should be set to 0 */
> +};
> +/* types for kvm_guest_mem_op->op */
> +#define KVM_MEMOP_VIRTREAD		0
> +#define KVM_MEMOP_VIRTWRITE		1
> +#define KVM_MEMOP_CHECKVIRTREAD		2
> +#define KVM_MEMOP_CHECKVIRTWRITE	3
> +/* flags for kvm_guest_mem_op->flags */
> +#define KVM_MEMOP_F_INJECT_EXC	0x0001UL	/* Inject exception on faults */
> +
>  /* for KVM_INTERRUPT */
>  struct kvm_interrupt {
>  	/* in */
> @@ -760,6 +778,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_PPC_ENABLE_HCALL 104
>  #define KVM_CAP_CHECK_EXTENSION_VM 105
>  #define KVM_CAP_S390_USER_SIGP 106
> +#define KVM_CAP_MEM_OP 107
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1135,6 +1154,8 @@ struct kvm_s390_ucas_mapping {
>  #define KVM_ARM_VCPU_INIT	  _IOW(KVMIO,  0xae, struct kvm_vcpu_init)
>  #define KVM_ARM_PREFERRED_TARGET  _IOR(KVMIO,  0xaf, struct kvm_vcpu_init)
>  #define KVM_GET_REG_LIST	  _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
> +/* Available with KVM_CAP_MEM_OP */
> +#define KVM_GUEST_MEM_OP	  _IOW(KVMIO,  0xb1, struct kvm_guest_mem_op)
>  
>  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
>  #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
>
Thomas Huth Feb. 3, 2015, 3:16 p.m. UTC | #2
On Tue, 03 Feb 2015 14:04:43 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On 03/02/2015 13:11, Thomas Huth wrote:
> > On s390, we've got to make sure to hold the IPTE lock while accessing
> > virtual memory. So let's add an ioctl for reading and writing virtual
> > memory to provide this feature for userspace, too.
> > 
> > Signed-off-by: Thomas Huth <thuth@linux.vnet.ibm.com>
> > Reviewed-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
> > Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
> > ---
> >  Documentation/virtual/kvm/api.txt |   44 +++++++++++++++++++++++++
> >  arch/s390/kvm/gaccess.c           |   22 +++++++++++++
> >  arch/s390/kvm/gaccess.h           |    2 +
> >  arch/s390/kvm/kvm-s390.c          |   63 +++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/kvm.h          |   21 ++++++++++++
> >  5 files changed, 152 insertions(+), 0 deletions(-)
> > 
> > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > index b112efc..bf44b53 100644
> > --- a/Documentation/virtual/kvm/api.txt
> > +++ b/Documentation/virtual/kvm/api.txt
> > @@ -2716,6 +2716,50 @@ The fields in each entry are defined as follows:
> >     eax, ebx, ecx, edx: the values returned by the cpuid instruction for
> >           this function/index combination
> >  
> > +4.89 KVM_GUEST_MEM_OP
> > +
> > +Capability: KVM_CAP_MEM_OP
> 
> Put "virtual" somewhere in the ioctl name and capability?

Actually, I'd prefer to keep the "virtual" in the defines for the type
of operation below: When it comes to s390 storage keys, we likely might
need some calls for reading and writing to physical memory, too. Then
we could simply extend this ioctl instead of inventing a new one.

> > +Architectures: s390
> > +Type: vcpu ioctl
> > +Parameters: struct kvm_guest_mem_op (in)
> > +Returns: = 0 on success,
> > +         < 0 on generic error (e.g. -EFAULT or -ENOMEM),
> > +         > 0 if an exception occurred while walking the page tables
> > +
> > +Read or write data from/to the virtual memory of a VPCU.
> > +
> > +Parameters are specified via the following structure:
> > +
> > +struct kvm_guest_mem_op {
> > +	__u64 gaddr;		/* the guest address */
> > +	__u64 flags;		/* arch specific flags */
> > +	__u32 size;		/* amount of bytes */
> > +	__u32 op;		/* type of operation */
> > +	__u64 buf;		/* buffer in userspace */
> > +	__u8 reserved[32];	/* should be set to 0 */
> > +};
> > +
> > +The type of operation is specified in the "op" field, either KVM_MEMOP_VIRTREAD
> > +for reading from memory, KVM_MEMOP_VIRTWRITE for writing to memory, or
> > +KVM_MEMOP_CHECKVIRTREAD or KVM_MEMOP_CHECKVIRTWRITE to check whether the
> 
> Better:
> 
> #define KVM_MEMOP_READ       0
> #define KVM_MEMOP_WRITE      1
> 
> and in the flags field:
> 
> #define KVM_MEMOP_F_CHECK_ONLY (1 << 1)

Ok, a flag for the check operations is fine for me, too.

...
> > +The logical (virtual) start address of the memory region has to be specified
> > +in the "gaddr" field, and the length of the region in the "size" field.
> > +"buf" is the buffer supplied by the userspace application where the read data
> > +should be written to for KVM_MEMOP_VIRTREAD, or where the data that should
> > +be written is stored for a KVM_MEMOP_VIRTWRITE. "buf" can be NULL for both
> > +CHECK operations.
> 
> "buf" is unused and can be NULL for both CHECK operations.
> 
> > +The "reserved" field is meant for future extensions. It must currently be
> > +set to 0.
> 
> Not really true, as you don't check it.  So "It is not used by KVM with
> the currently defined set of flags" is a better explanation.

ok ... and maybe add "should be set to zero" ?

> Paolo

Thanks for the review!

 Thomas
Paolo Bonzini Feb. 3, 2015, 3:22 p.m. UTC | #3
On 03/02/2015 16:16, Thomas Huth wrote:
> Actually, I'd prefer to keep the "virtual" in the defines for the type
> of operation below: When it comes to s390 storage keys, we likely might
> need some calls for reading and writing to physical memory, too. Then
> we could simply extend this ioctl instead of inventing a new one.

Can you explain why it is necessary to read/write physical addresses
from user space?  In the case of QEMU, I'm worried that you would have
to invent your own memory read/write APIs that are different from
everything else.

On real s390 zPCI, does bus-master DMA update storage keys?

>> Not really true, as you don't check it.  So "It is not used by KVM with
>> the currently defined set of flags" is a better explanation.
> 
> ok ... and maybe add "should be set to zero" ?

If you don't check it, it is misleading to document this.

Paolo
Thomas Huth Feb. 4, 2015, 7:53 a.m. UTC | #4
On Tue, 03 Feb 2015 16:22:32 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> 
> 
> On 03/02/2015 16:16, Thomas Huth wrote:
> > Actually, I'd prefer to keep the "virtual" in the defines for the type
> > of operation below: When it comes to s390 storage keys, we likely might
> > need some calls for reading and writing to physical memory, too. Then
> > we could simply extend this ioctl instead of inventing a new one.
> 
> Can you explain why it is necessary to read/write physical addresses
> from user space?  In the case of QEMU, I'm worried that you would have
> to invent your own memory read/write APIs that are different from
> everything else.
> 
> On real s390 zPCI, does bus-master DMA update storage keys?

Ah, I was not thinking about bus-mastering/DMA here: AFAIK there are
some CPU instructions that access a parameter block in physical memory,
for example the SCLP instruction (see hw/s390x/sclp.c) - it's already
doing a cpu_physical_memory_read and ..._write for the parameters.
However, I haven't checked yet whether it is also supposed to touch
the storage keys, so if not, we also might be fine without the ioctls
for reading/writing to physical memory.

> >> Not really true, as you don't check it.  So "It is not used by KVM with
> >> the currently defined set of flags" is a better explanation.
> > 
> > ok ... and maybe add "should be set to zero" ?
> 
> If you don't check it, it is misleading to document this.

True... so I'll omit that.

 Thomas
Christian Borntraeger Feb. 4, 2015, 8:26 a.m. UTC | #5
Am 03.02.2015 um 16:22 schrieb Paolo Bonzini:
> 
> 
> On 03/02/2015 16:16, Thomas Huth wrote:
>> Actually, I'd prefer to keep the "virtual" in the defines for the type
>> of operation below: When it comes to s390 storage keys, we likely might
>> need some calls for reading and writing to physical memory, too. Then
>> we could simply extend this ioctl instead of inventing a new one.

Rereading that. Shall we replace "virtual" with "logical"? That is what is
used architecturally when we mean "do whatever is appropriate right now"
That can boil down to virtual via DAT, virtual via access register mode, 
real if DAT is off... and if fact your kernel implementation does that.


> 
> Can you explain why it is necessary to read/write physical addresses
> from user space?  In the case of QEMU, I'm worried that you would have
> to invent your own memory read/write APIs that are different from
> everything else.
> 
> On real s390 zPCI, does bus-master DMA update storage keys?

the classic channel I/O does set the storage key change/reference and
also triggers errors in the storage key protection value mismatches.

The PCI IOTA structure does contain a storage key value for accesses,
so I assume its the same here, but I dont know for sure.

Conny:
I am asking myself, if we should explicitly add a comment in the 
virtio-ccw spec, that all accesses are assumed to be with key 0 and 
thus never cause key protection. The change/reference bit is set
by the underlying I/O or memory copy anyway.
We can then add a ccw later on to set a different key if we ever need
that.


> 
>>> Not really true, as you don't check it.  So "It is not used by KVM with
>>> the currently defined set of flags" is a better explanation.
>>
>> ok ... and maybe add "should be set to zero" ?
> 
> If you don't check it, it is misleading to document this.
> 
> Paolo
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Paolo Bonzini Feb. 4, 2015, 10:39 a.m. UTC | #6
On 04/02/2015 09:26, Christian Borntraeger wrote:
> Am 03.02.2015 um 16:22 schrieb Paolo Bonzini:
>> On 03/02/2015 16:16, Thomas Huth wrote:
>>> Actually, I'd prefer to keep the "virtual" in the defines for the type
>>> of operation below: When it comes to s390 storage keys, we likely might
>>> need some calls for reading and writing to physical memory, too. Then
>>> we could simply extend this ioctl instead of inventing a new one.
> 
> Rereading that. Shall we replace "virtual" with "logical"? That is what is
> used architecturally when we mean "do whatever is appropriate right now"
> That can boil down to virtual via DAT, virtual via access register mode, 
> real if DAT is off... and if fact your kernel implementation does that.

That makes sense.

>> Can you explain why it is necessary to read/write physical addresses
>> from user space?  In the case of QEMU, I'm worried that you would have
>> to invent your own memory read/write APIs that are different from
>> everything else.
>>
>> On real s390 zPCI, does bus-master DMA update storage keys?
> 
> the classic channel I/O does set the storage key change/reference and
> also triggers errors in the storage key protection value mismatches.
> 
> The PCI IOTA structure does contain a storage key value for accesses,
> so I assume its the same here, but I dont know for sure.

Emulating that in QEMU would be very hard.  Every DMA read/write would
have to go through a bounce buffer, but QEMU block device models for
example try hard to read from host disk directly into guest memory.

> Conny:
> I am asking myself, if we should explicitly add a comment in the 
> virtio-ccw spec, that all accesses are assumed to be with key 0 and 
> thus never cause key protection. The change/reference bit is set
> by the underlying I/O or memory copy anyway.

Can you explain the last sentence? :)

Paolo

> We can then add a ccw later on to set a different key if we ever need
> that.
> 
> 
>>
>>>> Not really true, as you don't check it.  So "It is not used by KVM with
>>>> the currently defined set of flags" is a better explanation.
>>>
>>> ok ... and maybe add "should be set to zero" ?
>>
>> If you don't check it, it is misleading to document this.
>>
>> Paolo
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Thomas Huth Feb. 4, 2015, 10:57 a.m. UTC | #7
On Wed, 04 Feb 2015 09:26:11 +0100
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> Am 03.02.2015 um 16:22 schrieb Paolo Bonzini:
> > 
> > 
> > On 03/02/2015 16:16, Thomas Huth wrote:
> >> Actually, I'd prefer to keep the "virtual" in the defines for the type
> >> of operation below: When it comes to s390 storage keys, we likely might
> >> need some calls for reading and writing to physical memory, too. Then
> >> we could simply extend this ioctl instead of inventing a new one.
> 
> Rereading that. Shall we replace "virtual" with "logical"? That is what is
> used architecturally when we mean "do whatever is appropriate right now"
> That can boil down to virtual via DAT, virtual via access register mode, 
> real if DAT is off... and if fact your kernel implementation does that.

True, but so far I tried to avoid to include s390-only wording into
this ioctl in case other architectures might need such a functionality
later, too (then this ioctl could be simply used there, too).

OTOH, maybe the memory model on s390 is just too special to try to keep
this ioctl generic ... but then I guess I also should rename the
define KVM_GUEST_MEM_OP into KVM_S390_GUEST_MEM_OP, for example?

 Thomas
Christian Borntraeger Feb. 4, 2015, 11:25 a.m. UTC | #8
Am 04.02.2015 um 11:39 schrieb Paolo Bonzini:
>> Conny:
>> I am asking myself, if we should explicitly add a comment in the 
>> virtio-ccw spec, that all accesses are assumed to be with key 0 and 
>> thus never cause key protection. The change/reference bit is set
>> by the underlying I/O or memory copy anyway.
> 
> Can you explain the last sentence? :)

Whenever vhost or qemu or a finished aio request wrote content into a
virtio buffer, the HW has set the storage key for that physical page,
which  makes it automatically dirty/referenced in the guest visible
storage key. 


For completeness sake: 
Now, if the guest does not use the storage key, but instead the new fault
based software dirty tracking, it wont notice the change bit. The guest 
I/O itself when finished will mark the struct page as Dirty, just like on
x86.

Makes sense?

Christian
Paolo Bonzini Feb. 4, 2015, 11:42 a.m. UTC | #9
On 04/02/2015 12:25, Christian Borntraeger wrote:
> Whenever vhost or qemu or a finished aio request wrote content into a
> virtio buffer, the HW has set the storage key for that physical page,
> which  makes it automatically dirty/referenced in the guest visible
> storage key. 

Ah, I knew the storage keys were per-physical page, but I wasn't sure if
they were separate for the host and the guest.  That's obvious now.

Can we detect non-zero storage key in emulated zPCI requests, and fail
the request somehow?

Paolo
Christian Borntraeger Feb. 4, 2015, 12:16 p.m. UTC | #10
Am 04.02.2015 um 12:42 schrieb Paolo Bonzini:
> 
> 
> On 04/02/2015 12:25, Christian Borntraeger wrote:
>> Whenever vhost or qemu or a finished aio request wrote content into a
>> virtio buffer, the HW has set the storage key for that physical page,
>> which  makes it automatically dirty/referenced in the guest visible
>> storage key. 
> 
> Ah, I knew the storage keys were per-physical page, but I wasn't sure if
> they were separate for the host and the guest.  That's obvious now.

Just something on top:
the storage key is per physical page. Just once. It contains C/R/ACC/F
(change, reference, access key, fetch protection)

But: there is also the pgste page table extension. That is used to  separate
both by doing logically ORs. The host and millicode will do the right shifting
copying  to keep both values separate, but when the physical storage key gets
dirty, the host and the guest view is now "changed==yes"

> 
> Can we detect non-zero storage key in emulated zPCI requests, and fail
> the request somehow?

Not right now. Even the kernel KVM module does not do this for emulated
instructions (as Linux has always key 0). Somewhen we might want to add
that capability, but its obviously not trivial for I/O like things. It
would get easier if we avoid VFIO etc and just had used the hardware support,
though. but as far as I can see this is not an option in QEMU.



Christian
Cornelia Huck Feb. 5, 2015, 1 p.m. UTC | #11
On Wed, 04 Feb 2015 09:26:11 +0100
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> the classic channel I/O does set the storage key change/reference and
> also triggers errors in the storage key protection value mismatches.

Just a bit of background for the benefit of innocent bystanders:

When classic channel I/O is initiated, the caller provides a so-called
operation request block (orb) which references the actual channel
program it wants to start. In this same orb, the caller also provides
the storage key to be used for all memory fetches (either of the
individual channel commands or of the memory they reference).

qemu interpretation of channel commands currently ignores all of this.

> Conny:
> I am asking myself, if we should explicitly add a comment in the 
> virtio-ccw spec, that all accesses are assumed to be with key 0 and 
> thus never cause key protection. The change/reference bit is set
> by the underlying I/O or memory copy anyway.
> We can then add a ccw later on to set a different key if we ever need
> that.

I don't think we need to clarify anything for the normal channel
commands used by virtio: They are treated as any other channel command
wrt key protection; and the lack of key checking is a feature of qemu,
not of the spec.

For the memory used for the virtqueues, I agree that we should clarify
that we assume key 0. I'm not sure whether there's a case for extending
this in the future, but who knows.

I'll probably have to open an issue against the virtio spec for this.
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index b112efc..bf44b53 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2716,6 +2716,50 @@  The fields in each entry are defined as follows:
    eax, ebx, ecx, edx: the values returned by the cpuid instruction for
          this function/index combination
 
+4.89 KVM_GUEST_MEM_OP
+
+Capability: KVM_CAP_MEM_OP
+Architectures: s390
+Type: vcpu ioctl
+Parameters: struct kvm_guest_mem_op (in)
+Returns: = 0 on success,
+         < 0 on generic error (e.g. -EFAULT or -ENOMEM),
+         > 0 if an exception occurred while walking the page tables
+
+Read or write data from/to the virtual memory of a VPCU.
+
+Parameters are specified via the following structure:
+
+struct kvm_guest_mem_op {
+	__u64 gaddr;		/* the guest address */
+	__u64 flags;		/* arch specific flags */
+	__u32 size;		/* amount of bytes */
+	__u32 op;		/* type of operation */
+	__u64 buf;		/* buffer in userspace */
+	__u8 reserved[32];	/* should be set to 0 */
+};
+
+The type of operation is specified in the "op" field, either KVM_MEMOP_VIRTREAD
+for reading from memory, KVM_MEMOP_VIRTWRITE for writing to memory, or
+KVM_MEMOP_CHECKVIRTREAD or KVM_MEMOP_CHECKVIRTWRITE to check whether the
+corresponding memory access would create an access exception (without
+changing the data in the memory at the destination). In case an access
+exception occurred while walking the MMU tables of the guest, the ioctl
+returns a positive error number to indicate the type of exception. The
+exception is raised directly at the corresponding VCPU if the bit
+KVM_MEMOP_F_INJECT_EXC is set in the "flags" field.
+
+The logical (virtual) start address of the memory region has to be specified
+in the "gaddr" field, and the length of the region in the "size" field.
+"buf" is the buffer supplied by the userspace application where the read data
+should be written to for KVM_MEMOP_VIRTREAD, or where the data that should
+be written is stored for a KVM_MEMOP_VIRTWRITE. "buf" can be NULL for both
+CHECK operations.
+
+The "reserved" field is meant for future extensions. It must currently be
+set to 0.
+
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 8a1be90..d912362 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -697,6 +697,28 @@  int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
 }
 
 /**
+ * check_gva_range - test a range of guest virtual addresses for accessibility
+ */
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva,
+		    unsigned long length, int is_write)
+{
+	unsigned long gpa;
+	unsigned long currlen;
+	int rc = 0;
+
+	ipte_lock(vcpu);
+	while (length > 0 && !rc) {
+		currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE));
+		rc = guest_translate_address(vcpu, gva, &gpa, is_write);
+		gva += currlen;
+		length -= currlen;
+	}
+	ipte_unlock(vcpu);
+
+	return rc;
+}
+
+/**
  * kvm_s390_check_low_addr_protection - check for low-address protection
  * @ga: Guest address
  *
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 0149cf1..268beb7 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -157,6 +157,8 @@  int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
 
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
 			    unsigned long *gpa, int write);
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva,
+		    unsigned long length, int is_write);
 
 int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
 		 unsigned long len, int write);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b2371c0..c80a640 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -168,6 +168,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VM_ATTRIBUTES:
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_S390_USER_SIGP:
+	case KVM_CAP_MEM_OP:
 		r = 1;
 		break;
 	case KVM_CAP_NR_VCPUS:
@@ -1886,6 +1887,59 @@  static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 	return r;
 }
 
+static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
+				  struct kvm_guest_mem_op *mop)
+{
+	void __user *uaddr = (void __user *)mop->buf;
+	void *tmpbuf = NULL;
+	int r, srcu_idx;
+
+	if ((mop->flags & ~KVM_MEMOP_F_INJECT_EXC) != 0)
+		return -EINVAL;
+
+	if (mop->op == KVM_MEMOP_VIRTREAD || mop->op == KVM_MEMOP_VIRTWRITE) {
+		if (mop->size > 16 * PAGE_SIZE)
+			return -E2BIG;
+		tmpbuf = vmalloc(mop->size);
+		if (!tmpbuf)
+			return -ENOMEM;
+	}
+
+	srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	switch (mop->op) {
+	case KVM_MEMOP_VIRTREAD:
+		r = read_guest(vcpu, mop->gaddr, tmpbuf, mop->size);
+		if (r == 0) {
+			if (copy_to_user(uaddr, tmpbuf, mop->size))
+				r = -EFAULT;
+		}
+		break;
+	case KVM_MEMOP_VIRTWRITE:
+		if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+			r = -EFAULT;
+			break;
+		}
+		r = write_guest(vcpu, mop->gaddr, tmpbuf, mop->size);
+		break;
+	case KVM_MEMOP_CHECKVIRTREAD:
+	case KVM_MEMOP_CHECKVIRTWRITE:
+		r = check_gva_range(vcpu, mop->gaddr, mop->size,
+				    mop->op == KVM_MEMOP_CHECKVIRTWRITE);
+		break;
+	default:
+		r = -EINVAL;
+	}
+
+	srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+
+	if (r > 0 && (mop->flags & KVM_MEMOP_F_INJECT_EXC) != 0)
+		kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+
+	vfree(tmpbuf);
+	return r;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -1985,6 +2039,15 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		break;
 	}
+	case KVM_GUEST_MEM_OP: {
+		struct kvm_guest_mem_op mem_op;
+
+		if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
+			r = kvm_s390_guest_mem_op(vcpu, &mem_op);
+		else
+			r = -EFAULT;
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 8055706..528a3d4 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -365,6 +365,24 @@  struct kvm_translation {
 	__u8  pad[5];
 };
 
+/* for KVM_GUEST_MEM_OP */
+struct kvm_guest_mem_op {
+	/* in */
+	__u64 gaddr;		/* the guest address */
+	__u64 flags;		/* arch specific flags */
+	__u32 size;		/* amount of bytes */
+	__u32 op;		/* type of operation */
+	__u64 buf;		/* buffer in userspace */
+	__u8 reserved[32];	/* should be set to 0 */
+};
+/* types for kvm_guest_mem_op->op */
+#define KVM_MEMOP_VIRTREAD		0
+#define KVM_MEMOP_VIRTWRITE		1
+#define KVM_MEMOP_CHECKVIRTREAD		2
+#define KVM_MEMOP_CHECKVIRTWRITE	3
+/* flags for kvm_guest_mem_op->flags */
+#define KVM_MEMOP_F_INJECT_EXC	0x0001UL	/* Inject exception on faults */
+
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
 	/* in */
@@ -760,6 +778,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_ENABLE_HCALL 104
 #define KVM_CAP_CHECK_EXTENSION_VM 105
 #define KVM_CAP_S390_USER_SIGP 106
+#define KVM_CAP_MEM_OP 107
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1135,6 +1154,8 @@  struct kvm_s390_ucas_mapping {
 #define KVM_ARM_VCPU_INIT	  _IOW(KVMIO,  0xae, struct kvm_vcpu_init)
 #define KVM_ARM_PREFERRED_TARGET  _IOR(KVMIO,  0xaf, struct kvm_vcpu_init)
 #define KVM_GET_REG_LIST	  _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
+/* Available with KVM_CAP_MEM_OP */
+#define KVM_GUEST_MEM_OP	  _IOW(KVMIO,  0xb1, struct kvm_guest_mem_op)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)