diff mbox

[v4,1/6] kvm: add device control API

Message ID 1365811727-24431-2-git-send-email-scottwood@freescale.com
State New, archived
Headers show

Commit Message

Scott Wood April 13, 2013, 12:08 a.m. UTC
Currently, devices that are emulated inside KVM are configured in a
hardcoded manner based on an assumption that any given architecture
only has one way to do it.  If there's any need to access device state,
it is done through inflexible one-purpose-only IOCTLs (e.g.
KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
cumbersome and depletes a limited numberspace.

This API provides a mechanism to instantiate a device of a certain
type, returning an ID that can be used to set/get attributes of the
device.  Attributes may include configuration parameters (e.g.
register base address), device state, operational commands, etc.  It
is similar to the ONE_REG API, except that it acts on devices rather
than vcpus.

Both device types and individual attributes can be tested without having
to create the device or get/set the attribute, without the need for
separately managing enumerated capabilities.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
v4:
 - Move some boilerplate back into generic code, as requested by Gleb.
   File descriptor management and reference counting is no longer the
   concern of the device implementation.

 - Don't hold kvm->lock during create.  The original reasons
   for doing so have vanished as for as MPIC is concerned, and
   this avoids needing to answer the question of whether to
   hold the lock during destroy as well.

   Paul, you may need to acquire the lock yourself in kvm_create_xics()
   to protect the -EEXIST check.

v3: remove some changes that were merged into this patch by accident,
and fix the error documentation for KVM_CREATE_DEVICE.
---
 Documentation/virtual/kvm/api.txt        |   70 ++++++++++++++++
 Documentation/virtual/kvm/devices/README |    1 +
 include/linux/kvm_host.h                 |   35 ++++++++
 include/uapi/linux/kvm.h                 |   27 +++++++
 virt/kvm/kvm_main.c                      |  129 ++++++++++++++++++++++++++++++
 5 files changed, 262 insertions(+)
 create mode 100644 Documentation/virtual/kvm/devices/README

Comments

Gleb Natapov April 25, 2013, 9:43 a.m. UTC | #1
On Fri, Apr 12, 2013 at 07:08:42PM -0500, Scott Wood wrote:
> Currently, devices that are emulated inside KVM are configured in a
> hardcoded manner based on an assumption that any given architecture
> only has one way to do it.  If there's any need to access device state,
> it is done through inflexible one-purpose-only IOCTLs (e.g.
> KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
> cumbersome and depletes a limited numberspace.
> 
> This API provides a mechanism to instantiate a device of a certain
> type, returning an ID that can be used to set/get attributes of the
> device.  Attributes may include configuration parameters (e.g.
> register base address), device state, operational commands, etc.  It
> is similar to the ONE_REG API, except that it acts on devices rather
> than vcpus.
> 
> Both device types and individual attributes can be tested without having
> to create the device or get/set the attribute, without the need for
> separately managing enumerated capabilities.
> 
> Signed-off-by: Scott Wood <scottwood@freescale.com>
> ---
> v4:
>  - Move some boilerplate back into generic code, as requested by Gleb.
>    File descriptor management and reference counting is no longer the
>    concern of the device implementation.
> 
>  - Don't hold kvm->lock during create.  The original reasons
>    for doing so have vanished as for as MPIC is concerned, and
>    this avoids needing to answer the question of whether to
>    hold the lock during destroy as well.
> 
>    Paul, you may need to acquire the lock yourself in kvm_create_xics()
>    to protect the -EEXIST check.
> 
> v3: remove some changes that were merged into this patch by accident,
> and fix the error documentation for KVM_CREATE_DEVICE.
> ---
>  Documentation/virtual/kvm/api.txt        |   70 ++++++++++++++++
>  Documentation/virtual/kvm/devices/README |    1 +
>  include/linux/kvm_host.h                 |   35 ++++++++
>  include/uapi/linux/kvm.h                 |   27 +++++++
>  virt/kvm/kvm_main.c                      |  129 ++++++++++++++++++++++++++++++
>  5 files changed, 262 insertions(+)
>  create mode 100644 Documentation/virtual/kvm/devices/README
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 976eb65..d52f3f9 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2173,6 +2173,76 @@ header; first `n_valid' valid entries with contents from the data
>  written, then `n_invalid' invalid entries, invalidating any previously
>  valid entries found.
>  
> +4.79 KVM_CREATE_DEVICE
> +
> +Capability: KVM_CAP_DEVICE_CTRL
> +Type: vm ioctl
> +Parameters: struct kvm_create_device (in/out)
> +Returns: 0 on success, -1 on error
> +Errors:
> +  ENODEV: The device type is unknown or unsupported
> +  EEXIST: Device already created, and this type of device may not
> +          be instantiated multiple times
> +
> +  Other error conditions may be defined by individual device types or
> +  have their standard meanings.
> +
> +Creates an emulated device in the kernel.  The file descriptor returned
> +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
> +
> +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
> +device type is supported (not necessarily whether it can be created
> +in the current vm).
> +
> +Individual devices should not define flags.  Attributes should be used
> +for specifying any behavior that is not implied by the device type
> +number.
> +
> +struct kvm_create_device {
> +	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
> +	__u32	fd;	/* out: device handle */
> +	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
> +};
Should we add __u32 padding here to make struct size multiple of u64?

> +
> +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
> +
> +Capability: KVM_CAP_DEVICE_CTRL
> +Type: device ioctl
> +Parameters: struct kvm_device_attr
> +Returns: 0 on success, -1 on error
> +Errors:
> +  ENXIO:  The group or attribute is unknown/unsupported for this device
> +  EPERM:  The attribute cannot (currently) be accessed this way
> +          (e.g. read-only attribute, or attribute that only makes
> +          sense when the device is in a different state)
> +
> +  Other error conditions may be defined by individual device types.
> +
> +Gets/sets a specified piece of device configuration and/or state.  The
> +semantics are device-specific.  See individual device documentation in
> +the "devices" directory.  As with ONE_REG, the size of the data
> +transferred is defined by the particular attribute.
> +
> +struct kvm_device_attr {
> +	__u32	flags;		/* no flags currently defined */
> +	__u32	group;		/* device-defined */
> +	__u64	attr;		/* group-defined */
> +	__u64	addr;		/* userspace address of attr data */
> +};
> +
> +4.81 KVM_HAS_DEVICE_ATTR
> +
> +Capability: KVM_CAP_DEVICE_CTRL
> +Type: device ioctl
> +Parameters: struct kvm_device_attr
> +Returns: 0 on success, -1 on error
> +Errors:
> +  ENXIO:  The group or attribute is unknown/unsupported for this device
> +
> +Tests whether a device supports a particular attribute.  A successful
> +return indicates the attribute is implemented.  It does not necessarily
> +indicate that the attribute can be read or written in the device's
> +current state.  "addr" is ignored.
>  
>  4.77 KVM_ARM_VCPU_INIT
>  
> diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README
> new file mode 100644
> index 0000000..34a6983
> --- /dev/null
> +++ b/Documentation/virtual/kvm/devices/README
> @@ -0,0 +1 @@
> +This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 20d77d2..8fce9bc 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1063,6 +1063,41 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
>  
>  extern bool kvm_rebooting;
>  
> +struct kvm_device_ops;
> +
> +struct kvm_device {
> +	struct kvm_device_ops *ops;
> +	struct kvm *kvm;
> +	atomic_t users;
> +	void *private;
> +};
> +
> +/* create, destroy, and name are mandatory */
> +struct kvm_device_ops {
> +	const char *name;
> +	int (*create)(struct kvm_device *dev, u32 type);
> +
> +	/*
> +	 * Destroy is responsible for freeing dev.
> +	 *
> +	 * Destroy may be called before or after destructors are called
> +	 * on emulated I/O regions, depending on whether a reference is
> +	 * held by a vcpu or other kvm component that gets destroyed
> +	 * after the emulated I/O.
> +	 */
> +	void (*destroy)(struct kvm_device *dev);
> +
> +	int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
> +	int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
> +	int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
> +	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
> +		      unsigned long arg);
> +};
> +
> +void kvm_device_get(struct kvm_device *dev);
> +void kvm_device_put(struct kvm_device *dev);
> +struct kvm_device *kvm_device_from_filp(struct file *filp);
> +
>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>  
>  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 74d0ff3..20ce2d2 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -668,6 +668,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_PPC_EPR 86
>  #define KVM_CAP_ARM_PSCI 87
>  #define KVM_CAP_ARM_SET_DEVICE_ADDR 88
> +#define KVM_CAP_DEVICE_CTRL 89
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -909,6 +910,32 @@ struct kvm_s390_ucas_mapping {
>  #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
>  
>  /*
> + * Device control API, available with KVM_CAP_DEVICE_CTRL
> + */
> +#define KVM_CREATE_DEVICE_TEST		1
> +
> +struct kvm_create_device {
> +	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
> +	__u32	fd;	/* out: device handle */
> +	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
> +};
> +
> +struct kvm_device_attr {
> +	__u32	flags;		/* no flags currently defined */
> +	__u32	group;		/* device-defined */
> +	__u64	attr;		/* group-defined */
> +	__u64	addr;		/* userspace address of attr data */
> +};
Please move struct definitions and KVM_CREATE_DEVICE_TEST define out
from ioctl definition block.

> +
> +/* ioctl for vm fd */
> +#define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
> +
> +/* ioctls for fds returned by KVM_CREATE_DEVICE */
> +#define KVM_SET_DEVICE_ATTR	  _IOW(KVMIO,  0xe1, struct kvm_device_attr)
> +#define KVM_GET_DEVICE_ATTR	  _IOW(KVMIO,  0xe2, struct kvm_device_attr)
> +#define KVM_HAS_DEVICE_ATTR	  _IOW(KVMIO,  0xe3, struct kvm_device_attr)
> +
> +/*
>   * ioctls for vcpu fds
>   */
>  #define KVM_RUN                   _IO(KVMIO,   0x80)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 5cc53c9..e2b18af 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2158,6 +2158,117 @@ out:
>  }
>  #endif
>  
> +static int kvm_device_ioctl_attr(struct kvm_device *dev,
> +				 int (*accessor)(struct kvm_device *dev,
> +						 struct kvm_device_attr *attr),
> +				 unsigned long arg)
> +{
> +	struct kvm_device_attr attr;
> +
> +	if (!accessor)
> +		return -EPERM;
> +
> +	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
> +		return -EFAULT;
> +
> +	return accessor(dev, &attr);
> +}
> +
> +static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
> +			     unsigned long arg)
> +{
> +	struct kvm_device *dev = filp->private_data;
> +
> +	switch (ioctl) {
> +	case KVM_SET_DEVICE_ATTR:
> +		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
> +	case KVM_GET_DEVICE_ATTR:
> +		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
> +	case KVM_HAS_DEVICE_ATTR:
> +		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
> +	default:
> +		if (dev->ops->ioctl)
> +			return dev->ops->ioctl(dev, ioctl, arg);
> +
> +		return -ENOTTY;
> +	}
> +}
> +
> +void kvm_device_get(struct kvm_device *dev)
> +{
> +	atomic_inc(&dev->users);
> +}
> +
> +void kvm_device_put(struct kvm_device *dev)
> +{
> +	if (atomic_dec_and_test(&dev->users))
> +		dev->ops->destroy(dev);
> +}
> +
> +static int kvm_device_release(struct inode *inode, struct file *filp)
> +{
> +	struct kvm_device *dev = filp->private_data;
> +	struct kvm *kvm = dev->kvm;
> +
> +	kvm_device_put(dev);
> +	kvm_put_kvm(kvm);
We may put kvm only if users goes to zero, otherwise kvm can be
freed while something holds a reference to a device. Why not make
kvm_device_put() do it?

> +	return 0;
> +}
> +
> +static const struct file_operations kvm_device_fops = {
> +	.unlocked_ioctl = kvm_device_ioctl,
> +	.release = kvm_device_release,
> +};
> +
> +struct kvm_device *kvm_device_from_filp(struct file *filp)
> +{
> +	if (filp->f_op != &kvm_device_fops)
> +		return NULL;
> +
> +	return filp->private_data;
> +}
> +
> +static int kvm_ioctl_create_device(struct kvm *kvm,
> +				   struct kvm_create_device *cd)
> +{
> +	struct kvm_device_ops *ops = NULL;
> +	struct kvm_device *dev;
> +	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
> +	int ret;
> +
> +	switch (cd->type) {
> +	default:
> +		return -ENODEV;
> +	}
> +
> +	if (test)
> +		return 0;
> +
> +	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> +	if (!dev)
> +		return -ENOMEM;
> +
> +	dev->ops = ops;
> +	dev->kvm = kvm;
> +	atomic_set(&dev->users, 1);
> +
> +	ret = ops->create(dev, cd->type);
> +	if (ret < 0) {
> +		kfree(dev);
> +		return ret;
> +	}
> +
> +	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR);
> +	if (ret < 0) {
> +		ops->destroy(dev);
> +		return ret;
> +	}
> +
> +	kvm_get_kvm(kvm);
> +	cd->fd = ret;
> +	return 0;
> +}
> +
>  static long kvm_vm_ioctl(struct file *filp,
>  			   unsigned int ioctl, unsigned long arg)
>  {
> @@ -2272,6 +2383,24 @@ static long kvm_vm_ioctl(struct file *filp,
>  		break;
>  	}
>  #endif
> +	case KVM_CREATE_DEVICE: {
> +		struct kvm_create_device cd;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&cd, argp, sizeof(cd)))
> +			goto out;
> +
> +		r = kvm_ioctl_create_device(kvm, &cd);
> +		if (r)
> +			goto out;
> +
> +		r = -EFAULT;
> +		if (copy_to_user(argp, &cd, sizeof(cd)))
> +			goto out;
> +
> +		r = 0;
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  		if (r == -ENOTTY)
> -- 
> 1.7.10.4
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf April 25, 2013, 10:47 a.m. UTC | #2
On 25.04.2013, at 11:43, Gleb Natapov wrote:

> On Fri, Apr 12, 2013 at 07:08:42PM -0500, Scott Wood wrote:
>> Currently, devices that are emulated inside KVM are configured in a
>> hardcoded manner based on an assumption that any given architecture
>> only has one way to do it.  If there's any need to access device state,
>> it is done through inflexible one-purpose-only IOCTLs (e.g.
>> KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
>> cumbersome and depletes a limited numberspace.
>> 
>> This API provides a mechanism to instantiate a device of a certain
>> type, returning an ID that can be used to set/get attributes of the
>> device.  Attributes may include configuration parameters (e.g.
>> register base address), device state, operational commands, etc.  It
>> is similar to the ONE_REG API, except that it acts on devices rather
>> than vcpus.
>> 
>> Both device types and individual attributes can be tested without having
>> to create the device or get/set the attribute, without the need for
>> separately managing enumerated capabilities.
>> 
>> Signed-off-by: Scott Wood <scottwood@freescale.com>
>> ---
>> v4:
>> - Move some boilerplate back into generic code, as requested by Gleb.
>>   File descriptor management and reference counting is no longer the
>>   concern of the device implementation.
>> 
>> - Don't hold kvm->lock during create.  The original reasons
>>   for doing so have vanished as for as MPIC is concerned, and
>>   this avoids needing to answer the question of whether to
>>   hold the lock during destroy as well.
>> 
>>   Paul, you may need to acquire the lock yourself in kvm_create_xics()
>>   to protect the -EEXIST check.
>> 
>> v3: remove some changes that were merged into this patch by accident,
>> and fix the error documentation for KVM_CREATE_DEVICE.
>> ---
>> Documentation/virtual/kvm/api.txt        |   70 ++++++++++++++++
>> Documentation/virtual/kvm/devices/README |    1 +
>> include/linux/kvm_host.h                 |   35 ++++++++
>> include/uapi/linux/kvm.h                 |   27 +++++++
>> virt/kvm/kvm_main.c                      |  129 ++++++++++++++++++++++++++++++
>> 5 files changed, 262 insertions(+)
>> create mode 100644 Documentation/virtual/kvm/devices/README
>> 
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index 976eb65..d52f3f9 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -2173,6 +2173,76 @@ header; first `n_valid' valid entries with contents from the data
>> written, then `n_invalid' invalid entries, invalidating any previously
>> valid entries found.
>> 
>> +4.79 KVM_CREATE_DEVICE
>> +
>> +Capability: KVM_CAP_DEVICE_CTRL
>> +Type: vm ioctl
>> +Parameters: struct kvm_create_device (in/out)
>> +Returns: 0 on success, -1 on error
>> +Errors:
>> +  ENODEV: The device type is unknown or unsupported
>> +  EEXIST: Device already created, and this type of device may not
>> +          be instantiated multiple times
>> +
>> +  Other error conditions may be defined by individual device types or
>> +  have their standard meanings.
>> +
>> +Creates an emulated device in the kernel.  The file descriptor returned
>> +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
>> +
>> +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
>> +device type is supported (not necessarily whether it can be created
>> +in the current vm).
>> +
>> +Individual devices should not define flags.  Attributes should be used
>> +for specifying any behavior that is not implied by the device type
>> +number.
>> +
>> +struct kvm_create_device {
>> +	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
>> +	__u32	fd;	/* out: device handle */
>> +	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
>> +};
> Should we add __u32 padding here to make struct size multiple of u64?

Do you know of any arch that pads structs to u64 boundaries? x86_64 doesn't and ppc64 doesn't either.

> 
>> +
>> +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
>> +
>> +Capability: KVM_CAP_DEVICE_CTRL
>> +Type: device ioctl
>> +Parameters: struct kvm_device_attr
>> +Returns: 0 on success, -1 on error
>> +Errors:
>> +  ENXIO:  The group or attribute is unknown/unsupported for this device
>> +  EPERM:  The attribute cannot (currently) be accessed this way
>> +          (e.g. read-only attribute, or attribute that only makes
>> +          sense when the device is in a different state)
>> +
>> +  Other error conditions may be defined by individual device types.
>> +
>> +Gets/sets a specified piece of device configuration and/or state.  The
>> +semantics are device-specific.  See individual device documentation in
>> +the "devices" directory.  As with ONE_REG, the size of the data
>> +transferred is defined by the particular attribute.
>> +
>> +struct kvm_device_attr {
>> +	__u32	flags;		/* no flags currently defined */
>> +	__u32	group;		/* device-defined */
>> +	__u64	attr;		/* group-defined */
>> +	__u64	addr;		/* userspace address of attr data */
>> +};
>> +
>> +4.81 KVM_HAS_DEVICE_ATTR
>> +
>> +Capability: KVM_CAP_DEVICE_CTRL
>> +Type: device ioctl
>> +Parameters: struct kvm_device_attr
>> +Returns: 0 on success, -1 on error
>> +Errors:
>> +  ENXIO:  The group or attribute is unknown/unsupported for this device
>> +
>> +Tests whether a device supports a particular attribute.  A successful
>> +return indicates the attribute is implemented.  It does not necessarily
>> +indicate that the attribute can be read or written in the device's
>> +current state.  "addr" is ignored.
>> 
>> 4.77 KVM_ARM_VCPU_INIT
>> 
>> diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README
>> new file mode 100644
>> index 0000000..34a6983
>> --- /dev/null
>> +++ b/Documentation/virtual/kvm/devices/README
>> @@ -0,0 +1 @@
>> +This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 20d77d2..8fce9bc 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -1063,6 +1063,41 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
>> 
>> extern bool kvm_rebooting;
>> 
>> +struct kvm_device_ops;
>> +
>> +struct kvm_device {
>> +	struct kvm_device_ops *ops;
>> +	struct kvm *kvm;
>> +	atomic_t users;
>> +	void *private;
>> +};
>> +
>> +/* create, destroy, and name are mandatory */
>> +struct kvm_device_ops {
>> +	const char *name;
>> +	int (*create)(struct kvm_device *dev, u32 type);
>> +
>> +	/*
>> +	 * Destroy is responsible for freeing dev.
>> +	 *
>> +	 * Destroy may be called before or after destructors are called
>> +	 * on emulated I/O regions, depending on whether a reference is
>> +	 * held by a vcpu or other kvm component that gets destroyed
>> +	 * after the emulated I/O.
>> +	 */
>> +	void (*destroy)(struct kvm_device *dev);
>> +
>> +	int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
>> +	int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
>> +	int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
>> +	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
>> +		      unsigned long arg);
>> +};
>> +
>> +void kvm_device_get(struct kvm_device *dev);
>> +void kvm_device_put(struct kvm_device *dev);
>> +struct kvm_device *kvm_device_from_filp(struct file *filp);
>> +
>> #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>> 
>> static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 74d0ff3..20ce2d2 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -668,6 +668,7 @@ struct kvm_ppc_smmu_info {
>> #define KVM_CAP_PPC_EPR 86
>> #define KVM_CAP_ARM_PSCI 87
>> #define KVM_CAP_ARM_SET_DEVICE_ADDR 88
>> +#define KVM_CAP_DEVICE_CTRL 89
>> 
>> #ifdef KVM_CAP_IRQ_ROUTING
>> 
>> @@ -909,6 +910,32 @@ struct kvm_s390_ucas_mapping {
>> #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
>> 
>> /*
>> + * Device control API, available with KVM_CAP_DEVICE_CTRL
>> + */
>> +#define KVM_CREATE_DEVICE_TEST		1
>> +
>> +struct kvm_create_device {
>> +	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
>> +	__u32	fd;	/* out: device handle */
>> +	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
>> +};
>> +
>> +struct kvm_device_attr {
>> +	__u32	flags;		/* no flags currently defined */
>> +	__u32	group;		/* device-defined */
>> +	__u64	attr;		/* group-defined */
>> +	__u64	addr;		/* userspace address of attr data */
>> +};
> Please move struct definitions and KVM_CREATE_DEVICE_TEST define out
> from ioctl definition block.

Let me change that in my tree...

> 
>> +
>> +/* ioctl for vm fd */
>> +#define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
>> +
>> +/* ioctls for fds returned by KVM_CREATE_DEVICE */
>> +#define KVM_SET_DEVICE_ATTR	  _IOW(KVMIO,  0xe1, struct kvm_device_attr)
>> +#define KVM_GET_DEVICE_ATTR	  _IOW(KVMIO,  0xe2, struct kvm_device_attr)
>> +#define KVM_HAS_DEVICE_ATTR	  _IOW(KVMIO,  0xe3, struct kvm_device_attr)
>> +
>> +/*
>>  * ioctls for vcpu fds
>>  */
>> #define KVM_RUN                   _IO(KVMIO,   0x80)
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 5cc53c9..e2b18af 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -2158,6 +2158,117 @@ out:
>> }
>> #endif
>> 
>> +static int kvm_device_ioctl_attr(struct kvm_device *dev,
>> +				 int (*accessor)(struct kvm_device *dev,
>> +						 struct kvm_device_attr *attr),
>> +				 unsigned long arg)
>> +{
>> +	struct kvm_device_attr attr;
>> +
>> +	if (!accessor)
>> +		return -EPERM;
>> +
>> +	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
>> +		return -EFAULT;
>> +
>> +	return accessor(dev, &attr);
>> +}
>> +
>> +static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
>> +			     unsigned long arg)
>> +{
>> +	struct kvm_device *dev = filp->private_data;
>> +
>> +	switch (ioctl) {
>> +	case KVM_SET_DEVICE_ATTR:
>> +		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
>> +	case KVM_GET_DEVICE_ATTR:
>> +		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
>> +	case KVM_HAS_DEVICE_ATTR:
>> +		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
>> +	default:
>> +		if (dev->ops->ioctl)
>> +			return dev->ops->ioctl(dev, ioctl, arg);
>> +
>> +		return -ENOTTY;
>> +	}
>> +}
>> +
>> +void kvm_device_get(struct kvm_device *dev)
>> +{
>> +	atomic_inc(&dev->users);
>> +}
>> +
>> +void kvm_device_put(struct kvm_device *dev)
>> +{
>> +	if (atomic_dec_and_test(&dev->users))
>> +		dev->ops->destroy(dev);
>> +}
>> +
>> +static int kvm_device_release(struct inode *inode, struct file *filp)
>> +{
>> +	struct kvm_device *dev = filp->private_data;
>> +	struct kvm *kvm = dev->kvm;
>> +
>> +	kvm_device_put(dev);
>> +	kvm_put_kvm(kvm);
> We may put kvm only if users goes to zero, otherwise kvm can be
> freed while something holds a reference to a device. Why not make
> kvm_device_put() do it?

Nice catch. I'll change the patch so it does the kvm_put_kvm inside kvm_device_put's destroy branch.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov April 25, 2013, 12:07 p.m. UTC | #3
On Thu, Apr 25, 2013 at 12:47:39PM +0200, Alexander Graf wrote:
> 
> On 25.04.2013, at 11:43, Gleb Natapov wrote:
> 
> > On Fri, Apr 12, 2013 at 07:08:42PM -0500, Scott Wood wrote:
> >> Currently, devices that are emulated inside KVM are configured in a
> >> hardcoded manner based on an assumption that any given architecture
> >> only has one way to do it.  If there's any need to access device state,
> >> it is done through inflexible one-purpose-only IOCTLs (e.g.
> >> KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
> >> cumbersome and depletes a limited numberspace.
> >> 
> >> This API provides a mechanism to instantiate a device of a certain
> >> type, returning an ID that can be used to set/get attributes of the
> >> device.  Attributes may include configuration parameters (e.g.
> >> register base address), device state, operational commands, etc.  It
> >> is similar to the ONE_REG API, except that it acts on devices rather
> >> than vcpus.
> >> 
> >> Both device types and individual attributes can be tested without having
> >> to create the device or get/set the attribute, without the need for
> >> separately managing enumerated capabilities.
> >> 
> >> Signed-off-by: Scott Wood <scottwood@freescale.com>
> >> ---
> >> v4:
> >> - Move some boilerplate back into generic code, as requested by Gleb.
> >>   File descriptor management and reference counting is no longer the
> >>   concern of the device implementation.
> >> 
> >> - Don't hold kvm->lock during create.  The original reasons
> >>   for doing so have vanished as for as MPIC is concerned, and
> >>   this avoids needing to answer the question of whether to
> >>   hold the lock during destroy as well.
> >> 
> >>   Paul, you may need to acquire the lock yourself in kvm_create_xics()
> >>   to protect the -EEXIST check.
> >> 
> >> v3: remove some changes that were merged into this patch by accident,
> >> and fix the error documentation for KVM_CREATE_DEVICE.
> >> ---
> >> Documentation/virtual/kvm/api.txt        |   70 ++++++++++++++++
> >> Documentation/virtual/kvm/devices/README |    1 +
> >> include/linux/kvm_host.h                 |   35 ++++++++
> >> include/uapi/linux/kvm.h                 |   27 +++++++
> >> virt/kvm/kvm_main.c                      |  129 ++++++++++++++++++++++++++++++
> >> 5 files changed, 262 insertions(+)
> >> create mode 100644 Documentation/virtual/kvm/devices/README
> >> 
> >> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> >> index 976eb65..d52f3f9 100644
> >> --- a/Documentation/virtual/kvm/api.txt
> >> +++ b/Documentation/virtual/kvm/api.txt
> >> @@ -2173,6 +2173,76 @@ header; first `n_valid' valid entries with contents from the data
> >> written, then `n_invalid' invalid entries, invalidating any previously
> >> valid entries found.
> >> 
> >> +4.79 KVM_CREATE_DEVICE
> >> +
> >> +Capability: KVM_CAP_DEVICE_CTRL
> >> +Type: vm ioctl
> >> +Parameters: struct kvm_create_device (in/out)
> >> +Returns: 0 on success, -1 on error
> >> +Errors:
> >> +  ENODEV: The device type is unknown or unsupported
> >> +  EEXIST: Device already created, and this type of device may not
> >> +          be instantiated multiple times
> >> +
> >> +  Other error conditions may be defined by individual device types or
> >> +  have their standard meanings.
> >> +
> >> +Creates an emulated device in the kernel.  The file descriptor returned
> >> +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
> >> +
> >> +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
> >> +device type is supported (not necessarily whether it can be created
> >> +in the current vm).
> >> +
> >> +Individual devices should not define flags.  Attributes should be used
> >> +for specifying any behavior that is not implied by the device type
> >> +number.
> >> +
> >> +struct kvm_create_device {
> >> +	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
> >> +	__u32	fd;	/* out: device handle */
> >> +	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
> >> +};
> > Should we add __u32 padding here to make struct size multiple of u64?
> 
> Do you know of any arch that pads structs to u64 boundaries? x86_64 doesn't and ppc64 doesn't either.
> 
Not really. I just notices that we pad some structures to that effect.

> > 
> >> +
> >> +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
> >> +
> >> +Capability: KVM_CAP_DEVICE_CTRL
> >> +Type: device ioctl
> >> +Parameters: struct kvm_device_attr
> >> +Returns: 0 on success, -1 on error
> >> +Errors:
> >> +  ENXIO:  The group or attribute is unknown/unsupported for this device
> >> +  EPERM:  The attribute cannot (currently) be accessed this way
> >> +          (e.g. read-only attribute, or attribute that only makes
> >> +          sense when the device is in a different state)
> >> +
> >> +  Other error conditions may be defined by individual device types.
> >> +
> >> +Gets/sets a specified piece of device configuration and/or state.  The
> >> +semantics are device-specific.  See individual device documentation in
> >> +the "devices" directory.  As with ONE_REG, the size of the data
> >> +transferred is defined by the particular attribute.
> >> +
> >> +struct kvm_device_attr {
> >> +	__u32	flags;		/* no flags currently defined */
> >> +	__u32	group;		/* device-defined */
> >> +	__u64	attr;		/* group-defined */
> >> +	__u64	addr;		/* userspace address of attr data */
> >> +};
> >> +
> >> +4.81 KVM_HAS_DEVICE_ATTR
> >> +
> >> +Capability: KVM_CAP_DEVICE_CTRL
> >> +Type: device ioctl
> >> +Parameters: struct kvm_device_attr
> >> +Returns: 0 on success, -1 on error
> >> +Errors:
> >> +  ENXIO:  The group or attribute is unknown/unsupported for this device
> >> +
> >> +Tests whether a device supports a particular attribute.  A successful
> >> +return indicates the attribute is implemented.  It does not necessarily
> >> +indicate that the attribute can be read or written in the device's
> >> +current state.  "addr" is ignored.
> >> 
> >> 4.77 KVM_ARM_VCPU_INIT
> >> 
> >> diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README
> >> new file mode 100644
> >> index 0000000..34a6983
> >> --- /dev/null
> >> +++ b/Documentation/virtual/kvm/devices/README
> >> @@ -0,0 +1 @@
> >> +This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.
> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> >> index 20d77d2..8fce9bc 100644
> >> --- a/include/linux/kvm_host.h
> >> +++ b/include/linux/kvm_host.h
> >> @@ -1063,6 +1063,41 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
> >> 
> >> extern bool kvm_rebooting;
> >> 
> >> +struct kvm_device_ops;
> >> +
> >> +struct kvm_device {
> >> +	struct kvm_device_ops *ops;
> >> +	struct kvm *kvm;
> >> +	atomic_t users;
> >> +	void *private;
> >> +};
> >> +
> >> +/* create, destroy, and name are mandatory */
> >> +struct kvm_device_ops {
> >> +	const char *name;
> >> +	int (*create)(struct kvm_device *dev, u32 type);
> >> +
> >> +	/*
> >> +	 * Destroy is responsible for freeing dev.
> >> +	 *
> >> +	 * Destroy may be called before or after destructors are called
> >> +	 * on emulated I/O regions, depending on whether a reference is
> >> +	 * held by a vcpu or other kvm component that gets destroyed
> >> +	 * after the emulated I/O.
> >> +	 */
> >> +	void (*destroy)(struct kvm_device *dev);
> >> +
> >> +	int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
> >> +	int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
> >> +	int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
> >> +	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
> >> +		      unsigned long arg);
> >> +};
> >> +
> >> +void kvm_device_get(struct kvm_device *dev);
> >> +void kvm_device_put(struct kvm_device *dev);
> >> +struct kvm_device *kvm_device_from_filp(struct file *filp);
> >> +
> >> #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> >> 
> >> static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
> >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> >> index 74d0ff3..20ce2d2 100644
> >> --- a/include/uapi/linux/kvm.h
> >> +++ b/include/uapi/linux/kvm.h
> >> @@ -668,6 +668,7 @@ struct kvm_ppc_smmu_info {
> >> #define KVM_CAP_PPC_EPR 86
> >> #define KVM_CAP_ARM_PSCI 87
> >> #define KVM_CAP_ARM_SET_DEVICE_ADDR 88
> >> +#define KVM_CAP_DEVICE_CTRL 89
> >> 
> >> #ifdef KVM_CAP_IRQ_ROUTING
> >> 
> >> @@ -909,6 +910,32 @@ struct kvm_s390_ucas_mapping {
> >> #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
> >> 
> >> /*
> >> + * Device control API, available with KVM_CAP_DEVICE_CTRL
> >> + */
> >> +#define KVM_CREATE_DEVICE_TEST		1
> >> +
> >> +struct kvm_create_device {
> >> +	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
> >> +	__u32	fd;	/* out: device handle */
> >> +	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
> >> +};
> >> +
> >> +struct kvm_device_attr {
> >> +	__u32	flags;		/* no flags currently defined */
> >> +	__u32	group;		/* device-defined */
> >> +	__u64	attr;		/* group-defined */
> >> +	__u64	addr;		/* userspace address of attr data */
> >> +};
> > Please move struct definitions and KVM_CREATE_DEVICE_TEST define out
> > from ioctl definition block.
> 
> Let me change that in my tree...
> 
So are you sending this via your tree and I should not apply it directly?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf April 25, 2013, 1:45 p.m. UTC | #4
On 25.04.2013, at 14:07, Gleb Natapov wrote:

> On Thu, Apr 25, 2013 at 12:47:39PM +0200, Alexander Graf wrote:
>> 
>> On 25.04.2013, at 11:43, Gleb Natapov wrote:
>> 
>>> On Fri, Apr 12, 2013 at 07:08:42PM -0500, Scott Wood wrote:
>>>> Currently, devices that are emulated inside KVM are configured in a
>>>> hardcoded manner based on an assumption that any given architecture
>>>> only has one way to do it.  If there's any need to access device state,
>>>> it is done through inflexible one-purpose-only IOCTLs (e.g.
>>>> KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
>>>> cumbersome and depletes a limited numberspace.
>>>> 
>>>> This API provides a mechanism to instantiate a device of a certain
>>>> type, returning an ID that can be used to set/get attributes of the
>>>> device.  Attributes may include configuration parameters (e.g.
>>>> register base address), device state, operational commands, etc.  It
>>>> is similar to the ONE_REG API, except that it acts on devices rather
>>>> than vcpus.
>>>> 
>>>> Both device types and individual attributes can be tested without having
>>>> to create the device or get/set the attribute, without the need for
>>>> separately managing enumerated capabilities.
>>>> 
>>>> Signed-off-by: Scott Wood <scottwood@freescale.com>
>>>> ---
>>>> v4:
>>>> - Move some boilerplate back into generic code, as requested by Gleb.
>>>>  File descriptor management and reference counting is no longer the
>>>>  concern of the device implementation.
>>>> 
>>>> - Don't hold kvm->lock during create.  The original reasons
>>>>  for doing so have vanished as for as MPIC is concerned, and
>>>>  this avoids needing to answer the question of whether to
>>>>  hold the lock during destroy as well.
>>>> 
>>>>  Paul, you may need to acquire the lock yourself in kvm_create_xics()
>>>>  to protect the -EEXIST check.
>>>> 
>>>> v3: remove some changes that were merged into this patch by accident,
>>>> and fix the error documentation for KVM_CREATE_DEVICE.
>>>> ---
>>>> Documentation/virtual/kvm/api.txt        |   70 ++++++++++++++++
>>>> Documentation/virtual/kvm/devices/README |    1 +
>>>> include/linux/kvm_host.h                 |   35 ++++++++
>>>> include/uapi/linux/kvm.h                 |   27 +++++++
>>>> virt/kvm/kvm_main.c                      |  129 ++++++++++++++++++++++++++++++
>>>> 5 files changed, 262 insertions(+)
>>>> create mode 100644 Documentation/virtual/kvm/devices/README
>>>> 
>>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>>>> index 976eb65..d52f3f9 100644
>>>> --- a/Documentation/virtual/kvm/api.txt
>>>> +++ b/Documentation/virtual/kvm/api.txt
>>>> @@ -2173,6 +2173,76 @@ header; first `n_valid' valid entries with contents from the data
>>>> written, then `n_invalid' invalid entries, invalidating any previously
>>>> valid entries found.
>>>> 
>>>> +4.79 KVM_CREATE_DEVICE
>>>> +
>>>> +Capability: KVM_CAP_DEVICE_CTRL
>>>> +Type: vm ioctl
>>>> +Parameters: struct kvm_create_device (in/out)
>>>> +Returns: 0 on success, -1 on error
>>>> +Errors:
>>>> +  ENODEV: The device type is unknown or unsupported
>>>> +  EEXIST: Device already created, and this type of device may not
>>>> +          be instantiated multiple times
>>>> +
>>>> +  Other error conditions may be defined by individual device types or
>>>> +  have their standard meanings.
>>>> +
>>>> +Creates an emulated device in the kernel.  The file descriptor returned
>>>> +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
>>>> +
>>>> +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
>>>> +device type is supported (not necessarily whether it can be created
>>>> +in the current vm).
>>>> +
>>>> +Individual devices should not define flags.  Attributes should be used
>>>> +for specifying any behavior that is not implied by the device type
>>>> +number.
>>>> +
>>>> +struct kvm_create_device {
>>>> +	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
>>>> +	__u32	fd;	/* out: device handle */
>>>> +	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
>>>> +};
>>> Should we add __u32 padding here to make struct size multiple of u64?
>> 
>> Do you know of any arch that pads structs to u64 boundaries? x86_64 doesn't and ppc64 doesn't either.
>> 
> Not really. I just notices that we pad some structures to that effect.

I don't think we really need to :).

> 
>>> 
>>>> +
>>>> +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
>>>> +
>>>> +Capability: KVM_CAP_DEVICE_CTRL
>>>> +Type: device ioctl
>>>> +Parameters: struct kvm_device_attr
>>>> +Returns: 0 on success, -1 on error
>>>> +Errors:
>>>> +  ENXIO:  The group or attribute is unknown/unsupported for this device
>>>> +  EPERM:  The attribute cannot (currently) be accessed this way
>>>> +          (e.g. read-only attribute, or attribute that only makes
>>>> +          sense when the device is in a different state)
>>>> +
>>>> +  Other error conditions may be defined by individual device types.
>>>> +
>>>> +Gets/sets a specified piece of device configuration and/or state.  The
>>>> +semantics are device-specific.  See individual device documentation in
>>>> +the "devices" directory.  As with ONE_REG, the size of the data
>>>> +transferred is defined by the particular attribute.
>>>> +
>>>> +struct kvm_device_attr {
>>>> +	__u32	flags;		/* no flags currently defined */
>>>> +	__u32	group;		/* device-defined */
>>>> +	__u64	attr;		/* group-defined */
>>>> +	__u64	addr;		/* userspace address of attr data */
>>>> +};
>>>> +
>>>> +4.81 KVM_HAS_DEVICE_ATTR
>>>> +
>>>> +Capability: KVM_CAP_DEVICE_CTRL
>>>> +Type: device ioctl
>>>> +Parameters: struct kvm_device_attr
>>>> +Returns: 0 on success, -1 on error
>>>> +Errors:
>>>> +  ENXIO:  The group or attribute is unknown/unsupported for this device
>>>> +
>>>> +Tests whether a device supports a particular attribute.  A successful
>>>> +return indicates the attribute is implemented.  It does not necessarily
>>>> +indicate that the attribute can be read or written in the device's
>>>> +current state.  "addr" is ignored.
>>>> 
>>>> 4.77 KVM_ARM_VCPU_INIT
>>>> 
>>>> diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README
>>>> new file mode 100644
>>>> index 0000000..34a6983
>>>> --- /dev/null
>>>> +++ b/Documentation/virtual/kvm/devices/README
>>>> @@ -0,0 +1 @@
>>>> +This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.
>>>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>>>> index 20d77d2..8fce9bc 100644
>>>> --- a/include/linux/kvm_host.h
>>>> +++ b/include/linux/kvm_host.h
>>>> @@ -1063,6 +1063,41 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
>>>> 
>>>> extern bool kvm_rebooting;
>>>> 
>>>> +struct kvm_device_ops;
>>>> +
>>>> +struct kvm_device {
>>>> +	struct kvm_device_ops *ops;
>>>> +	struct kvm *kvm;
>>>> +	atomic_t users;
>>>> +	void *private;
>>>> +};
>>>> +
>>>> +/* create, destroy, and name are mandatory */
>>>> +struct kvm_device_ops {
>>>> +	const char *name;
>>>> +	int (*create)(struct kvm_device *dev, u32 type);
>>>> +
>>>> +	/*
>>>> +	 * Destroy is responsible for freeing dev.
>>>> +	 *
>>>> +	 * Destroy may be called before or after destructors are called
>>>> +	 * on emulated I/O regions, depending on whether a reference is
>>>> +	 * held by a vcpu or other kvm component that gets destroyed
>>>> +	 * after the emulated I/O.
>>>> +	 */
>>>> +	void (*destroy)(struct kvm_device *dev);
>>>> +
>>>> +	int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
>>>> +	int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
>>>> +	int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
>>>> +	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
>>>> +		      unsigned long arg);
>>>> +};
>>>> +
>>>> +void kvm_device_get(struct kvm_device *dev);
>>>> +void kvm_device_put(struct kvm_device *dev);
>>>> +struct kvm_device *kvm_device_from_filp(struct file *filp);
>>>> +
>>>> #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>>>> 
>>>> static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
>>>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>>>> index 74d0ff3..20ce2d2 100644
>>>> --- a/include/uapi/linux/kvm.h
>>>> +++ b/include/uapi/linux/kvm.h
>>>> @@ -668,6 +668,7 @@ struct kvm_ppc_smmu_info {
>>>> #define KVM_CAP_PPC_EPR 86
>>>> #define KVM_CAP_ARM_PSCI 87
>>>> #define KVM_CAP_ARM_SET_DEVICE_ADDR 88
>>>> +#define KVM_CAP_DEVICE_CTRL 89
>>>> 
>>>> #ifdef KVM_CAP_IRQ_ROUTING
>>>> 
>>>> @@ -909,6 +910,32 @@ struct kvm_s390_ucas_mapping {
>>>> #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
>>>> 
>>>> /*
>>>> + * Device control API, available with KVM_CAP_DEVICE_CTRL
>>>> + */
>>>> +#define KVM_CREATE_DEVICE_TEST		1
>>>> +
>>>> +struct kvm_create_device {
>>>> +	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
>>>> +	__u32	fd;	/* out: device handle */
>>>> +	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
>>>> +};
>>>> +
>>>> +struct kvm_device_attr {
>>>> +	__u32	flags;		/* no flags currently defined */
>>>> +	__u32	group;		/* device-defined */
>>>> +	__u64	attr;		/* group-defined */
>>>> +	__u64	addr;		/* userspace address of attr data */
>>>> +};
>>> Please move struct definitions and KVM_CREATE_DEVICE_TEST define out
>>> from ioctl definition block.
>> 
>> Let me change that in my tree...
>> 
> So are you sending this via your tree and I should not apply it directly?

I was hoping to have things ready very soon for you to just pull...


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov April 25, 2013, 1:51 p.m. UTC | #5
On Thu, Apr 25, 2013 at 03:45:14PM +0200, Alexander Graf wrote:
> >>> Please move struct definitions and KVM_CREATE_DEVICE_TEST define out
> >>> from ioctl definition block.
> >> 
> >> Let me change that in my tree...
> >> 
> > So are you sending this via your tree and I should not apply it directly?
> 
> I was hoping to have things ready very soon for you to just pull...
> 
Make sense since there are PPC patches that depend on this one. 3.10 merge windows
will very likely open next week though...

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood April 25, 2013, 4:51 p.m. UTC | #6
On 04/25/2013 05:47:39 AM, Alexander Graf wrote:
> 
> On 25.04.2013, at 11:43, Gleb Natapov wrote:
> 
> >> +void kvm_device_put(struct kvm_device *dev)
> >> +{
> >> +	if (atomic_dec_and_test(&dev->users))
> >> +		dev->ops->destroy(dev);
> >> +}
> >> +
> >> +static int kvm_device_release(struct inode *inode, struct file  
> *filp)
> >> +{
> >> +	struct kvm_device *dev = filp->private_data;
> >> +	struct kvm *kvm = dev->kvm;
> >> +
> >> +	kvm_device_put(dev);
> >> +	kvm_put_kvm(kvm);
> > We may put kvm only if users goes to zero, otherwise kvm can be
> > freed while something holds a reference to a device. Why not make
> > kvm_device_put() do it?
> 
> Nice catch. I'll change the patch so it does the kvm_put_kvm inside  
> kvm_device_put's destroy branch.

No, please don't.  The KVM reference being "put" here is associated  
with the file descriptor, not with the MPIC object.  If you make that  
change I think you'll have circular references and thus a memory leak,  
because the vcpus can hold a reference to the MPIC object.

-Scott
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov April 25, 2013, 6:22 p.m. UTC | #7
On Thu, Apr 25, 2013 at 11:51:08AM -0500, Scott Wood wrote:
> On 04/25/2013 05:47:39 AM, Alexander Graf wrote:
> >
> >On 25.04.2013, at 11:43, Gleb Natapov wrote:
> >
> >>> +void kvm_device_put(struct kvm_device *dev)
> >>> +{
> >>> +	if (atomic_dec_and_test(&dev->users))
> >>> +		dev->ops->destroy(dev);
> >>> +}
> >>> +
> >>> +static int kvm_device_release(struct inode *inode, struct file
> >*filp)
> >>> +{
> >>> +	struct kvm_device *dev = filp->private_data;
> >>> +	struct kvm *kvm = dev->kvm;
> >>> +
> >>> +	kvm_device_put(dev);
> >>> +	kvm_put_kvm(kvm);
> >> We may put kvm only if users goes to zero, otherwise kvm can be
> >> freed while something holds a reference to a device. Why not make
> >> kvm_device_put() do it?
> >
> >Nice catch. I'll change the patch so it does the kvm_put_kvm
> >inside kvm_device_put's destroy branch.
> 
> No, please don't.  The KVM reference being "put" here is associated
> with the file descriptor, not with the MPIC object.
Is it so? Device holds a pointer to kvm, so it increments kvm reference
to make sure the pointer is valid. What prevents kvm from been destroyed
while device is still in use in current code?
 

>                                                    If you make
> that change I think you'll have circular references and thus a
> memory leak, because the vcpus can hold a reference to the MPIC
> object.
> 
How circular reference can be created?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov April 26, 2013, 9:53 a.m. UTC | #8
On Thu, Apr 25, 2013 at 01:59:20PM -0500, Scott Wood wrote:
> On 04/25/2013 01:22:04 PM, Gleb Natapov wrote:
> >On Thu, Apr 25, 2013 at 11:51:08AM -0500, Scott Wood wrote:
> >> On 04/25/2013 05:47:39 AM, Alexander Graf wrote:
> >> >
> >> >On 25.04.2013, at 11:43, Gleb Natapov wrote:
> >> >
> >> >>> +void kvm_device_put(struct kvm_device *dev)
> >> >>> +{
> >> >>> +	if (atomic_dec_and_test(&dev->users))
> >> >>> +		dev->ops->destroy(dev);
> >> >>> +}
> >> >>> +
> >> >>> +static int kvm_device_release(struct inode *inode, struct file
> >> >*filp)
> >> >>> +{
> >> >>> +	struct kvm_device *dev = filp->private_data;
> >> >>> +	struct kvm *kvm = dev->kvm;
> >> >>> +
> >> >>> +	kvm_device_put(dev);
> >> >>> +	kvm_put_kvm(kvm);
> >> >> We may put kvm only if users goes to zero, otherwise kvm can be
> >> >> freed while something holds a reference to a device. Why not make
> >> >> kvm_device_put() do it?
> >> >
> >> >Nice catch. I'll change the patch so it does the kvm_put_kvm
> >> >inside kvm_device_put's destroy branch.
> >>
> >> No, please don't.  The KVM reference being "put" here is associated
> >> with the file descriptor, not with the MPIC object.
> >Is it so? Device holds a pointer to kvm, so it increments kvm
> >reference
> >to make sure the pointer is valid. What prevents kvm from been
> >destroyed
> >while device is still in use in current code?
> 
> Where will that kvm pointer be used, after all the file descriptors
> go away and the vcpus stop running?  mmio_mapped guards against
> unmapping the MMIO if it's already been unmapped due to KVM
> destruction.  We don't have any timers or other delayed work.
> 
MPIC does not, but timer device will have one.

> Well, I do see one place, that Alex added -- the NULLing out of
> dev->kvm->arch.mpic, which didn't exist in my patchset.
> 
> >> that change I think you'll have circular references and thus a
> >> memory leak, because the vcpus can hold a reference to the MPIC
> >> object.
> >>
> >How circular reference can be created?
> 
> MPIC holds reference on KVM, vcpu holds reference on MPIC, and vcpu
> is not destroyed until KVM is destroyed.
> 
Yes, you are right. So we need to think about how to fix it in a
different way. What about holding all devices in kvm->devices[] array
and destroy them during kvm destruction, like we do for vcpus?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf April 26, 2013, 9:55 a.m. UTC | #9
On 26.04.2013, at 11:53, Gleb Natapov wrote:

> On Thu, Apr 25, 2013 at 01:59:20PM -0500, Scott Wood wrote:
>> On 04/25/2013 01:22:04 PM, Gleb Natapov wrote:
>>> On Thu, Apr 25, 2013 at 11:51:08AM -0500, Scott Wood wrote:
>>>> On 04/25/2013 05:47:39 AM, Alexander Graf wrote:
>>>>> 
>>>>> On 25.04.2013, at 11:43, Gleb Natapov wrote:
>>>>> 
>>>>>>> +void kvm_device_put(struct kvm_device *dev)
>>>>>>> +{
>>>>>>> +	if (atomic_dec_and_test(&dev->users))
>>>>>>> +		dev->ops->destroy(dev);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int kvm_device_release(struct inode *inode, struct file
>>>>> *filp)
>>>>>>> +{
>>>>>>> +	struct kvm_device *dev = filp->private_data;
>>>>>>> +	struct kvm *kvm = dev->kvm;
>>>>>>> +
>>>>>>> +	kvm_device_put(dev);
>>>>>>> +	kvm_put_kvm(kvm);
>>>>>> We may put kvm only if users goes to zero, otherwise kvm can be
>>>>>> freed while something holds a reference to a device. Why not make
>>>>>> kvm_device_put() do it?
>>>>> 
>>>>> Nice catch. I'll change the patch so it does the kvm_put_kvm
>>>>> inside kvm_device_put's destroy branch.
>>>> 
>>>> No, please don't.  The KVM reference being "put" here is associated
>>>> with the file descriptor, not with the MPIC object.
>>> Is it so? Device holds a pointer to kvm, so it increments kvm
>>> reference
>>> to make sure the pointer is valid. What prevents kvm from been
>>> destroyed
>>> while device is still in use in current code?
>> 
>> Where will that kvm pointer be used, after all the file descriptors
>> go away and the vcpus stop running?  mmio_mapped guards against
>> unmapping the MMIO if it's already been unmapped due to KVM
>> destruction.  We don't have any timers or other delayed work.
>> 
> MPIC does not, but timer device will have one.
> 
>> Well, I do see one place, that Alex added -- the NULLing out of
>> dev->kvm->arch.mpic, which didn't exist in my patchset.
>> 
>>>> that change I think you'll have circular references and thus a
>>>> memory leak, because the vcpus can hold a reference to the MPIC
>>>> object.
>>>> 
>>> How circular reference can be created?
>> 
>> MPIC holds reference on KVM, vcpu holds reference on MPIC, and vcpu
>> is not destroyed until KVM is destroyed.
>> 
> Yes, you are right. So we need to think about how to fix it in a
> different way. What about holding all devices in kvm->devices[] array
> and destroy them during kvm destruction, like we do for vcpus?

You should really look at your patches in LIFO order :). A patch doing that was already sent by Scott last night and is in v4 of my patch set.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov April 26, 2013, 9:57 a.m. UTC | #10
On Fri, Apr 26, 2013 at 11:55:27AM +0200, Alexander Graf wrote:
> 
> On 26.04.2013, at 11:53, Gleb Natapov wrote:
> 
> > On Thu, Apr 25, 2013 at 01:59:20PM -0500, Scott Wood wrote:
> >> On 04/25/2013 01:22:04 PM, Gleb Natapov wrote:
> >>> On Thu, Apr 25, 2013 at 11:51:08AM -0500, Scott Wood wrote:
> >>>> On 04/25/2013 05:47:39 AM, Alexander Graf wrote:
> >>>>> 
> >>>>> On 25.04.2013, at 11:43, Gleb Natapov wrote:
> >>>>> 
> >>>>>>> +void kvm_device_put(struct kvm_device *dev)
> >>>>>>> +{
> >>>>>>> +	if (atomic_dec_and_test(&dev->users))
> >>>>>>> +		dev->ops->destroy(dev);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static int kvm_device_release(struct inode *inode, struct file
> >>>>> *filp)
> >>>>>>> +{
> >>>>>>> +	struct kvm_device *dev = filp->private_data;
> >>>>>>> +	struct kvm *kvm = dev->kvm;
> >>>>>>> +
> >>>>>>> +	kvm_device_put(dev);
> >>>>>>> +	kvm_put_kvm(kvm);
> >>>>>> We may put kvm only if users goes to zero, otherwise kvm can be
> >>>>>> freed while something holds a reference to a device. Why not make
> >>>>>> kvm_device_put() do it?
> >>>>> 
> >>>>> Nice catch. I'll change the patch so it does the kvm_put_kvm
> >>>>> inside kvm_device_put's destroy branch.
> >>>> 
> >>>> No, please don't.  The KVM reference being "put" here is associated
> >>>> with the file descriptor, not with the MPIC object.
> >>> Is it so? Device holds a pointer to kvm, so it increments kvm
> >>> reference
> >>> to make sure the pointer is valid. What prevents kvm from been
> >>> destroyed
> >>> while device is still in use in current code?
> >> 
> >> Where will that kvm pointer be used, after all the file descriptors
> >> go away and the vcpus stop running?  mmio_mapped guards against
> >> unmapping the MMIO if it's already been unmapped due to KVM
> >> destruction.  We don't have any timers or other delayed work.
> >> 
> > MPIC does not, but timer device will have one.
> > 
> >> Well, I do see one place, that Alex added -- the NULLing out of
> >> dev->kvm->arch.mpic, which didn't exist in my patchset.
> >> 
> >>>> that change I think you'll have circular references and thus a
> >>>> memory leak, because the vcpus can hold a reference to the MPIC
> >>>> object.
> >>>> 
> >>> How circular reference can be created?
> >> 
> >> MPIC holds reference on KVM, vcpu holds reference on MPIC, and vcpu
> >> is not destroyed until KVM is destroyed.
> >> 
> > Yes, you are right. So we need to think about how to fix it in a
> > different way. What about holding all devices in kvm->devices[] array
> > and destroy them during kvm destruction, like we do for vcpus?
> 
> You should really look at your patches in LIFO order :). A patch doing that was already sent by Scott last night and is in v4 of my patch set.
> 
> 
I tried! This causes starvation for some patches. I need better algorithm :)

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 976eb65..d52f3f9 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2173,6 +2173,76 @@  header; first `n_valid' valid entries with contents from the data
 written, then `n_invalid' invalid entries, invalidating any previously
 valid entries found.
 
+4.79 KVM_CREATE_DEVICE
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: vm ioctl
+Parameters: struct kvm_create_device (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  ENODEV: The device type is unknown or unsupported
+  EEXIST: Device already created, and this type of device may not
+          be instantiated multiple times
+
+  Other error conditions may be defined by individual device types or
+  have their standard meanings.
+
+Creates an emulated device in the kernel.  The file descriptor returned
+in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
+
+If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
+device type is supported (not necessarily whether it can be created
+in the current vm).
+
+Individual devices should not define flags.  Attributes should be used
+for specifying any behavior that is not implied by the device type
+number.
+
+struct kvm_create_device {
+	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
+	__u32	fd;	/* out: device handle */
+	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
+};
+
+4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+  ENXIO:  The group or attribute is unknown/unsupported for this device
+  EPERM:  The attribute cannot (currently) be accessed this way
+          (e.g. read-only attribute, or attribute that only makes
+          sense when the device is in a different state)
+
+  Other error conditions may be defined by individual device types.
+
+Gets/sets a specified piece of device configuration and/or state.  The
+semantics are device-specific.  See individual device documentation in
+the "devices" directory.  As with ONE_REG, the size of the data
+transferred is defined by the particular attribute.
+
+struct kvm_device_attr {
+	__u32	flags;		/* no flags currently defined */
+	__u32	group;		/* device-defined */
+	__u64	attr;		/* group-defined */
+	__u64	addr;		/* userspace address of attr data */
+};
+
+4.81 KVM_HAS_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+  ENXIO:  The group or attribute is unknown/unsupported for this device
+
+Tests whether a device supports a particular attribute.  A successful
+return indicates the attribute is implemented.  It does not necessarily
+indicate that the attribute can be read or written in the device's
+current state.  "addr" is ignored.
 
 4.77 KVM_ARM_VCPU_INIT
 
diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README
new file mode 100644
index 0000000..34a6983
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/README
@@ -0,0 +1 @@ 
+This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 20d77d2..8fce9bc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1063,6 +1063,41 @@  static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 
 extern bool kvm_rebooting;
 
+struct kvm_device_ops;
+
+struct kvm_device {
+	struct kvm_device_ops *ops;
+	struct kvm *kvm;
+	atomic_t users;
+	void *private;
+};
+
+/* create, destroy, and name are mandatory */
+struct kvm_device_ops {
+	const char *name;
+	int (*create)(struct kvm_device *dev, u32 type);
+
+	/*
+	 * Destroy is responsible for freeing dev.
+	 *
+	 * Destroy may be called before or after destructors are called
+	 * on emulated I/O regions, depending on whether a reference is
+	 * held by a vcpu or other kvm component that gets destroyed
+	 * after the emulated I/O.
+	 */
+	void (*destroy)(struct kvm_device *dev);
+
+	int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+	int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+	int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
+		      unsigned long arg);
+};
+
+void kvm_device_get(struct kvm_device *dev);
+void kvm_device_put(struct kvm_device *dev);
+struct kvm_device *kvm_device_from_filp(struct file *filp);
+
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
 static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 74d0ff3..20ce2d2 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -668,6 +668,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_EPR 86
 #define KVM_CAP_ARM_PSCI 87
 #define KVM_CAP_ARM_SET_DEVICE_ADDR 88
+#define KVM_CAP_DEVICE_CTRL 89
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -909,6 +910,32 @@  struct kvm_s390_ucas_mapping {
 #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
 
 /*
+ * Device control API, available with KVM_CAP_DEVICE_CTRL
+ */
+#define KVM_CREATE_DEVICE_TEST		1
+
+struct kvm_create_device {
+	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
+	__u32	fd;	/* out: device handle */
+	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
+};
+
+struct kvm_device_attr {
+	__u32	flags;		/* no flags currently defined */
+	__u32	group;		/* device-defined */
+	__u64	attr;		/* group-defined */
+	__u64	addr;		/* userspace address of attr data */
+};
+
+/* ioctl for vm fd */
+#define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
+
+/* ioctls for fds returned by KVM_CREATE_DEVICE */
+#define KVM_SET_DEVICE_ATTR	  _IOW(KVMIO,  0xe1, struct kvm_device_attr)
+#define KVM_GET_DEVICE_ATTR	  _IOW(KVMIO,  0xe2, struct kvm_device_attr)
+#define KVM_HAS_DEVICE_ATTR	  _IOW(KVMIO,  0xe3, struct kvm_device_attr)
+
+/*
  * ioctls for vcpu fds
  */
 #define KVM_RUN                   _IO(KVMIO,   0x80)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5cc53c9..e2b18af 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2158,6 +2158,117 @@  out:
 }
 #endif
 
+static int kvm_device_ioctl_attr(struct kvm_device *dev,
+				 int (*accessor)(struct kvm_device *dev,
+						 struct kvm_device_attr *attr),
+				 unsigned long arg)
+{
+	struct kvm_device_attr attr;
+
+	if (!accessor)
+		return -EPERM;
+
+	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+		return -EFAULT;
+
+	return accessor(dev, &attr);
+}
+
+static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
+			     unsigned long arg)
+{
+	struct kvm_device *dev = filp->private_data;
+
+	switch (ioctl) {
+	case KVM_SET_DEVICE_ATTR:
+		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
+	case KVM_GET_DEVICE_ATTR:
+		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
+	case KVM_HAS_DEVICE_ATTR:
+		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
+	default:
+		if (dev->ops->ioctl)
+			return dev->ops->ioctl(dev, ioctl, arg);
+
+		return -ENOTTY;
+	}
+}
+
+void kvm_device_get(struct kvm_device *dev)
+{
+	atomic_inc(&dev->users);
+}
+
+void kvm_device_put(struct kvm_device *dev)
+{
+	if (atomic_dec_and_test(&dev->users))
+		dev->ops->destroy(dev);
+}
+
+static int kvm_device_release(struct inode *inode, struct file *filp)
+{
+	struct kvm_device *dev = filp->private_data;
+	struct kvm *kvm = dev->kvm;
+
+	kvm_device_put(dev);
+	kvm_put_kvm(kvm);
+	return 0;
+}
+
+static const struct file_operations kvm_device_fops = {
+	.unlocked_ioctl = kvm_device_ioctl,
+	.release = kvm_device_release,
+};
+
+struct kvm_device *kvm_device_from_filp(struct file *filp)
+{
+	if (filp->f_op != &kvm_device_fops)
+		return NULL;
+
+	return filp->private_data;
+}
+
+static int kvm_ioctl_create_device(struct kvm *kvm,
+				   struct kvm_create_device *cd)
+{
+	struct kvm_device_ops *ops = NULL;
+	struct kvm_device *dev;
+	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
+	int ret;
+
+	switch (cd->type) {
+	default:
+		return -ENODEV;
+	}
+
+	if (test)
+		return 0;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->ops = ops;
+	dev->kvm = kvm;
+	atomic_set(&dev->users, 1);
+
+	ret = ops->create(dev, cd->type);
+	if (ret < 0) {
+		kfree(dev);
+		return ret;
+	}
+
+	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR);
+	if (ret < 0) {
+		ops->destroy(dev);
+		return ret;
+	}
+
+	kvm_get_kvm(kvm);
+	cd->fd = ret;
+	return 0;
+}
+
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -2272,6 +2383,24 @@  static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif
+	case KVM_CREATE_DEVICE: {
+		struct kvm_create_device cd;
+
+		r = -EFAULT;
+		if (copy_from_user(&cd, argp, sizeof(cd)))
+			goto out;
+
+		r = kvm_ioctl_create_device(kvm, &cd);
+		if (r)
+			goto out;
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &cd, sizeof(cd)))
+			goto out;
+
+		r = 0;
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)