Patchwork [v2,2/2] KVM: s390: add floating irq controller

login
register
mail settings
Submitter Jens Freimann
Date Sept. 6, 2013, 12:19 p.m.
Message ID <1378469957-62290-3-git-send-email-jfrei@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/273172/
State New
Headers show

Comments

Jens Freimann - Sept. 6, 2013, 12:19 p.m.
This patch adds a floating irq controller as a kvm_device.
It will be necessary for migration of floating interrupts as well
as for hardening the reset code by allowing user space to explicitly
remove all pending floating interrupts.

Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
 arch/s390/include/asm/kvm_host.h                |   1 +
 arch/s390/include/uapi/asm/kvm.h                |   5 +
 arch/s390/kvm/interrupt.c                       | 304 ++++++++++++++++++++----
 arch/s390/kvm/kvm-s390.c                        |   1 +
 include/linux/kvm_host.h                        |   1 +
 include/uapi/linux/kvm.h                        |   2 +
 virt/kvm/kvm_main.c                             |   5 +
 8 files changed, 304 insertions(+), 51 deletions(-)
Christian Borntraeger - Sept. 6, 2013, 1:22 p.m.
On 06/09/13 14:19, Jens Freimann wrote:
> This patch adds a floating irq controller as a kvm_device.
> It will be necessary for migration of floating interrupts as well
> as for hardening the reset code by allowing user space to explicitly
> remove all pending floating interrupts.
> 
> Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
> Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>

> ---
>  Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
>  arch/s390/include/asm/kvm_host.h                |   1 +
>  arch/s390/include/uapi/asm/kvm.h                |   5 +
>  arch/s390/kvm/interrupt.c                       | 304 ++++++++++++++++++++----
>  arch/s390/kvm/kvm-s390.c                        |   1 +
>  include/linux/kvm_host.h                        |   1 +
>  include/uapi/linux/kvm.h                        |   2 +
>  virt/kvm/kvm_main.c                             |   5 +
>  8 files changed, 304 insertions(+), 51 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
> new file mode 100644
> index 0000000..06aef31
> --- /dev/null
> +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
> @@ -0,0 +1,36 @@
> +FLIC (floating interrupt controller)
> +====================================
> +
> +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
> +machine check interruptions. All interrupts are stored in a per-vm list of
> +pending interrupts. FLIC performs operations on this list.
> +
> +Only one FLIC instance may be instantiated.
> +
> +FLIC provides support to
> +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
> +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
> +
> +Groups:
> +  KVM_DEV_FLIC_ENQUEUE
> +    Adds one interrupt to the list of pending floating interrupts. Interrupts
> +    are taken from this list for injection into the guest. attr contains
> +    a struct kvm_s390_irq which contains all data relevant for
> +    interrupt injection.
> +    The format of the data structure kvm_s390_irq as it is copied from userspace
> +    is defined in usr/include/linux/kvm.h.
> +    For historic reasons list members are stored in a different data structure, i.e.
> +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
> +    which can then be added to the list.
> +
> +  KVM_DEV_FLIC_DEQUEUE
> +    Takes one element off the pending interrupts list and copies it into userspace.
> +    Dequeued interrupts are not injected into the guest.
> +    attr->addr contains the userspace address of a struct kvm_s390_irq.
> +    List elements are stored in the format of struct kvm_s390_interrupt_info
> +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
> +    (usr/include/linux/kvm.h)
> +
> +  KVM_DEV_FLIC_CLEAR_IRQS
> +    Simply deletes all elements from the list of currently pending floating interrupts.
> +    No interrupts are injected into the guest.
> diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
> index adb05c5..e1cc166 100644
> --- a/arch/s390/include/asm/kvm_host.h
> +++ b/arch/s390/include/asm/kvm_host.h
> @@ -238,6 +238,7 @@ struct kvm_arch{
>  	struct sca_block *sca;
>  	debug_info_t *dbf;
>  	struct kvm_s390_float_interrupt float_int;
> +	struct kvm_device *flic;
>  	struct gmap *gmap;
>  	int css_support;
>  };
> diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
> index d25da59..33d52b8 100644
> --- a/arch/s390/include/uapi/asm/kvm.h
> +++ b/arch/s390/include/uapi/asm/kvm.h
> @@ -16,6 +16,11 @@
> 
>  #define __KVM_S390
> 
> +/* Device control API: s390-specific devices */
> +#define KVM_DEV_FLIC_DEQUEUE 1
> +#define KVM_DEV_FLIC_ENQUEUE 2
> +#define KVM_DEV_FLIC_CLEAR_IRQS 3
> +
>  /* for KVM_GET_REGS and KVM_SET_REGS */
>  struct kvm_regs {
>  	/* general purpose regs for s390 */
> diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
> index 7f35cb3..d6d5e36 100644
> --- a/arch/s390/kvm/interrupt.c
> +++ b/arch/s390/kvm/interrupt.c
> @@ -656,53 +656,86 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
>  	return inti;
>  }
> 
> -int kvm_s390_inject_vm(struct kvm *kvm,
> -		       struct kvm_s390_interrupt *s390int)
> +static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
>  {
>  	struct kvm_s390_local_interrupt *li;
>  	struct kvm_s390_float_interrupt *fi;
> -	struct kvm_s390_interrupt_info *inti, *iter;
> +	struct kvm_s390_interrupt_info *iter;
>  	int sigcpu;
> 
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;
> +	spin_lock(&fi->lock);
> +	if (!is_ioint(inti->type)) {
> +		list_add_tail(&inti->list, &fi->list);
> +	} else {
> +		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> +
> +		/* Keep I/O interrupts sorted in isc order. */
> +		list_for_each_entry(iter, &fi->list, list) {
> +			if (!is_ioint(iter->type))
> +				continue;
> +			if (int_word_to_isc_bits(iter->io.io_int_word)
> +					<= isc_bits)
> +				continue;
> +			break;
> +		}
> +		list_add_tail(&inti->list, &iter->list);
> +	}
> +	atomic_set(&fi->active, 1);
> +	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> +	if (sigcpu == KVM_MAX_VCPUS) {
> +		do {
> +			sigcpu = fi->next_rr_cpu++;
> +			if (sigcpu == KVM_MAX_VCPUS)
> +				sigcpu = fi->next_rr_cpu = 0;
> +		} while (fi->local_int[sigcpu] == NULL);
> +	}
> +	li = fi->local_int[sigcpu];
> +	spin_lock_bh(&li->lock);
> +	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> +	if (waitqueue_active(li->wq))
> +		wake_up_interruptible(li->wq);
> +	spin_unlock_bh(&li->lock);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +}
> +
> +int kvm_s390_inject_vm(struct kvm *kvm,
> +		       struct kvm_s390_interrupt *s390int)
> +{
> +	struct kvm_s390_interrupt_info *inti;
> +
>  	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
>  	if (!inti)
>  		return -ENOMEM;
> 
> -	switch (s390int->type) {
> +	inti->type = s390int->type;
> +	switch (inti->type) {
>  	case KVM_S390_INT_VIRTIO:
>  		VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
>  			 s390int->parm, s390int->parm64);
> -		inti->type = s390int->type;
>  		inti->ext.ext_params = s390int->parm;
>  		inti->ext.ext_params2 = s390int->parm64;
>  		break;
>  	case KVM_S390_INT_SERVICE:
>  		VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
> -		inti->type = s390int->type;
>  		inti->ext.ext_params = s390int->parm;
>  		break;
> -	case KVM_S390_PROGRAM_INT:
> -	case KVM_S390_SIGP_STOP:
> -	case KVM_S390_INT_EXTERNAL_CALL:
> -	case KVM_S390_INT_EMERGENCY:
> -		kfree(inti);
> -		return -EINVAL;
>  	case KVM_S390_MCHK:
>  		VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
>  			 s390int->parm64);
> -		inti->type = s390int->type;
>  		inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
>  		inti->mchk.mcic = s390int->parm64;
>  		break;
>  	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> -		if (s390int->type & IOINT_AI_MASK)
> +		if (inti->type & IOINT_AI_MASK)
>  			VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
>  		else
>  			VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
>  				 s390int->type & IOINT_CSSID_MASK,
>  				 s390int->type & IOINT_SSID_MASK,
>  				 s390int->type & IOINT_SCHID_MASK);
> -		inti->type = s390int->type;
>  		inti->io.subchannel_id = s390int->parm >> 16;
>  		inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
>  		inti->io.io_int_parm = s390int->parm64 >> 32;
> @@ -715,42 +748,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
>  	trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
>  				 2);
> 
> -	mutex_lock(&kvm->lock);
> -	fi = &kvm->arch.float_int;
> -	spin_lock(&fi->lock);
> -	if (!is_ioint(inti->type))
> -		list_add_tail(&inti->list, &fi->list);
> -	else {
> -		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> -
> -		/* Keep I/O interrupts sorted in isc order. */
> -		list_for_each_entry(iter, &fi->list, list) {
> -			if (!is_ioint(iter->type))
> -				continue;
> -			if (int_word_to_isc_bits(iter->io.io_int_word)
> -			    <= isc_bits)
> -				continue;
> -			break;
> -		}
> -		list_add_tail(&inti->list, &iter->list);
> -	}
> -	atomic_set(&fi->active, 1);
> -	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> -	if (sigcpu == KVM_MAX_VCPUS) {
> -		do {
> -			sigcpu = fi->next_rr_cpu++;
> -			if (sigcpu == KVM_MAX_VCPUS)
> -				sigcpu = fi->next_rr_cpu = 0;
> -		} while (fi->local_int[sigcpu] == NULL);
> -	}
> -	li = fi->local_int[sigcpu];
> -	spin_lock_bh(&li->lock);
> -	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> -	if (waitqueue_active(li->wq))
> -		wake_up_interruptible(li->wq);
> -	spin_unlock_bh(&li->lock);
> -	spin_unlock(&fi->lock);
> -	mutex_unlock(&kvm->lock);
> +	__inject_vm(kvm, inti);
>  	return 0;
>  }
> 
> @@ -838,3 +836,207 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
>  	mutex_unlock(&vcpu->kvm->lock);
>  	return 0;
>  }
> +
> +static void clear_floating_interrupts(struct kvm *kvm)
> +{
> +	struct kvm_s390_float_interrupt *fi;
> +	struct kvm_s390_interrupt_info	*n, *inti = NULL;
> +
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;
> +	spin_lock(&fi->lock);
> +	list_for_each_entry_safe(inti, n, &fi->list, list) {
> +		list_del(&inti->list);
> +		kfree(inti);
> +	}
> +	atomic_set(&fi->active, 0);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +}
> +
> +static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
> +				   u64 addr)
> +{
> +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> +	void __user *target;
> +	void *source;
> +	u64 size;
> +	int r = 0;
> +
> +	switch (inti->type) {
> +	case KVM_S390_INT_VIRTIO:
> +	case KVM_S390_INT_SERVICE:
> +		source = &inti->ext;
> +		target = &uptr->ext;
> +		size = sizeof(inti->ext);
> +		break;
> +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> +		source = &inti->io;
> +		target = &uptr->io;
> +		size = sizeof(inti->io);
> +		break;
> +	case KVM_S390_MCHK:
> +		source = &inti->mchk;
> +		target = &uptr->mchk;
> +		size = sizeof(inti->mchk);
> +		break;
> +	case KVM_S390_INT_MAX:
> +		goto out;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	r = put_user(inti->type, (u64 __user *) &uptr->type);
> +	if (copy_to_user(target, source, size))
> +		r = -EFAULT;
> +
> +out:
> +	return r;
> +}
> +
> +static int dequeue_floating_irq(struct kvm *kvm, __u64 addr)
> +{
> +	struct kvm_s390_interrupt_info *inti;
> +	struct kvm_s390_float_interrupt *fi;
> +	int r = 0;
> +
> +
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;
> +	spin_lock(&fi->lock);
> +	if (list_empty(&fi->list)) {
> +		mutex_unlock(&kvm->lock);
> +		spin_unlock(&fi->lock);
> +		return -ENODATA;
> +	}
> +	inti = list_first_entry(&fi->list,
> +			struct kvm_s390_interrupt_info, list);
> +	list_del(&inti->list);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +
> +	r = copy_irq_to_user(inti, addr);
> +
> +	kfree(inti);
> +	return r;
> +}
> +
> +static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	int r;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_FLIC_DEQUEUE:
> +		r = dequeue_floating_irq(dev->kvm, attr->addr);
> +		break;
> +	default:
> +		r = -EINVAL;
> +	}
> +
> +	return r;
> +}
> +
> +static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
> +				     u64 addr)
> +{
> +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> +	void *target = NULL;
> +	void *source;
> +	u64 size;
> +	int r = 0;
> +
> +	if (get_user(inti->type, (u64 __user *)addr))
> +		return -EFAULT;
> +	switch (inti->type) {
> +	case KVM_S390_INT_VIRTIO:
> +	case KVM_S390_INT_SERVICE:
> +		target = (void *) &inti->ext;
> +		source = &uptr->ext;
> +		size = sizeof(inti->ext);
> +		break;
> +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> +		target = (void *) &inti->io;
> +		source = &uptr->io;
> +		size = sizeof(inti->io);
> +		break;
> +	case KVM_S390_MCHK:
> +		target = (void *) &inti->mchk;
> +		source = &uptr->mchk;
> +		size = sizeof(inti->mchk);
> +		break;
> +	case KVM_S390_INT_MAX:
> +		goto out;
> +	default:
> +		r = -EINVAL;
> +		return r;
> +	}
> +
> +	if (copy_from_user(target, source, size))
> +		r = -EFAULT;
> +
> +out:
> +	return r;
> +}
> +
> +static int enqueue_floating_irq(struct kvm_device *dev,
> +				 struct kvm_device_attr *attr)
> +{
> +	struct kvm_s390_interrupt_info *inti = NULL;
> +	int r = 0;
> +
> +	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
> +	if (!inti)
> +		return -ENOMEM;
> +
> +	r = copy_irq_from_user(inti, attr->addr);
> +	if (r) {
> +		kfree(inti);
> +		return r;
> +	}
> +	__inject_vm(dev->kvm, inti);
> +
> +	return r;
> +}
> +
> +static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	int r = 0;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_FLIC_ENQUEUE:
> +		r = enqueue_floating_irq(dev, attr);
> +		break;
> +	case KVM_DEV_FLIC_CLEAR_IRQS:
> +		r = 0;
> +		clear_floating_interrupts(dev->kvm);
> +		break;
> +	default:
> +		r = -EINVAL;
> +	}
> +
> +	return r;
> +}
> +
> +static int flic_create(struct kvm_device *dev, u32 type)
> +{
> +	if (!dev)
> +		return -EINVAL;
> +	if (dev->kvm->arch.flic)
> +		return -EINVAL;
> +	dev->kvm->arch.flic = dev;
> +	return 0;
> +}
> +
> +static void flic_destroy(struct kvm_device *dev)
> +{
> +	dev->kvm->arch.flic = NULL;
> +}
> +
> +/* s390 floating irq controller (flic) */
> +struct kvm_device_ops kvm_flic_ops = {
> +	.name = "kvm-flic",
> +	.get_attr = flic_get_attr,
> +	.set_attr = flic_set_attr,
> +	.create = flic_create,
> +	.destroy = flic_destroy,
> +};
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 776dafe..760febf 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -157,6 +157,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_ENABLE_CAP:
>  	case KVM_CAP_S390_CSS_SUPPORT:
>  	case KVM_CAP_IOEVENTFD:
> +	case KVM_CAP_DEVICE_CTRL:
>  		r = 1;
>  		break;
>  	case KVM_CAP_NR_VCPUS:
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ca645a0..e89304d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1065,6 +1065,7 @@ struct kvm_device *kvm_device_from_filp(struct file *filp);
> 
>  extern struct kvm_device_ops kvm_mpic_ops;
>  extern struct kvm_device_ops kvm_xics_ops;
> +extern struct kvm_device_ops kvm_flic_ops;
> 
>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> 
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index eeb08a1..9c8b457 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -426,6 +426,7 @@ struct kvm_s390_psw {
>  	 ((ai) << 26))
>  #define KVM_S390_INT_IO_MIN		0x00000000u
>  #define KVM_S390_INT_IO_MAX		0xfffdffffu
> +#define KVM_S390_INT_MAX		0xffffffffu
> 
> 
>  struct kvm_s390_interrupt {
> @@ -906,6 +907,7 @@ struct kvm_device_attr {
>  #define KVM_DEV_TYPE_FSL_MPIC_20	1
>  #define KVM_DEV_TYPE_FSL_MPIC_42	2
>  #define KVM_DEV_TYPE_XICS		3
> +#define KVM_DEV_TYPE_FLIC		4
> 
>  /*
>   * ioctls for VM fds
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index bf040c4..79a2ecc 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2265,6 +2265,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
>  		ops = &kvm_xics_ops;
>  		break;
>  #endif
> +#ifdef CONFIG_S390
> +	case KVM_DEV_TYPE_FLIC:
> +		ops = &kvm_flic_ops;
> +		break;
> +#endif
>  	default:
>  		return -ENODEV;
>  	}
>
Alexander Graf - Oct. 4, 2013, 11:53 p.m.
On 06.09.2013, at 14:19, Jens Freimann wrote:

> This patch adds a floating irq controller as a kvm_device.
> It will be necessary for migration of floating interrupts as well
> as for hardening the reset code by allowing user space to explicitly
> remove all pending floating interrupts.
> 
> Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
> Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
> ---
> Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
> arch/s390/include/asm/kvm_host.h                |   1 +
> arch/s390/include/uapi/asm/kvm.h                |   5 +
> arch/s390/kvm/interrupt.c                       | 304 ++++++++++++++++++++----
> arch/s390/kvm/kvm-s390.c                        |   1 +
> include/linux/kvm_host.h                        |   1 +
> include/uapi/linux/kvm.h                        |   2 +
> virt/kvm/kvm_main.c                             |   5 +
> 8 files changed, 304 insertions(+), 51 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
> new file mode 100644
> index 0000000..06aef31
> --- /dev/null
> +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
> @@ -0,0 +1,36 @@
> +FLIC (floating interrupt controller)
> +====================================
> +
> +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
> +machine check interruptions. All interrupts are stored in a per-vm list of
> +pending interrupts. FLIC performs operations on this list.
> +
> +Only one FLIC instance may be instantiated.
> +
> +FLIC provides support to
> +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
> +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
> +
> +Groups:
> +  KVM_DEV_FLIC_ENQUEUE
> +    Adds one interrupt to the list of pending floating interrupts. Interrupts
> +    are taken from this list for injection into the guest. attr contains
> +    a struct kvm_s390_irq which contains all data relevant for
> +    interrupt injection.
> +    The format of the data structure kvm_s390_irq as it is copied from userspace
> +    is defined in usr/include/linux/kvm.h.
> +    For historic reasons list members are stored in a different data structure, i.e.
> +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
> +    which can then be added to the list.
> +
> +  KVM_DEV_FLIC_DEQUEUE
> +    Takes one element off the pending interrupts list and copies it into userspace.
> +    Dequeued interrupts are not injected into the guest.
> +    attr->addr contains the userspace address of a struct kvm_s390_irq.
> +    List elements are stored in the format of struct kvm_s390_interrupt_info
> +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
> +    (usr/include/linux/kvm.h)
> +
> +  KVM_DEV_FLIC_CLEAR_IRQS
> +    Simply deletes all elements from the list of currently pending floating interrupts.
> +    No interrupts are injected into the guest.
> diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
> index adb05c5..e1cc166 100644
> --- a/arch/s390/include/asm/kvm_host.h
> +++ b/arch/s390/include/asm/kvm_host.h
> @@ -238,6 +238,7 @@ struct kvm_arch{
> 	struct sca_block *sca;
> 	debug_info_t *dbf;
> 	struct kvm_s390_float_interrupt float_int;
> +	struct kvm_device *flic;
> 	struct gmap *gmap;
> 	int css_support;
> };
> diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
> index d25da59..33d52b8 100644
> --- a/arch/s390/include/uapi/asm/kvm.h
> +++ b/arch/s390/include/uapi/asm/kvm.h
> @@ -16,6 +16,11 @@
> 
> #define __KVM_S390
> 
> +/* Device control API: s390-specific devices */
> +#define KVM_DEV_FLIC_DEQUEUE 1
> +#define KVM_DEV_FLIC_ENQUEUE 2
> +#define KVM_DEV_FLIC_CLEAR_IRQS 3
> +
> /* for KVM_GET_REGS and KVM_SET_REGS */
> struct kvm_regs {
> 	/* general purpose regs for s390 */
> diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
> index 7f35cb3..d6d5e36 100644
> --- a/arch/s390/kvm/interrupt.c
> +++ b/arch/s390/kvm/interrupt.c
> @@ -656,53 +656,86 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
> 	return inti;
> }
> 
> -int kvm_s390_inject_vm(struct kvm *kvm,
> -		       struct kvm_s390_interrupt *s390int)
> +static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)

This really doesn't only inject, it enqueues an interrupt and also injects them then, right?

> {
> 	struct kvm_s390_local_interrupt *li;
> 	struct kvm_s390_float_interrupt *fi;
> -	struct kvm_s390_interrupt_info *inti, *iter;
> +	struct kvm_s390_interrupt_info *iter;
> 	int sigcpu;
> 
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;

You probably want to move this into your device structure eventually.

> +	spin_lock(&fi->lock);
> +	if (!is_ioint(inti->type)) {
> +		list_add_tail(&inti->list, &fi->list);
> +	} else {
> +		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> +
> +		/* Keep I/O interrupts sorted in isc order. */
> +		list_for_each_entry(iter, &fi->list, list) {
> +			if (!is_ioint(iter->type))
> +				continue;
> +			if (int_word_to_isc_bits(iter->io.io_int_word)
> +					<= isc_bits)
> +				continue;
> +			break;
> +		}
> +		list_add_tail(&inti->list, &iter->list);
> +	}
> +	atomic_set(&fi->active, 1);
> +	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> +	if (sigcpu == KVM_MAX_VCPUS) {
> +		do {
> +			sigcpu = fi->next_rr_cpu++;
> +			if (sigcpu == KVM_MAX_VCPUS)
> +				sigcpu = fi->next_rr_cpu = 0;
> +		} while (fi->local_int[sigcpu] == NULL);
> +	}
> +	li = fi->local_int[sigcpu];
> +	spin_lock_bh(&li->lock);
> +	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> +	if (waitqueue_active(li->wq))
> +		wake_up_interruptible(li->wq);

Does this kick the other vcpu to notify it that an irq is available?

The code is very spaghetti style. There's certainly a good opportunity to split things up and make it more readable here ;). I think splitting it into "enqueue" and "target cpu deliver" parts makes a lot of sense for starters.

Btw, usually the way these interrupt controllers work in KVM is that the CPU local interrupt controller part "fetches" interrupts from the floating interrupt controller. So in here you would only enqueue and maybe kick a potential receiver of the interrupt. Before a vcpu goes back into guest state, it asks the floating interrupt controller whether there's anything pending for it. If so, it delivers it.

That way even if the vcpu you chose here happens to get preempted, you still deliver the interrupt quickly through another vcpu.

> +	spin_unlock_bh(&li->lock);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +}
> +
> +int kvm_s390_inject_vm(struct kvm *kvm,
> +		       struct kvm_s390_interrupt *s390int)
> +{
> +	struct kvm_s390_interrupt_info *inti;
> +
> 	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
> 	if (!inti)
> 		return -ENOMEM;
> 
> -	switch (s390int->type) {
> +	inti->type = s390int->type;
> +	switch (inti->type) {
> 	case KVM_S390_INT_VIRTIO:
> 		VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
> 			 s390int->parm, s390int->parm64);
> -		inti->type = s390int->type;
> 		inti->ext.ext_params = s390int->parm;
> 		inti->ext.ext_params2 = s390int->parm64;
> 		break;
> 	case KVM_S390_INT_SERVICE:
> 		VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
> -		inti->type = s390int->type;
> 		inti->ext.ext_params = s390int->parm;
> 		break;
> -	case KVM_S390_PROGRAM_INT:
> -	case KVM_S390_SIGP_STOP:
> -	case KVM_S390_INT_EXTERNAL_CALL:
> -	case KVM_S390_INT_EMERGENCY:
> -		kfree(inti);
> -		return -EINVAL;
> 	case KVM_S390_MCHK:
> 		VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
> 			 s390int->parm64);
> -		inti->type = s390int->type;
> 		inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
> 		inti->mchk.mcic = s390int->parm64;
> 		break;
> 	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> -		if (s390int->type & IOINT_AI_MASK)
> +		if (inti->type & IOINT_AI_MASK)
> 			VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
> 		else
> 			VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
> 				 s390int->type & IOINT_CSSID_MASK,
> 				 s390int->type & IOINT_SSID_MASK,
> 				 s390int->type & IOINT_SCHID_MASK);
> -		inti->type = s390int->type;
> 		inti->io.subchannel_id = s390int->parm >> 16;
> 		inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
> 		inti->io.io_int_parm = s390int->parm64 >> 32;
> @@ -715,42 +748,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
> 	trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
> 				 2);
> 
> -	mutex_lock(&kvm->lock);
> -	fi = &kvm->arch.float_int;
> -	spin_lock(&fi->lock);
> -	if (!is_ioint(inti->type))
> -		list_add_tail(&inti->list, &fi->list);
> -	else {
> -		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> -
> -		/* Keep I/O interrupts sorted in isc order. */
> -		list_for_each_entry(iter, &fi->list, list) {
> -			if (!is_ioint(iter->type))
> -				continue;
> -			if (int_word_to_isc_bits(iter->io.io_int_word)
> -			    <= isc_bits)
> -				continue;
> -			break;
> -		}
> -		list_add_tail(&inti->list, &iter->list);
> -	}
> -	atomic_set(&fi->active, 1);
> -	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> -	if (sigcpu == KVM_MAX_VCPUS) {
> -		do {
> -			sigcpu = fi->next_rr_cpu++;
> -			if (sigcpu == KVM_MAX_VCPUS)
> -				sigcpu = fi->next_rr_cpu = 0;
> -		} while (fi->local_int[sigcpu] == NULL);
> -	}
> -	li = fi->local_int[sigcpu];
> -	spin_lock_bh(&li->lock);
> -	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> -	if (waitqueue_active(li->wq))
> -		wake_up_interruptible(li->wq);
> -	spin_unlock_bh(&li->lock);
> -	spin_unlock(&fi->lock);
> -	mutex_unlock(&kvm->lock);

... but as it's mostly copy&paste from here I wouldn't mind if you do the cleanup in a follow-up patch.

> +	__inject_vm(kvm, inti);
> 	return 0;
> }
> 
> @@ -838,3 +836,207 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
> 	mutex_unlock(&vcpu->kvm->lock);
> 	return 0;
> }
> +
> +static void clear_floating_interrupts(struct kvm *kvm)

This should eventually only require device scope, no?

> +{
> +	struct kvm_s390_float_interrupt *fi;
> +	struct kvm_s390_interrupt_info	*n, *inti = NULL;
> +
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;
> +	spin_lock(&fi->lock);
> +	list_for_each_entry_safe(inti, n, &fi->list, list) {
> +		list_del(&inti->list);
> +		kfree(inti);
> +	}
> +	atomic_set(&fi->active, 0);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +}
> +
> +static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
> +				   u64 addr)
> +{
> +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> +	void __user *target;
> +	void *source;
> +	u64 size;
> +	int r = 0;
> +
> +	switch (inti->type) {
> +	case KVM_S390_INT_VIRTIO:
> +	case KVM_S390_INT_SERVICE:
> +		source = &inti->ext;
> +		target = &uptr->ext;
> +		size = sizeof(inti->ext);
> +		break;
> +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> +		source = &inti->io;
> +		target = &uptr->io;
> +		size = sizeof(inti->io);
> +		break;
> +	case KVM_S390_MCHK:
> +		source = &inti->mchk;
> +		target = &uptr->mchk;
> +		size = sizeof(inti->mchk);
> +		break;
> +	case KVM_S390_INT_MAX:
> +		goto out;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	r = put_user(inti->type, (u64 __user *) &uptr->type);
> +	if (copy_to_user(target, source, size))
> +		r = -EFAULT;
> +
> +out:
> +	return r;
> +}
> +
> +static int dequeue_floating_irq(struct kvm *kvm, __u64 addr)

I don't understand the purpose of this function. Why would you want to dequeue (and obtain all information about) a floating interrupt without checking for its type? In a normal interrupt controller world you would say "lower the line for interrupt line X now".  I'm missing that "interrupt line X" parameter.


Alex

> +{
> +	struct kvm_s390_interrupt_info *inti;
> +	struct kvm_s390_float_interrupt *fi;
> +	int r = 0;
> +
> +
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;
> +	spin_lock(&fi->lock);
> +	if (list_empty(&fi->list)) {
> +		mutex_unlock(&kvm->lock);
> +		spin_unlock(&fi->lock);
> +		return -ENODATA;
> +	}
> +	inti = list_first_entry(&fi->list,
> +			struct kvm_s390_interrupt_info, list);
> +	list_del(&inti->list);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +
> +	r = copy_irq_to_user(inti, addr);
> +
> +	kfree(inti);
> +	return r;
> +}
> +
> +static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	int r;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_FLIC_DEQUEUE:
> +		r = dequeue_floating_irq(dev->kvm, attr->addr);
> +		break;
> +	default:
> +		r = -EINVAL;
> +	}
> +
> +	return r;
> +}
> +
> +static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
> +				     u64 addr)
> +{
> +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> +	void *target = NULL;
> +	void *source;
> +	u64 size;
> +	int r = 0;
> +
> +	if (get_user(inti->type, (u64 __user *)addr))
> +		return -EFAULT;
> +	switch (inti->type) {
> +	case KVM_S390_INT_VIRTIO:
> +	case KVM_S390_INT_SERVICE:
> +		target = (void *) &inti->ext;
> +		source = &uptr->ext;
> +		size = sizeof(inti->ext);
> +		break;
> +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> +		target = (void *) &inti->io;
> +		source = &uptr->io;
> +		size = sizeof(inti->io);
> +		break;
> +	case KVM_S390_MCHK:
> +		target = (void *) &inti->mchk;
> +		source = &uptr->mchk;
> +		size = sizeof(inti->mchk);
> +		break;
> +	case KVM_S390_INT_MAX:
> +		goto out;
> +	default:
> +		r = -EINVAL;
> +		return r;
> +	}
> +
> +	if (copy_from_user(target, source, size))
> +		r = -EFAULT;
> +
> +out:
> +	return r;
> +}
> +
> +static int enqueue_floating_irq(struct kvm_device *dev,
> +				 struct kvm_device_attr *attr)
> +{
> +	struct kvm_s390_interrupt_info *inti = NULL;
> +	int r = 0;
> +
> +	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
> +	if (!inti)
> +		return -ENOMEM;
> +
> +	r = copy_irq_from_user(inti, attr->addr);
> +	if (r) {
> +		kfree(inti);
> +		return r;
> +	}
> +	__inject_vm(dev->kvm, inti);
> +
> +	return r;
> +}
> +
> +static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	int r = 0;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_FLIC_ENQUEUE:
> +		r = enqueue_floating_irq(dev, attr);
> +		break;
> +	case KVM_DEV_FLIC_CLEAR_IRQS:
> +		r = 0;
> +		clear_floating_interrupts(dev->kvm);
> +		break;
> +	default:
> +		r = -EINVAL;
> +	}
> +
> +	return r;
> +}
> +
> +static int flic_create(struct kvm_device *dev, u32 type)
> +{
> +	if (!dev)
> +		return -EINVAL;
> +	if (dev->kvm->arch.flic)
> +		return -EINVAL;
> +	dev->kvm->arch.flic = dev;
> +	return 0;
> +}
> +
> +static void flic_destroy(struct kvm_device *dev)
> +{
> +	dev->kvm->arch.flic = NULL;
> +}
> +
> +/* s390 floating irq controller (flic) */
> +struct kvm_device_ops kvm_flic_ops = {
> +	.name = "kvm-flic",
> +	.get_attr = flic_get_attr,
> +	.set_attr = flic_set_attr,
> +	.create = flic_create,
> +	.destroy = flic_destroy,
> +};
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 776dafe..760febf 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -157,6 +157,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> 	case KVM_CAP_ENABLE_CAP:
> 	case KVM_CAP_S390_CSS_SUPPORT:
> 	case KVM_CAP_IOEVENTFD:
> +	case KVM_CAP_DEVICE_CTRL:
> 		r = 1;
> 		break;
> 	case KVM_CAP_NR_VCPUS:
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ca645a0..e89304d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1065,6 +1065,7 @@ struct kvm_device *kvm_device_from_filp(struct file *filp);
> 
> extern struct kvm_device_ops kvm_mpic_ops;
> extern struct kvm_device_ops kvm_xics_ops;
> +extern struct kvm_device_ops kvm_flic_ops;
> 
> #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> 
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index eeb08a1..9c8b457 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -426,6 +426,7 @@ struct kvm_s390_psw {
> 	 ((ai) << 26))
> #define KVM_S390_INT_IO_MIN		0x00000000u
> #define KVM_S390_INT_IO_MAX		0xfffdffffu
> +#define KVM_S390_INT_MAX		0xffffffffu
> 
> 
> struct kvm_s390_interrupt {
> @@ -906,6 +907,7 @@ struct kvm_device_attr {
> #define KVM_DEV_TYPE_FSL_MPIC_20	1
> #define KVM_DEV_TYPE_FSL_MPIC_42	2
> #define KVM_DEV_TYPE_XICS		3
> +#define KVM_DEV_TYPE_FLIC		4
> 
> /*
>  * ioctls for VM fds
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index bf040c4..79a2ecc 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2265,6 +2265,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
> 		ops = &kvm_xics_ops;
> 		break;
> #endif
> +#ifdef CONFIG_S390
> +	case KVM_DEV_TYPE_FLIC:
> +		ops = &kvm_flic_ops;
> +		break;
> +#endif
> 	default:
> 		return -ENODEV;
> 	}
> -- 
> 1.8.3.4
>
Jens Freimann - Oct. 8, 2013, 10:38 a.m.
On Sat, Oct 05, 2013 at 01:53:33AM +0200, Alexander Graf wrote:
> 
> On 06.09.2013, at 14:19, Jens Freimann wrote:
> 
[snip] 
> > -int kvm_s390_inject_vm(struct kvm *kvm,
> > -		       struct kvm_s390_interrupt *s390int)
> > +static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
> 
> This really doesn't only inject, it enqueues an interrupt and also injects them then, right?

hmm, maybe the term injecting is misleading. What is meant by injecting here is simply adding
it to the list. The actual "injection" into the guest lowcore is done by the do_deliver_interrupt
function.

> > {
> > 	struct kvm_s390_local_interrupt *li;
> > 	struct kvm_s390_float_interrupt *fi;
> > -	struct kvm_s390_interrupt_info *inti, *iter;
> > +	struct kvm_s390_interrupt_info *iter;
> > 	int sigcpu;
> > 
> > +	mutex_lock(&kvm->lock);
> > +	fi = &kvm->arch.float_int;
> 
> You probably want to move this into your device structure eventually.

Yes, good point.
 
> > +	spin_lock(&fi->lock);
> > +	if (!is_ioint(inti->type)) {
> > +		list_add_tail(&inti->list, &fi->list);
> > +	} else {
> > +		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> > +
> > +		/* Keep I/O interrupts sorted in isc order. */
> > +		list_for_each_entry(iter, &fi->list, list) {
> > +			if (!is_ioint(iter->type))
> > +				continue;
> > +			if (int_word_to_isc_bits(iter->io.io_int_word)
> > +					<= isc_bits)
> > +				continue;
> > +			break;
> > +		}
> > +		list_add_tail(&inti->list, &iter->list);
> > +	}
> > +	atomic_set(&fi->active, 1);
> > +	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> > +	if (sigcpu == KVM_MAX_VCPUS) {
> > +		do {
> > +			sigcpu = fi->next_rr_cpu++;
> > +			if (sigcpu == KVM_MAX_VCPUS)
> > +				sigcpu = fi->next_rr_cpu = 0;
> > +		} while (fi->local_int[sigcpu] == NULL);
> > +	}
> > +	li = fi->local_int[sigcpu];
> > +	spin_lock_bh(&li->lock);
> > +	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> > +	if (waitqueue_active(li->wq))
> > +		wake_up_interruptible(li->wq);
> 
> Does this kick the other vcpu to notify it that an irq is available?
> 
> The code is very spaghetti style. There's certainly a good opportunity to split things up and make it more readable here ;). I think splitting it into "enqueue" and "target cpu deliver" parts makes a lot of sense for starters.

I'm working on a larger patchset that changes interrupt injection and delivery.
The rationale is to make it more readable, use less locking (get rid of lists 
use bitmaps) and become more PoP-compliant. I'll try to make it more obvious.
 
> Btw, usually the way these interrupt controllers work in KVM is that the CPU local interrupt controller part "fetches" interrupts from the floating interrupt controller. So in here you would only enqueue and maybe kick a potential receiver of the interrupt. Before a vcpu goes back into guest state, it asks the floating interrupt controller whether there's anything pending for it. If so, it delivers it.

I think that's kind of what we're doing here. We enqueue by adding it to our list
of pending interrupts, look for an idle cpu, set the external interrupt flag 
and wake it up. 

> 
> That way even if the vcpu you chose here happens to get preempted, you still deliver the interrupt quickly through another vcpu.
> 
> > +	spin_unlock_bh(&li->lock);
> > +	spin_unlock(&fi->lock);
> > +	mutex_unlock(&kvm->lock);
> > +}
> > +
> > +int kvm_s390_inject_vm(struct kvm *kvm,
> > +		       struct kvm_s390_interrupt *s390int)
> > +{
> > +	struct kvm_s390_interrupt_info *inti;
> > +
> > 	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
> > 	if (!inti)
> > 		return -ENOMEM;
> > 
> > -	switch (s390int->type) {
> > +	inti->type = s390int->type;
> > +	switch (inti->type) {
> > 	case KVM_S390_INT_VIRTIO:
> > 		VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
> > 			 s390int->parm, s390int->parm64);
> > -		inti->type = s390int->type;
> > 		inti->ext.ext_params = s390int->parm;
> > 		inti->ext.ext_params2 = s390int->parm64;
> > 		break;
> > 	case KVM_S390_INT_SERVICE:
> > 		VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
> > -		inti->type = s390int->type;
> > 		inti->ext.ext_params = s390int->parm;
> > 		break;
> > -	case KVM_S390_PROGRAM_INT:
> > -	case KVM_S390_SIGP_STOP:
> > -	case KVM_S390_INT_EXTERNAL_CALL:
> > -	case KVM_S390_INT_EMERGENCY:
> > -		kfree(inti);
> > -		return -EINVAL;
> > 	case KVM_S390_MCHK:
> > 		VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
> > 			 s390int->parm64);
> > -		inti->type = s390int->type;
> > 		inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
> > 		inti->mchk.mcic = s390int->parm64;
> > 		break;
> > 	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> > -		if (s390int->type & IOINT_AI_MASK)
> > +		if (inti->type & IOINT_AI_MASK)
> > 			VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
> > 		else
> > 			VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
> > 				 s390int->type & IOINT_CSSID_MASK,
> > 				 s390int->type & IOINT_SSID_MASK,
> > 				 s390int->type & IOINT_SCHID_MASK);
> > -		inti->type = s390int->type;
> > 		inti->io.subchannel_id = s390int->parm >> 16;
> > 		inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
> > 		inti->io.io_int_parm = s390int->parm64 >> 32;
> > @@ -715,42 +748,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
> > 	trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
> > 				 2);
> > 
> > -	mutex_lock(&kvm->lock);
> > -	fi = &kvm->arch.float_int;
> > -	spin_lock(&fi->lock);
> > -	if (!is_ioint(inti->type))
> > -		list_add_tail(&inti->list, &fi->list);
> > -	else {
> > -		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> > -
> > -		/* Keep I/O interrupts sorted in isc order. */
> > -		list_for_each_entry(iter, &fi->list, list) {
> > -			if (!is_ioint(iter->type))
> > -				continue;
> > -			if (int_word_to_isc_bits(iter->io.io_int_word)
> > -			    <= isc_bits)
> > -				continue;
> > -			break;
> > -		}
> > -		list_add_tail(&inti->list, &iter->list);
> > -	}
> > -	atomic_set(&fi->active, 1);
> > -	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> > -	if (sigcpu == KVM_MAX_VCPUS) {
> > -		do {
> > -			sigcpu = fi->next_rr_cpu++;
> > -			if (sigcpu == KVM_MAX_VCPUS)
> > -				sigcpu = fi->next_rr_cpu = 0;
> > -		} while (fi->local_int[sigcpu] == NULL);
> > -	}
> > -	li = fi->local_int[sigcpu];
> > -	spin_lock_bh(&li->lock);
> > -	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> > -	if (waitqueue_active(li->wq))
> > -		wake_up_interruptible(li->wq);
> > -	spin_unlock_bh(&li->lock);
> > -	spin_unlock(&fi->lock);
> > -	mutex_unlock(&kvm->lock);
> 
> ... but as it's mostly copy&paste from here I wouldn't mind if you do the cleanup in a follow-up patch.
> 
> > +	__inject_vm(kvm, inti);
> > 	return 0;
> > }
> > 
> > @@ -838,3 +836,207 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
> > 	mutex_unlock(&vcpu->kvm->lock);
> > 	return 0;
> > }
> > +
> > +static void clear_floating_interrupts(struct kvm *kvm)
> 
> This should eventually only require device scope, no?

Yes, will change that. 
> > +{
> > +	struct kvm_s390_float_interrupt *fi;
> > +	struct kvm_s390_interrupt_info	*n, *inti = NULL;
> > +
> > +	mutex_lock(&kvm->lock);
> > +	fi = &kvm->arch.float_int;
> > +	spin_lock(&fi->lock);
> > +	list_for_each_entry_safe(inti, n, &fi->list, list) {
> > +		list_del(&inti->list);
> > +		kfree(inti);
> > +	}
> > +	atomic_set(&fi->active, 0);
> > +	spin_unlock(&fi->lock);
> > +	mutex_unlock(&kvm->lock);
> > +}
> > +
> > +static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
> > +				   u64 addr)
> > +{
> > +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> > +	void __user *target;
> > +	void *source;
> > +	u64 size;
> > +	int r = 0;
> > +
> > +	switch (inti->type) {
> > +	case KVM_S390_INT_VIRTIO:
> > +	case KVM_S390_INT_SERVICE:
> > +		source = &inti->ext;
> > +		target = &uptr->ext;
> > +		size = sizeof(inti->ext);
> > +		break;
> > +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> > +		source = &inti->io;
> > +		target = &uptr->io;
> > +		size = sizeof(inti->io);
> > +		break;
> > +	case KVM_S390_MCHK:
> > +		source = &inti->mchk;
> > +		target = &uptr->mchk;
> > +		size = sizeof(inti->mchk);
> > +		break;
> > +	case KVM_S390_INT_MAX:
> > +		goto out;
> > +	default:
> > +		return -EINVAL;
> > +	}
> > +
> > +	r = put_user(inti->type, (u64 __user *) &uptr->type);
> > +	if (copy_to_user(target, source, size))
> > +		r = -EFAULT;
> > +
> > +out:
> > +	return r;
> > +}
> > +
> > +static int dequeue_floating_irq(struct kvm *kvm, __u64 addr)
> 
> I don't understand the purpose of this function. Why would you want to dequeue (and obtain all information about) a floating interrupt without checking for its type? 

Again, I guess the name "dequeue" is misleading. It simply means to take an 
interrupt off the list. This function is currently used for migration only. 

> In a normal interrupt controller world you would say "lower the line for interrupt line X now".  I'm missing that "interrupt line X" parameter.

No "interrupt lines" in s390. We "disable" interrupts by turning off bits
in the PSW or control registers. But this function has nothing to do with that. 

I'll try to make the naming more obvious with my upcoming patchset.


Thanks for the review!

regards
Jens


> 
> 
> Alex
> 
> > +{
> > +	struct kvm_s390_interrupt_info *inti;
> > +	struct kvm_s390_float_interrupt *fi;
> > +	int r = 0;
> > +
> > +
> > +	mutex_lock(&kvm->lock);
> > +	fi = &kvm->arch.float_int;
> > +	spin_lock(&fi->lock);
> > +	if (list_empty(&fi->list)) {
> > +		mutex_unlock(&kvm->lock);
> > +		spin_unlock(&fi->lock);
> > +		return -ENODATA;
> > +	}
> > +	inti = list_first_entry(&fi->list,
> > +			struct kvm_s390_interrupt_info, list);
> > +	list_del(&inti->list);
> > +	spin_unlock(&fi->lock);
> > +	mutex_unlock(&kvm->lock);
> > +
> > +	r = copy_irq_to_user(inti, addr);
> > +
> > +	kfree(inti);
> > +	return r;
> > +}
> > +
> > +static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> > +{
> > +	int r;
> > +
> > +	switch (attr->group) {
> > +	case KVM_DEV_FLIC_DEQUEUE:
> > +		r = dequeue_floating_irq(dev->kvm, attr->addr);
> > +		break;
> > +	default:
> > +		r = -EINVAL;
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
> > +				     u64 addr)
> > +{
> > +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> > +	void *target = NULL;
> > +	void *source;
> > +	u64 size;
> > +	int r = 0;
> > +
> > +	if (get_user(inti->type, (u64 __user *)addr))
> > +		return -EFAULT;
> > +	switch (inti->type) {
> > +	case KVM_S390_INT_VIRTIO:
> > +	case KVM_S390_INT_SERVICE:
> > +		target = (void *) &inti->ext;
> > +		source = &uptr->ext;
> > +		size = sizeof(inti->ext);
> > +		break;
> > +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> > +		target = (void *) &inti->io;
> > +		source = &uptr->io;
> > +		size = sizeof(inti->io);
> > +		break;
> > +	case KVM_S390_MCHK:
> > +		target = (void *) &inti->mchk;
> > +		source = &uptr->mchk;
> > +		size = sizeof(inti->mchk);
> > +		break;
> > +	case KVM_S390_INT_MAX:
> > +		goto out;
> > +	default:
> > +		r = -EINVAL;
> > +		return r;
> > +	}
> > +
> > +	if (copy_from_user(target, source, size))
> > +		r = -EFAULT;
> > +
> > +out:
> > +	return r;
> > +}
> > +
> > +static int enqueue_floating_irq(struct kvm_device *dev,
> > +				 struct kvm_device_attr *attr)
> > +{
> > +	struct kvm_s390_interrupt_info *inti = NULL;
> > +	int r = 0;
> > +
> > +	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
> > +	if (!inti)
> > +		return -ENOMEM;
> > +
> > +	r = copy_irq_from_user(inti, attr->addr);
> > +	if (r) {
> > +		kfree(inti);
> > +		return r;
> > +	}
> > +	__inject_vm(dev->kvm, inti);
> > +
> > +	return r;
> > +}
> > +
> > +static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> > +{
> > +	int r = 0;
> > +
> > +	switch (attr->group) {
> > +	case KVM_DEV_FLIC_ENQUEUE:
> > +		r = enqueue_floating_irq(dev, attr);
> > +		break;
> > +	case KVM_DEV_FLIC_CLEAR_IRQS:
> > +		r = 0;
> > +		clear_floating_interrupts(dev->kvm);
> > +		break;
> > +	default:
> > +		r = -EINVAL;
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static int flic_create(struct kvm_device *dev, u32 type)
> > +{
> > +	if (!dev)
> > +		return -EINVAL;
> > +	if (dev->kvm->arch.flic)
> > +		return -EINVAL;
> > +	dev->kvm->arch.flic = dev;
> > +	return 0;
> > +}
> > +
> > +static void flic_destroy(struct kvm_device *dev)
> > +{
> > +	dev->kvm->arch.flic = NULL;
> > +}
> > +
> > +/* s390 floating irq controller (flic) */
> > +struct kvm_device_ops kvm_flic_ops = {
> > +	.name = "kvm-flic",
> > +	.get_attr = flic_get_attr,
> > +	.set_attr = flic_set_attr,
> > +	.create = flic_create,
> > +	.destroy = flic_destroy,
> > +};
> > diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> > index 776dafe..760febf 100644
> > --- a/arch/s390/kvm/kvm-s390.c
> > +++ b/arch/s390/kvm/kvm-s390.c
> > @@ -157,6 +157,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > 	case KVM_CAP_ENABLE_CAP:
> > 	case KVM_CAP_S390_CSS_SUPPORT:
> > 	case KVM_CAP_IOEVENTFD:
> > +	case KVM_CAP_DEVICE_CTRL:
> > 		r = 1;
> > 		break;
> > 	case KVM_CAP_NR_VCPUS:
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index ca645a0..e89304d 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1065,6 +1065,7 @@ struct kvm_device *kvm_device_from_filp(struct file *filp);
> > 
> > extern struct kvm_device_ops kvm_mpic_ops;
> > extern struct kvm_device_ops kvm_xics_ops;
> > +extern struct kvm_device_ops kvm_flic_ops;
> > 
> > #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> > 
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index eeb08a1..9c8b457 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -426,6 +426,7 @@ struct kvm_s390_psw {
> > 	 ((ai) << 26))
> > #define KVM_S390_INT_IO_MIN		0x00000000u
> > #define KVM_S390_INT_IO_MAX		0xfffdffffu
> > +#define KVM_S390_INT_MAX		0xffffffffu
> > 
> > 
> > struct kvm_s390_interrupt {
> > @@ -906,6 +907,7 @@ struct kvm_device_attr {
> > #define KVM_DEV_TYPE_FSL_MPIC_20	1
> > #define KVM_DEV_TYPE_FSL_MPIC_42	2
> > #define KVM_DEV_TYPE_XICS		3
> > +#define KVM_DEV_TYPE_FLIC		4
> > 
> > /*
> >  * ioctls for VM fds
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index bf040c4..79a2ecc 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2265,6 +2265,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
> > 		ops = &kvm_xics_ops;
> > 		break;
> > #endif
> > +#ifdef CONFIG_S390
> > +	case KVM_DEV_TYPE_FLIC:
> > +		ops = &kvm_flic_ops;
> > +		break;
> > +#endif
> > 	default:
> > 		return -ENODEV;
> > 	}
> > -- 
> > 1.8.3.4
> > 
>

Patch

diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
new file mode 100644
index 0000000..06aef31
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -0,0 +1,36 @@ 
+FLIC (floating interrupt controller)
+====================================
+
+FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
+machine check interruptions. All interrupts are stored in a per-vm list of
+pending interrupts. FLIC performs operations on this list.
+
+Only one FLIC instance may be instantiated.
+
+FLIC provides support to
+- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
+- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
+
+Groups:
+  KVM_DEV_FLIC_ENQUEUE
+    Adds one interrupt to the list of pending floating interrupts. Interrupts
+    are taken from this list for injection into the guest. attr contains
+    a struct kvm_s390_irq which contains all data relevant for
+    interrupt injection.
+    The format of the data structure kvm_s390_irq as it is copied from userspace
+    is defined in usr/include/linux/kvm.h.
+    For historic reasons list members are stored in a different data structure, i.e.
+    we need to copy the relevant data into a struct kvm_s390_interrupt_info
+    which can then be added to the list.
+
+  KVM_DEV_FLIC_DEQUEUE
+    Takes one element off the pending interrupts list and copies it into userspace.
+    Dequeued interrupts are not injected into the guest.
+    attr->addr contains the userspace address of a struct kvm_s390_irq.
+    List elements are stored in the format of struct kvm_s390_interrupt_info
+    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
+    (usr/include/linux/kvm.h)
+
+  KVM_DEV_FLIC_CLEAR_IRQS
+    Simply deletes all elements from the list of currently pending floating interrupts.
+    No interrupts are injected into the guest.
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index adb05c5..e1cc166 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -238,6 +238,7 @@  struct kvm_arch{
 	struct sca_block *sca;
 	debug_info_t *dbf;
 	struct kvm_s390_float_interrupt float_int;
+	struct kvm_device *flic;
 	struct gmap *gmap;
 	int css_support;
 };
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index d25da59..33d52b8 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -16,6 +16,11 @@ 
 
 #define __KVM_S390
 
+/* Device control API: s390-specific devices */
+#define KVM_DEV_FLIC_DEQUEUE 1
+#define KVM_DEV_FLIC_ENQUEUE 2
+#define KVM_DEV_FLIC_CLEAR_IRQS 3
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
 	/* general purpose regs for s390 */
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 7f35cb3..d6d5e36 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -656,53 +656,86 @@  struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
 	return inti;
 }
 
-int kvm_s390_inject_vm(struct kvm *kvm,
-		       struct kvm_s390_interrupt *s390int)
+static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 {
 	struct kvm_s390_local_interrupt *li;
 	struct kvm_s390_float_interrupt *fi;
-	struct kvm_s390_interrupt_info *inti, *iter;
+	struct kvm_s390_interrupt_info *iter;
 	int sigcpu;
 
+	mutex_lock(&kvm->lock);
+	fi = &kvm->arch.float_int;
+	spin_lock(&fi->lock);
+	if (!is_ioint(inti->type)) {
+		list_add_tail(&inti->list, &fi->list);
+	} else {
+		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
+
+		/* Keep I/O interrupts sorted in isc order. */
+		list_for_each_entry(iter, &fi->list, list) {
+			if (!is_ioint(iter->type))
+				continue;
+			if (int_word_to_isc_bits(iter->io.io_int_word)
+					<= isc_bits)
+				continue;
+			break;
+		}
+		list_add_tail(&inti->list, &iter->list);
+	}
+	atomic_set(&fi->active, 1);
+	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
+	if (sigcpu == KVM_MAX_VCPUS) {
+		do {
+			sigcpu = fi->next_rr_cpu++;
+			if (sigcpu == KVM_MAX_VCPUS)
+				sigcpu = fi->next_rr_cpu = 0;
+		} while (fi->local_int[sigcpu] == NULL);
+	}
+	li = fi->local_int[sigcpu];
+	spin_lock_bh(&li->lock);
+	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+	if (waitqueue_active(li->wq))
+		wake_up_interruptible(li->wq);
+	spin_unlock_bh(&li->lock);
+	spin_unlock(&fi->lock);
+	mutex_unlock(&kvm->lock);
+}
+
+int kvm_s390_inject_vm(struct kvm *kvm,
+		       struct kvm_s390_interrupt *s390int)
+{
+	struct kvm_s390_interrupt_info *inti;
+
 	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
 	if (!inti)
 		return -ENOMEM;
 
-	switch (s390int->type) {
+	inti->type = s390int->type;
+	switch (inti->type) {
 	case KVM_S390_INT_VIRTIO:
 		VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
 			 s390int->parm, s390int->parm64);
-		inti->type = s390int->type;
 		inti->ext.ext_params = s390int->parm;
 		inti->ext.ext_params2 = s390int->parm64;
 		break;
 	case KVM_S390_INT_SERVICE:
 		VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
-		inti->type = s390int->type;
 		inti->ext.ext_params = s390int->parm;
 		break;
-	case KVM_S390_PROGRAM_INT:
-	case KVM_S390_SIGP_STOP:
-	case KVM_S390_INT_EXTERNAL_CALL:
-	case KVM_S390_INT_EMERGENCY:
-		kfree(inti);
-		return -EINVAL;
 	case KVM_S390_MCHK:
 		VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
 			 s390int->parm64);
-		inti->type = s390int->type;
 		inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
 		inti->mchk.mcic = s390int->parm64;
 		break;
 	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-		if (s390int->type & IOINT_AI_MASK)
+		if (inti->type & IOINT_AI_MASK)
 			VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
 		else
 			VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
 				 s390int->type & IOINT_CSSID_MASK,
 				 s390int->type & IOINT_SSID_MASK,
 				 s390int->type & IOINT_SCHID_MASK);
-		inti->type = s390int->type;
 		inti->io.subchannel_id = s390int->parm >> 16;
 		inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
 		inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -715,42 +748,7 @@  int kvm_s390_inject_vm(struct kvm *kvm,
 	trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
 				 2);
 
-	mutex_lock(&kvm->lock);
-	fi = &kvm->arch.float_int;
-	spin_lock(&fi->lock);
-	if (!is_ioint(inti->type))
-		list_add_tail(&inti->list, &fi->list);
-	else {
-		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
-
-		/* Keep I/O interrupts sorted in isc order. */
-		list_for_each_entry(iter, &fi->list, list) {
-			if (!is_ioint(iter->type))
-				continue;
-			if (int_word_to_isc_bits(iter->io.io_int_word)
-			    <= isc_bits)
-				continue;
-			break;
-		}
-		list_add_tail(&inti->list, &iter->list);
-	}
-	atomic_set(&fi->active, 1);
-	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
-	if (sigcpu == KVM_MAX_VCPUS) {
-		do {
-			sigcpu = fi->next_rr_cpu++;
-			if (sigcpu == KVM_MAX_VCPUS)
-				sigcpu = fi->next_rr_cpu = 0;
-		} while (fi->local_int[sigcpu] == NULL);
-	}
-	li = fi->local_int[sigcpu];
-	spin_lock_bh(&li->lock);
-	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-	if (waitqueue_active(li->wq))
-		wake_up_interruptible(li->wq);
-	spin_unlock_bh(&li->lock);
-	spin_unlock(&fi->lock);
-	mutex_unlock(&kvm->lock);
+	__inject_vm(kvm, inti);
 	return 0;
 }
 
@@ -838,3 +836,207 @@  int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 	mutex_unlock(&vcpu->kvm->lock);
 	return 0;
 }
+
+static void clear_floating_interrupts(struct kvm *kvm)
+{
+	struct kvm_s390_float_interrupt *fi;
+	struct kvm_s390_interrupt_info	*n, *inti = NULL;
+
+	mutex_lock(&kvm->lock);
+	fi = &kvm->arch.float_int;
+	spin_lock(&fi->lock);
+	list_for_each_entry_safe(inti, n, &fi->list, list) {
+		list_del(&inti->list);
+		kfree(inti);
+	}
+	atomic_set(&fi->active, 0);
+	spin_unlock(&fi->lock);
+	mutex_unlock(&kvm->lock);
+}
+
+static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
+				   u64 addr)
+{
+	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
+	void __user *target;
+	void *source;
+	u64 size;
+	int r = 0;
+
+	switch (inti->type) {
+	case KVM_S390_INT_VIRTIO:
+	case KVM_S390_INT_SERVICE:
+		source = &inti->ext;
+		target = &uptr->ext;
+		size = sizeof(inti->ext);
+		break;
+	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+		source = &inti->io;
+		target = &uptr->io;
+		size = sizeof(inti->io);
+		break;
+	case KVM_S390_MCHK:
+		source = &inti->mchk;
+		target = &uptr->mchk;
+		size = sizeof(inti->mchk);
+		break;
+	case KVM_S390_INT_MAX:
+		goto out;
+	default:
+		return -EINVAL;
+	}
+
+	r = put_user(inti->type, (u64 __user *) &uptr->type);
+	if (copy_to_user(target, source, size))
+		r = -EFAULT;
+
+out:
+	return r;
+}
+
+static int dequeue_floating_irq(struct kvm *kvm, __u64 addr)
+{
+	struct kvm_s390_interrupt_info *inti;
+	struct kvm_s390_float_interrupt *fi;
+	int r = 0;
+
+
+	mutex_lock(&kvm->lock);
+	fi = &kvm->arch.float_int;
+	spin_lock(&fi->lock);
+	if (list_empty(&fi->list)) {
+		mutex_unlock(&kvm->lock);
+		spin_unlock(&fi->lock);
+		return -ENODATA;
+	}
+	inti = list_first_entry(&fi->list,
+			struct kvm_s390_interrupt_info, list);
+	list_del(&inti->list);
+	spin_unlock(&fi->lock);
+	mutex_unlock(&kvm->lock);
+
+	r = copy_irq_to_user(inti, addr);
+
+	kfree(inti);
+	return r;
+}
+
+static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	int r;
+
+	switch (attr->group) {
+	case KVM_DEV_FLIC_DEQUEUE:
+		r = dequeue_floating_irq(dev->kvm, attr->addr);
+		break;
+	default:
+		r = -EINVAL;
+	}
+
+	return r;
+}
+
+static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
+				     u64 addr)
+{
+	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
+	void *target = NULL;
+	void *source;
+	u64 size;
+	int r = 0;
+
+	if (get_user(inti->type, (u64 __user *)addr))
+		return -EFAULT;
+	switch (inti->type) {
+	case KVM_S390_INT_VIRTIO:
+	case KVM_S390_INT_SERVICE:
+		target = (void *) &inti->ext;
+		source = &uptr->ext;
+		size = sizeof(inti->ext);
+		break;
+	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+		target = (void *) &inti->io;
+		source = &uptr->io;
+		size = sizeof(inti->io);
+		break;
+	case KVM_S390_MCHK:
+		target = (void *) &inti->mchk;
+		source = &uptr->mchk;
+		size = sizeof(inti->mchk);
+		break;
+	case KVM_S390_INT_MAX:
+		goto out;
+	default:
+		r = -EINVAL;
+		return r;
+	}
+
+	if (copy_from_user(target, source, size))
+		r = -EFAULT;
+
+out:
+	return r;
+}
+
+static int enqueue_floating_irq(struct kvm_device *dev,
+				 struct kvm_device_attr *attr)
+{
+	struct kvm_s390_interrupt_info *inti = NULL;
+	int r = 0;
+
+	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+	if (!inti)
+		return -ENOMEM;
+
+	r = copy_irq_from_user(inti, attr->addr);
+	if (r) {
+		kfree(inti);
+		return r;
+	}
+	__inject_vm(dev->kvm, inti);
+
+	return r;
+}
+
+static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	int r = 0;
+
+	switch (attr->group) {
+	case KVM_DEV_FLIC_ENQUEUE:
+		r = enqueue_floating_irq(dev, attr);
+		break;
+	case KVM_DEV_FLIC_CLEAR_IRQS:
+		r = 0;
+		clear_floating_interrupts(dev->kvm);
+		break;
+	default:
+		r = -EINVAL;
+	}
+
+	return r;
+}
+
+static int flic_create(struct kvm_device *dev, u32 type)
+{
+	if (!dev)
+		return -EINVAL;
+	if (dev->kvm->arch.flic)
+		return -EINVAL;
+	dev->kvm->arch.flic = dev;
+	return 0;
+}
+
+static void flic_destroy(struct kvm_device *dev)
+{
+	dev->kvm->arch.flic = NULL;
+}
+
+/* s390 floating irq controller (flic) */
+struct kvm_device_ops kvm_flic_ops = {
+	.name = "kvm-flic",
+	.get_attr = flic_get_attr,
+	.set_attr = flic_set_attr,
+	.create = flic_create,
+	.destroy = flic_destroy,
+};
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 776dafe..760febf 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -157,6 +157,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_S390_CSS_SUPPORT:
 	case KVM_CAP_IOEVENTFD:
+	case KVM_CAP_DEVICE_CTRL:
 		r = 1;
 		break;
 	case KVM_CAP_NR_VCPUS:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ca645a0..e89304d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1065,6 +1065,7 @@  struct kvm_device *kvm_device_from_filp(struct file *filp);
 
 extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
+extern struct kvm_device_ops kvm_flic_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index eeb08a1..9c8b457 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -426,6 +426,7 @@  struct kvm_s390_psw {
 	 ((ai) << 26))
 #define KVM_S390_INT_IO_MIN		0x00000000u
 #define KVM_S390_INT_IO_MAX		0xfffdffffu
+#define KVM_S390_INT_MAX		0xffffffffu
 
 
 struct kvm_s390_interrupt {
@@ -906,6 +907,7 @@  struct kvm_device_attr {
 #define KVM_DEV_TYPE_FSL_MPIC_20	1
 #define KVM_DEV_TYPE_FSL_MPIC_42	2
 #define KVM_DEV_TYPE_XICS		3
+#define KVM_DEV_TYPE_FLIC		4
 
 /*
  * ioctls for VM fds
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bf040c4..79a2ecc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2265,6 +2265,11 @@  static int kvm_ioctl_create_device(struct kvm *kvm,
 		ops = &kvm_xics_ops;
 		break;
 #endif
+#ifdef CONFIG_S390
+	case KVM_DEV_TYPE_FLIC:
+		ops = &kvm_flic_ops;
+		break;
+#endif
 	default:
 		return -ENODEV;
 	}