diff mbox series

[v2,04/16] KVM: PPC: Book3S HV: XIVE: add a control to initialize a source

Message ID 20190222112840.25000-5-clg@kaod.org
State Superseded
Headers show
Series KVM: PPC: Book3S HV: add XIVE native exploitation mode | expand

Commit Message

Cédric Le Goater Feb. 22, 2019, 11:28 a.m. UTC
The associated HW interrupt source is simply allocated at the OPAL/HW
level and then MASKED. KVM only needs to know about its type: LSI or
MSI.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 arch/powerpc/include/uapi/asm/kvm.h        |   5 +
 arch/powerpc/kvm/book3s_xive.h             |  10 ++
 arch/powerpc/kvm/book3s_xive.c             |   8 +-
 arch/powerpc/kvm/book3s_xive_native.c      | 114 +++++++++++++++++++++
 Documentation/virtual/kvm/devices/xive.txt |  15 +++
 5 files changed, 148 insertions(+), 4 deletions(-)

Comments

David Gibson Feb. 25, 2019, 2:10 a.m. UTC | #1
On Fri, Feb 22, 2019 at 12:28:28PM +0100, Cédric Le Goater wrote:
> The associated HW interrupt source is simply allocated at the OPAL/HW
> level and then MASKED. KVM only needs to know about its type: LSI or
> MSI.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/include/uapi/asm/kvm.h        |   5 +
>  arch/powerpc/kvm/book3s_xive.h             |  10 ++
>  arch/powerpc/kvm/book3s_xive.c             |   8 +-
>  arch/powerpc/kvm/book3s_xive_native.c      | 114 +++++++++++++++++++++
>  Documentation/virtual/kvm/devices/xive.txt |  15 +++
>  5 files changed, 148 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> index b002c0c67787..a9ad99f2a11b 100644
> --- a/arch/powerpc/include/uapi/asm/kvm.h
> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> @@ -677,5 +677,10 @@ struct kvm_ppc_cpu_char {
>  
>  /* POWER9 XIVE Native Interrupt Controller */
>  #define KVM_DEV_XIVE_GRP_CTRL		1
> +#define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source attributes */
> +
> +/* Layout of 64-bit XIVE source attribute values */
> +#define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
> +#define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
>  
>  #endif /* __LINUX_KVM_POWERPC_H */
> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
> index bcb1bbcf0359..f22f2d46d0f0 100644
> --- a/arch/powerpc/kvm/book3s_xive.h
> +++ b/arch/powerpc/kvm/book3s_xive.h
> @@ -12,6 +12,13 @@
>  #ifdef CONFIG_KVM_XICS
>  #include "book3s_xics.h"
>  
> +/*
> + * The XIVE IRQ number space is aligned with the XICS IRQ number
> + * space, CPU IPIs being allocated in the first 4K.

We do align these in qemu, but I don't see that the kernel part
cares: as far as it's concerned only one of XICS or XIVE is active at
a time, and the irq numbers are chosen by userspace.

> + */
> +#define KVMPPC_XIVE_FIRST_IRQ	0
> +#define KVMPPC_XIVE_NR_IRQS	KVMPPC_XICS_NR_IRQS
> +
>  /*
>   * State for one guest irq source.
>   *
> @@ -253,6 +260,9 @@ extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
>   */
>  void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu);
>  int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu);
> +struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
> +	struct kvmppc_xive *xive, int irq);
> +void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);
>  
>  #endif /* CONFIG_KVM_XICS */
>  #endif /* _KVM_PPC_BOOK3S_XICS_H */
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index d1cc18a5b1c4..6f950ecb3592 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c

I wonder if we should rename this book3s_xics_on_xive.c or something
at some point, I keep getting confused because I forget that this is
only dealing with host xive, not guest xive.

> @@ -1485,8 +1485,8 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
>  	return 0;
>  }
>  
> -static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
> -							   int irq)
> +struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
> +	struct kvmppc_xive *xive, int irq)
>  {
>  	struct kvm *kvm = xive->kvm;
>  	struct kvmppc_xive_src_block *sb;

It's odd that this function, now used from the xive-on-xive path as
well as the xics-on-xive path references KVMPPC_XICS_ICS_SHIFT a few
lines down from this change.

> @@ -1565,7 +1565,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
>  	sb = kvmppc_xive_find_source(xive, irq, &idx);
>  	if (!sb) {
>  		pr_devel("No source, creating source block...\n");
> -		sb = xive_create_src_block(xive, irq);
> +		sb = kvmppc_xive_create_src_block(xive, irq);
>  		if (!sb) {
>  			pr_devel("Failed to create block...\n");
>  			return -ENOMEM;
> @@ -1789,7 +1789,7 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
>  	xive_cleanup_irq_data(xd);
>  }
>  
> -static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
> +void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
>  {
>  	int i;
>  
> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> index 1f3da47a4a6a..a9b2d2d9af99 100644
> --- a/arch/powerpc/kvm/book3s_xive_native.c
> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> @@ -31,6 +31,29 @@
>  
>  #include "book3s_xive.h"
>  
> +/*
> + * TODO: introduce a common template file with the XIVE native layer
> + * and the XICS-on-XIVE glue for the utility functions
> + */
> +#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
> +#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
> +#define __x_readq	__raw_readq
> +#define __x_writeq	__raw_writeq
> +
> +static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
> +{
> +	u64 val;
> +
> +	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
> +		offset |= offset << 4;
> +
> +	val = __x_readq(__x_eoi_page(xd) + offset);
> +#ifdef __LITTLE_ENDIAN__
> +	val >>= 64-8;
> +#endif
> +	return (u8)val;
> +}
> +
>  static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
>  {
>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> @@ -153,12 +176,89 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>  	return rc;
>  }
>  
> +static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
> +					 u64 addr)
> +{
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	u64 __user *ubufp = (u64 __user *) addr;
> +	u64 val;
> +	u16 idx;
> +
> +	pr_devel("%s irq=0x%lx\n", __func__, irq);
> +
> +	if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
> +		return -E2BIG;
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> +	if (!sb) {
> +		pr_debug("No source, creating source block...\n");
> +		sb = kvmppc_xive_create_src_block(xive, irq);
> +		if (!sb) {
> +			pr_err("Failed to create block...\n");
> +			return -ENOMEM;
> +		}
> +	}
> +	state = &sb->irq_state[idx];
> +
> +	if (get_user(val, ubufp)) {
> +		pr_err("fault getting user info !\n");
> +		return -EFAULT;
> +	}

You should validate the value loaded here to check it doesn't have any
bits set we don't know about.

> +
> +	/*
> +	 * If the source doesn't already have an IPI, allocate
> +	 * one and get the corresponding data
> +	 */
> +	if (!state->ipi_number) {
> +		state->ipi_number = xive_native_alloc_irq();
> +		if (state->ipi_number == 0) {
> +			pr_err("Failed to allocate IRQ !\n");
> +			return -ENXIO;
> +		}
> +		xive_native_populate_irq_data(state->ipi_number,
> +					      &state->ipi_data);
> +		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
> +			 state->ipi_number, irq);
> +	}
> +
> +	arch_spin_lock(&sb->lock);

Why the direct call to arch_spin_lock() rather than just spin_lock()?

> +
> +	/* Restore LSI state */
> +	if (val & KVM_XIVE_LEVEL_SENSITIVE) {
> +		state->lsi = true;
> +		if (val & KVM_XIVE_LEVEL_ASSERTED)
> +			state->asserted = true;
> +		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
> +	}
> +
> +	/* Mask IRQ to start with */
> +	state->act_server = 0;
> +	state->act_priority = MASKED;
> +	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
> +	xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
> +
> +	/* Increment the number of valid sources and mark this one valid */
> +	if (!state->valid)
> +		xive->src_count++;
> +	state->valid = true;
> +
> +	arch_spin_unlock(&sb->lock);
> +
> +	return 0;
> +}
> +
>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>  				       struct kvm_device_attr *attr)
>  {
> +	struct kvmppc_xive *xive = dev->private;
> +
>  	switch (attr->group) {
>  	case KVM_DEV_XIVE_GRP_CTRL:
>  		break;
> +	case KVM_DEV_XIVE_GRP_SOURCE:
> +		return kvmppc_xive_native_set_source(xive, attr->attr,
> +						     attr->addr);
>  	}
>  	return -ENXIO;
>  }
> @@ -175,6 +275,11 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
>  	switch (attr->group) {
>  	case KVM_DEV_XIVE_GRP_CTRL:
>  		break;
> +	case KVM_DEV_XIVE_GRP_SOURCE:
> +		if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
> +		    attr->attr < KVMPPC_XIVE_NR_IRQS)
> +			return 0;
> +		break;
>  	}
>  	return -ENXIO;
>  }
> @@ -183,6 +288,7 @@ static void kvmppc_xive_native_free(struct kvm_device *dev)
>  {
>  	struct kvmppc_xive *xive = dev->private;
>  	struct kvm *kvm = xive->kvm;
> +	int i;
>  
>  	debugfs_remove(xive->dentry);
>  
> @@ -191,6 +297,14 @@ static void kvmppc_xive_native_free(struct kvm_device *dev)
>  	if (kvm)
>  		kvm->arch.xive = NULL;
>  
> +	/* Mask and free interrupts */
> +	for (i = 0; i <= xive->max_sbid; i++) {
> +		if (xive->src_blocks[i])
> +			kvmppc_xive_free_sources(xive->src_blocks[i]);
> +		kfree(xive->src_blocks[i]);
> +		xive->src_blocks[i] = NULL;
> +	}
> +
>  	if (xive->vp_base != XIVE_INVALID_VP)
>  		xive_native_free_vp_block(xive->vp_base);
>  
> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
> index fdbd2ff92a88..cd8bfc37b72e 100644
> --- a/Documentation/virtual/kvm/devices/xive.txt
> +++ b/Documentation/virtual/kvm/devices/xive.txt
> @@ -17,3 +17,18 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>  
>    1. KVM_DEV_XIVE_GRP_CTRL
>    Provides global controls on the device
> +
> +  2. KVM_DEV_XIVE_GRP_SOURCE (write only)
> +  Initializes a new source in the XIVE device and mask it.
> +  Attributes:
> +    Interrupt source number  (64-bit)
> +  The kvm_device_attr.addr points to a __u64 value:
> +  bits:     | 63   ....  2 |   1   |   0
> +  values:   |    unused    | level | type
> +  - type:  0:MSI 1:LSI
> +  - level: assertion level in case of an LSI.
> +  Errors:
> +    -E2BIG:  Interrupt source number is out of range
> +    -ENOMEM: Could not create a new source block
> +    -EFAULT: Invalid user pointer for attr->addr.
> +    -ENXIO:  Could not allocate underlying HW interrupt
Paul Mackerras Feb. 25, 2019, 5:30 a.m. UTC | #2
On Fri, Feb 22, 2019 at 12:28:28PM +0100, Cédric Le Goater wrote:
> The associated HW interrupt source is simply allocated at the OPAL/HW
> level and then MASKED. KVM only needs to know about its type: LSI or
> MSI.

I think it would be helpful to explain to the reader here that with
XIVE, all interrupts have a hardware source, even IPIs and virtual
device interrupts, for which we allocate a software-triggerable
interrupt source in the XIVE hardware.

> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/include/uapi/asm/kvm.h        |   5 +
>  arch/powerpc/kvm/book3s_xive.h             |  10 ++
>  arch/powerpc/kvm/book3s_xive.c             |   8 +-
>  arch/powerpc/kvm/book3s_xive_native.c      | 114 +++++++++++++++++++++
>  Documentation/virtual/kvm/devices/xive.txt |  15 +++
>  5 files changed, 148 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> index b002c0c67787..a9ad99f2a11b 100644
> --- a/arch/powerpc/include/uapi/asm/kvm.h
> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> @@ -677,5 +677,10 @@ struct kvm_ppc_cpu_char {
>  
>  /* POWER9 XIVE Native Interrupt Controller */
>  #define KVM_DEV_XIVE_GRP_CTRL		1
> +#define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source attributes */
> +
> +/* Layout of 64-bit XIVE source attribute values */
> +#define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
> +#define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
>  
>  #endif /* __LINUX_KVM_POWERPC_H */
> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
> index bcb1bbcf0359..f22f2d46d0f0 100644
> --- a/arch/powerpc/kvm/book3s_xive.h
> +++ b/arch/powerpc/kvm/book3s_xive.h
> @@ -12,6 +12,13 @@
>  #ifdef CONFIG_KVM_XICS
>  #include "book3s_xics.h"
>  
> +/*
> + * The XIVE IRQ number space is aligned with the XICS IRQ number
> + * space, CPU IPIs being allocated in the first 4K.
> + */
> +#define KVMPPC_XIVE_FIRST_IRQ	0
> +#define KVMPPC_XIVE_NR_IRQS	KVMPPC_XICS_NR_IRQS
> +
>  /*
>   * State for one guest irq source.
>   *
> @@ -253,6 +260,9 @@ extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
>   */
>  void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu);
>  int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu);
> +struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
> +	struct kvmppc_xive *xive, int irq);
> +void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);

So we're using the same source block data structure (effectively a
2-level tree) that the XICS-on-XIVE code uses.  That would be worth
mentioning as a design choice in the patch description.

>  
>  #endif /* CONFIG_KVM_XICS */
>  #endif /* _KVM_PPC_BOOK3S_XICS_H */
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index d1cc18a5b1c4..6f950ecb3592 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -1485,8 +1485,8 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
>  	return 0;
>  }
>  
> -static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
> -							   int irq)
> +struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
> +	struct kvmppc_xive *xive, int irq)
>  {
>  	struct kvm *kvm = xive->kvm;
>  	struct kvmppc_xive_src_block *sb;
> @@ -1565,7 +1565,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
>  	sb = kvmppc_xive_find_source(xive, irq, &idx);
>  	if (!sb) {
>  		pr_devel("No source, creating source block...\n");
> -		sb = xive_create_src_block(xive, irq);
> +		sb = kvmppc_xive_create_src_block(xive, irq);
>  		if (!sb) {
>  			pr_devel("Failed to create block...\n");
>  			return -ENOMEM;
> @@ -1789,7 +1789,7 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
>  	xive_cleanup_irq_data(xd);
>  }
>  
> -static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
> +void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
>  {
>  	int i;
>  
> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> index 1f3da47a4a6a..a9b2d2d9af99 100644
> --- a/arch/powerpc/kvm/book3s_xive_native.c
> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> @@ -31,6 +31,29 @@
>  
>  #include "book3s_xive.h"
>  
> +/*
> + * TODO: introduce a common template file with the XIVE native layer
> + * and the XICS-on-XIVE glue for the utility functions
> + */
> +#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
> +#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
> +#define __x_readq	__raw_readq
> +#define __x_writeq	__raw_writeq
> +
> +static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
> +{
> +	u64 val;
> +
> +	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
> +		offset |= offset << 4;
> +
> +	val = __x_readq(__x_eoi_page(xd) + offset);

Ben originally defined the __x_* macros so that he could use the same
source code twice, once for real mode and once for virtual mode.
Since you're not doing that, is there really any reason for this
indirection?  Why not just __raw_readq, __raw_writeq etc. directly?

Paul.
Paul Mackerras Feb. 26, 2019, 4:25 a.m. UTC | #3
On Mon, Feb 25, 2019 at 01:10:12PM +1100, David Gibson wrote:
> On Fri, Feb 22, 2019 at 12:28:28PM +0100, Cédric Le Goater wrote:
> > +	/*
> > +	 * If the source doesn't already have an IPI, allocate
> > +	 * one and get the corresponding data
> > +	 */
> > +	if (!state->ipi_number) {
> > +		state->ipi_number = xive_native_alloc_irq();
> > +		if (state->ipi_number == 0) {
> > +			pr_err("Failed to allocate IRQ !\n");
> > +			return -ENXIO;
> > +		}
> > +		xive_native_populate_irq_data(state->ipi_number,
> > +					      &state->ipi_data);
> > +		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
> > +			 state->ipi_number, irq);
> > +	}
> > +
> > +	arch_spin_lock(&sb->lock);
> 
> Why the direct call to arch_spin_lock() rather than just spin_lock()?

He's sharing data structures with the xics-on-xive code, and that code
has a real-mode variant, and in real mode we don't want to risk
invoking lockdep code.  Hence sb->lock is an arch_spinlock_t, and he
has to use arch_spin_lock() on it.

Paul.
David Gibson Feb. 26, 2019, 11:20 p.m. UTC | #4
On Tue, Feb 26, 2019 at 03:25:15PM +1100, Paul Mackerras wrote:
> On Mon, Feb 25, 2019 at 01:10:12PM +1100, David Gibson wrote:
> > On Fri, Feb 22, 2019 at 12:28:28PM +0100, Cédric Le Goater wrote:
> > > +	/*
> > > +	 * If the source doesn't already have an IPI, allocate
> > > +	 * one and get the corresponding data
> > > +	 */
> > > +	if (!state->ipi_number) {
> > > +		state->ipi_number = xive_native_alloc_irq();
> > > +		if (state->ipi_number == 0) {
> > > +			pr_err("Failed to allocate IRQ !\n");
> > > +			return -ENXIO;
> > > +		}
> > > +		xive_native_populate_irq_data(state->ipi_number,
> > > +					      &state->ipi_data);
> > > +		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
> > > +			 state->ipi_number, irq);
> > > +	}
> > > +
> > > +	arch_spin_lock(&sb->lock);
> > 
> > Why the direct call to arch_spin_lock() rather than just spin_lock()?
> 
> He's sharing data structures with the xics-on-xive code, and that code
> has a real-mode variant, and in real mode we don't want to risk
> invoking lockdep code.  Hence sb->lock is an arch_spinlock_t, and he
> has to use arch_spin_lock() on it.

Ah, right.
Cédric Le Goater March 12, 2019, 3:19 p.m. UTC | #5
On 2/25/19 3:10 AM, David Gibson wrote:
> On Fri, Feb 22, 2019 at 12:28:28PM +0100, Cédric Le Goater wrote:
>> The associated HW interrupt source is simply allocated at the OPAL/HW
>> level and then MASKED. KVM only needs to know about its type: LSI or
>> MSI.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  arch/powerpc/include/uapi/asm/kvm.h        |   5 +
>>  arch/powerpc/kvm/book3s_xive.h             |  10 ++
>>  arch/powerpc/kvm/book3s_xive.c             |   8 +-
>>  arch/powerpc/kvm/book3s_xive_native.c      | 114 +++++++++++++++++++++
>>  Documentation/virtual/kvm/devices/xive.txt |  15 +++
>>  5 files changed, 148 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>> index b002c0c67787..a9ad99f2a11b 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -677,5 +677,10 @@ struct kvm_ppc_cpu_char {
>>  
>>  /* POWER9 XIVE Native Interrupt Controller */
>>  #define KVM_DEV_XIVE_GRP_CTRL		1
>> +#define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source attributes */
>> +
>> +/* Layout of 64-bit XIVE source attribute values */
>> +#define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
>> +#define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
>>  
>>  #endif /* __LINUX_KVM_POWERPC_H */
>> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
>> index bcb1bbcf0359..f22f2d46d0f0 100644
>> --- a/arch/powerpc/kvm/book3s_xive.h
>> +++ b/arch/powerpc/kvm/book3s_xive.h
>> @@ -12,6 +12,13 @@
>>  #ifdef CONFIG_KVM_XICS
>>  #include "book3s_xics.h"
>>  
>> +/*
>> + * The XIVE IRQ number space is aligned with the XICS IRQ number
>> + * space, CPU IPIs being allocated in the first 4K.
> 
> We do align these in qemu, but I don't see that the kernel part
> cares: as far as it's concerned only one of XICS or XIVE is active at
> a time, and the irq numbers are chosen by userspace.

There is some relation with userspace nevertheless. The KVM device does 
not remap the numbers to some other range today and the limits are fixed
values. Checks are being done in the has_attr() and the set_attr(). 

>> + */
>> +#define KVMPPC_XIVE_FIRST_IRQ	0
>> +#define KVMPPC_XIVE_NR_IRQS	KVMPPC_XICS_NR_IRQS
>> +
>>  /*
>>   * State for one guest irq source.
>>   *
>> @@ -253,6 +260,9 @@ extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
>>   */
>>  void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu);
>>  int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu);
>> +struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
>> +	struct kvmppc_xive *xive, int irq);
>> +void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);
>>  
>>  #endif /* CONFIG_KVM_XICS */
>>  #endif /* _KVM_PPC_BOOK3S_XICS_H */
>> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
>> index d1cc18a5b1c4..6f950ecb3592 100644
>> --- a/arch/powerpc/kvm/book3s_xive.c
>> +++ b/arch/powerpc/kvm/book3s_xive.c
> 
> I wonder if we should rename this book3s_xics_on_xive.c or something
> at some point, I keep getting confused because I forget that this is
> only dealing with host xive, not guest xive.

I am fine with renaming. Any objections ? book3s_xics_p9.c ? 

>> @@ -1485,8 +1485,8 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
>>  	return 0;
>>  }
>>  
>> -static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
>> -							   int irq)
>> +struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
>> +	struct kvmppc_xive *xive, int irq)
>>  {
>>  	struct kvm *kvm = xive->kvm;
>>  	struct kvmppc_xive_src_block *sb;
> 
> It's odd that this function, now used from the xive-on-xive path as
> well as the xics-on-xive path references KVMPPC_XICS_ICS_SHIFT a few
> lines down from this change.

Yes. This is because of the definition of the struct kvmppc_xive_src_block.

We could introduce new defines for XIVE or a common set of defines for
XICS and XIVE.

>> @@ -1565,7 +1565,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
>>  	sb = kvmppc_xive_find_source(xive, irq, &idx);
>>  	if (!sb) {
>>  		pr_devel("No source, creating source block...\n");
>> -		sb = xive_create_src_block(xive, irq);
>> +		sb = kvmppc_xive_create_src_block(xive, irq);
>>  		if (!sb) {
>>  			pr_devel("Failed to create block...\n");
>>  			return -ENOMEM;
>> @@ -1789,7 +1789,7 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
>>  	xive_cleanup_irq_data(xd);
>>  }
>>  
>> -static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
>> +void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
>>  {
>>  	int i;
>>  
>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
>> index 1f3da47a4a6a..a9b2d2d9af99 100644
>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>> @@ -31,6 +31,29 @@
>>  
>>  #include "book3s_xive.h"
>>  
>> +/*
>> + * TODO: introduce a common template file with the XIVE native layer
>> + * and the XICS-on-XIVE glue for the utility functions
>> + */
>> +#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
>> +#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
>> +#define __x_readq	__raw_readq
>> +#define __x_writeq	__raw_writeq
>> +
>> +static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
>> +{
>> +	u64 val;
>> +
>> +	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
>> +		offset |= offset << 4;
>> +
>> +	val = __x_readq(__x_eoi_page(xd) + offset);
>> +#ifdef __LITTLE_ENDIAN__
>> +	val >>= 64-8;
>> +#endif
>> +	return (u8)val;
>> +}
>> +
>>  static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
>>  {
>>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>> @@ -153,12 +176,89 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>>  	return rc;
>>  }
>>  
>> +static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
>> +					 u64 addr)
>> +{
>> +	struct kvmppc_xive_src_block *sb;
>> +	struct kvmppc_xive_irq_state *state;
>> +	u64 __user *ubufp = (u64 __user *) addr;
>> +	u64 val;
>> +	u16 idx;
>> +
>> +	pr_devel("%s irq=0x%lx\n", __func__, irq);
>> +
>> +	if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
>> +		return -E2BIG;
>> +
>> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
>> +	if (!sb) {
>> +		pr_debug("No source, creating source block...\n");
>> +		sb = kvmppc_xive_create_src_block(xive, irq);
>> +		if (!sb) {
>> +			pr_err("Failed to create block...\n");
>> +			return -ENOMEM;
>> +		}
>> +	}
>> +	state = &sb->irq_state[idx];
>> +
>> +	if (get_user(val, ubufp)) {
>> +		pr_err("fault getting user info !\n");
>> +		return -EFAULT;
>> +	}
> 
> You should validate the value loaded here to check it doesn't have any
> bits set we don't know about.

ok

> 
>> +
>> +	/*
>> +	 * If the source doesn't already have an IPI, allocate
>> +	 * one and get the corresponding data
>> +	 */
>> +	if (!state->ipi_number) {
>> +		state->ipi_number = xive_native_alloc_irq();
>> +		if (state->ipi_number == 0) {
>> +			pr_err("Failed to allocate IRQ !\n");
>> +			return -ENXIO;
>> +		}
>> +		xive_native_populate_irq_data(state->ipi_number,
>> +					      &state->ipi_data);
>> +		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
>> +			 state->ipi_number, irq);
>> +	}
>> +
>> +	arch_spin_lock(&sb->lock);
> 
> Why the direct call to arch_spin_lock() rather than just spin_lock()?

Paul answered this question but may be I should make the effort to
decouple both devices on this aspect. 

Thanks,

C. 

>> +
>> +	/* Restore LSI state */
>> +	if (val & KVM_XIVE_LEVEL_SENSITIVE) {
>> +		state->lsi = true;
>> +		if (val & KVM_XIVE_LEVEL_ASSERTED)
>> +			state->asserted = true;
>> +		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
>> +	}
>> +
>> +	/* Mask IRQ to start with */
>> +	state->act_server = 0;
>> +	state->act_priority = MASKED;
>> +	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
>> +	xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
>> +
>> +	/* Increment the number of valid sources and mark this one valid */
>> +	if (!state->valid)
>> +		xive->src_count++;
>> +	state->valid = true;
>> +
>> +	arch_spin_unlock(&sb->lock);
>> +
>> +	return 0;
>> +}
>> +
>>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>>  				       struct kvm_device_attr *attr)
>>  {
>> +	struct kvmppc_xive *xive = dev->private;
>> +
>>  	switch (attr->group) {
>>  	case KVM_DEV_XIVE_GRP_CTRL:
>>  		break;
>> +	case KVM_DEV_XIVE_GRP_SOURCE:
>> +		return kvmppc_xive_native_set_source(xive, attr->attr,
>> +						     attr->addr);
>>  	}
>>  	return -ENXIO;
>>  }
>> @@ -175,6 +275,11 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
>>  	switch (attr->group) {
>>  	case KVM_DEV_XIVE_GRP_CTRL:
>>  		break;
>> +	case KVM_DEV_XIVE_GRP_SOURCE:
>> +		if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
>> +		    attr->attr < KVMPPC_XIVE_NR_IRQS)
>> +			return 0;
>> +		break;
>>  	}
>>  	return -ENXIO;
>>  }
>> @@ -183,6 +288,7 @@ static void kvmppc_xive_native_free(struct kvm_device *dev)
>>  {
>>  	struct kvmppc_xive *xive = dev->private;
>>  	struct kvm *kvm = xive->kvm;
>> +	int i;
>>  
>>  	debugfs_remove(xive->dentry);
>>  
>> @@ -191,6 +297,14 @@ static void kvmppc_xive_native_free(struct kvm_device *dev)
>>  	if (kvm)
>>  		kvm->arch.xive = NULL;
>>  
>> +	/* Mask and free interrupts */
>> +	for (i = 0; i <= xive->max_sbid; i++) {
>> +		if (xive->src_blocks[i])
>> +			kvmppc_xive_free_sources(xive->src_blocks[i]);
>> +		kfree(xive->src_blocks[i]);
>> +		xive->src_blocks[i] = NULL;
>> +	}
>> +
>>  	if (xive->vp_base != XIVE_INVALID_VP)
>>  		xive_native_free_vp_block(xive->vp_base);
>>  
>> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
>> index fdbd2ff92a88..cd8bfc37b72e 100644
>> --- a/Documentation/virtual/kvm/devices/xive.txt
>> +++ b/Documentation/virtual/kvm/devices/xive.txt
>> @@ -17,3 +17,18 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>>  
>>    1. KVM_DEV_XIVE_GRP_CTRL
>>    Provides global controls on the device
>> +
>> +  2. KVM_DEV_XIVE_GRP_SOURCE (write only)
>> +  Initializes a new source in the XIVE device and mask it.
>> +  Attributes:
>> +    Interrupt source number  (64-bit)
>> +  The kvm_device_attr.addr points to a __u64 value:
>> +  bits:     | 63   ....  2 |   1   |   0
>> +  values:   |    unused    | level | type
>> +  - type:  0:MSI 1:LSI
>> +  - level: assertion level in case of an LSI.
>> +  Errors:
>> +    -E2BIG:  Interrupt source number is out of range
>> +    -ENOMEM: Could not create a new source block
>> +    -EFAULT: Invalid user pointer for attr->addr.
>> +    -ENXIO:  Could not allocate underlying HW interrupt
>
David Gibson March 14, 2019, 2:15 a.m. UTC | #6
On Tue, Mar 12, 2019 at 04:19:35PM +0100, Cédric Le Goater wrote:
> On 2/25/19 3:10 AM, David Gibson wrote:
> > On Fri, Feb 22, 2019 at 12:28:28PM +0100, Cédric Le Goater wrote:
> >> The associated HW interrupt source is simply allocated at the OPAL/HW
> >> level and then MASKED. KVM only needs to know about its type: LSI or
> >> MSI.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  arch/powerpc/include/uapi/asm/kvm.h        |   5 +
> >>  arch/powerpc/kvm/book3s_xive.h             |  10 ++
> >>  arch/powerpc/kvm/book3s_xive.c             |   8 +-
> >>  arch/powerpc/kvm/book3s_xive_native.c      | 114 +++++++++++++++++++++
> >>  Documentation/virtual/kvm/devices/xive.txt |  15 +++
> >>  5 files changed, 148 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> >> index b002c0c67787..a9ad99f2a11b 100644
> >> --- a/arch/powerpc/include/uapi/asm/kvm.h
> >> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> >> @@ -677,5 +677,10 @@ struct kvm_ppc_cpu_char {
> >>  
> >>  /* POWER9 XIVE Native Interrupt Controller */
> >>  #define KVM_DEV_XIVE_GRP_CTRL		1
> >> +#define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source attributes */
> >> +
> >> +/* Layout of 64-bit XIVE source attribute values */
> >> +#define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
> >> +#define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
> >>  
> >>  #endif /* __LINUX_KVM_POWERPC_H */
> >> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
> >> index bcb1bbcf0359..f22f2d46d0f0 100644
> >> --- a/arch/powerpc/kvm/book3s_xive.h
> >> +++ b/arch/powerpc/kvm/book3s_xive.h
> >> @@ -12,6 +12,13 @@
> >>  #ifdef CONFIG_KVM_XICS
> >>  #include "book3s_xics.h"
> >>  
> >> +/*
> >> + * The XIVE IRQ number space is aligned with the XICS IRQ number
> >> + * space, CPU IPIs being allocated in the first 4K.
> > 
> > We do align these in qemu, but I don't see that the kernel part
> > cares: as far as it's concerned only one of XICS or XIVE is active at
> > a time, and the irq numbers are chosen by userspace.
> 
> There is some relation with userspace nevertheless. The KVM device does 
> not remap the numbers to some other range today and the limits are fixed
> values. Checks are being done in the has_attr() and the set_attr().

Hrm.  I still think the comment needs to describe what the constraint
is from the point of view of the kernel, which this isn't.  Maybe, "we
allow irqs in the range X..Y (allowing userspace to do ...)"

> 
> >> + */
> >> +#define KVMPPC_XIVE_FIRST_IRQ	0
> >> +#define KVMPPC_XIVE_NR_IRQS	KVMPPC_XICS_NR_IRQS
> >> +
> >>  /*
> >>   * State for one guest irq source.
> >>   *
> >> @@ -253,6 +260,9 @@ extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
> >>   */
> >>  void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu);
> >>  int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu);
> >> +struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
> >> +	struct kvmppc_xive *xive, int irq);
> >> +void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);
> >>  
> >>  #endif /* CONFIG_KVM_XICS */
> >>  #endif /* _KVM_PPC_BOOK3S_XICS_H */
> >> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> >> index d1cc18a5b1c4..6f950ecb3592 100644
> >> --- a/arch/powerpc/kvm/book3s_xive.c
> >> +++ b/arch/powerpc/kvm/book3s_xive.c
> > 
> > I wonder if we should rename this book3s_xics_on_xive.c or something
> > at some point, I keep getting confused because I forget that this is
> > only dealing with host xive, not guest xive.
> 
> I am fine with renaming. Any objections ? book3s_xics_p9.c ? 

Hm, maybe?

> >> @@ -1485,8 +1485,8 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
> >>  	return 0;
> >>  }
> >>  
> >> -static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
> >> -							   int irq)
> >> +struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
> >> +	struct kvmppc_xive *xive, int irq)
> >>  {
> >>  	struct kvm *kvm = xive->kvm;
> >>  	struct kvmppc_xive_src_block *sb;
> > 
> > It's odd that this function, now used from the xive-on-xive path as
> > well as the xics-on-xive path references KVMPPC_XICS_ICS_SHIFT a few
> > lines down from this change.
> 
> Yes. This is because of the definition of the struct kvmppc_xive_src_block.
> 
> We could introduce new defines for XIVE or a common set of defines for
> XICS and XIVE.

I think making common definitions would be best, if possible.

> 
> >> @@ -1565,7 +1565,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
> >>  	sb = kvmppc_xive_find_source(xive, irq, &idx);
> >>  	if (!sb) {
> >>  		pr_devel("No source, creating source block...\n");
> >> -		sb = xive_create_src_block(xive, irq);
> >> +		sb = kvmppc_xive_create_src_block(xive, irq);
> >>  		if (!sb) {
> >>  			pr_devel("Failed to create block...\n");
> >>  			return -ENOMEM;
> >> @@ -1789,7 +1789,7 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
> >>  	xive_cleanup_irq_data(xd);
> >>  }
> >>  
> >> -static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
> >> +void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
> >>  {
> >>  	int i;
> >>  
> >> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> >> index 1f3da47a4a6a..a9b2d2d9af99 100644
> >> --- a/arch/powerpc/kvm/book3s_xive_native.c
> >> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> >> @@ -31,6 +31,29 @@
> >>  
> >>  #include "book3s_xive.h"
> >>  
> >> +/*
> >> + * TODO: introduce a common template file with the XIVE native layer
> >> + * and the XICS-on-XIVE glue for the utility functions
> >> + */
> >> +#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
> >> +#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
> >> +#define __x_readq	__raw_readq
> >> +#define __x_writeq	__raw_writeq
> >> +
> >> +static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
> >> +{
> >> +	u64 val;
> >> +
> >> +	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
> >> +		offset |= offset << 4;
> >> +
> >> +	val = __x_readq(__x_eoi_page(xd) + offset);
> >> +#ifdef __LITTLE_ENDIAN__
> >> +	val >>= 64-8;
> >> +#endif
> >> +	return (u8)val;
> >> +}
> >> +
> >>  static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
> >>  {
> >>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> >> @@ -153,12 +176,89 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
> >>  	return rc;
> >>  }
> >>  
> >> +static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
> >> +					 u64 addr)
> >> +{
> >> +	struct kvmppc_xive_src_block *sb;
> >> +	struct kvmppc_xive_irq_state *state;
> >> +	u64 __user *ubufp = (u64 __user *) addr;
> >> +	u64 val;
> >> +	u16 idx;
> >> +
> >> +	pr_devel("%s irq=0x%lx\n", __func__, irq);
> >> +
> >> +	if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
> >> +		return -E2BIG;
> >> +
> >> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> >> +	if (!sb) {
> >> +		pr_debug("No source, creating source block...\n");
> >> +		sb = kvmppc_xive_create_src_block(xive, irq);
> >> +		if (!sb) {
> >> +			pr_err("Failed to create block...\n");
> >> +			return -ENOMEM;
> >> +		}
> >> +	}
> >> +	state = &sb->irq_state[idx];
> >> +
> >> +	if (get_user(val, ubufp)) {
> >> +		pr_err("fault getting user info !\n");
> >> +		return -EFAULT;
> >> +	}
> > 
> > You should validate the value loaded here to check it doesn't have any
> > bits set we don't know about.
> 
> ok
> 
> > 
> >> +
> >> +	/*
> >> +	 * If the source doesn't already have an IPI, allocate
> >> +	 * one and get the corresponding data
> >> +	 */
> >> +	if (!state->ipi_number) {
> >> +		state->ipi_number = xive_native_alloc_irq();
> >> +		if (state->ipi_number == 0) {
> >> +			pr_err("Failed to allocate IRQ !\n");
> >> +			return -ENXIO;
> >> +		}
> >> +		xive_native_populate_irq_data(state->ipi_number,
> >> +					      &state->ipi_data);
> >> +		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
> >> +			 state->ipi_number, irq);
> >> +	}
> >> +
> >> +	arch_spin_lock(&sb->lock);
> > 
> > Why the direct call to arch_spin_lock() rather than just spin_lock()?
> 
> Paul answered this question but may be I should make the effort to
> decouple both devices on this aspect. 
> 
> Thanks,
> 
> C. 
> 
> >> +
> >> +	/* Restore LSI state */
> >> +	if (val & KVM_XIVE_LEVEL_SENSITIVE) {
> >> +		state->lsi = true;
> >> +		if (val & KVM_XIVE_LEVEL_ASSERTED)
> >> +			state->asserted = true;
> >> +		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
> >> +	}
> >> +
> >> +	/* Mask IRQ to start with */
> >> +	state->act_server = 0;
> >> +	state->act_priority = MASKED;
> >> +	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
> >> +	xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
> >> +
> >> +	/* Increment the number of valid sources and mark this one valid */
> >> +	if (!state->valid)
> >> +		xive->src_count++;
> >> +	state->valid = true;
> >> +
> >> +	arch_spin_unlock(&sb->lock);
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
> >>  				       struct kvm_device_attr *attr)
> >>  {
> >> +	struct kvmppc_xive *xive = dev->private;
> >> +
> >>  	switch (attr->group) {
> >>  	case KVM_DEV_XIVE_GRP_CTRL:
> >>  		break;
> >> +	case KVM_DEV_XIVE_GRP_SOURCE:
> >> +		return kvmppc_xive_native_set_source(xive, attr->attr,
> >> +						     attr->addr);
> >>  	}
> >>  	return -ENXIO;
> >>  }
> >> @@ -175,6 +275,11 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
> >>  	switch (attr->group) {
> >>  	case KVM_DEV_XIVE_GRP_CTRL:
> >>  		break;
> >> +	case KVM_DEV_XIVE_GRP_SOURCE:
> >> +		if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
> >> +		    attr->attr < KVMPPC_XIVE_NR_IRQS)
> >> +			return 0;
> >> +		break;
> >>  	}
> >>  	return -ENXIO;
> >>  }
> >> @@ -183,6 +288,7 @@ static void kvmppc_xive_native_free(struct kvm_device *dev)
> >>  {
> >>  	struct kvmppc_xive *xive = dev->private;
> >>  	struct kvm *kvm = xive->kvm;
> >> +	int i;
> >>  
> >>  	debugfs_remove(xive->dentry);
> >>  
> >> @@ -191,6 +297,14 @@ static void kvmppc_xive_native_free(struct kvm_device *dev)
> >>  	if (kvm)
> >>  		kvm->arch.xive = NULL;
> >>  
> >> +	/* Mask and free interrupts */
> >> +	for (i = 0; i <= xive->max_sbid; i++) {
> >> +		if (xive->src_blocks[i])
> >> +			kvmppc_xive_free_sources(xive->src_blocks[i]);
> >> +		kfree(xive->src_blocks[i]);
> >> +		xive->src_blocks[i] = NULL;
> >> +	}
> >> +
> >>  	if (xive->vp_base != XIVE_INVALID_VP)
> >>  		xive_native_free_vp_block(xive->vp_base);
> >>  
> >> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
> >> index fdbd2ff92a88..cd8bfc37b72e 100644
> >> --- a/Documentation/virtual/kvm/devices/xive.txt
> >> +++ b/Documentation/virtual/kvm/devices/xive.txt
> >> @@ -17,3 +17,18 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
> >>  
> >>    1. KVM_DEV_XIVE_GRP_CTRL
> >>    Provides global controls on the device
> >> +
> >> +  2. KVM_DEV_XIVE_GRP_SOURCE (write only)
> >> +  Initializes a new source in the XIVE device and mask it.
> >> +  Attributes:
> >> +    Interrupt source number  (64-bit)
> >> +  The kvm_device_attr.addr points to a __u64 value:
> >> +  bits:     | 63   ....  2 |   1   |   0
> >> +  values:   |    unused    | level | type
> >> +  - type:  0:MSI 1:LSI
> >> +  - level: assertion level in case of an LSI.
> >> +  Errors:
> >> +    -E2BIG:  Interrupt source number is out of range
> >> +    -ENOMEM: Could not create a new source block
> >> +    -EFAULT: Invalid user pointer for attr->addr.
> >> +    -ENXIO:  Could not allocate underlying HW interrupt
> > 
>
diff mbox series

Patch

diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index b002c0c67787..a9ad99f2a11b 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -677,5 +677,10 @@  struct kvm_ppc_cpu_char {
 
 /* POWER9 XIVE Native Interrupt Controller */
 #define KVM_DEV_XIVE_GRP_CTRL		1
+#define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source attributes */
+
+/* Layout of 64-bit XIVE source attribute values */
+#define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
+#define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
 
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index bcb1bbcf0359..f22f2d46d0f0 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -12,6 +12,13 @@ 
 #ifdef CONFIG_KVM_XICS
 #include "book3s_xics.h"
 
+/*
+ * The XIVE IRQ number space is aligned with the XICS IRQ number
+ * space, CPU IPIs being allocated in the first 4K.
+ */
+#define KVMPPC_XIVE_FIRST_IRQ	0
+#define KVMPPC_XIVE_NR_IRQS	KVMPPC_XICS_NR_IRQS
+
 /*
  * State for one guest irq source.
  *
@@ -253,6 +260,9 @@  extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
  */
 void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu);
 int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu);
+struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
+	struct kvmppc_xive *xive, int irq);
+void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);
 
 #endif /* CONFIG_KVM_XICS */
 #endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index d1cc18a5b1c4..6f950ecb3592 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1485,8 +1485,8 @@  static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
 	return 0;
 }
 
-static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
-							   int irq)
+struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
+	struct kvmppc_xive *xive, int irq)
 {
 	struct kvm *kvm = xive->kvm;
 	struct kvmppc_xive_src_block *sb;
@@ -1565,7 +1565,7 @@  static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
 	sb = kvmppc_xive_find_source(xive, irq, &idx);
 	if (!sb) {
 		pr_devel("No source, creating source block...\n");
-		sb = xive_create_src_block(xive, irq);
+		sb = kvmppc_xive_create_src_block(xive, irq);
 		if (!sb) {
 			pr_devel("Failed to create block...\n");
 			return -ENOMEM;
@@ -1789,7 +1789,7 @@  static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
 	xive_cleanup_irq_data(xd);
 }
 
-static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
+void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
 {
 	int i;
 
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 1f3da47a4a6a..a9b2d2d9af99 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -31,6 +31,29 @@ 
 
 #include "book3s_xive.h"
 
+/*
+ * TODO: introduce a common template file with the XIVE native layer
+ * and the XICS-on-XIVE glue for the utility functions
+ */
+#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
+#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
+#define __x_readq	__raw_readq
+#define __x_writeq	__raw_writeq
+
+static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
+{
+	u64 val;
+
+	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
+		offset |= offset << 4;
+
+	val = __x_readq(__x_eoi_page(xd) + offset);
+#ifdef __LITTLE_ENDIAN__
+	val >>= 64-8;
+#endif
+	return (u8)val;
+}
+
 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
 {
 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@@ -153,12 +176,89 @@  int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
 	return rc;
 }
 
+static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
+					 u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u64 val;
+	u16 idx;
+
+	pr_devel("%s irq=0x%lx\n", __func__, irq);
+
+	if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
+		return -E2BIG;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb) {
+		pr_debug("No source, creating source block...\n");
+		sb = kvmppc_xive_create_src_block(xive, irq);
+		if (!sb) {
+			pr_err("Failed to create block...\n");
+			return -ENOMEM;
+		}
+	}
+	state = &sb->irq_state[idx];
+
+	if (get_user(val, ubufp)) {
+		pr_err("fault getting user info !\n");
+		return -EFAULT;
+	}
+
+	/*
+	 * If the source doesn't already have an IPI, allocate
+	 * one and get the corresponding data
+	 */
+	if (!state->ipi_number) {
+		state->ipi_number = xive_native_alloc_irq();
+		if (state->ipi_number == 0) {
+			pr_err("Failed to allocate IRQ !\n");
+			return -ENXIO;
+		}
+		xive_native_populate_irq_data(state->ipi_number,
+					      &state->ipi_data);
+		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
+			 state->ipi_number, irq);
+	}
+
+	arch_spin_lock(&sb->lock);
+
+	/* Restore LSI state */
+	if (val & KVM_XIVE_LEVEL_SENSITIVE) {
+		state->lsi = true;
+		if (val & KVM_XIVE_LEVEL_ASSERTED)
+			state->asserted = true;
+		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
+	}
+
+	/* Mask IRQ to start with */
+	state->act_server = 0;
+	state->act_priority = MASKED;
+	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+	xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
+
+	/* Increment the number of valid sources and mark this one valid */
+	if (!state->valid)
+		xive->src_count++;
+	state->valid = true;
+
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
 				       struct kvm_device_attr *attr)
 {
+	struct kvmppc_xive *xive = dev->private;
+
 	switch (attr->group) {
 	case KVM_DEV_XIVE_GRP_CTRL:
 		break;
+	case KVM_DEV_XIVE_GRP_SOURCE:
+		return kvmppc_xive_native_set_source(xive, attr->attr,
+						     attr->addr);
 	}
 	return -ENXIO;
 }
@@ -175,6 +275,11 @@  static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
 	switch (attr->group) {
 	case KVM_DEV_XIVE_GRP_CTRL:
 		break;
+	case KVM_DEV_XIVE_GRP_SOURCE:
+		if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XIVE_NR_IRQS)
+			return 0;
+		break;
 	}
 	return -ENXIO;
 }
@@ -183,6 +288,7 @@  static void kvmppc_xive_native_free(struct kvm_device *dev)
 {
 	struct kvmppc_xive *xive = dev->private;
 	struct kvm *kvm = xive->kvm;
+	int i;
 
 	debugfs_remove(xive->dentry);
 
@@ -191,6 +297,14 @@  static void kvmppc_xive_native_free(struct kvm_device *dev)
 	if (kvm)
 		kvm->arch.xive = NULL;
 
+	/* Mask and free interrupts */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		if (xive->src_blocks[i])
+			kvmppc_xive_free_sources(xive->src_blocks[i]);
+		kfree(xive->src_blocks[i]);
+		xive->src_blocks[i] = NULL;
+	}
+
 	if (xive->vp_base != XIVE_INVALID_VP)
 		xive_native_free_vp_block(xive->vp_base);
 
diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
index fdbd2ff92a88..cd8bfc37b72e 100644
--- a/Documentation/virtual/kvm/devices/xive.txt
+++ b/Documentation/virtual/kvm/devices/xive.txt
@@ -17,3 +17,18 @@  the legacy interrupt mode, referred as XICS (POWER7/8).
 
   1. KVM_DEV_XIVE_GRP_CTRL
   Provides global controls on the device
+
+  2. KVM_DEV_XIVE_GRP_SOURCE (write only)
+  Initializes a new source in the XIVE device and mask it.
+  Attributes:
+    Interrupt source number  (64-bit)
+  The kvm_device_attr.addr points to a __u64 value:
+  bits:     | 63   ....  2 |   1   |   0
+  values:   |    unused    | level | type
+  - type:  0:MSI 1:LSI
+  - level: assertion level in case of an LSI.
+  Errors:
+    -E2BIG:  Interrupt source number is out of range
+    -ENOMEM: Could not create a new source block
+    -EFAULT: Invalid user pointer for attr->addr.
+    -ENXIO:  Could not allocate underlying HW interrupt