[06/19] KVM: PPC: Book3S HV: add a GET_ESB_FD control to the XIVE native device
diff mbox series

Message ID 20190107184331.8429-7-clg@kaod.org
State Changes Requested
Headers show
Series
  • KVM: PPC: Book3S HV: add XIVE native exploitation mode
Related show

Commit Message

Cédric Le Goater Jan. 7, 2019, 6:43 p.m. UTC
This will let the guest create a memory mapping to expose the ESB MMIO
regions used to control the interrupt sources, to trigger events, to
EOI or to turn off the sources.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
 arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)

Comments

Paul Mackerras Jan. 22, 2019, 5:09 a.m. UTC | #1
On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
> This will let the guest create a memory mapping to expose the ESB MMIO
> regions used to control the interrupt sources, to trigger events, to
> EOI or to turn off the sources.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
>  2 files changed, 101 insertions(+)
> 
> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> index 8c876c166ef2..6bb61ba141c2 100644
> --- a/arch/powerpc/include/uapi/asm/kvm.h
> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
>  #define  KVM_XICS_QUEUED		(1ULL << 44)
>  
> +/* POWER9 XIVE Native Interrupt Controller */
> +#define KVM_DEV_XIVE_GRP_CTRL		1
> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
> +
>  #endif /* __LINUX_KVM_POWERPC_H */
> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> index 115143e76c45..e20081f0c8d4 100644
> --- a/arch/powerpc/kvm/book3s_xive_native.c
> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> @@ -153,6 +153,85 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>  	return rc;
>  }
>  
> +static int xive_native_esb_fault(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct kvmppc_xive *xive = vma->vm_file->private_data;
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	struct xive_irq_data *xd;
> +	u32 hw_num;
> +	u16 src;
> +	u64 page;
> +	unsigned long irq;
> +
> +	/*
> +	 * Linux/KVM uses a two pages ESB setting, one for trigger and
> +	 * one for EOI
> +	 */
> +	irq = vmf->pgoff / 2;
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &src);
> +	if (!sb) {
> +		pr_err("%s: source %lx not found !\n", __func__, irq);

In general it's a bad idea to have a printk that userspace can trigger
at will without any rate-limiting.  Is there a real reason why this
printk is needed (and can't be pr_devel)?

> +		return VM_FAULT_SIGBUS;
> +	}
> +
> +	state = &sb->irq_state[src];
> +	kvmppc_xive_select_irq(state, &hw_num, &xd);
> +
> +	arch_spin_lock(&sb->lock);
> +
> +	/*
> +	 * first/even page is for trigger
> +	 * second/odd page is for EOI and management.
> +	 */
> +	page = vmf->pgoff % 2 ? xd->eoi_page : xd->trig_page;
> +	arch_spin_unlock(&sb->lock);
> +
> +	if (!page) {
> +		pr_err("%s: acessing invalid ESB page for source %lx !\n",
> +		       __func__, irq);

Does this represent a exceptional condition that userspace can't just
trigger at will (i.e. it implies the presence of a kernel bug)?  If
not then the same comment as above applies.

Paul.
Cédric Le Goater Jan. 23, 2019, 4:48 p.m. UTC | #2
On 1/22/19 6:09 AM, Paul Mackerras wrote:
> On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
>> This will let the guest create a memory mapping to expose the ESB MMIO
>> regions used to control the interrupt sources, to trigger events, to
>> EOI or to turn off the sources.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
>>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
>>  2 files changed, 101 insertions(+)
>>
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>> index 8c876c166ef2..6bb61ba141c2 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
>>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
>>  #define  KVM_XICS_QUEUED		(1ULL << 44)
>>  
>> +/* POWER9 XIVE Native Interrupt Controller */
>> +#define KVM_DEV_XIVE_GRP_CTRL		1
>> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
>> +
>>  #endif /* __LINUX_KVM_POWERPC_H */
>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
>> index 115143e76c45..e20081f0c8d4 100644
>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>> @@ -153,6 +153,85 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>>  	return rc;
>>  }
>>  
>> +static int xive_native_esb_fault(struct vm_fault *vmf)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	struct kvmppc_xive *xive = vma->vm_file->private_data;
>> +	struct kvmppc_xive_src_block *sb;
>> +	struct kvmppc_xive_irq_state *state;
>> +	struct xive_irq_data *xd;
>> +	u32 hw_num;
>> +	u16 src;
>> +	u64 page;
>> +	unsigned long irq;
>> +
>> +	/*
>> +	 * Linux/KVM uses a two pages ESB setting, one for trigger and
>> +	 * one for EOI
>> +	 */
>> +	irq = vmf->pgoff / 2;
>> +
>> +	sb = kvmppc_xive_find_source(xive, irq, &src);
>> +	if (!sb) {
>> +		pr_err("%s: source %lx not found !\n", __func__, irq);
> 
> In general it's a bad idea to have a printk that userspace can trigger
> at will without any rate-limiting.  Is there a real reason why this
> printk is needed (and can't be pr_devel)?

yes. It should. The SIGBUS is enough to know what's going on. 

> 
>> +		return VM_FAULT_SIGBUS;
>> +	}
>> +
>> +	state = &sb->irq_state[src];
>> +	kvmppc_xive_select_irq(state, &hw_num, &xd);
>> +
>> +	arch_spin_lock(&sb->lock);
>> +
>> +	/*
>> +	 * first/even page is for trigger
>> +	 * second/odd page is for EOI and management.
>> +	 */
>> +	page = vmf->pgoff % 2 ? xd->eoi_page : xd->trig_page;
>> +	arch_spin_unlock(&sb->lock);
>> +
>> +	if (!page) {
>> +		pr_err("%s: acessing invalid ESB page for source %lx !\n",
>> +		       __func__, irq);
> 
> Does this represent a exceptional condition that userspace can't just
> trigger at will (i.e. it implies the presence of a kernel bug)?  If
> not then the same comment as above applies.

Not having an ESB page (trigger or EOI) implies that the xive_irq_data 
for the source is bogus. This probably deserves a WARN().

Thanks,

C.
David Gibson Feb. 4, 2019, 4:45 a.m. UTC | #3
On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
> This will let the guest create a memory mapping to expose the ESB MMIO
> regions used to control the interrupt sources, to trigger events, to
> EOI or to turn off the sources.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
>  2 files changed, 101 insertions(+)
> 
> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> index 8c876c166ef2..6bb61ba141c2 100644
> --- a/arch/powerpc/include/uapi/asm/kvm.h
> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
>  #define  KVM_XICS_QUEUED		(1ULL << 44)
>  
> +/* POWER9 XIVE Native Interrupt Controller */
> +#define KVM_DEV_XIVE_GRP_CTRL		1
> +#define   KVM_DEV_XIVE_GET_ESB_FD	1

Introducing a new FD for ESB and TIMA seems overkill.  Can't you get
to both with an mmap() directly on the xive device fd?  Using the
offset to distinguish which one to map, obviously.

>  #endif /* __LINUX_KVM_POWERPC_H */
> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> index 115143e76c45..e20081f0c8d4 100644
> --- a/arch/powerpc/kvm/book3s_xive_native.c
> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> @@ -153,6 +153,85 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>  	return rc;
>  }
>  
> +static int xive_native_esb_fault(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct kvmppc_xive *xive = vma->vm_file->private_data;
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	struct xive_irq_data *xd;
> +	u32 hw_num;
> +	u16 src;
> +	u64 page;
> +	unsigned long irq;
> +
> +	/*
> +	 * Linux/KVM uses a two pages ESB setting, one for trigger and
> +	 * one for EOI
> +	 */
> +	irq = vmf->pgoff / 2;
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &src);
> +	if (!sb) {
> +		pr_err("%s: source %lx not found !\n", __func__, irq);
> +		return VM_FAULT_SIGBUS;
> +	}
> +
> +	state = &sb->irq_state[src];
> +	kvmppc_xive_select_irq(state, &hw_num, &xd);
> +
> +	arch_spin_lock(&sb->lock);
> +
> +	/*
> +	 * first/even page is for trigger
> +	 * second/odd page is for EOI and management.
> +	 */
> +	page = vmf->pgoff % 2 ? xd->eoi_page : xd->trig_page;
> +	arch_spin_unlock(&sb->lock);
> +
> +	if (!page) {
> +		pr_err("%s: acessing invalid ESB page for source %lx !\n",
> +		       __func__, irq);
> +		return VM_FAULT_SIGBUS;
> +	}
> +
> +	vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
> +	return VM_FAULT_NOPAGE;
> +}
> +
> +static const struct vm_operations_struct xive_native_esb_vmops = {
> +	.fault = xive_native_esb_fault,
> +};
> +
> +static int xive_native_esb_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	/* There are two ESB pages (trigger and EOI) per IRQ */
> +	if (vma_pages(vma) + vma->vm_pgoff > KVMPPC_XIVE_NR_IRQS * 2)
> +		return -EINVAL;
> +
> +	vma->vm_flags |= VM_IO | VM_PFNMAP;
> +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +	vma->vm_ops = &xive_native_esb_vmops;
> +	return 0;
> +}
> +
> +static const struct file_operations xive_native_esb_fops = {
> +	.mmap = xive_native_esb_mmap,
> +};
> +
> +static int kvmppc_xive_native_get_esb_fd(struct kvmppc_xive *xive, u64 addr)
> +{
> +	u64 __user *ubufp = (u64 __user *) addr;
> +	int ret;
> +
> +	ret = anon_inode_getfd("[xive-esb]", &xive_native_esb_fops, xive,
> +				O_RDWR | O_CLOEXEC);
> +	if (ret < 0)
> +		return ret;
> +
> +	return put_user(ret, ubufp);
> +}
> +
>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>  				       struct kvm_device_attr *attr)
>  {
> @@ -162,12 +241,30 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>  static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
>  				       struct kvm_device_attr *attr)
>  {
> +	struct kvmppc_xive *xive = dev->private;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_XIVE_GRP_CTRL:
> +		switch (attr->attr) {
> +		case KVM_DEV_XIVE_GET_ESB_FD:
> +			return kvmppc_xive_native_get_esb_fd(xive, attr->addr);
> +		}
> +		break;
> +	}
>  	return -ENXIO;
>  }
>  
>  static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
>  				       struct kvm_device_attr *attr)
>  {
> +	switch (attr->group) {
> +	case KVM_DEV_XIVE_GRP_CTRL:
> +		switch (attr->attr) {
> +		case KVM_DEV_XIVE_GET_ESB_FD:
> +			return 0;
> +		}
> +		break;
> +	}
>  	return -ENXIO;
>  }
>
Cédric Le Goater Feb. 4, 2019, 11:30 a.m. UTC | #4
On 2/4/19 5:45 AM, David Gibson wrote:
> On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
>> This will let the guest create a memory mapping to expose the ESB MMIO
>> regions used to control the interrupt sources, to trigger events, to
>> EOI or to turn off the sources.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
>>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
>>  2 files changed, 101 insertions(+)
>>
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>> index 8c876c166ef2..6bb61ba141c2 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
>>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
>>  #define  KVM_XICS_QUEUED		(1ULL << 44)
>>  
>> +/* POWER9 XIVE Native Interrupt Controller */
>> +#define KVM_DEV_XIVE_GRP_CTRL		1
>> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
> 
> Introducing a new FD for ESB and TIMA seems overkill.  Can't you get
> to both with an mmap() directly on the xive device fd?  Using the
> offset to distinguish which one to map, obviously.

The page offset would define some sort of user API. It seems feasible.
But I am not sure this would be practical in the future if we need to 
tune the length.

The TIMA has two pages that can be exposed at guest level for interrupt 
management : the OS and the USER page. That should be OK.

But we might want to map only portions of the interrupt ESB space, for 
PCI passthrough for instance as Paul proposed. I am still looking at that.

Thanks,

C.

>>  #endif /* __LINUX_KVM_POWERPC_H */
>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
>> index 115143e76c45..e20081f0c8d4 100644
>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>> @@ -153,6 +153,85 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>>  	return rc;
>>  }
>>  
>> +static int xive_native_esb_fault(struct vm_fault *vmf)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	struct kvmppc_xive *xive = vma->vm_file->private_data;
>> +	struct kvmppc_xive_src_block *sb;
>> +	struct kvmppc_xive_irq_state *state;
>> +	struct xive_irq_data *xd;
>> +	u32 hw_num;
>> +	u16 src;
>> +	u64 page;
>> +	unsigned long irq;
>> +
>> +	/*
>> +	 * Linux/KVM uses a two pages ESB setting, one for trigger and
>> +	 * one for EOI
>> +	 */
>> +	irq = vmf->pgoff / 2;
>> +
>> +	sb = kvmppc_xive_find_source(xive, irq, &src);
>> +	if (!sb) {
>> +		pr_err("%s: source %lx not found !\n", __func__, irq);
>> +		return VM_FAULT_SIGBUS;
>> +	}
>> +
>> +	state = &sb->irq_state[src];
>> +	kvmppc_xive_select_irq(state, &hw_num, &xd);
>> +
>> +	arch_spin_lock(&sb->lock);
>> +
>> +	/*
>> +	 * first/even page is for trigger
>> +	 * second/odd page is for EOI and management.
>> +	 */
>> +	page = vmf->pgoff % 2 ? xd->eoi_page : xd->trig_page;
>> +	arch_spin_unlock(&sb->lock);
>> +
>> +	if (!page) {
>> +		pr_err("%s: acessing invalid ESB page for source %lx !\n",
>> +		       __func__, irq);
>> +		return VM_FAULT_SIGBUS;
>> +	}
>> +
>> +	vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
>> +	return VM_FAULT_NOPAGE;
>> +}
>> +
>> +static const struct vm_operations_struct xive_native_esb_vmops = {
>> +	.fault = xive_native_esb_fault,
>> +};
>> +
>> +static int xive_native_esb_mmap(struct file *file, struct vm_area_struct *vma)
>> +{
>> +	/* There are two ESB pages (trigger and EOI) per IRQ */
>> +	if (vma_pages(vma) + vma->vm_pgoff > KVMPPC_XIVE_NR_IRQS * 2)
>> +		return -EINVAL;
>> +
>> +	vma->vm_flags |= VM_IO | VM_PFNMAP;
>> +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
>> +	vma->vm_ops = &xive_native_esb_vmops;
>> +	return 0;
>> +}
>> +
>> +static const struct file_operations xive_native_esb_fops = {
>> +	.mmap = xive_native_esb_mmap,
>> +};
>> +
>> +static int kvmppc_xive_native_get_esb_fd(struct kvmppc_xive *xive, u64 addr)
>> +{
>> +	u64 __user *ubufp = (u64 __user *) addr;
>> +	int ret;
>> +
>> +	ret = anon_inode_getfd("[xive-esb]", &xive_native_esb_fops, xive,
>> +				O_RDWR | O_CLOEXEC);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	return put_user(ret, ubufp);
>> +}
>> +
>>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>>  				       struct kvm_device_attr *attr)
>>  {
>> @@ -162,12 +241,30 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>>  static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
>>  				       struct kvm_device_attr *attr)
>>  {
>> +	struct kvmppc_xive *xive = dev->private;
>> +
>> +	switch (attr->group) {
>> +	case KVM_DEV_XIVE_GRP_CTRL:
>> +		switch (attr->attr) {
>> +		case KVM_DEV_XIVE_GET_ESB_FD:
>> +			return kvmppc_xive_native_get_esb_fd(xive, attr->addr);
>> +		}
>> +		break;
>> +	}
>>  	return -ENXIO;
>>  }
>>  
>>  static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
>>  				       struct kvm_device_attr *attr)
>>  {
>> +	switch (attr->group) {
>> +	case KVM_DEV_XIVE_GRP_CTRL:
>> +		switch (attr->attr) {
>> +		case KVM_DEV_XIVE_GET_ESB_FD:
>> +			return 0;
>> +		}
>> +		break;
>> +	}
>>  	return -ENXIO;
>>  }
>>  
>
David Gibson Feb. 5, 2019, 5:28 a.m. UTC | #5
On Mon, Feb 04, 2019 at 12:30:39PM +0100, Cédric Le Goater wrote:
> On 2/4/19 5:45 AM, David Gibson wrote:
> > On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
> >> This will let the guest create a memory mapping to expose the ESB MMIO
> >> regions used to control the interrupt sources, to trigger events, to
> >> EOI or to turn off the sources.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
> >>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
> >>  2 files changed, 101 insertions(+)
> >>
> >> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> >> index 8c876c166ef2..6bb61ba141c2 100644
> >> --- a/arch/powerpc/include/uapi/asm/kvm.h
> >> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> >> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
> >>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
> >>  #define  KVM_XICS_QUEUED		(1ULL << 44)
> >>  
> >> +/* POWER9 XIVE Native Interrupt Controller */
> >> +#define KVM_DEV_XIVE_GRP_CTRL		1
> >> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
> > 
> > Introducing a new FD for ESB and TIMA seems overkill.  Can't you get
> > to both with an mmap() directly on the xive device fd?  Using the
> > offset to distinguish which one to map, obviously.
> 
> The page offset would define some sort of user API. It seems feasible.
> But I am not sure this would be practical in the future if we need to 
> tune the length.

Um.. why not?  I mean, yes the XIVE supports rather a lot of
interrupts, but we have 64-bits of offset we can play with - we can
leave room for billions of ESB slots and still have room for billions
of VPs.

> The TIMA has two pages that can be exposed at guest level for interrupt 
> management : the OS and the USER page. That should be OK.
> 
> But we might want to map only portions of the interrupt ESB space, for 
> PCI passthrough for instance as Paul proposed. I am still looking at that.
> 
> Thanks,
> 
> C.
> 
> >>  #endif /* __LINUX_KVM_POWERPC_H */
> >> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> >> index 115143e76c45..e20081f0c8d4 100644
> >> --- a/arch/powerpc/kvm/book3s_xive_native.c
> >> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> >> @@ -153,6 +153,85 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
> >>  	return rc;
> >>  }
> >>  
> >> +static int xive_native_esb_fault(struct vm_fault *vmf)
> >> +{
> >> +	struct vm_area_struct *vma = vmf->vma;
> >> +	struct kvmppc_xive *xive = vma->vm_file->private_data;
> >> +	struct kvmppc_xive_src_block *sb;
> >> +	struct kvmppc_xive_irq_state *state;
> >> +	struct xive_irq_data *xd;
> >> +	u32 hw_num;
> >> +	u16 src;
> >> +	u64 page;
> >> +	unsigned long irq;
> >> +
> >> +	/*
> >> +	 * Linux/KVM uses a two pages ESB setting, one for trigger and
> >> +	 * one for EOI
> >> +	 */
> >> +	irq = vmf->pgoff / 2;
> >> +
> >> +	sb = kvmppc_xive_find_source(xive, irq, &src);
> >> +	if (!sb) {
> >> +		pr_err("%s: source %lx not found !\n", __func__, irq);
> >> +		return VM_FAULT_SIGBUS;
> >> +	}
> >> +
> >> +	state = &sb->irq_state[src];
> >> +	kvmppc_xive_select_irq(state, &hw_num, &xd);
> >> +
> >> +	arch_spin_lock(&sb->lock);
> >> +
> >> +	/*
> >> +	 * first/even page is for trigger
> >> +	 * second/odd page is for EOI and management.
> >> +	 */
> >> +	page = vmf->pgoff % 2 ? xd->eoi_page : xd->trig_page;
> >> +	arch_spin_unlock(&sb->lock);
> >> +
> >> +	if (!page) {
> >> +		pr_err("%s: acessing invalid ESB page for source %lx !\n",
> >> +		       __func__, irq);
> >> +		return VM_FAULT_SIGBUS;
> >> +	}
> >> +
> >> +	vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
> >> +	return VM_FAULT_NOPAGE;
> >> +}
> >> +
> >> +static const struct vm_operations_struct xive_native_esb_vmops = {
> >> +	.fault = xive_native_esb_fault,
> >> +};
> >> +
> >> +static int xive_native_esb_mmap(struct file *file, struct vm_area_struct *vma)
> >> +{
> >> +	/* There are two ESB pages (trigger and EOI) per IRQ */
> >> +	if (vma_pages(vma) + vma->vm_pgoff > KVMPPC_XIVE_NR_IRQS * 2)
> >> +		return -EINVAL;
> >> +
> >> +	vma->vm_flags |= VM_IO | VM_PFNMAP;
> >> +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> >> +	vma->vm_ops = &xive_native_esb_vmops;
> >> +	return 0;
> >> +}
> >> +
> >> +static const struct file_operations xive_native_esb_fops = {
> >> +	.mmap = xive_native_esb_mmap,
> >> +};
> >> +
> >> +static int kvmppc_xive_native_get_esb_fd(struct kvmppc_xive *xive, u64 addr)
> >> +{
> >> +	u64 __user *ubufp = (u64 __user *) addr;
> >> +	int ret;
> >> +
> >> +	ret = anon_inode_getfd("[xive-esb]", &xive_native_esb_fops, xive,
> >> +				O_RDWR | O_CLOEXEC);
> >> +	if (ret < 0)
> >> +		return ret;
> >> +
> >> +	return put_user(ret, ubufp);
> >> +}
> >> +
> >>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
> >>  				       struct kvm_device_attr *attr)
> >>  {
> >> @@ -162,12 +241,30 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
> >>  static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
> >>  				       struct kvm_device_attr *attr)
> >>  {
> >> +	struct kvmppc_xive *xive = dev->private;
> >> +
> >> +	switch (attr->group) {
> >> +	case KVM_DEV_XIVE_GRP_CTRL:
> >> +		switch (attr->attr) {
> >> +		case KVM_DEV_XIVE_GET_ESB_FD:
> >> +			return kvmppc_xive_native_get_esb_fd(xive, attr->addr);
> >> +		}
> >> +		break;
> >> +	}
> >>  	return -ENXIO;
> >>  }
> >>  
> >>  static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
> >>  				       struct kvm_device_attr *attr)
> >>  {
> >> +	switch (attr->group) {
> >> +	case KVM_DEV_XIVE_GRP_CTRL:
> >> +		switch (attr->attr) {
> >> +		case KVM_DEV_XIVE_GET_ESB_FD:
> >> +			return 0;
> >> +		}
> >> +		break;
> >> +	}
> >>  	return -ENXIO;
> >>  }
> >>  
> > 
>
Cédric Le Goater Feb. 5, 2019, 12:55 p.m. UTC | #6
On 2/5/19 6:28 AM, David Gibson wrote:
> On Mon, Feb 04, 2019 at 12:30:39PM +0100, Cédric Le Goater wrote:
>> On 2/4/19 5:45 AM, David Gibson wrote:
>>> On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
>>>> This will let the guest create a memory mapping to expose the ESB MMIO
>>>> regions used to control the interrupt sources, to trigger events, to
>>>> EOI or to turn off the sources.
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>> ---
>>>>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
>>>>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
>>>>  2 files changed, 101 insertions(+)
>>>>
>>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>>>> index 8c876c166ef2..6bb61ba141c2 100644
>>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>>>> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
>>>>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
>>>>  #define  KVM_XICS_QUEUED		(1ULL << 44)
>>>>  
>>>> +/* POWER9 XIVE Native Interrupt Controller */
>>>> +#define KVM_DEV_XIVE_GRP_CTRL		1
>>>> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
>>>
>>> Introducing a new FD for ESB and TIMA seems overkill.  Can't you get
>>> to both with an mmap() directly on the xive device fd?  Using the
>>> offset to distinguish which one to map, obviously.
>>
>> The page offset would define some sort of user API. It seems feasible.
>> But I am not sure this would be practical in the future if we need to 
>> tune the length.
> 
> Um.. why not?  I mean, yes the XIVE supports rather a lot of
> interrupts, but we have 64-bits of offset we can play with - we can
> leave room for billions of ESB slots and still have room for billions
> of VPs.

So the first 4 pages could be the TIMA pages and then would come  
the pages for the interrupt ESBs. I think that we can have different 
vm_fault handler for each mapping.
 
I wonder how this will work out with pass-through. As Paul said in 
a previous email, it would be better to let QEMU request a new 
mapping to handle the ESB pages of the device being passed through.
I guess this is not a special case, just another offset and length.

I will give it a try.

Thanks,

C.
David Gibson Feb. 6, 2019, 1:23 a.m. UTC | #7
On Tue, Feb 05, 2019 at 01:55:40PM +0100, Cédric Le Goater wrote:
> On 2/5/19 6:28 AM, David Gibson wrote:
> > On Mon, Feb 04, 2019 at 12:30:39PM +0100, Cédric Le Goater wrote:
> >> On 2/4/19 5:45 AM, David Gibson wrote:
> >>> On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
> >>>> This will let the guest create a memory mapping to expose the ESB MMIO
> >>>> regions used to control the interrupt sources, to trigger events, to
> >>>> EOI or to turn off the sources.
> >>>>
> >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>> ---
> >>>>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
> >>>>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
> >>>>  2 files changed, 101 insertions(+)
> >>>>
> >>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> >>>> index 8c876c166ef2..6bb61ba141c2 100644
> >>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
> >>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> >>>> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
> >>>>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
> >>>>  #define  KVM_XICS_QUEUED		(1ULL << 44)
> >>>>  
> >>>> +/* POWER9 XIVE Native Interrupt Controller */
> >>>> +#define KVM_DEV_XIVE_GRP_CTRL		1
> >>>> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
> >>>
> >>> Introducing a new FD for ESB and TIMA seems overkill.  Can't you get
> >>> to both with an mmap() directly on the xive device fd?  Using the
> >>> offset to distinguish which one to map, obviously.
> >>
> >> The page offset would define some sort of user API. It seems feasible.
> >> But I am not sure this would be practical in the future if we need to 
> >> tune the length.
> > 
> > Um.. why not?  I mean, yes the XIVE supports rather a lot of
> > interrupts, but we have 64-bits of offset we can play with - we can
> > leave room for billions of ESB slots and still have room for billions
> > of VPs.
> 
> So the first 4 pages could be the TIMA pages and then would come  
> the pages for the interrupt ESBs. I think that we can have different 
> vm_fault handler for each mapping.

Um.. no, I'm saying you don't need to tightly pack them.  So you could
have the ESB pages at 0, the TIMA at, say offset 2^60.

> I wonder how this will work out with pass-through. As Paul said in 
> a previous email, it would be better to let QEMU request a new 
> mapping to handle the ESB pages of the device being passed through.
> I guess this is not a special case, just another offset and length.

Right, if we need multiple "chunks" of ESB pages we can given them
each their own terabyte or several.  No need to be stingy with address
space.
Cédric Le Goater Feb. 6, 2019, 7:21 a.m. UTC | #8
On 2/6/19 2:23 AM, David Gibson wrote:
> On Tue, Feb 05, 2019 at 01:55:40PM +0100, Cédric Le Goater wrote:
>> On 2/5/19 6:28 AM, David Gibson wrote:
>>> On Mon, Feb 04, 2019 at 12:30:39PM +0100, Cédric Le Goater wrote:
>>>> On 2/4/19 5:45 AM, David Gibson wrote:
>>>>> On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
>>>>>> This will let the guest create a memory mapping to expose the ESB MMIO
>>>>>> regions used to control the interrupt sources, to trigger events, to
>>>>>> EOI or to turn off the sources.
>>>>>>
>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>> ---
>>>>>>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
>>>>>>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
>>>>>>  2 files changed, 101 insertions(+)
>>>>>>
>>>>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>>>>>> index 8c876c166ef2..6bb61ba141c2 100644
>>>>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>>>>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>>>>>> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
>>>>>>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
>>>>>>  #define  KVM_XICS_QUEUED		(1ULL << 44)
>>>>>>  
>>>>>> +/* POWER9 XIVE Native Interrupt Controller */
>>>>>> +#define KVM_DEV_XIVE_GRP_CTRL		1
>>>>>> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
>>>>>
>>>>> Introducing a new FD for ESB and TIMA seems overkill.  Can't you get
>>>>> to both with an mmap() directly on the xive device fd?  Using the
>>>>> offset to distinguish which one to map, obviously.
>>>>
>>>> The page offset would define some sort of user API. It seems feasible.
>>>> But I am not sure this would be practical in the future if we need to 
>>>> tune the length.
>>>
>>> Um.. why not?  I mean, yes the XIVE supports rather a lot of
>>> interrupts, but we have 64-bits of offset we can play with - we can
>>> leave room for billions of ESB slots and still have room for billions
>>> of VPs.
>>
>> So the first 4 pages could be the TIMA pages and then would come  
>> the pages for the interrupt ESBs. I think that we can have different 
>> vm_fault handler for each mapping.
> 
> Um.. no, I'm saying you don't need to tightly pack them.  So you could
> have the ESB pages at 0, the TIMA at, say offset 2^60.

Well, we know that the TIMA is 4 pages wide and is "directly" related
with the KVM interrupt device. So being at offset 0 seems a good idea.
While the ESB segment is of a variable size depending on the number
of IRQs and it can come after I think.

>> I wonder how this will work out with pass-through. As Paul said in 
>> a previous email, it would be better to let QEMU request a new 
>> mapping to handle the ESB pages of the device being passed through.
>> I guess this is not a special case, just another offset and length.
> 
> Right, if we need multiple "chunks" of ESB pages we can given them
> each their own terabyte or several.  No need to be stingy with address
> space.

You can not put them anywhere. They should map the same interrupt range
of ESB pages, overlapping with the underlying segment of IPI ESB pages. 

C.
David Gibson Feb. 7, 2019, 2:49 a.m. UTC | #9
On Wed, Feb 06, 2019 at 08:21:10AM +0100, Cédric Le Goater wrote:
> On 2/6/19 2:23 AM, David Gibson wrote:
> > On Tue, Feb 05, 2019 at 01:55:40PM +0100, Cédric Le Goater wrote:
> >> On 2/5/19 6:28 AM, David Gibson wrote:
> >>> On Mon, Feb 04, 2019 at 12:30:39PM +0100, Cédric Le Goater wrote:
> >>>> On 2/4/19 5:45 AM, David Gibson wrote:
> >>>>> On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
> >>>>>> This will let the guest create a memory mapping to expose the ESB MMIO
> >>>>>> regions used to control the interrupt sources, to trigger events, to
> >>>>>> EOI or to turn off the sources.
> >>>>>>
> >>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>>>> ---
> >>>>>>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
> >>>>>>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
> >>>>>>  2 files changed, 101 insertions(+)
> >>>>>>
> >>>>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> >>>>>> index 8c876c166ef2..6bb61ba141c2 100644
> >>>>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
> >>>>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> >>>>>> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
> >>>>>>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
> >>>>>>  #define  KVM_XICS_QUEUED		(1ULL << 44)
> >>>>>>  
> >>>>>> +/* POWER9 XIVE Native Interrupt Controller */
> >>>>>> +#define KVM_DEV_XIVE_GRP_CTRL		1
> >>>>>> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
> >>>>>
> >>>>> Introducing a new FD for ESB and TIMA seems overkill.  Can't you get
> >>>>> to both with an mmap() directly on the xive device fd?  Using the
> >>>>> offset to distinguish which one to map, obviously.
> >>>>
> >>>> The page offset would define some sort of user API. It seems feasible.
> >>>> But I am not sure this would be practical in the future if we need to 
> >>>> tune the length.
> >>>
> >>> Um.. why not?  I mean, yes the XIVE supports rather a lot of
> >>> interrupts, but we have 64-bits of offset we can play with - we can
> >>> leave room for billions of ESB slots and still have room for billions
> >>> of VPs.
> >>
> >> So the first 4 pages could be the TIMA pages and then would come  
> >> the pages for the interrupt ESBs. I think that we can have different 
> >> vm_fault handler for each mapping.
> > 
> > Um.. no, I'm saying you don't need to tightly pack them.  So you could
> > have the ESB pages at 0, the TIMA at, say offset 2^60.
> 
> Well, we know that the TIMA is 4 pages wide and is "directly" related
> with the KVM interrupt device. So being at offset 0 seems a good idea.
> While the ESB segment is of a variable size depending on the number
> of IRQs and it can come after I think.
> 
> >> I wonder how this will work out with pass-through. As Paul said in 
> >> a previous email, it would be better to let QEMU request a new 
> >> mapping to handle the ESB pages of the device being passed through.
> >> I guess this is not a special case, just another offset and length.
> > 
> > Right, if we need multiple "chunks" of ESB pages we can given them
> > each their own terabyte or several.  No need to be stingy with address
> > space.
> 
> You can not put them anywhere. They should map the same interrupt range
> of ESB pages, overlapping with the underlying segment of IPI ESB pages. 

I don't really follow what you're saying here.
Cédric Le Goater Feb. 7, 2019, 9:03 a.m. UTC | #10
On 2/7/19 3:49 AM, David Gibson wrote:
> On Wed, Feb 06, 2019 at 08:21:10AM +0100, Cédric Le Goater wrote:
>> On 2/6/19 2:23 AM, David Gibson wrote:
>>> On Tue, Feb 05, 2019 at 01:55:40PM +0100, Cédric Le Goater wrote:
>>>> On 2/5/19 6:28 AM, David Gibson wrote:
>>>>> On Mon, Feb 04, 2019 at 12:30:39PM +0100, Cédric Le Goater wrote:
>>>>>> On 2/4/19 5:45 AM, David Gibson wrote:
>>>>>>> On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
>>>>>>>> This will let the guest create a memory mapping to expose the ESB MMIO
>>>>>>>> regions used to control the interrupt sources, to trigger events, to
>>>>>>>> EOI or to turn off the sources.
>>>>>>>>
>>>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>>>> ---
>>>>>>>>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
>>>>>>>>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
>>>>>>>>  2 files changed, 101 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>>>>>>>> index 8c876c166ef2..6bb61ba141c2 100644
>>>>>>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>>>>>>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>>>>>>>> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
>>>>>>>>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
>>>>>>>>  #define  KVM_XICS_QUEUED		(1ULL << 44)
>>>>>>>>  
>>>>>>>> +/* POWER9 XIVE Native Interrupt Controller */
>>>>>>>> +#define KVM_DEV_XIVE_GRP_CTRL		1
>>>>>>>> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
>>>>>>>
>>>>>>> Introducing a new FD for ESB and TIMA seems overkill.  Can't you get
>>>>>>> to both with an mmap() directly on the xive device fd?  Using the
>>>>>>> offset to distinguish which one to map, obviously.
>>>>>>
>>>>>> The page offset would define some sort of user API. It seems feasible.
>>>>>> But I am not sure this would be practical in the future if we need to 
>>>>>> tune the length.
>>>>>
>>>>> Um.. why not?  I mean, yes the XIVE supports rather a lot of
>>>>> interrupts, but we have 64-bits of offset we can play with - we can
>>>>> leave room for billions of ESB slots and still have room for billions
>>>>> of VPs.
>>>>
>>>> So the first 4 pages could be the TIMA pages and then would come  
>>>> the pages for the interrupt ESBs. I think that we can have different 
>>>> vm_fault handler for each mapping.
>>>
>>> Um.. no, I'm saying you don't need to tightly pack them.  So you could
>>> have the ESB pages at 0, the TIMA at, say offset 2^60.
>>
>> Well, we know that the TIMA is 4 pages wide and is "directly" related
>> with the KVM interrupt device. So being at offset 0 seems a good idea.
>> While the ESB segment is of a variable size depending on the number
>> of IRQs and it can come after I think.
>>
>>>> I wonder how this will work out with pass-through. As Paul said in 
>>>> a previous email, it would be better to let QEMU request a new 
>>>> mapping to handle the ESB pages of the device being passed through.
>>>> I guess this is not a special case, just another offset and length.
>>>
>>> Right, if we need multiple "chunks" of ESB pages we can given them
>>> each their own terabyte or several.  No need to be stingy with address
>>> space.
>>
>> You can not put them anywhere. They should map the same interrupt range
>> of ESB pages, overlapping with the underlying segment of IPI ESB pages. 
> 
> I don't really follow what you're saying here.


What we want the guest to access in terms of ESB pages is something like 
below, VMA0 being the initial mapping done by QEMU at offset 0x0, the IPI 
ESB pages being populated on the demand with the loads and the stores from 
the guest :


                 0x0                   0x1100  0x1200    0x1300     
      
         ranges   |       CPU IPIs   .. |  VIO  | PCI LSI |  MSIs
       	  
                  +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
 VMA0    IPI ESB  | | | | | | |     | | | | | | | | | | | | | | | | | |
          pages   +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....



A device is passed through and the driver requests MSIs. 

We now want the guest to access the HW ESB pages for the requested IRQs 
but still the initial IPI ESB pages for the others. Something like below : 


                 0x0                   0x1100  0x1200    0x1300     
      
         ranges   |       CPU IPIs   .. |  VIO  | PCI LSI |  MSIs

                  +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
 VMA0    IPI ESB  | | | | | | |     | | | | | | | | | | | | | | | | | |
          pages   +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
                                                                  
 VMA1    PHB ESB                                          +-------+
          pages                                           | | | | | 
                                                          +-------+

The VMA1 is the result of a new mmap() being done at an offset depending on 
the first IRQ number requested by the driver. 

This is because the vm_fault handler uses the page offset to find the 
associated KVM IRQ struct containing the addresses of the EOI and trigger 
pages in the underlying hardware, which will be the PHB in case of a 
passthrough device.  

From there, the VMA1 mmap() pointer will be used to create a 'ram device'
memory region which will be mapped on top of the initial ESB memory region 
in QEMU. This will override the initial IPI ESB pages with the PHB ESB pages 
in the guest ESB address space. 

That's the plan I have in mind as suggested by Paul if I understood it well.
The mechanics are more complex than the patch zapping the PTEs from the VMA
but it's also safer. 


C.
David Gibson Feb. 8, 2019, 5:15 a.m. UTC | #11
On Thu, Feb 07, 2019 at 10:03:15AM +0100, Cédric Le Goater wrote:
> On 2/7/19 3:49 AM, David Gibson wrote:
> > On Wed, Feb 06, 2019 at 08:21:10AM +0100, Cédric Le Goater wrote:
> >> On 2/6/19 2:23 AM, David Gibson wrote:
> >>> On Tue, Feb 05, 2019 at 01:55:40PM +0100, Cédric Le Goater wrote:
> >>>> On 2/5/19 6:28 AM, David Gibson wrote:
> >>>>> On Mon, Feb 04, 2019 at 12:30:39PM +0100, Cédric Le Goater wrote:
> >>>>>> On 2/4/19 5:45 AM, David Gibson wrote:
> >>>>>>> On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
> >>>>>>>> This will let the guest create a memory mapping to expose the ESB MMIO
> >>>>>>>> regions used to control the interrupt sources, to trigger events, to
> >>>>>>>> EOI or to turn off the sources.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>>>>>> ---
> >>>>>>>>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
> >>>>>>>>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
> >>>>>>>>  2 files changed, 101 insertions(+)
> >>>>>>>>
> >>>>>>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> >>>>>>>> index 8c876c166ef2..6bb61ba141c2 100644
> >>>>>>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
> >>>>>>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> >>>>>>>> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
> >>>>>>>>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
> >>>>>>>>  #define  KVM_XICS_QUEUED		(1ULL << 44)
> >>>>>>>>  
> >>>>>>>> +/* POWER9 XIVE Native Interrupt Controller */
> >>>>>>>> +#define KVM_DEV_XIVE_GRP_CTRL		1
> >>>>>>>> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
> >>>>>>>
> >>>>>>> Introducing a new FD for ESB and TIMA seems overkill.  Can't you get
> >>>>>>> to both with an mmap() directly on the xive device fd?  Using the
> >>>>>>> offset to distinguish which one to map, obviously.
> >>>>>>
> >>>>>> The page offset would define some sort of user API. It seems feasible.
> >>>>>> But I am not sure this would be practical in the future if we need to 
> >>>>>> tune the length.
> >>>>>
> >>>>> Um.. why not?  I mean, yes the XIVE supports rather a lot of
> >>>>> interrupts, but we have 64-bits of offset we can play with - we can
> >>>>> leave room for billions of ESB slots and still have room for billions
> >>>>> of VPs.
> >>>>
> >>>> So the first 4 pages could be the TIMA pages and then would come  
> >>>> the pages for the interrupt ESBs. I think that we can have different 
> >>>> vm_fault handler for each mapping.
> >>>
> >>> Um.. no, I'm saying you don't need to tightly pack them.  So you could
> >>> have the ESB pages at 0, the TIMA at, say offset 2^60.
> >>
> >> Well, we know that the TIMA is 4 pages wide and is "directly" related
> >> with the KVM interrupt device. So being at offset 0 seems a good idea.
> >> While the ESB segment is of a variable size depending on the number
> >> of IRQs and it can come after I think.
> >>
> >>>> I wonder how this will work out with pass-through. As Paul said in 
> >>>> a previous email, it would be better to let QEMU request a new 
> >>>> mapping to handle the ESB pages of the device being passed through.
> >>>> I guess this is not a special case, just another offset and length.
> >>>
> >>> Right, if we need multiple "chunks" of ESB pages we can given them
> >>> each their own terabyte or several.  No need to be stingy with address
> >>> space.
> >>
> >> You can not put them anywhere. They should map the same interrupt range
> >> of ESB pages, overlapping with the underlying segment of IPI ESB pages. 
> > 
> > I don't really follow what you're saying here.
> 
> 
> What we want the guest to access in terms of ESB pages is something like 
> below, VMA0 being the initial mapping done by QEMU at offset 0x0, the IPI 
> ESB pages being populated on the demand with the loads and the stores from 
> the guest :
> 
> 
>                  0x0                   0x1100  0x1200    0x1300     
>       
>          ranges   |       CPU IPIs   .. |  VIO  | PCI LSI |  MSIs
>        	  
>                   +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
>  VMA0    IPI ESB  | | | | | | |     | | | | | | | | | | | | | | | | | |
>           pages   +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
> 
> 
> 
> A device is passed through and the driver requests MSIs. 
> 
> We now want the guest to access the HW ESB pages for the requested IRQs 
> but still the initial IPI ESB pages for the others. Something like below : 
> 
> 
>                  0x0                   0x1100  0x1200    0x1300     
>       
>          ranges   |       CPU IPIs   .. |  VIO  | PCI LSI |  MSIs
> 
>                   +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
>  VMA0    IPI ESB  | | | | | | |     | | | | | | | | | | | | | | | | | |
>           pages   +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
>                                                                   
>  VMA1    PHB ESB                                          +-------+
>           pages                                           | | | | | 
>                                                           +-------+

Right, except of course VMA0 will be split into two pieces by
performing the mmap() over it.

> The VMA1 is the result of a new mmap() being done at an offset depending on 
> the first IRQ number requested by the driver.

Right... that's one way we could do it.  But the irq numbers are all
dynamically allocated here, so could we instead just put the
passthrough MSIs in a separate range?  We'd still need a separate
mmap() for them, but we wouldn't have to deal with mapping over and
unmapping if the device is removed or whatever.

> This is because the vm_fault handler uses the page offset to find the 
> associated KVM IRQ struct containing the addresses of the EOI and trigger 
> pages in the underlying hardware, which will be the PHB in case of a 
> passthrough device.  
> 
> >From there, the VMA1 mmap() pointer will be used to create a 'ram device'
> memory region which will be mapped on top of the initial ESB memory region 
> in QEMU. This will override the initial IPI ESB pages with the PHB ESB pages 
> in the guest ESB address space.

Um.. what?  If that qemu memory range is already mapped into the guest
we don't need to create new RAM devices or anything for the
overmapping.  If we overmap in qemu that will just get carried into
the guest.

> That's the plan I have in mind as suggested by Paul if I understood it well.
> The mechanics are more complex than the patch zapping the PTEs from the VMA
> but it's also safer.

Well, yes, where "safer" means "has the possibility to be correct".
Cédric Le Goater Feb. 8, 2019, 7:58 a.m. UTC | #12
On 2/8/19 6:15 AM, David Gibson wrote:
> On Thu, Feb 07, 2019 at 10:03:15AM +0100, Cédric Le Goater wrote:
>> On 2/7/19 3:49 AM, David Gibson wrote:
>>> On Wed, Feb 06, 2019 at 08:21:10AM +0100, Cédric Le Goater wrote:
>>>> On 2/6/19 2:23 AM, David Gibson wrote:
>>>>> On Tue, Feb 05, 2019 at 01:55:40PM +0100, Cédric Le Goater wrote:
>>>>>> On 2/5/19 6:28 AM, David Gibson wrote:
>>>>>>> On Mon, Feb 04, 2019 at 12:30:39PM +0100, Cédric Le Goater wrote:
>>>>>>>> On 2/4/19 5:45 AM, David Gibson wrote:
>>>>>>>>> On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote:
>>>>>>>>>> This will let the guest create a memory mapping to expose the ESB MMIO
>>>>>>>>>> regions used to control the interrupt sources, to trigger events, to
>>>>>>>>>> EOI or to turn off the sources.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>>>>>> ---
>>>>>>>>>>  arch/powerpc/include/uapi/asm/kvm.h   |  4 ++
>>>>>>>>>>  arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++
>>>>>>>>>>  2 files changed, 101 insertions(+)
>>>>>>>>>>
>>>>>>>>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>>>>>>>>>> index 8c876c166ef2..6bb61ba141c2 100644
>>>>>>>>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>>>>>>>>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>>>>>>>>>> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char {
>>>>>>>>>>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
>>>>>>>>>>  #define  KVM_XICS_QUEUED		(1ULL << 44)
>>>>>>>>>>  
>>>>>>>>>> +/* POWER9 XIVE Native Interrupt Controller */
>>>>>>>>>> +#define KVM_DEV_XIVE_GRP_CTRL		1
>>>>>>>>>> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
>>>>>>>>>
>>>>>>>>> Introducing a new FD for ESB and TIMA seems overkill.  Can't you get
>>>>>>>>> to both with an mmap() directly on the xive device fd?  Using the
>>>>>>>>> offset to distinguish which one to map, obviously.
>>>>>>>>
>>>>>>>> The page offset would define some sort of user API. It seems feasible.
>>>>>>>> But I am not sure this would be practical in the future if we need to 
>>>>>>>> tune the length.
>>>>>>>
>>>>>>> Um.. why not?  I mean, yes the XIVE supports rather a lot of
>>>>>>> interrupts, but we have 64-bits of offset we can play with - we can
>>>>>>> leave room for billions of ESB slots and still have room for billions
>>>>>>> of VPs.
>>>>>>
>>>>>> So the first 4 pages could be the TIMA pages and then would come  
>>>>>> the pages for the interrupt ESBs. I think that we can have different 
>>>>>> vm_fault handler for each mapping.
>>>>>
>>>>> Um.. no, I'm saying you don't need to tightly pack them.  So you could
>>>>> have the ESB pages at 0, the TIMA at, say offset 2^60.
>>>>
>>>> Well, we know that the TIMA is 4 pages wide and is "directly" related
>>>> with the KVM interrupt device. So being at offset 0 seems a good idea.
>>>> While the ESB segment is of a variable size depending on the number
>>>> of IRQs and it can come after I think.
>>>>
>>>>>> I wonder how this will work out with pass-through. As Paul said in 
>>>>>> a previous email, it would be better to let QEMU request a new 
>>>>>> mapping to handle the ESB pages of the device being passed through.
>>>>>> I guess this is not a special case, just another offset and length.
>>>>>
>>>>> Right, if we need multiple "chunks" of ESB pages we can given them
>>>>> each their own terabyte or several.  No need to be stingy with address
>>>>> space.
>>>>
>>>> You can not put them anywhere. They should map the same interrupt range
>>>> of ESB pages, overlapping with the underlying segment of IPI ESB pages. 
>>>
>>> I don't really follow what you're saying here.
>>
>>
>> What we want the guest to access in terms of ESB pages is something like 
>> below, VMA0 being the initial mapping done by QEMU at offset 0x0, the IPI 
>> ESB pages being populated on the demand with the loads and the stores from 
>> the guest :
>>
>>
>>                  0x0                   0x1100  0x1200    0x1300     
>>       
>>          ranges   |       CPU IPIs   .. |  VIO  | PCI LSI |  MSIs
>>        	  
>>                   +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
>>  VMA0    IPI ESB  | | | | | | |     | | | | | | | | | | | | | | | | | |
>>           pages   +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
>>
>>
>>
>> A device is passed through and the driver requests MSIs. 
>>
>> We now want the guest to access the HW ESB pages for the requested IRQs 
>> but still the initial IPI ESB pages for the others. Something like below : 
>>
>>
>>                  0x0                   0x1100  0x1200    0x1300     
>>       
>>          ranges   |       CPU IPIs   .. |  VIO  | PCI LSI |  MSIs
>>
>>                   +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
>>  VMA0    IPI ESB  | | | | | | |     | | | | | | | | | | | | | | | | | |
>>           pages   +-+-+-+-+-+-+-...-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- ....
>>                                                                   
>>  VMA1    PHB ESB                                          +-------+
>>           pages                                           | | | | | 
>>                                                           +-------+
> 
> Right, except of course VMA0 will be split into two pieces by
> performing the mmap() over it.
> 
>> The VMA1 is the result of a new mmap() being done at an offset depending on 
>> the first IRQ number requested by the driver.
> 
> Right... that's one way we could do it.  But the irq numbers are all
> dynamically allocated here, so could we instead just put the
> passthrough MSIs in a separate range?  

Hmm, yes. These are still MSIs. I am not sure of the benefits. See below.

> We'd still need a separate
> mmap() for them, but we wouldn't have to deal with mapping over and
> unmapping if the device is removed or whatever.

How would we handle multiples devices being hot-plugged, hot-unplugged 
and hot-replugged ? The ESB pages would be populated the first time 
they are touched and might not be the correct ones if a new device is 
hot-plugged to the machine. 

>> This is because the vm_fault handler uses the page offset to find the 
>> associated KVM IRQ struct containing the addresses of the EOI and trigger 
>> pages in the underlying hardware, which will be the PHB in case of a 
>> passthrough device.  
>>
>> From there, the VMA1 mmap() pointer will be used to create a 'ram device'
>> memory region which will be mapped on top of the initial ESB memory region 
>> in QEMU. This will override the initial IPI ESB pages with the PHB ESB pages 
>> in the guest ESB address space.
> 
> Um.. what?  If that qemu memory range is already mapped into the guest
> we don't need to create new RAM devices or anything for the
> overmapping.  If we overmap in qemu that will just get carried into
> the guest.

yes, that's the goal. 

When the guest accesses the region, the vm_fault handler will be invoked 
and the VMA will be populated with the ESB pages of the device being 
passthrough. When the device is removed from the machine, we only need 
to delete the region from QEMU and munmap() the VMA to clear the mappings.
The underlying pages will be the ones for the XIVE IC IPIs. 

And the IRQ numbers can be safely recycled for another passthrough device.

>> That's the plan I have in mind as suggested by Paul if I understood it well.
>> The mechanics are more complex than the patch zapping the PTEs from the VMA
>> but it's also safer.
> 
> Well, yes, where "safer" means "has the possibility to be correct".

Well, the only problem with the kernel approach is keeping a pointer on 
the VMA. If we could call find_vma(), it would be perfectly safe and much 
more simpler.

C.
Paul Mackerras Feb. 8, 2019, 9:53 p.m. UTC | #13
On Fri, Feb 08, 2019 at 08:58:14AM +0100, Cédric Le Goater wrote:
> On 2/8/19 6:15 AM, David Gibson wrote:
> > On Thu, Feb 07, 2019 at 10:03:15AM +0100, Cédric Le Goater wrote:
> >> That's the plan I have in mind as suggested by Paul if I understood it well.
> >> The mechanics are more complex than the patch zapping the PTEs from the VMA
> >> but it's also safer.
> > 
> > Well, yes, where "safer" means "has the possibility to be correct".
> 
> Well, the only problem with the kernel approach is keeping a pointer on 
> the VMA. If we could call find_vma(), it would be perfectly safe and much 
> more simpler.

You seem to be assuming that the kernel can easily work out a single
virtual address which will be the only place where a given set of
interrupt pages are mapped.  But that is really not possible in the
general case, because userspace could have mapped the fd at many
different offsets in many different places.

QEMU doesn't do that; in QEMU, the mmaps are sufficiently limited that
it can work out a single virtual address that needs to be changed.
The way that QEMU should tell the kernel what that address is and what
the mapping should be changed to, is via the existing munmap()/mmap()
interface.

Paul.
Cédric Le Goater Feb. 9, 2019, 9:41 a.m. UTC | #14
On 2/8/19 10:53 PM, Paul Mackerras wrote:
> On Fri, Feb 08, 2019 at 08:58:14AM +0100, Cédric Le Goater wrote:
>> On 2/8/19 6:15 AM, David Gibson wrote:
>>> On Thu, Feb 07, 2019 at 10:03:15AM +0100, Cédric Le Goater wrote:
>>>> That's the plan I have in mind as suggested by Paul if I understood it well.
>>>> The mechanics are more complex than the patch zapping the PTEs from the VMA
>>>> but it's also safer.
>>>
>>> Well, yes, where "safer" means "has the possibility to be correct".
>>
>> Well, the only problem with the kernel approach is keeping a pointer on 
>> the VMA. If we could call find_vma(), it would be perfectly safe and much 
>> more simpler.
> 
> You seem to be assuming that the kernel can easily work out a single
> virtual address which will be the only place where a given set of
> interrupt pages are mapped.  But that is really not possible in the
> general case, because userspace could have mapped the fd at many
> different offsets in many different places.
> 
> QEMU doesn't do that; in QEMU, the mmaps are sufficiently limited that
> it can work out a single virtual address that needs to be changed.
> The way that QEMU should tell the kernel what that address is and what
> the mapping should be changed to, is via the existing munmap()/mmap()
> interface.

Yes. We agreed on that. QEMU should handle these mappings somewhere in 
VFIO. It's me grumbling, that's all.

The discussion has moved to the mmap() interface of the KVM device. The 
current proposal adds controls on the device creating fds to mmap() the 
TIMA pages and the ESB pages. David is proposing to use directly the fd 
of the KVM device to mmap() these pages with a different offset for each 
set. 

I think that should work pretty well, for passthrough also. The fault 
handler should take care of populating the VMA(s) with the appropriate 
pages. 

We might support END notification one day, so we should have room for 
these pages. And nested might require IRQ space extensions at L1. 
something to keep in mind.

C.
David Gibson Feb. 11, 2019, 2:38 a.m. UTC | #15
On Sat, Feb 09, 2019 at 10:41:38AM +0100, Cédric Le Goater wrote:
> On 2/8/19 10:53 PM, Paul Mackerras wrote:
> > On Fri, Feb 08, 2019 at 08:58:14AM +0100, Cédric Le Goater wrote:
> >> On 2/8/19 6:15 AM, David Gibson wrote:
> >>> On Thu, Feb 07, 2019 at 10:03:15AM +0100, Cédric Le Goater wrote:
> >>>> That's the plan I have in mind as suggested by Paul if I understood it well.
> >>>> The mechanics are more complex than the patch zapping the PTEs from the VMA
> >>>> but it's also safer.
> >>>
> >>> Well, yes, where "safer" means "has the possibility to be correct".
> >>
> >> Well, the only problem with the kernel approach is keeping a pointer on 
> >> the VMA. If we could call find_vma(), it would be perfectly safe and much 
> >> more simpler.
> > 
> > You seem to be assuming that the kernel can easily work out a single
> > virtual address which will be the only place where a given set of
> > interrupt pages are mapped.  But that is really not possible in the
> > general case, because userspace could have mapped the fd at many
> > different offsets in many different places.
> > 
> > QEMU doesn't do that; in QEMU, the mmaps are sufficiently limited that
> > it can work out a single virtual address that needs to be changed.
> > The way that QEMU should tell the kernel what that address is and what
> > the mapping should be changed to, is via the existing munmap()/mmap()
> > interface.
> 
> Yes. We agreed on that. QEMU should handle these mappings somewhere in 
> VFIO. It's me grumbling, that's all.
> 
> The discussion has moved to the mmap() interface of the KVM device. The 
> current proposal adds controls on the device creating fds to mmap() the 
> TIMA pages and the ESB pages. David is proposing to use directly the fd 
> of the KVM device to mmap() these pages with a different offset for each 
> set. 
> 
> I think that should work pretty well, for passthrough also. The fault 
> handler should take care of populating the VMA(s) with the appropriate 
> pages. 
> 
> We might support END notification one day, so we should have room for 
> these pages. And nested might require IRQ space extensions at L1. 
> something to keep in mind.

I had some more thoughts on this topic.  I think there's been some
confusion because there are more ways of tackling this than I
previously realized:

1) All in kernel

The offset always maps directly to guest irq number and the kernel
somehow binds it either to an IPI or a host irq as necessary.
Cédric's original code attempts this, but the mechanism of keeping a
pointer to the VMA can't work.

But.. remapping the irqs should be sufficiently infrequent that it
might be ok to consider simply stepping through all the hosting
process's VMAs to do this.

2) Remapped in qemu (using memory regions)

I _think_ (in hindsight) was Cédric's been discussing as the
alternative in more recent posts.

Qemu maps the IPI pages at one place and the passthrough IRQ pages
somewhere else.  The IPIs are mapped into the guest as one memory
region, then any passthrough IRQ pages are mapped over that using
overlapping memory regions.

I don't think this approach will work well, because it could require a
bunch of separate KVM memory slots, which are fairly scarce.

3) Remapped in qemu (using mmap())

This is the approach I (and I think Paul) have been suggested in
contrast to (1).

Qemu maps the IPI pages and maps those into the guest.  When we need
to set up a passthrough IRQ, qemu mmap()s its pages directly over the
IPI pages, and it remains mapped into the guest with the same memory
region / memslot as the IPIs are already using.  If the passthrough
device is removed we have to remap the IPI pages back into place.

4) Dedicated irq numbers

We never re-use regular guest irq numbers for passthrough irqs,
instead we put them somewhere else and keep those mapped to the
passthrough irq pages.

I was favouring this approach, but it does mean there will be a guest
visible difference between kernel_irqchip=on and off which isn't
great.


(1) is the most elegant _interface_, but as we've seen it's
problematic to implement.  Looking at the for_all_vmas() approach
could be interesting, but otherwise option (3) might be the most
practical.
Benjamin Herrenschmidt Feb. 11, 2019, 6:42 a.m. UTC | #16
On Mon, 2019-02-11 at 13:38 +1100, David Gibson wrote:
> 
> 1) All in kernel
> 
> The offset always maps directly to guest irq number and the kernel
> somehow binds it either to an IPI or a host irq as necessary.
> Cédric's original code attempts this, but the mechanism of keeping a
> pointer to the VMA can't work.

Why do you need a pointer to the VMA anyway ? unmap_mapping_range()
doesn't need a VMA for the unmap part, and faults/mmaps have the VMA.

> But.. remapping the irqs should be sufficiently infrequent that it
> might be ok to consider simply stepping through all the hosting
> process's VMAs to do this.

Which unmap_mapping_range() does for you as I explained previously. You
only need the address space. See how spufs does it (among others).

> 2) Remapped in qemu (using memory regions)
> 
> I _think_ (in hindsight) was Cédric's been discussing as the
> alternative in more recent posts.
> 
> Qemu maps the IPI pages at one place and the passthrough IRQ pages
> somewhere else.  The IPIs are mapped into the guest as one memory
> region, then any passthrough IRQ pages are mapped over that using
> overlapping memory regions.
> 
> I don't think this approach will work well, because it could require a
> bunch of separate KVM memory slots, which are fairly scarce.
> 
> 3) Remapped in qemu (using mmap())
> 
> This is the approach I (and I think Paul) have been suggested in
> contrast to (1).
> 
> Qemu maps the IPI pages and maps those into the guest.  When we need
> to set up a passthrough IRQ, qemu mmap()s its pages directly over the
> IPI pages, and it remains mapped into the guest with the same memory
> region / memslot as the IPIs are already using.  If the passthrough
> device is removed we have to remap the IPI pages back into place.
> 
> 4) Dedicated irq numbers
> 
> We never re-use regular guest irq numbers for passthrough irqs,
> instead we put them somewhere else and keep those mapped to the
> passthrough irq pages.
> 
> I was favouring this approach, but it does mean there will be a guest
> visible difference between kernel_irqchip=on and off which isn't
> great.
> 
> 
> (1) is the most elegant _interface_, but as we've seen it's
> problematic to implement.  Looking at the for_all_vmas() approach
> could be interesting, but otherwise option (3) might be the most
> practical.
> 
> --
Cédric Le Goater Feb. 12, 2019, 10:07 p.m. UTC | #17
On 2/11/19 7:42 AM, Benjamin Herrenschmidt wrote:
> On Mon, 2019-02-11 at 13:38 +1100, David Gibson wrote:
>>
>> 1) All in kernel
>>
>> The offset always maps directly to guest irq number and the kernel
>> somehow binds it either to an IPI or a host irq as necessary.
>> Cédric's original code attempts this, but the mechanism of keeping a
>> pointer to the VMA can't work.
> 
> Why do you need a pointer to the VMA anyway ? unmap_mapping_range()
> doesn't need a VMA for the unmap part, and faults/mmaps have the VMA.
> 
>> But.. remapping the irqs should be sufficiently infrequent that it
>> might be ok to consider simply stepping through all the hosting
>> process's VMAs to do this.
> 
> Which unmap_mapping_range() does for you as I explained previously. You
> only need the address space. See how spufs does it (among others).

and the different CAPI drivers. This is much better and it works fine.

On the same topic, the XIVE IC on P10 will use IPI ESB pages for the 
PHB interrupts sources. We will still need this kind of remapping but 
the pages will be from the same controller.

Thanks,

C.

Patch
diff mbox series

diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 8c876c166ef2..6bb61ba141c2 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -675,4 +675,8 @@  struct kvm_ppc_cpu_char {
 #define  KVM_XICS_PRESENTED		(1ULL << 43)
 #define  KVM_XICS_QUEUED		(1ULL << 44)
 
+/* POWER9 XIVE Native Interrupt Controller */
+#define KVM_DEV_XIVE_GRP_CTRL		1
+#define   KVM_DEV_XIVE_GET_ESB_FD	1
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 115143e76c45..e20081f0c8d4 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -153,6 +153,85 @@  int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
 	return rc;
 }
 
+static int xive_native_esb_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct kvmppc_xive *xive = vma->vm_file->private_data;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct xive_irq_data *xd;
+	u32 hw_num;
+	u16 src;
+	u64 page;
+	unsigned long irq;
+
+	/*
+	 * Linux/KVM uses a two pages ESB setting, one for trigger and
+	 * one for EOI
+	 */
+	irq = vmf->pgoff / 2;
+
+	sb = kvmppc_xive_find_source(xive, irq, &src);
+	if (!sb) {
+		pr_err("%s: source %lx not found !\n", __func__, irq);
+		return VM_FAULT_SIGBUS;
+	}
+
+	state = &sb->irq_state[src];
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	arch_spin_lock(&sb->lock);
+
+	/*
+	 * first/even page is for trigger
+	 * second/odd page is for EOI and management.
+	 */
+	page = vmf->pgoff % 2 ? xd->eoi_page : xd->trig_page;
+	arch_spin_unlock(&sb->lock);
+
+	if (!page) {
+		pr_err("%s: acessing invalid ESB page for source %lx !\n",
+		       __func__, irq);
+		return VM_FAULT_SIGBUS;
+	}
+
+	vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
+	return VM_FAULT_NOPAGE;
+}
+
+static const struct vm_operations_struct xive_native_esb_vmops = {
+	.fault = xive_native_esb_fault,
+};
+
+static int xive_native_esb_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	/* There are two ESB pages (trigger and EOI) per IRQ */
+	if (vma_pages(vma) + vma->vm_pgoff > KVMPPC_XIVE_NR_IRQS * 2)
+		return -EINVAL;
+
+	vma->vm_flags |= VM_IO | VM_PFNMAP;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_ops = &xive_native_esb_vmops;
+	return 0;
+}
+
+static const struct file_operations xive_native_esb_fops = {
+	.mmap = xive_native_esb_mmap,
+};
+
+static int kvmppc_xive_native_get_esb_fd(struct kvmppc_xive *xive, u64 addr)
+{
+	u64 __user *ubufp = (u64 __user *) addr;
+	int ret;
+
+	ret = anon_inode_getfd("[xive-esb]", &xive_native_esb_fops, xive,
+				O_RDWR | O_CLOEXEC);
+	if (ret < 0)
+		return ret;
+
+	return put_user(ret, ubufp);
+}
+
 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
 				       struct kvm_device_attr *attr)
 {
@@ -162,12 +241,30 @@  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
 static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
 				       struct kvm_device_attr *attr)
 {
+	struct kvmppc_xive *xive = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XIVE_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_XIVE_GET_ESB_FD:
+			return kvmppc_xive_native_get_esb_fd(xive, attr->addr);
+		}
+		break;
+	}
 	return -ENXIO;
 }
 
 static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
 				       struct kvm_device_attr *attr)
 {
+	switch (attr->group) {
+	case KVM_DEV_XIVE_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_XIVE_GET_ESB_FD:
+			return 0;
+		}
+		break;
+	}
 	return -ENXIO;
 }