diff mbox series

[RFC,v2,02/18] irq/dev-msi: Add support for a new DEV_MSI irq domain

Message ID 159534734833.28840.10067945890695808535.stgit@djiang5-desk3.ch.intel.com
State New
Headers show
Series Add VFIO mediated device support and DEV-MSI support for the idxd driver | expand

Commit Message

Dave Jiang July 21, 2020, 4:02 p.m. UTC
From: Megha Dey <megha.dey@intel.com>

Add support for the creation of a new DEV_MSI irq domain. It creates a
new irq chip associated with the DEV_MSI domain and adds the necessary
domain operations to it.

Add a new config option DEV_MSI which must be enabled by any
driver that wants to support device-specific message-signaled-interrupts
outside of PCI-MSI(-X).

Lastly, add device specific mask/unmask callbacks in addition to a write
function to the platform_msi_ops.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 arch/x86/include/asm/hw_irq.h |    5 ++
 drivers/base/Kconfig          |    7 +++
 drivers/base/Makefile         |    1 
 drivers/base/dev-msi.c        |   95 +++++++++++++++++++++++++++++++++++++++++
 drivers/base/platform-msi.c   |   45 +++++++++++++------
 drivers/base/platform-msi.h   |   23 ++++++++++
 include/linux/msi.h           |    8 +++
 7 files changed, 168 insertions(+), 16 deletions(-)
 create mode 100644 drivers/base/dev-msi.c
 create mode 100644 drivers/base/platform-msi.h

Comments

Jason Gunthorpe July 21, 2020, 4:13 p.m. UTC | #1
On Tue, Jul 21, 2020 at 09:02:28AM -0700, Dave Jiang wrote:
> From: Megha Dey <megha.dey@intel.com>
> 
> Add support for the creation of a new DEV_MSI irq domain. It creates a
> new irq chip associated with the DEV_MSI domain and adds the necessary
> domain operations to it.
> 
> Add a new config option DEV_MSI which must be enabled by any
> driver that wants to support device-specific message-signaled-interrupts
> outside of PCI-MSI(-X).
> 
> Lastly, add device specific mask/unmask callbacks in addition to a write
> function to the platform_msi_ops.
> 
> Reviewed-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Megha Dey <megha.dey@intel.com>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>  arch/x86/include/asm/hw_irq.h |    5 ++
>  drivers/base/Kconfig          |    7 +++
>  drivers/base/Makefile         |    1 
>  drivers/base/dev-msi.c        |   95 +++++++++++++++++++++++++++++++++++++++++
>  drivers/base/platform-msi.c   |   45 +++++++++++++------
>  drivers/base/platform-msi.h   |   23 ++++++++++
>  include/linux/msi.h           |    8 +++
>  7 files changed, 168 insertions(+), 16 deletions(-)
>  create mode 100644 drivers/base/dev-msi.c
>  create mode 100644 drivers/base/platform-msi.h
> 
> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
> index 74c12437401e..8ecd7570589d 100644
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -61,6 +61,11 @@ struct irq_alloc_info {
>  			irq_hw_number_t	msi_hwirq;
>  		};
>  #endif
> +#ifdef CONFIG_DEV_MSI
> +		struct {
> +			irq_hw_number_t hwirq;
> +		};
> +#endif

Why is this in this patch? I didn't see an obvious place where it is
used?
>  
> +static void __platform_msi_desc_mask_unmask_irq(struct msi_desc *desc, u32 mask)
> +{
> +	const struct platform_msi_ops *ops;
> +
> +	ops = desc->platform.msi_priv_data->ops;
> +	if (!ops)
> +		return;
> +
> +	if (mask) {
> +		if (ops->irq_mask)
> +			ops->irq_mask(desc);
> +	} else {
> +		if (ops->irq_unmask)
> +			ops->irq_unmask(desc);
> +	}
> +}
> +
> +void platform_msi_mask_irq(struct irq_data *data)
> +{
> +	__platform_msi_desc_mask_unmask_irq(irq_data_get_msi_desc(data), 1);
> +}
> +
> +void platform_msi_unmask_irq(struct irq_data *data)
> +{
> +	__platform_msi_desc_mask_unmask_irq(irq_data_get_msi_desc(data), 0);
> +}

This is a bit convoluted, just call the op directly:

void platform_msi_unmask_irq(struct irq_data *data)
{
	const struct platform_msi_ops *ops = desc->platform.msi_priv_data->ops;

	if (ops->irq_unmask)
		ops->irq_unmask(desc);
}

> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index 7f6a8eb51aca..1da97f905720 100644
> +++ b/include/linux/msi.h
> @@ -323,9 +323,13 @@ enum {
>  
>  /*
>   * platform_msi_ops - Callbacks for platform MSI ops
> + * @irq_mask:   mask an interrupt source
> + * @irq_unmask: unmask an interrupt source
>   * @write_msg:	write message content
>   */
>  struct platform_msi_ops {
> +	unsigned int            (*irq_mask)(struct msi_desc *desc);
> +	unsigned int            (*irq_unmask)(struct msi_desc *desc);

Why do these functions return things if the only call site throws it
away?

Jason
Dey, Megha July 22, 2020, 4:50 p.m. UTC | #2
Hi Jason,

On 7/21/2020 9:13 AM, Jason Gunthorpe wrote:
> On Tue, Jul 21, 2020 at 09:02:28AM -0700, Dave Jiang wrote:
>> From: Megha Dey <megha.dey@intel.com>
>>
>> Add support for the creation of a new DEV_MSI irq domain. It creates a
>> new irq chip associated with the DEV_MSI domain and adds the necessary
>> domain operations to it.
>>
>> Add a new config option DEV_MSI which must be enabled by any
>> driver that wants to support device-specific message-signaled-interrupts
>> outside of PCI-MSI(-X).
>>
>> Lastly, add device specific mask/unmask callbacks in addition to a write
>> function to the platform_msi_ops.
>>
>> Reviewed-by: Dan Williams <dan.j.williams@intel.com>
>> Signed-off-by: Megha Dey <megha.dey@intel.com>
>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>>   arch/x86/include/asm/hw_irq.h |    5 ++
>>   drivers/base/Kconfig          |    7 +++
>>   drivers/base/Makefile         |    1
>>   drivers/base/dev-msi.c        |   95 +++++++++++++++++++++++++++++++++++++++++
>>   drivers/base/platform-msi.c   |   45 +++++++++++++------
>>   drivers/base/platform-msi.h   |   23 ++++++++++
>>   include/linux/msi.h           |    8 +++
>>   7 files changed, 168 insertions(+), 16 deletions(-)
>>   create mode 100644 drivers/base/dev-msi.c
>>   create mode 100644 drivers/base/platform-msi.h
>>
>> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
>> index 74c12437401e..8ecd7570589d 100644
>> +++ b/arch/x86/include/asm/hw_irq.h
>> @@ -61,6 +61,11 @@ struct irq_alloc_info {
>>   			irq_hw_number_t	msi_hwirq;
>>   		};
>>   #endif
>> +#ifdef CONFIG_DEV_MSI
>> +		struct {
>> +			irq_hw_number_t hwirq;
>> +		};
>> +#endif
> 
> Why is this in this patch? I didn't see an obvious place where it is
> used?

Since I have introduced the DEV-MSI domain and related ops, this is 
required in the dev_msi_set_hwirq and dev_msi_set_desc in this patch.

>>   
>> +static void __platform_msi_desc_mask_unmask_irq(struct msi_desc *desc, u32 mask)
>> +{
>> +	const struct platform_msi_ops *ops;
>> +
>> +	ops = desc->platform.msi_priv_data->ops;
>> +	if (!ops)
>> +		return;
>> +
>> +	if (mask) {
>> +		if (ops->irq_mask)
>> +			ops->irq_mask(desc);
>> +	} else {
>> +		if (ops->irq_unmask)
>> +			ops->irq_unmask(desc);
>> +	}
>> +}
>> +
>> +void platform_msi_mask_irq(struct irq_data *data)
>> +{
>> +	__platform_msi_desc_mask_unmask_irq(irq_data_get_msi_desc(data), 1);
>> +}
>> +
>> +void platform_msi_unmask_irq(struct irq_data *data)
>> +{
>> +	__platform_msi_desc_mask_unmask_irq(irq_data_get_msi_desc(data), 0);
>> +}
> 
> This is a bit convoluted, just call the op directly:
> 
> void platform_msi_unmask_irq(struct irq_data *data)
> {
> 	const struct platform_msi_ops *ops = desc->platform.msi_priv_data->ops;
> 
> 	if (ops->irq_unmask)
> 		ops->irq_unmask(desc);
> }
>

Sure, I will update this.

>> diff --git a/include/linux/msi.h b/include/linux/msi.h
>> index 7f6a8eb51aca..1da97f905720 100644
>> +++ b/include/linux/msi.h
>> @@ -323,9 +323,13 @@ enum {
>>   
>>   /*
>>    * platform_msi_ops - Callbacks for platform MSI ops
>> + * @irq_mask:   mask an interrupt source
>> + * @irq_unmask: unmask an interrupt source
>>    * @write_msg:	write message content
>>    */
>>   struct platform_msi_ops {
>> +	unsigned int            (*irq_mask)(struct msi_desc *desc);
>> +	unsigned int            (*irq_unmask)(struct msi_desc *desc);
> 
> Why do these functions return things if the only call site throws it
> away?

Hmmm, fair enough, I will change it to void.

> 
> Jason
>
Marc Zyngier July 22, 2020, 6:52 p.m. UTC | #3
On Tue, 21 Jul 2020 17:02:28 +0100,
Dave Jiang <dave.jiang@intel.com> wrote:
> 
> From: Megha Dey <megha.dey@intel.com>
> 
> Add support for the creation of a new DEV_MSI irq domain. It creates a
> new irq chip associated with the DEV_MSI domain and adds the necessary
> domain operations to it.
> 
> Add a new config option DEV_MSI which must be enabled by any
> driver that wants to support device-specific message-signaled-interrupts
> outside of PCI-MSI(-X).

Which is exactly what platform-MSI already does. Why do we need
something else?

> 
> Lastly, add device specific mask/unmask callbacks in addition to a write
> function to the platform_msi_ops.
> 
> Reviewed-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Megha Dey <megha.dey@intel.com>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  arch/x86/include/asm/hw_irq.h |    5 ++
>  drivers/base/Kconfig          |    7 +++
>  drivers/base/Makefile         |    1 
>  drivers/base/dev-msi.c        |   95 +++++++++++++++++++++++++++++++++++++++++
>  drivers/base/platform-msi.c   |   45 +++++++++++++------
>  drivers/base/platform-msi.h   |   23 ++++++++++
>  include/linux/msi.h           |    8 +++
>  7 files changed, 168 insertions(+), 16 deletions(-)
>  create mode 100644 drivers/base/dev-msi.c
>  create mode 100644 drivers/base/platform-msi.h
> 
> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
> index 74c12437401e..8ecd7570589d 100644
> --- a/arch/x86/include/asm/hw_irq.h
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -61,6 +61,11 @@ struct irq_alloc_info {
>  			irq_hw_number_t	msi_hwirq;
>  		};
>  #endif
> +#ifdef CONFIG_DEV_MSI
> +		struct {
> +			irq_hw_number_t hwirq;
> +		};
> +#endif
>  #ifdef	CONFIG_X86_IO_APIC
>  		struct {
>  			int		ioapic_id;
> diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
> index 8d7001712062..f00901bac056 100644
> --- a/drivers/base/Kconfig
> +++ b/drivers/base/Kconfig
> @@ -210,4 +210,11 @@ config GENERIC_ARCH_TOPOLOGY
>  	  appropriate scaling, sysfs interface for reading capacity values at
>  	  runtime.
>  
> +config DEV_MSI
> +	bool "Device Specific Interrupt Messages"
> +	select IRQ_DOMAIN_HIERARCHY
> +	select GENERIC_MSI_IRQ_DOMAIN
> +	help
> +	  Allow device drivers to generate device-specific interrupt messages
> +	  for devices independent of PCI MSI/-X.
>  endmenu
> diff --git a/drivers/base/Makefile b/drivers/base/Makefile
> index 157452080f3d..ca1e4d39164e 100644
> --- a/drivers/base/Makefile
> +++ b/drivers/base/Makefile
> @@ -21,6 +21,7 @@ obj-$(CONFIG_REGMAP)	+= regmap/
>  obj-$(CONFIG_SOC_BUS) += soc.o
>  obj-$(CONFIG_PINCTRL) += pinctrl.o
>  obj-$(CONFIG_DEV_COREDUMP) += devcoredump.o
> +obj-$(CONFIG_DEV_MSI) += dev-msi.o
>  obj-$(CONFIG_GENERIC_MSI_IRQ_DOMAIN) += platform-msi.o
>  obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += arch_topology.o
>  
> diff --git a/drivers/base/dev-msi.c b/drivers/base/dev-msi.c
> new file mode 100644
> index 000000000000..240ccc353933
> --- /dev/null
> +++ b/drivers/base/dev-msi.c
> @@ -0,0 +1,95 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright © 2020 Intel Corporation.
> + *
> + * Author: Megha Dey <megha.dey@intel.com>
> + */
> +
> +#include <linux/irq.h>
> +#include <linux/irqdomain.h>
> +#include <linux/msi.h>
> +#include "platform-msi.h"
> +
> +struct irq_domain *dev_msi_default_domain;
> +
> +static irq_hw_number_t dev_msi_get_hwirq(struct msi_domain_info *info,
> +					 msi_alloc_info_t *arg)
> +{
> +	return arg->hwirq;
> +}
> +
> +static irq_hw_number_t dev_msi_calc_hwirq(struct msi_desc *desc)
> +{
> +	u32 devid;
> +
> +	devid = desc->platform.msi_priv_data->devid;
> +
> +	return (devid << (32 - DEV_ID_SHIFT)) | desc->platform.msi_index;
> +}
> +
> +static void dev_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
> +{
> +	arg->hwirq = dev_msi_calc_hwirq(desc);
> +}
> +
> +static int dev_msi_prepare(struct irq_domain *domain, struct device *dev,
> +			   int nvec, msi_alloc_info_t *arg)
> +{
> +	memset(arg, 0, sizeof(*arg));
> +
> +	return 0;
> +}
> +
> +static struct msi_domain_ops dev_msi_domain_ops = {
> +	.get_hwirq      = dev_msi_get_hwirq,
> +	.set_desc       = dev_msi_set_desc,
> +	.msi_prepare	= dev_msi_prepare,
> +};
> +
> +static struct irq_chip dev_msi_controller = {
> +	.name                   = "DEV-MSI",
> +	.irq_unmask             = platform_msi_unmask_irq,
> +	.irq_mask               = platform_msi_mask_irq,

This seems pretty odd, see below.

> +	.irq_write_msi_msg      = platform_msi_write_msg,
> +	.irq_ack                = irq_chip_ack_parent,
> +	.irq_retrigger          = irq_chip_retrigger_hierarchy,
> +	.flags                  = IRQCHIP_SKIP_SET_WAKE,
> +};
> +
> +static struct msi_domain_info dev_msi_domain_info = {
> +	.flags          = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS,
> +	.ops            = &dev_msi_domain_ops,
> +	.chip           = &dev_msi_controller,
> +	.handler        = handle_edge_irq,
> +	.handler_name   = "edge",
> +};
> +
> +static int __init create_dev_msi_domain(void)
> +{
> +	struct irq_domain *parent = NULL;
> +	struct fwnode_handle *fn;
> +
> +	/*
> +	 * Modern code should never have to use irq_get_default_host. But since
> +	 * dev-msi is invisible to DT/ACPI, this is an exception case.
> +	 */
> +	parent = irq_get_default_host();

Really? How is it going to work once you have devices sending their
MSIs to two different downstream blocks? This looks rather short-sighted.

> +	if (!parent)
> +		return -ENXIO;
> +
> +	fn = irq_domain_alloc_named_fwnode("DEV_MSI");
> +	if (!fn)
> +		return -ENXIO;
> +
> +	dev_msi_default_domain = msi_create_irq_domain(fn, &dev_msi_domain_info, parent);
> +	if (!dev_msi_default_domain) {
> +		pr_warn("failed to initialize irqdomain for DEV-MSI.\n");
> +		return -ENXIO;
> +	}
> +
> +	irq_domain_update_bus_token(dev_msi_default_domain, DOMAIN_BUS_PLATFORM_MSI);
> +	irq_domain_free_fwnode(fn);
> +
> +	return 0;
> +}
> +device_initcall(create_dev_msi_domain);
> diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
> index 9d94cd699468..5e1f210d65ee 100644
> --- a/drivers/base/platform-msi.c
> +++ b/drivers/base/platform-msi.c
> @@ -12,21 +12,7 @@
>  #include <linux/irqdomain.h>
>  #include <linux/msi.h>
>  #include <linux/slab.h>
> -
> -#define DEV_ID_SHIFT	21
> -#define MAX_DEV_MSIS	(1 << (32 - DEV_ID_SHIFT))
> -
> -/*
> - * Internal data structure containing a (made up, but unique) devid
> - * and the platform-msi ops
> - */
> -struct platform_msi_priv_data {
> -	struct device			*dev;
> -	void				*host_data;
> -	msi_alloc_info_t		arg;
> -	const struct platform_msi_ops	*ops;
> -	int				devid;
> -};
> +#include "platform-msi.h"
>  
>  /* The devid allocator */
>  static DEFINE_IDA(platform_msi_devid_ida);
> @@ -76,7 +62,7 @@ static void platform_msi_update_dom_ops(struct msi_domain_info *info)
>  		ops->set_desc = platform_msi_set_desc;
>  }
>  
> -static void platform_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
> +void platform_msi_write_msg(struct irq_data *data, struct msi_msg *msg)

It really begs the question: Why are you inventing a whole new
"DEV-MSI" when this really is platform-MSI?

>  {
>  	struct msi_desc *desc = irq_data_get_msi_desc(data);
>  	struct platform_msi_priv_data *priv_data;
> @@ -86,6 +72,33 @@ static void platform_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
>  	priv_data->ops->write_msg(desc, msg);
>  }
>  
> +static void __platform_msi_desc_mask_unmask_irq(struct msi_desc *desc, u32 mask)
> +{
> +	const struct platform_msi_ops *ops;
> +
> +	ops = desc->platform.msi_priv_data->ops;
> +	if (!ops)
> +		return;
> +
> +	if (mask) {
> +		if (ops->irq_mask)
> +			ops->irq_mask(desc);
> +	} else {
> +		if (ops->irq_unmask)
> +			ops->irq_unmask(desc);
> +	}
> +}
> +
> +void platform_msi_mask_irq(struct irq_data *data)
> +{
> +	__platform_msi_desc_mask_unmask_irq(irq_data_get_msi_desc(data), 1);
> +}
> +
> +void platform_msi_unmask_irq(struct irq_data *data)
> +{
> +	__platform_msi_desc_mask_unmask_irq(irq_data_get_msi_desc(data), 0);
> +}
> +

I don't immediately get why you have this code at the platform MSI
level. Until now, we only had the programming of the message into the
end-point, which is a device-specific action (and the whole reason why
this silly platform MSI exists).

On the other hand, masking an interrupt is an irqchip operation, and
only concerns the irqchip level. Here, you seem to be making it an
end-point operation, which doesn't really make sense to me. Or is this
device its own interrupt controller as well? That would be extremely
surprising, and I'd expect some block downstream of the device to be
able to control the masking of the interrupt.

>  static void platform_msi_update_chip_ops(struct msi_domain_info *info)
>  {
>  	struct irq_chip *chip = info->chip;
> diff --git a/drivers/base/platform-msi.h b/drivers/base/platform-msi.h
> new file mode 100644
> index 000000000000..1de8c2874218
> --- /dev/null
> +++ b/drivers/base/platform-msi.h
> @@ -0,0 +1,23 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright © 2020 Intel Corporation.
> + *
> + * Author: Megha Dey <megha.dey@intel.com>
> + */

Or not. You are merely moving existing code, not authoring it. Either
keep the original copyright attribution, or drop this mention
altogether.

> +
> +#include <linux/msi.h>
> +
> +#define DEV_ID_SHIFT    21
> +#define MAX_DEV_MSIS	(1 << (32 - DEV_ID_SHIFT))
> +
> +/*
> + * Data structure containing a (made up, but unique) devid
> + * and the platform-msi ops.
> + */
> +struct platform_msi_priv_data {
> +	struct device			*dev;
> +	void				*host_data;
> +	msi_alloc_info_t		arg;
> +	const struct platform_msi_ops	*ops;
> +	int				devid;
> +};
> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index 7f6a8eb51aca..1da97f905720 100644
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -323,9 +323,13 @@ enum {
>  
>  /*
>   * platform_msi_ops - Callbacks for platform MSI ops
> + * @irq_mask:   mask an interrupt source
> + * @irq_unmask: unmask an interrupt source
>   * @write_msg:	write message content
>   */
>  struct platform_msi_ops {
> +	unsigned int            (*irq_mask)(struct msi_desc *desc);
> +	unsigned int            (*irq_unmask)(struct msi_desc *desc);
>  	irq_write_msi_msg_t	write_msg;
>  };
>  
> @@ -370,6 +374,10 @@ int platform_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
>  void platform_msi_domain_free(struct irq_domain *domain, unsigned int virq,
>  			      unsigned int nvec);
>  void *platform_msi_get_host_data(struct irq_domain *domain);
> +
> +void platform_msi_write_msg(struct irq_data *data, struct msi_msg *msg);
> +void platform_msi_unmask_irq(struct irq_data *data);
> +void platform_msi_mask_irq(struct irq_data *data);
>  #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
>  
>  #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
> 
> 

Thanks,

	M.
Jason Gunthorpe July 22, 2020, 7:59 p.m. UTC | #4
On Wed, Jul 22, 2020 at 07:52:33PM +0100, Marc Zyngier wrote:

> Which is exactly what platform-MSI already does. Why do we need
> something else?

It looks to me like all the code is around managing the
dev->msi_domain of the devices.

The intended use would have PCI drivers create children devices using
mdev or virtbus and those devices wouldn't have a msi_domain from the
platform. Looks like platform_msi_alloc_priv_data() fails immediately
because dev->msi_domain will be NULL for these kinds of devices.

Maybe that issue should be handled directly instead of wrappering
platform_msi_*?

For instance a trivial addition to the platform_msi API:

  platform_msi_assign_domain(struct_device *newly_created_virtual_device,
                             struct device *physical_device);

Which could set the msi_domain of new device using the topology of
physical_device to deduce the correct domain?

Then the question is how to properly create a domain within the
hardware topology of physical_device with the correct parameters for
the platform. 

Why do we need a dummy msi_domain anyhow? Can this just use
physical_device->msi_domain directly? (I'm at my limit here of how
much of this I remember, sorry)

If you solve that it should solve the remapping problem too, as the
physical_device is already assigned by the platform to a remapping irq
domain if that is what the platform wants.

>> +	parent = irq_get_default_host();
> Really? How is it going to work once you have devices sending their
> MSIs to two different downstream blocks? This looks rather
> short-sighted.

.. and fix this too, the parent domain should be derived from the
topology of the physical_device which is originating the interrupt
messages.

> On the other hand, masking an interrupt is an irqchip operation, and
> only concerns the irqchip level. Here, you seem to be making it an
> end-point operation, which doesn't really make sense to me. Or is this
> device its own interrupt controller as well? That would be extremely
> surprising, and I'd expect some block downstream of the device to be
> able to control the masking of the interrupt.

These are message interrupts so they originate directly from the
device and generally travel directly to the CPU APIC. On the wire
there is no difference between a MSI, MSI-X and a device using the
dev-msi approach. 

IIRC on Intel/AMD at least once a MSI is launched it is not maskable.

So the model for MSI is always "mask at source". The closest mapping
to the Linux IRQ model is to say the end device has a irqchip that
encapsulates the ability of the device to generate the MSI in the
first place.

It looks like existing platform_msi drivers deal with "masking"
implicitly by halting the device interrupt generation before releasing
the interrupt and have no way for the generic irqchip layer to mask
the interrupt.

I suppose the motivation to make it explicit is related to vfio using
the generic mask/unmask functionality?

Explicit seems better, IMHO.

Jason
Marc Zyngier July 23, 2020, 8:51 a.m. UTC | #5
On 2020-07-22 20:59, Jason Gunthorpe wrote:
> On Wed, Jul 22, 2020 at 07:52:33PM +0100, Marc Zyngier wrote:
> 
>> Which is exactly what platform-MSI already does. Why do we need
>> something else?
> 
> It looks to me like all the code is around managing the
> dev->msi_domain of the devices.
> 
> The intended use would have PCI drivers create children devices using
> mdev or virtbus and those devices wouldn't have a msi_domain from the
> platform. Looks like platform_msi_alloc_priv_data() fails immediately
> because dev->msi_domain will be NULL for these kinds of devices.
> 
> Maybe that issue should be handled directly instead of wrappering
> platform_msi_*?
> 
> For instance a trivial addition to the platform_msi API:
> 
>   platform_msi_assign_domain(struct_device 
> *newly_created_virtual_device,
>                              struct device *physical_device);
> 
> Which could set the msi_domain of new device using the topology of
> physical_device to deduce the correct domain?

That would seem like a sensible course of action, as losing
the topology information will likely result in problems down
the line.

> Then the question is how to properly create a domain within the
> hardware topology of physical_device with the correct parameters for
> the platform.
> 
> Why do we need a dummy msi_domain anyhow? Can this just use
> physical_device->msi_domain directly? (I'm at my limit here of how
> much of this I remember, sorry)

The parent device would be a PCI device, if I follow you correctly.
It would thus expect to be able to program the MSI the PCI way,
which wouldn't work. So we end-up with this custom MSI domain
that knows about *this* particular family of devices.

> If you solve that it should solve the remapping problem too, as the
> physical_device is already assigned by the platform to a remapping irq
> domain if that is what the platform wants.
> 
>>> +	parent = irq_get_default_host();
>> Really? How is it going to work once you have devices sending their
>> MSIs to two different downstream blocks? This looks rather
>> short-sighted.
> 
> .. and fix this too, the parent domain should be derived from the
> topology of the physical_device which is originating the interrupt
> messages.
> 
>> On the other hand, masking an interrupt is an irqchip operation, and
>> only concerns the irqchip level. Here, you seem to be making it an
>> end-point operation, which doesn't really make sense to me. Or is this
>> device its own interrupt controller as well? That would be extremely
>> surprising, and I'd expect some block downstream of the device to be
>> able to control the masking of the interrupt.
> 
> These are message interrupts so they originate directly from the
> device and generally travel directly to the CPU APIC. On the wire
> there is no difference between a MSI, MSI-X and a device using the
> dev-msi approach.

I understand that.

> IIRC on Intel/AMD at least once a MSI is launched it is not maskable.

Really? So you can't shut a device with a screaming interrupt,
for example, should it become otherwise unresponsive?

> So the model for MSI is always "mask at source". The closest mapping
> to the Linux IRQ model is to say the end device has a irqchip that
> encapsulates the ability of the device to generate the MSI in the
> first place.

This is an x86'ism, I'm afraid. Systems I deal with can mask any
interrupt at the interrupt controller level, MSI or not.

> It looks like existing platform_msi drivers deal with "masking"
> implicitly by halting the device interrupt generation before releasing
> the interrupt and have no way for the generic irqchip layer to mask
> the interrupt.

No. As I said above, the interrupt controller is perfectly capable
of masking interrupts on its own, without touching the device.

> I suppose the motivation to make it explicit is related to vfio using
> the generic mask/unmask functionality?
> 
> Explicit seems better, IMHO.

If masking at the source is the only way to shut the device up,
and assuming that the device provides the expected semantics
(a MSI raised by the device while the interrupt is masked
isn't lost and gets sent when unmasked), that's fair enough.
It's just ugly.

Thanks,

         M.
Jason Gunthorpe July 24, 2020, 12:16 a.m. UTC | #6
On Thu, Jul 23, 2020 at 09:51:52AM +0100, Marc Zyngier wrote:

> > IIRC on Intel/AMD at least once a MSI is launched it is not maskable.
> 
> Really? So you can't shut a device with a screaming interrupt,
> for example, should it become otherwise unresponsive?

Well, it used to be like that in the APICv1 days. I suppose modern
interrupt remapping probably changes things.

> > So the model for MSI is always "mask at source". The closest mapping
> > to the Linux IRQ model is to say the end device has a irqchip that
> > encapsulates the ability of the device to generate the MSI in the
> > first place.
> 
> This is an x86'ism, I'm afraid. Systems I deal with can mask any
> interrupt at the interrupt controller level, MSI or not.

Sure. However it feels like a bad practice to leave the source
unmasked and potentially continuing to generate messages if the
intention was to disable the IRQ that was assigned to it - even if the
messages do not result in CPU interrupts they will still consume
system resources.

> > I suppose the motivation to make it explicit is related to vfio using
> > the generic mask/unmask functionality?
> > 
> > Explicit seems better, IMHO.
> 
> If masking at the source is the only way to shut the device up,
> and assuming that the device provides the expected semantics
> (a MSI raised by the device while the interrupt is masked
> isn't lost and gets sent when unmasked), that's fair enough.
> It's just ugly.

It makes sense that the masking should follow the same semantics for
PCI MSI masking.

Jason
Thomas Gleixner July 24, 2020, 12:36 a.m. UTC | #7
Jason Gunthorpe <jgg@mellanox.com> writes:
> On Thu, Jul 23, 2020 at 09:51:52AM +0100, Marc Zyngier wrote:
>> > IIRC on Intel/AMD at least once a MSI is launched it is not maskable.
>> 
>> Really? So you can't shut a device with a screaming interrupt,
>> for example, should it become otherwise unresponsive?
>
> Well, it used to be like that in the APICv1 days. I suppose modern
> interrupt remapping probably changes things.

The MSI side of affairs has nothing to do with Intel and neither with
ACPIv1. It's a trainwreck on the PCI side.

MSI interrupts do not have mandatory masking. For those which do not
implement it (and that's still the case with devices designed today
especially CPU internal peripherals) there are only a few options to
shut them up:

  1) Disable MSI which has the problem that the interrupt gets
     redirected to legacy PCI #A-#D interrupt unless the hardware
     supports to disable that redirection, which is another optional
     thing and hopeless case

  2) Disable it at the IRQ remapping level which fortunately allows by
     design to do so.

  3) Disable it at the device level which is feasible for a device
     driver but impossible for the irq side

>> > So the model for MSI is always "mask at source". The closest mapping
>> > to the Linux IRQ model is to say the end device has a irqchip that
>> > encapsulates the ability of the device to generate the MSI in the
>> > first place.
>> 
>> This is an x86'ism, I'm afraid. Systems I deal with can mask any
>> interrupt at the interrupt controller level, MSI or not.

Yes, it's a pain, but reality.

> Sure. However it feels like a bad practice to leave the source
> unmasked and potentially continuing to generate messages if the
> intention was to disable the IRQ that was assigned to it - even if the
> messages do not result in CPU interrupts they will still consume
> system resources.

See above. You cannot reach out to the device driver to disable the
underlying interrupt source, which is the ultimate ratio if #1 or #2 are
not working or not there. That would be squaring the circle and
violating all rules of layering and locking at once.

The bad news is that we can't change the hardware. We have to deal with
it. And yes, I told HW people publicly and in private conversations that
unmaskable interrupts are broken by definition for more than a
decade. They still get designed that way ...

>> If masking at the source is the only way to shut the device up,
>> and assuming that the device provides the expected semantics
>> (a MSI raised by the device while the interrupt is masked
>> isn't lost and gets sent when unmasked), that's fair enough.
>> It's just ugly.
>
> It makes sense that the masking should follow the same semantics for
> PCI MSI masking.

Which semantics? The horrors of MSI or the halfways reasonable MSI-X
variant?

Thanks,

        tglx
Dey, Megha Aug. 5, 2020, 6:55 p.m. UTC | #8
Hi Marc,

> -----Original Message-----
> From: Marc Zyngier <maz@kernel.org>
> Sent: Wednesday, July 22, 2020 11:53 AM
> To: Jiang, Dave <dave.jiang@intel.com>
> Cc: vkoul@kernel.org; Dey, Megha <megha.dey@intel.com>;
> bhelgaas@google.com; rafael@kernel.org; gregkh@linuxfoundation.org;
> tglx@linutronix.de; hpa@zytor.com; alex.williamson@redhat.com; Pan, Jacob
> jun <jacob.jun.pan@intel.com>; Raj, Ashok <ashok.raj@intel.com>;
> jgg@mellanox.com; Liu, Yi L <yi.l.liu@intel.com>; Lu, Baolu
> <baolu.lu@intel.com>; Tian, Kevin <kevin.tian@intel.com>; Kumar, Sanjay K
> <sanjay.k.kumar@intel.com>; Luck, Tony <tony.luck@intel.com>; Lin, Jing
> <jing.lin@intel.com>; Williams, Dan J <dan.j.williams@intel.com>;
> kwankhede@nvidia.com; eric.auger@redhat.com; parav@mellanox.com;
> Hansen, Dave <dave.hansen@intel.com>; netanelg@mellanox.com;
> shahafs@mellanox.com; yan.y.zhao@linux.intel.com; pbonzini@redhat.com;
> Ortiz, Samuel <samuel.ortiz@intel.com>; Hossain, Mona
> <mona.hossain@intel.com>; dmaengine@vger.kernel.org; linux-
> kernel@vger.kernel.org; x86@kernel.org; linux-pci@vger.kernel.org;
> kvm@vger.kernel.org
> Subject: Re: [PATCH RFC v2 02/18] irq/dev-msi: Add support for a new DEV_MSI
> irq domain
> 
> On Tue, 21 Jul 2020 17:02:28 +0100,
> Dave Jiang <dave.jiang@intel.com> wrote:
> >
> > From: Megha Dey <megha.dey@intel.com>
> >
> > Add support for the creation of a new DEV_MSI irq domain. It creates a
> > new irq chip associated with the DEV_MSI domain and adds the necessary
> > domain operations to it.
> >
> > Add a new config option DEV_MSI which must be enabled by any driver
> > that wants to support device-specific message-signaled-interrupts
> > outside of PCI-MSI(-X).
> 
> Which is exactly what platform-MSI already does. Why do we need something
> else?

True, dev-msi is a mere extension of platform-msi, which apart from providing a
custom write msg also provides a custom mask/unmask to the device.
Also, we introduce a new IRQ domain to be associated with these classes of devices.
There is nothing more to dev-msi than this currently.

> 
> >
> > Lastly, add device specific mask/unmask callbacks in addition to a
> > write function to the platform_msi_ops.
> >
> > Reviewed-by: Dan Williams <dan.j.williams@intel.com>
> > Signed-off-by: Megha Dey <megha.dey@intel.com>
> > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > ---
> >  arch/x86/include/asm/hw_irq.h |    5 ++
> >  drivers/base/Kconfig          |    7 +++
> >  drivers/base/Makefile         |    1
> >  drivers/base/dev-msi.c        |   95
> +++++++++++++++++++++++++++++++++++++++++
> >  drivers/base/platform-msi.c   |   45 +++++++++++++------
> >  drivers/base/platform-msi.h   |   23 ++++++++++
> >  include/linux/msi.h           |    8 +++
> >  7 files changed, 168 insertions(+), 16 deletions(-)  create mode
> > 100644 drivers/base/dev-msi.c  create mode 100644
> > drivers/base/platform-msi.h
> >
> > diff --git a/arch/x86/include/asm/hw_irq.h
> > b/arch/x86/include/asm/hw_irq.h index 74c12437401e..8ecd7570589d
> > 100644
> > --- a/arch/x86/include/asm/hw_irq.h
> > +++ b/arch/x86/include/asm/hw_irq.h
> > @@ -61,6 +61,11 @@ struct irq_alloc_info {
> >  			irq_hw_number_t	msi_hwirq;
> >  		};
> >  #endif
> > +#ifdef CONFIG_DEV_MSI
> > +		struct {
> > +			irq_hw_number_t hwirq;
> > +		};
> > +#endif
> >  #ifdef	CONFIG_X86_IO_APIC
> >  		struct {
> >  			int		ioapic_id;
> > diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index
> > 8d7001712062..f00901bac056 100644
> > --- a/drivers/base/Kconfig
> > +++ b/drivers/base/Kconfig
> > @@ -210,4 +210,11 @@ config GENERIC_ARCH_TOPOLOGY
> >  	  appropriate scaling, sysfs interface for reading capacity values at
> >  	  runtime.
> >
> > +config DEV_MSI
> > +	bool "Device Specific Interrupt Messages"
> > +	select IRQ_DOMAIN_HIERARCHY
> > +	select GENERIC_MSI_IRQ_DOMAIN
> > +	help
> > +	  Allow device drivers to generate device-specific interrupt messages
> > +	  for devices independent of PCI MSI/-X.
> >  endmenu
> > diff --git a/drivers/base/Makefile b/drivers/base/Makefile index
> > 157452080f3d..ca1e4d39164e 100644
> > --- a/drivers/base/Makefile
> > +++ b/drivers/base/Makefile
> > @@ -21,6 +21,7 @@ obj-$(CONFIG_REGMAP)	+= regmap/
> >  obj-$(CONFIG_SOC_BUS) += soc.o
> >  obj-$(CONFIG_PINCTRL) += pinctrl.o
> >  obj-$(CONFIG_DEV_COREDUMP) += devcoredump.o
> > +obj-$(CONFIG_DEV_MSI) += dev-msi.o
> >  obj-$(CONFIG_GENERIC_MSI_IRQ_DOMAIN) += platform-msi.o
> >  obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += arch_topology.o
> >
> > diff --git a/drivers/base/dev-msi.c b/drivers/base/dev-msi.c new file
> > mode 100644 index 000000000000..240ccc353933
> > --- /dev/null
> > +++ b/drivers/base/dev-msi.c
> > @@ -0,0 +1,95 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright © 2020 Intel Corporation.
> > + *
> > + * Author: Megha Dey <megha.dey@intel.com>  */
> > +
> > +#include <linux/irq.h>
> > +#include <linux/irqdomain.h>
> > +#include <linux/msi.h>
> > +#include "platform-msi.h"
> > +
> > +struct irq_domain *dev_msi_default_domain;
> > +
> > +static irq_hw_number_t dev_msi_get_hwirq(struct msi_domain_info *info,
> > +					 msi_alloc_info_t *arg)
> > +{
> > +	return arg->hwirq;
> > +}
> > +
> > +static irq_hw_number_t dev_msi_calc_hwirq(struct msi_desc *desc) {
> > +	u32 devid;
> > +
> > +	devid = desc->platform.msi_priv_data->devid;
> > +
> > +	return (devid << (32 - DEV_ID_SHIFT)) | desc->platform.msi_index; }
> > +
> > +static void dev_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc
> > +*desc) {
> > +	arg->hwirq = dev_msi_calc_hwirq(desc); }
> > +
> > +static int dev_msi_prepare(struct irq_domain *domain, struct device *dev,
> > +			   int nvec, msi_alloc_info_t *arg) {
> > +	memset(arg, 0, sizeof(*arg));
> > +
> > +	return 0;
> > +}
> > +
> > +static struct msi_domain_ops dev_msi_domain_ops = {
> > +	.get_hwirq      = dev_msi_get_hwirq,
> > +	.set_desc       = dev_msi_set_desc,
> > +	.msi_prepare	= dev_msi_prepare,
> > +};
> > +
> > +static struct irq_chip dev_msi_controller = {
> > +	.name                   = "DEV-MSI",
> > +	.irq_unmask             = platform_msi_unmask_irq,
> > +	.irq_mask               = platform_msi_mask_irq,
> 
> This seems pretty odd, see below.

Ok..
> 
> > +	.irq_write_msi_msg      = platform_msi_write_msg,
> > +	.irq_ack                = irq_chip_ack_parent,
> > +	.irq_retrigger          = irq_chip_retrigger_hierarchy,
> > +	.flags                  = IRQCHIP_SKIP_SET_WAKE,
> > +};
> > +
> > +static struct msi_domain_info dev_msi_domain_info = {
> > +	.flags          = MSI_FLAG_USE_DEF_DOM_OPS |
> MSI_FLAG_USE_DEF_CHIP_OPS,
> > +	.ops            = &dev_msi_domain_ops,
> > +	.chip           = &dev_msi_controller,
> > +	.handler        = handle_edge_irq,
> > +	.handler_name   = "edge",
> > +};
> > +
> > +static int __init create_dev_msi_domain(void) {
> > +	struct irq_domain *parent = NULL;
> > +	struct fwnode_handle *fn;
> > +
> > +	/*
> > +	 * Modern code should never have to use irq_get_default_host. But
> since
> > +	 * dev-msi is invisible to DT/ACPI, this is an exception case.
> > +	 */
> > +	parent = irq_get_default_host();
> 
> Really? How is it going to work once you have devices sending their MSIs to two
> different downstream blocks? This looks rather short-sighted.

So after some thought, I've realized that we don’t need to introduce 2 IRQ domains- with/without
Interrupt remapping enabled.
Hence, the above is void in the next version of patches.
> 
> > +	if (!parent)
> > +		return -ENXIO;
> > +
> > +	fn = irq_domain_alloc_named_fwnode("DEV_MSI");
> > +	if (!fn)
> > +		return -ENXIO;
> > +
> > +	dev_msi_default_domain = msi_create_irq_domain(fn,
> &dev_msi_domain_info, parent);
> > +	if (!dev_msi_default_domain) {
> > +		pr_warn("failed to initialize irqdomain for DEV-MSI.\n");
> > +		return -ENXIO;
> > +	}
> > +
> > +	irq_domain_update_bus_token(dev_msi_default_domain,
> DOMAIN_BUS_PLATFORM_MSI);
> > +	irq_domain_free_fwnode(fn);
> > +
> > +	return 0;
> > +}
> > +device_initcall(create_dev_msi_domain);
> > diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
> > index 9d94cd699468..5e1f210d65ee 100644
> > --- a/drivers/base/platform-msi.c
> > +++ b/drivers/base/platform-msi.c
> > @@ -12,21 +12,7 @@
> >  #include <linux/irqdomain.h>
> >  #include <linux/msi.h>
> >  #include <linux/slab.h>
> > -
> > -#define DEV_ID_SHIFT	21
> > -#define MAX_DEV_MSIS	(1 << (32 - DEV_ID_SHIFT))
> > -
> > -/*
> > - * Internal data structure containing a (made up, but unique) devid
> > - * and the platform-msi ops
> > - */
> > -struct platform_msi_priv_data {
> > -	struct device			*dev;
> > -	void				*host_data;
> > -	msi_alloc_info_t		arg;
> > -	const struct platform_msi_ops	*ops;
> > -	int				devid;
> > -};
> > +#include "platform-msi.h"
> >
> >  /* The devid allocator */
> >  static DEFINE_IDA(platform_msi_devid_ida);
> > @@ -76,7 +62,7 @@ static void platform_msi_update_dom_ops(struct
> msi_domain_info *info)
> >  		ops->set_desc = platform_msi_set_desc;  }
> >
> > -static void platform_msi_write_msg(struct irq_data *data, struct
> > msi_msg *msg)
> > +void platform_msi_write_msg(struct irq_data *data, struct msi_msg
> > +*msg)
> 
> It really begs the question: Why are you inventing a whole new "DEV-MSI" when
> this really is platform-MSI?

platform-msi is platform custom, but device-driver opaque MSI setup/control. With dev-msi, we add the
following
1.  device specific mask/unmask functions
2. new dev-msi domain to setup/control MSI on these devices
3. explicitly deny pci devices from using the dev_msi alloc/free calls, something not currently in platform-msi..

We are not really inventing anything new, but only extending platform-msi to cover new groups of devices.
We will be sending out the next version of patches shortly, please let me know if you have any naming suggestions
for this extension.

> 
> >  {
> >  	struct msi_desc *desc = irq_data_get_msi_desc(data);
> >  	struct platform_msi_priv_data *priv_data; @@ -86,6 +72,33 @@ static
> > void platform_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
> >  	priv_data->ops->write_msg(desc, msg);  }
> >
> > +static void __platform_msi_desc_mask_unmask_irq(struct msi_desc
> > +*desc, u32 mask) {
> > +	const struct platform_msi_ops *ops;
> > +
> > +	ops = desc->platform.msi_priv_data->ops;
> > +	if (!ops)
> > +		return;
> > +
> > +	if (mask) {
> > +		if (ops->irq_mask)
> > +			ops->irq_mask(desc);
> > +	} else {
> > +		if (ops->irq_unmask)
> > +			ops->irq_unmask(desc);
> > +	}
> > +}
> > +
> > +void platform_msi_mask_irq(struct irq_data *data) {
> > +	__platform_msi_desc_mask_unmask_irq(irq_data_get_msi_desc(data),
> 1);
> > +}
> > +
> > +void platform_msi_unmask_irq(struct irq_data *data) {
> > +	__platform_msi_desc_mask_unmask_irq(irq_data_get_msi_desc(data),
> 0);
> > +}
> > +
> 
> I don't immediately get why you have this code at the platform MSI level. Until
> now, we only had the programming of the message into the end-point, which is
> a device-specific action (and the whole reason why this silly platform MSI exists)
> 
> On the other hand, masking an interrupt is an irqchip operation, and only
> concerns the irqchip level. Here, you seem to be making it an end-point
> operation, which doesn't really make sense to me. Or is this device its own
> interrupt controller as well? That would be extremely surprising, and I'd expect
> some block downstream of the device to be able to control the masking of the
> interrupt.

Hmmm, I don’t fully understand this. Ultimately the mask/unmask is a device operation right?
Some new devices may want the option to mask/unmask interrupts at a non-standard location.
These callbacks are a way for the device to inform how exactly interrupts could be masked/unmasked
on my device, no different from pci mask/unmask, except this is at a custom location...  

> 
> >  static void platform_msi_update_chip_ops(struct msi_domain_info
> > *info)  {
> >  	struct irq_chip *chip = info->chip;
> > diff --git a/drivers/base/platform-msi.h b/drivers/base/platform-msi.h
> > new file mode 100644 index 000000000000..1de8c2874218
> > --- /dev/null
> > +++ b/drivers/base/platform-msi.h
> > @@ -0,0 +1,23 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright © 2020 Intel Corporation.
> > + *
> > + * Author: Megha Dey <megha.dey@intel.com>  */
> 
> Or not. You are merely moving existing code, not authoring it. Either keep the
> original copyright attribution, or drop this mention altogether.

sure
> 
> > +
> > +#include <linux/msi.h>
> > +
> > +#define DEV_ID_SHIFT    21
> > +#define MAX_DEV_MSIS	(1 << (32 - DEV_ID_SHIFT))
> > +
> > +/*
> > + * Data structure containing a (made up, but unique) devid
> > + * and the platform-msi ops.
> > + */
> > +struct platform_msi_priv_data {
> > +	struct device			*dev;
> > +	void				*host_data;
> > +	msi_alloc_info_t		arg;
> > +	const struct platform_msi_ops	*ops;
> > +	int				devid;
> > +};
> > diff --git a/include/linux/msi.h b/include/linux/msi.h index
> > 7f6a8eb51aca..1da97f905720 100644
> > --- a/include/linux/msi.h
> > +++ b/include/linux/msi.h
> > @@ -323,9 +323,13 @@ enum {
> >
> >  /*
> >   * platform_msi_ops - Callbacks for platform MSI ops
> > + * @irq_mask:   mask an interrupt source
> > + * @irq_unmask: unmask an interrupt source
> >   * @write_msg:	write message content
> >   */
> >  struct platform_msi_ops {
> > +	unsigned int            (*irq_mask)(struct msi_desc *desc);
> > +	unsigned int            (*irq_unmask)(struct msi_desc *desc);
> >  	irq_write_msi_msg_t	write_msg;
> >  };
> >
> > @@ -370,6 +374,10 @@ int platform_msi_domain_alloc(struct irq_domain
> > *domain, unsigned int virq,  void platform_msi_domain_free(struct irq_domain
> *domain, unsigned int virq,
> >  			      unsigned int nvec);
> >  void *platform_msi_get_host_data(struct irq_domain *domain);
> > +
> > +void platform_msi_write_msg(struct irq_data *data, struct msi_msg
> > +*msg); void platform_msi_unmask_irq(struct irq_data *data); void
> > +platform_msi_mask_irq(struct irq_data *data);
> >  #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
> >
> >  #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
> >
> >
> 
> Thanks,
> 
> 	M.
> 
> --
> Without deviation from the norm, progress is not possible.
Dey, Megha Aug. 5, 2020, 7:18 p.m. UTC | #9
Hi Jason,

> -----Original Message-----
> From: Jason Gunthorpe <jgg@mellanox.com>
> Sent: Wednesday, July 22, 2020 12:59 PM
> To: Marc Zyngier <maz@kernel.org>
> Cc: Jiang, Dave <dave.jiang@intel.com>; vkoul@kernel.org; Dey, Megha
> <megha.dey@intel.com>; bhelgaas@google.com; rafael@kernel.org;
> gregkh@linuxfoundation.org; tglx@linutronix.de; hpa@zytor.com;
> alex.williamson@redhat.com; Pan, Jacob jun <jacob.jun.pan@intel.com>; Raj,
> Ashok <ashok.raj@intel.com>; Liu, Yi L <yi.l.liu@intel.com>; Lu, Baolu
> <baolu.lu@intel.com>; Tian, Kevin <kevin.tian@intel.com>; Kumar, Sanjay K
> <sanjay.k.kumar@intel.com>; Luck, Tony <tony.luck@intel.com>; Lin, Jing
> <jing.lin@intel.com>; Williams, Dan J <dan.j.williams@intel.com>;
> kwankhede@nvidia.com; eric.auger@redhat.com; parav@mellanox.com;
> Hansen, Dave <dave.hansen@intel.com>; netanelg@mellanox.com;
> shahafs@mellanox.com; yan.y.zhao@linux.intel.com; pbonzini@redhat.com;
> Ortiz, Samuel <samuel.ortiz@intel.com>; Hossain, Mona
> <mona.hossain@intel.com>; dmaengine@vger.kernel.org; linux-
> kernel@vger.kernel.org; x86@kernel.org; linux-pci@vger.kernel.org;
> kvm@vger.kernel.org
> Subject: Re: [PATCH RFC v2 02/18] irq/dev-msi: Add support for a new DEV_MSI
> irq domain
> 
> On Wed, Jul 22, 2020 at 07:52:33PM +0100, Marc Zyngier wrote:
> 
> > Which is exactly what platform-MSI already does. Why do we need
> > something else?
> 
> It looks to me like all the code is around managing the
> dev->msi_domain of the devices.
> 
> The intended use would have PCI drivers create children devices using mdev or
> virtbus and those devices wouldn't have a msi_domain from the platform. Looks
> like platform_msi_alloc_priv_data() fails immediately because dev->msi_domain
> will be NULL for these kinds of devices.
> 
> Maybe that issue should be handled directly instead of wrappering
> platform_msi_*?
> 
> For instance a trivial addition to the platform_msi API:
> 
>   platform_msi_assign_domain(struct_device *newly_created_virtual_device,
>                              struct device *physical_device);
> 
> Which could set the msi_domain of new device using the topology of
> physical_device to deduce the correct domain?
> 
> Then the question is how to properly create a domain within the hardware
> topology of physical_device with the correct parameters for the platform.
> 
> Why do we need a dummy msi_domain anyhow? Can this just use
> physical_device->msi_domain directly? (I'm at my limit here of how much of this
> I remember, sorry)
> 
> If you solve that it should solve the remapping problem too, as the
> physical_device is already assigned by the platform to a remapping irq domain if
> that is what the platform wants.

Yeah most of what you said is right. For the most part, we are simply introducing a new IRQ domain
which provides specific domain info ops for the classes of devices which want to provide custom
mask/unmask callbacks..

Also, from your other comments, I've realized the same IRQ domain can be used when interrupt
remapping is enabled/disabled.

Hence we will only have one create_dev_msi_domain which can be called by any device driver that
wants to use the dev-msi IRQ domain to alloc/free IRQs. It would be the responsibility of the device
driver to provide the correct device and update the dev->msi_domain.
  
> 
> >> +	parent = irq_get_default_host();
> > Really? How is it going to work once you have devices sending their
> > MSIs to two different downstream blocks? This looks rather
> > short-sighted.
> 
> .. and fix this too, the parent domain should be derived from the topology of the
> physical_device which is originating the interrupt messages.
> 
Yes

> > On the other hand, masking an interrupt is an irqchip operation, and
> > only concerns the irqchip level. Here, you seem to be making it an
> > end-point operation, which doesn't really make sense to me. Or is this
> > device its own interrupt controller as well? That would be extremely
> > surprising, and I'd expect some block downstream of the device to be
> > able to control the masking of the interrupt.
> 
> These are message interrupts so they originate directly from the device and
> generally travel directly to the CPU APIC. On the wire there is no difference
> between a MSI, MSI-X and a device using the dev-msi approach.
> 
> IIRC on Intel/AMD at least once a MSI is launched it is not maskable.
> 
> So the model for MSI is always "mask at source". The closest mapping to the
> Linux IRQ model is to say the end device has a irqchip that encapsulates the
> ability of the device to generate the MSI in the first place.
> 
> It looks like existing platform_msi drivers deal with "masking"
> implicitly by halting the device interrupt generation before releasing the
> interrupt and have no way for the generic irqchip layer to mask the interrupt.
> 
> I suppose the motivation to make it explicit is related to vfio using the generic
> mask/unmask functionality?
> 
> Explicit seems better, IMHO.

I don't think I understand this fully, ive still kept the device specific mask/unmask calls in the next
patch series, please let me know if it needs further modifications.
> 
> Jason
Jason Gunthorpe Aug. 5, 2020, 10:15 p.m. UTC | #10
On Wed, Aug 05, 2020 at 07:18:39PM +0000, Dey, Megha wrote:

> Hence we will only have one create_dev_msi_domain which can be
> called by any device driver that wants to use the dev-msi IRQ domain
> to alloc/free IRQs. It would be the responsibility of the device
> driver to provide the correct device and update the dev->msi_domain.

I'm not sure that sounds like a good idea, why should a device driver
touch dev->msi_domain?

There was a certain appeal to the api I suggested by having everything
related to setting up the new IRQs being in the core code.

Jason
Dey, Megha Aug. 5, 2020, 10:36 p.m. UTC | #11
Hi Jason,

> -----Original Message-----
> From: Jason Gunthorpe <jgg@mellanox.com>
> Sent: Wednesday, August 5, 2020 3:16 PM
> To: Dey, Megha <megha.dey@intel.com>
> Cc: Marc Zyngier <maz@kernel.org>; Jiang, Dave <dave.jiang@intel.com>;
> vkoul@kernel.org; bhelgaas@google.com; rafael@kernel.org;
> gregkh@linuxfoundation.org; tglx@linutronix.de; hpa@zytor.com;
> alex.williamson@redhat.com; Pan, Jacob jun <jacob.jun.pan@intel.com>; Raj,
> Ashok <ashok.raj@intel.com>; Liu, Yi L <yi.l.liu@intel.com>; Lu, Baolu
> <baolu.lu@intel.com>; Tian, Kevin <kevin.tian@intel.com>; Kumar, Sanjay K
> <sanjay.k.kumar@intel.com>; Luck, Tony <tony.luck@intel.com>; Lin, Jing
> <jing.lin@intel.com>; Williams, Dan J <dan.j.williams@intel.com>;
> kwankhede@nvidia.com; eric.auger@redhat.com; parav@mellanox.com;
> Hansen, Dave <dave.hansen@intel.com>; netanelg@mellanox.com;
> shahafs@mellanox.com; yan.y.zhao@linux.intel.com; pbonzini@redhat.com;
> Ortiz, Samuel <samuel.ortiz@intel.com>; Hossain, Mona
> <mona.hossain@intel.com>; dmaengine@vger.kernel.org; linux-
> kernel@vger.kernel.org; x86@kernel.org; linux-pci@vger.kernel.org;
> kvm@vger.kernel.org
> Subject: Re: [PATCH RFC v2 02/18] irq/dev-msi: Add support for a new DEV_MSI
> irq domain
> 
> On Wed, Aug 05, 2020 at 07:18:39PM +0000, Dey, Megha wrote:
> 
> > Hence we will only have one create_dev_msi_domain which can be called
> > by any device driver that wants to use the dev-msi IRQ domain to
> > alloc/free IRQs. It would be the responsibility of the device driver
> > to provide the correct device and update the dev->msi_domain.
> 
> I'm not sure that sounds like a good idea, why should a device driver touch dev-
> >msi_domain?
> 
> There was a certain appeal to the api I suggested by having everything related to
> setting up the new IRQs being in the core code.

The basic API to create the dev_msi domain would be :

struct irq_domain *create_dev_msi_irq_domain(struct irq_domain *parent)

This can be called by devices according to their use case.

For e.g. in dsa case, it is called from the irq remapping driver:
iommu->ir_dev_msi_domain = create_dev_msi_domain(iommu->ir_domain)

and from the dsa mdev driver:
p_dev = get_parent_pci_dev(dev);
iommu = device_to_iommu(p_dev);

dev->msi_domain = iommu->ir_dev_msi_domain;

So we are creating the domain in the IRQ  remapping domain which can be used by other devices which want to have the same IRQ parent domain and use dev-msi APIs. We are only updating that device's msi_domain to the already created dev-msi domain in the driver. 

Other devices (your rdma driver etc) can create their own dev-msi domain by passing the appropriate parent IRq domain.

We cannot have this in the core code since the parent domain cannot be the same?

Please let me know if you think otherwise..
> 
> Jason
Jason Gunthorpe Aug. 5, 2020, 10:53 p.m. UTC | #12
On Wed, Aug 05, 2020 at 10:36:23PM +0000, Dey, Megha wrote:
> Hi Jason,
> 
> > From: Jason Gunthorpe <jgg@mellanox.com>
> > Sent: Wednesday, August 5, 2020 3:16 PM
> > To: Dey, Megha <megha.dey@intel.com>
> > Cc: Marc Zyngier <maz@kernel.org>; Jiang, Dave <dave.jiang@intel.com>;
> > vkoul@kernel.org; bhelgaas@google.com; rafael@kernel.org;
> > gregkh@linuxfoundation.org; tglx@linutronix.de; hpa@zytor.com;
> > alex.williamson@redhat.com; Pan, Jacob jun <jacob.jun.pan@intel.com>; Raj,
> > Ashok <ashok.raj@intel.com>; Liu, Yi L <yi.l.liu@intel.com>; Lu, Baolu
> > <baolu.lu@intel.com>; Tian, Kevin <kevin.tian@intel.com>; Kumar, Sanjay K
> > <sanjay.k.kumar@intel.com>; Luck, Tony <tony.luck@intel.com>; Lin, Jing
> > <jing.lin@intel.com>; Williams, Dan J <dan.j.williams@intel.com>;
> > kwankhede@nvidia.com; eric.auger@redhat.com; parav@mellanox.com;
> > Hansen, Dave <dave.hansen@intel.com>; netanelg@mellanox.com;
> > shahafs@mellanox.com; yan.y.zhao@linux.intel.com; pbonzini@redhat.com;
> > Ortiz, Samuel <samuel.ortiz@intel.com>; Hossain, Mona
> > <mona.hossain@intel.com>; dmaengine@vger.kernel.org; linux-
> > kernel@vger.kernel.org; x86@kernel.org; linux-pci@vger.kernel.org;
> > kvm@vger.kernel.org
> > Subject: Re: [PATCH RFC v2 02/18] irq/dev-msi: Add support for a new DEV_MSI
> > irq domain
> > 
> > On Wed, Aug 05, 2020 at 07:18:39PM +0000, Dey, Megha wrote:
> > 
> > > Hence we will only have one create_dev_msi_domain which can be called
> > > by any device driver that wants to use the dev-msi IRQ domain to
> > > alloc/free IRQs. It would be the responsibility of the device driver
> > > to provide the correct device and update the dev->msi_domain.
> > 
> > I'm not sure that sounds like a good idea, why should a device driver touch dev-
> > >msi_domain?
> > 
> > There was a certain appeal to the api I suggested by having everything related to
> > setting up the new IRQs being in the core code.
> 
> The basic API to create the dev_msi domain would be :
> 
> struct irq_domain *create_dev_msi_irq_domain(struct irq_domain *parent)
> 
> This can be called by devices according to their use case.
> 
> For e.g. in dsa case, it is called from the irq remapping driver:
> iommu->ir_dev_msi_domain = create_dev_msi_domain(iommu->ir_domain)
> 
> and from the dsa mdev driver:
> p_dev = get_parent_pci_dev(dev);
> iommu = device_to_iommu(p_dev);
> 
> dev->msi_domain = iommu->ir_dev_msi_domain;
> 
> So we are creating the domain in the IRQ  remapping domain which can be used by other devices which want to have the same IRQ parent domain and use dev-msi APIs. We are only updating that device's msi_domain to the already created dev-msi domain in the driver. 
> 
> Other devices (your rdma driver etc) can create their own dev-msi domain by passing the appropriate parent IRq domain.
> 
> We cannot have this in the core code since the parent domain cannot
> be the same?

Well, I had suggested to pass in the parent struct device, but it
could certainly use an irq_domain instead:

  platform_msi_assign_domain(dev, device_to_iommu(p_dev)->ir_domain);

Or

  platform_msi_assign_domain(dev, pdev->msi_domain)

?

Any maybe the natural expression is to add a version of
platform_msi_create_device_domain() that accepts a parent irq_domain()
and if the device doesn't already have a msi_domain then it creates
one. Might be too tricky to manage lifetime of the new irq_domain
though..

It feels cleaner to me if everything related to this is contained in
the platform_msi and the driver using it. Not sure it makes sense to
involve the iommu?

Jason
Dey, Megha Aug. 6, 2020, 12:13 a.m. UTC | #13
Hi Jason,

> -----Original Message-----
> From: Jason Gunthorpe <jgg@mellanox.com>
> Sent: Wednesday, August 5, 2020 3:54 PM
> To: Dey, Megha <megha.dey@intel.com>
> Cc: Marc Zyngier <maz@kernel.org>; Jiang, Dave <dave.jiang@intel.com>;
> vkoul@kernel.org; bhelgaas@google.com; rafael@kernel.org;
> gregkh@linuxfoundation.org; tglx@linutronix.de; hpa@zytor.com;
> alex.williamson@redhat.com; Pan, Jacob jun <jacob.jun.pan@intel.com>; Raj,
> Ashok <ashok.raj@intel.com>; Liu, Yi L <yi.l.liu@intel.com>; Lu, Baolu
> <baolu.lu@intel.com>; Tian, Kevin <kevin.tian@intel.com>; Kumar, Sanjay K
> <sanjay.k.kumar@intel.com>; Luck, Tony <tony.luck@intel.com>; Lin, Jing
> <jing.lin@intel.com>; Williams, Dan J <dan.j.williams@intel.com>;
> kwankhede@nvidia.com; eric.auger@redhat.com; parav@mellanox.com;
> Hansen, Dave <dave.hansen@intel.com>; netanelg@mellanox.com;
> shahafs@mellanox.com; yan.y.zhao@linux.intel.com; pbonzini@redhat.com;
> Ortiz, Samuel <samuel.ortiz@intel.com>; Hossain, Mona
> <mona.hossain@intel.com>; dmaengine@vger.kernel.org; linux-
> kernel@vger.kernel.org; x86@kernel.org; linux-pci@vger.kernel.org;
> kvm@vger.kernel.org
> Subject: Re: [PATCH RFC v2 02/18] irq/dev-msi: Add support for a new DEV_MSI
> irq domain
> 
> On Wed, Aug 05, 2020 at 10:36:23PM +0000, Dey, Megha wrote:
> > Hi Jason,
> >
> > > From: Jason Gunthorpe <jgg@mellanox.com>
> > > Sent: Wednesday, August 5, 2020 3:16 PM
> > > To: Dey, Megha <megha.dey@intel.com>
> > > Cc: Marc Zyngier <maz@kernel.org>; Jiang, Dave
> > > <dave.jiang@intel.com>; vkoul@kernel.org; bhelgaas@google.com;
> > > rafael@kernel.org; gregkh@linuxfoundation.org; tglx@linutronix.de;
> > > hpa@zytor.com; alex.williamson@redhat.com; Pan, Jacob jun
> > > <jacob.jun.pan@intel.com>; Raj, Ashok <ashok.raj@intel.com>; Liu, Yi
> > > L <yi.l.liu@intel.com>; Lu, Baolu <baolu.lu@intel.com>; Tian, Kevin
> > > <kevin.tian@intel.com>; Kumar, Sanjay K <sanjay.k.kumar@intel.com>;
> > > Luck, Tony <tony.luck@intel.com>; Lin, Jing <jing.lin@intel.com>;
> > > Williams, Dan J <dan.j.williams@intel.com>; kwankhede@nvidia.com;
> > > eric.auger@redhat.com; parav@mellanox.com; Hansen, Dave
> > > <dave.hansen@intel.com>; netanelg@mellanox.com;
> > > shahafs@mellanox.com; yan.y.zhao@linux.intel.com;
> > > pbonzini@redhat.com; Ortiz, Samuel <samuel.ortiz@intel.com>;
> > > Hossain, Mona <mona.hossain@intel.com>; dmaengine@vger.kernel.org;
> > > linux- kernel@vger.kernel.org; x86@kernel.org;
> > > linux-pci@vger.kernel.org; kvm@vger.kernel.org
> > > Subject: Re: [PATCH RFC v2 02/18] irq/dev-msi: Add support for a new
> > > DEV_MSI irq domain
> > >
> > > On Wed, Aug 05, 2020 at 07:18:39PM +0000, Dey, Megha wrote:
> > >
> > > > Hence we will only have one create_dev_msi_domain which can be
> > > > called by any device driver that wants to use the dev-msi IRQ
> > > > domain to alloc/free IRQs. It would be the responsibility of the
> > > > device driver to provide the correct device and update the dev-
> >msi_domain.
> > >
> > > I'm not sure that sounds like a good idea, why should a device
> > > driver touch dev-
> > > >msi_domain?
> > >
> > > There was a certain appeal to the api I suggested by having
> > > everything related to setting up the new IRQs being in the core code.
> >
> > The basic API to create the dev_msi domain would be :
> >
> > struct irq_domain *create_dev_msi_irq_domain(struct irq_domain
> > *parent)
> >
> > This can be called by devices according to their use case.
> >
> > For e.g. in dsa case, it is called from the irq remapping driver:
> > iommu->ir_dev_msi_domain = create_dev_msi_domain(iommu->ir_domain)
> >
> > and from the dsa mdev driver:
> > p_dev = get_parent_pci_dev(dev);
> > iommu = device_to_iommu(p_dev);
> >
> > dev->msi_domain = iommu->ir_dev_msi_domain;
> >
> > So we are creating the domain in the IRQ  remapping domain which can be
> used by other devices which want to have the same IRQ parent domain and use
> dev-msi APIs. We are only updating that device's msi_domain to the already
> created dev-msi domain in the driver.
> >
> > Other devices (your rdma driver etc) can create their own dev-msi domain by
> passing the appropriate parent IRq domain.
> >
> > We cannot have this in the core code since the parent domain cannot be
> > the same?
> 
> Well, I had suggested to pass in the parent struct device, but it could certainly
> use an irq_domain instead:
> 
>   platform_msi_assign_domain(dev, device_to_iommu(p_dev)->ir_domain);
> 
> Or
> 
>   platform_msi_assign_domain(dev, pdev->msi_domain)
> 
> ?
> 
> Any maybe the natural expression is to add a version of
> platform_msi_create_device_domain() that accepts a parent irq_domain() and if
> the device doesn't already have a msi_domain then it creates one. Might be too
> tricky to manage lifetime of the new irq_domain though..
> 
> It feels cleaner to me if everything related to this is contained in the
> platform_msi and the driver using it. Not sure it makes sense to involve the
> iommu?

Well yeah something like this can be done, but what is the missing piece is where the IRQ domain actually gets created, i.e where this new version of platform_msi_create_device_domain() is called. That is the only piece that is currently done in the IOMMU driver only for DSA mdev. Not that all devices need to do it this way.. do you have suggestions as to where you want to call this function?
> 
> Jason
Jason Gunthorpe Aug. 6, 2020, 12:19 a.m. UTC | #14
On Thu, Aug 06, 2020 at 12:13:24AM +0000, Dey, Megha wrote:
> > Well, I had suggested to pass in the parent struct device, but it could certainly
> > use an irq_domain instead:
> > 
> >   platform_msi_assign_domain(dev, device_to_iommu(p_dev)->ir_domain);
> > 
> > Or
> > 
> >   platform_msi_assign_domain(dev, pdev->msi_domain)
> > 
> > ?
> > 
> > Any maybe the natural expression is to add a version of
> > platform_msi_create_device_domain() that accepts a parent irq_domain() and if
> > the device doesn't already have a msi_domain then it creates one. Might be too
> > tricky to manage lifetime of the new irq_domain though..
> > 
> > It feels cleaner to me if everything related to this is contained in the
> > platform_msi and the driver using it. Not sure it makes sense to involve the
> > iommu?
> 
> Well yeah something like this can be done, but what is the missing
> piece is where the IRQ domain actually gets created, i.e where this
> new version of platform_msi_create_device_domain() is called. That
> is the only piece that is currently done in the IOMMU driver only
> for DSA mdev. Not that all devices need to do it this way.. do you
> have suggestions as to where you want to call this function?

Oops, I was thinking of platform_msi_domain_alloc_irqs() not
create_device_domain()

ie call it in the device driver that wishes to consume the extra
MSIs. 

Is there a harm if each device driver creates a new irq_domain for its
use?

Jason
Dey, Megha Aug. 6, 2020, 12:32 a.m. UTC | #15
Hi Jason,

> -----Original Message-----
> From: Jason Gunthorpe <jgg@mellanox.com>
> Sent: Wednesday, August 5, 2020 5:19 PM
> To: Dey, Megha <megha.dey@intel.com>
> Cc: Marc Zyngier <maz@kernel.org>; Jiang, Dave <dave.jiang@intel.com>;
> vkoul@kernel.org; bhelgaas@google.com; rafael@kernel.org;
> gregkh@linuxfoundation.org; tglx@linutronix.de; hpa@zytor.com;
> alex.williamson@redhat.com; Pan, Jacob jun <jacob.jun.pan@intel.com>; Raj,
> Ashok <ashok.raj@intel.com>; Liu, Yi L <yi.l.liu@intel.com>; Lu, Baolu
> <baolu.lu@intel.com>; Tian, Kevin <kevin.tian@intel.com>; Kumar, Sanjay K
> <sanjay.k.kumar@intel.com>; Luck, Tony <tony.luck@intel.com>; Lin, Jing
> <jing.lin@intel.com>; Williams, Dan J <dan.j.williams@intel.com>;
> kwankhede@nvidia.com; eric.auger@redhat.com; parav@mellanox.com;
> Hansen, Dave <dave.hansen@intel.com>; netanelg@mellanox.com;
> shahafs@mellanox.com; yan.y.zhao@linux.intel.com; pbonzini@redhat.com;
> Ortiz, Samuel <samuel.ortiz@intel.com>; Hossain, Mona
> <mona.hossain@intel.com>; dmaengine@vger.kernel.org; linux-
> kernel@vger.kernel.org; x86@kernel.org; linux-pci@vger.kernel.org;
> kvm@vger.kernel.org
> Subject: Re: [PATCH RFC v2 02/18] irq/dev-msi: Add support for a new DEV_MSI
> irq domain
> 
> On Thu, Aug 06, 2020 at 12:13:24AM +0000, Dey, Megha wrote:
> > > Well, I had suggested to pass in the parent struct device, but it
> > > could certainly use an irq_domain instead:
> > >
> > >   platform_msi_assign_domain(dev,
> > > device_to_iommu(p_dev)->ir_domain);
> > >
> > > Or
> > >
> > >   platform_msi_assign_domain(dev, pdev->msi_domain)
> > >
> > > ?
> > >
> > > Any maybe the natural expression is to add a version of
> > > platform_msi_create_device_domain() that accepts a parent
> > > irq_domain() and if the device doesn't already have a msi_domain
> > > then it creates one. Might be too tricky to manage lifetime of the new
> irq_domain though..
> > >
> > > It feels cleaner to me if everything related to this is contained in
> > > the platform_msi and the driver using it. Not sure it makes sense to
> > > involve the iommu?
> >
> > Well yeah something like this can be done, but what is the missing
> > piece is where the IRQ domain actually gets created, i.e where this
> > new version of platform_msi_create_device_domain() is called. That is
> > the only piece that is currently done in the IOMMU driver only for DSA
> > mdev. Not that all devices need to do it this way.. do you have
> > suggestions as to where you want to call this function?
> 
> Oops, I was thinking of platform_msi_domain_alloc_irqs() not
> create_device_domain()
> 
> ie call it in the device driver that wishes to consume the extra MSIs.
> 
> Is there a harm if each device driver creates a new irq_domain for its use?

Well, the only harm is if we want to reuse the irq domain.

As of today, we only have DSA mdev which uses the dev-msi domain. In the IRQ domain hierarchy,
We will have this:

Vector-> intel-ir->dev-msi 

So tmrw if we have a new device, which would also want to have the intel-ir as the parent and use the same domain ops, we will simply be creating a copy of this IRQ domain, which may not be very fruitful.

But apart from that, I don't think there are any issues..

What do you think is the best approach here?
> 
> Jason
Jason Gunthorpe Aug. 6, 2020, 12:46 a.m. UTC | #16
On Thu, Aug 06, 2020 at 12:32:31AM +0000, Dey, Megha wrote:
> > Oops, I was thinking of platform_msi_domain_alloc_irqs() not
> > create_device_domain()
> > 
> > ie call it in the device driver that wishes to consume the extra MSIs.
> > 
> > Is there a harm if each device driver creates a new irq_domain for its use?
> 
> Well, the only harm is if we want to reuse the irq domain.
> 
> As of today, we only have DSA mdev which uses the dev-msi domain. In the IRQ domain hierarchy,
> We will have this:
> 
> Vector-> intel-ir->dev-msi 
> 
> So tmrw if we have a new device, which would also want to have the
> intel-ir as the parent and use the same domain ops, we will simply
> be creating a copy of this IRQ domain, which may not be very
> fruitful.
>
> But apart from that, I don't think there are any issues..
> 
> What do you think is the best approach here?

I've surely forgotten these details, I can't advise if duplicate
irq_domains are very bad.

A single domain per parent irq_domain does seem more elegant, but I'm
not sure it is worth the extra work to do it?

In any event the API seems cleaner if it is all contained in the
platform_msi and strongly connected to the driver, not spread to the
iommu as well. If it had to create single dev-msi domain per parent
irq_domain then it certainly could be done in a few ways.

Jason
Thomas Gleixner Aug. 6, 2020, 5:10 p.m. UTC | #17
Megha,

"Dey, Megha" <megha.dey@intel.com> writes:

>> -----Original Message-----
>> From: Jason Gunthorpe <jgg@mellanox.com>
<SNIP>
>> Subject: Re: [PATCH RFC v2 02/18] irq/dev-msi: Add support for a new DEV_MSI
>> irq domain

can you please fix your mail client not to copy the whole header of the
mail you are replying to into the mail body?

>> > > Well, I had suggested to pass in the parent struct device, but it
>> Oops, I was thinking of platform_msi_domain_alloc_irqs() not
>> create_device_domain()
>> 
>> ie call it in the device driver that wishes to consume the extra MSIs.
>> 
>> Is there a harm if each device driver creates a new irq_domain for its use?
>
> Well, the only harm is if we want to reuse the irq domain.

You cannot reuse the irq domain if you create a domain per driver. The
way how hierarchical domains work is:

vector --- DMAR-MSI
       |
       |-- ....
       |
       |-- IR-0 --- IO/APIC-0
       |        | 
       |        |-- IO/APIC-1
       |        |
       |        |-- PCI/MSI-0
       |        |
       |        |-- HPET/MSI-0
       |
       |-- IR-1 --- PCI/MSI-1
       |        |

The outermost domain is what the actual device driver uses. I.e. for
PCI-MSI it's the msi domain which is associated to the bus the device is
connected to. Each domain has its own interrupt chip instance and its
own data set.

Domains of the same type share the code, but neither the data nor the
interrupt chip instance.

Also there is a strict parent child relationship in terms of resources.
Let's look at PCI.

PCI/MSI-0 depends on IR-0 which depends on the vector domain. That's
reflecting both the flow of the interrupt and the steps required for
various tasks, e.g. allocation/deallocation and also interrupt chip
operations. In order to allocate a PCI/MSI interrupt in domain PCI/MSI-0
a slot in the remapping unit and a vector needs to be allocated.

If you disable interrupt remapping all the outermost domains in the
scheme above become childs of the vector domain.

So if we look at DEV/MSI as a infrastructure domain then the scheme
looks like this:

vector --- DMAR-MSI
       |
       |-- ....
       |
       |-- IR-0 --- IO/APIC-0
       |        | 
       |        |-- IO/APIC-1
       |        |
       |        |-- PCI/MSI-0
       |        |
       |        |-- HPET/MSI-0
       |        |
       |        |-- DEV/MSI-0
       |
       |-- IR-1 --- PCI/MSI-1
       |        |
       |        |-- DEV/MSI-1


But if you make it per device then you have multiple DEV/MSI domains per
IR unit.

What's the right thing to do?

If the DEV/MSI domain has it's own per IR unit resource management, then
you need one per IR unit.

If the resource management is solely per device then having a domain per
device is the right choice.

Thanks,

        tglx
Dey, Megha Aug. 6, 2020, 5:58 p.m. UTC | #18
Hi Thomas,

On 8/6/2020 10:10 AM, Thomas Gleixner wrote:
> Megha,
>
> "Dey, Megha" <megha.dey@intel.com> writes:
>
>>> -----Original Message-----
>>> From: Jason Gunthorpe <jgg@mellanox.com>
> <SNIP>
>>> Subject: Re: [PATCH RFC v2 02/18] irq/dev-msi: Add support for a new DEV_MSI
>>> irq domain
> can you please fix your mail client not to copy the whole header of the
> mail you are replying to into the mail body?

oops, i hope i have fixed it now..

>
>>>>> Well, I had suggested to pass in the parent struct device, but it
>>> Oops, I was thinking of platform_msi_domain_alloc_irqs() not
>>> create_device_domain()
>>>
>>> ie call it in the device driver that wishes to consume the extra MSIs.
>>>
>>> Is there a harm if each device driver creates a new irq_domain for its use?
>> Well, the only harm is if we want to reuse the irq domain.
> You cannot reuse the irq domain if you create a domain per driver. The
> way how hierarchical domains work is:
>
> vector --- DMAR-MSI
>         |
>         |-- ....
>         |
>         |-- IR-0 --- IO/APIC-0
>         |        |
>         |        |-- IO/APIC-1
>         |        |
>         |        |-- PCI/MSI-0
>         |        |
>         |        |-- HPET/MSI-0
>         |
>         |-- IR-1 --- PCI/MSI-1
>         |        |
>
> The outermost domain is what the actual device driver uses. I.e. for
> PCI-MSI it's the msi domain which is associated to the bus the device is
> connected to. Each domain has its own interrupt chip instance and its
> own data set.
>
> Domains of the same type share the code, but neither the data nor the
> interrupt chip instance.
>
> Also there is a strict parent child relationship in terms of resources.
> Let's look at PCI.
>
> PCI/MSI-0 depends on IR-0 which depends on the vector domain. That's
> reflecting both the flow of the interrupt and the steps required for
> various tasks, e.g. allocation/deallocation and also interrupt chip
> operations. In order to allocate a PCI/MSI interrupt in domain PCI/MSI-0
> a slot in the remapping unit and a vector needs to be allocated.
>
> If you disable interrupt remapping all the outermost domains in the
> scheme above become childs of the vector domain.
>
> So if we look at DEV/MSI as a infrastructure domain then the scheme
> looks like this:
>
> vector --- DMAR-MSI
>         |
>         |-- ....
>         |
>         |-- IR-0 --- IO/APIC-0
>         |        |
>         |        |-- IO/APIC-1
>         |        |
>         |        |-- PCI/MSI-0
>         |        |
>         |        |-- HPET/MSI-0
>         |        |
>         |        |-- DEV/MSI-0
>         |
>         |-- IR-1 --- PCI/MSI-1
>         |        |
>         |        |-- DEV/MSI-1
>
>
> But if you make it per device then you have multiple DEV/MSI domains per
> IR unit.
>
> What's the right thing to do?
>
> If the DEV/MSI domain has it's own per IR unit resource management, then
> you need one per IR unit.
>
> If the resource management is solely per device then having a domain per
> device is the right choice.

Thanks a lot Thomas for this detailed explanation!!

The dev-msi domain can be used by other devices if they too would want 
to follow the
vector->intel IR->dev-msi IRQ hierarchy.
I do create one dev-msi IRQ domain instance per IR unit. So I guess for 
this case,
it makes most sense to have a dev-msi IRQ domain per IR unit as opposed 
to create one
per individual driver..
>
> Thanks,
>
>          tglx
Thomas Gleixner Aug. 6, 2020, 8:21 p.m. UTC | #19
Megha,

"Dey, Megha" <megha.dey@intel.com> writes:
> On 8/6/2020 10:10 AM, Thomas Gleixner wrote:
>> If the DEV/MSI domain has it's own per IR unit resource management, then
>> you need one per IR unit.
>>
>> If the resource management is solely per device then having a domain per
>> device is the right choice.
>
> The dev-msi domain can be used by other devices if they too would want
> to follow the vector->intel IR->dev-msi IRQ hierarchy.  I do create
> one dev-msi IRQ domain instance per IR unit. So I guess for this case,
> it makes most sense to have a dev-msi IRQ domain per IR unit as
> opposed to create one per individual driver..

I'm not really convinced. I looked at the idxd driver and that has it's
own interrupt related resource management for the IMS slots and provides
the mask,unmask callbacks for the interrupt chip via this crude platform
data indirection.

So I don't see the value of the dev-msi domain per IR unit. The domain
itself does not provide much functionality other than indirections and
you clearly need per device interrupt resource management on the side
and a customized irq chip. I rather see it as a plain layering
violation.

The point is that your IDXD driver manages the per device IMS slots
which is a interrupt related resource. The story would be different if
the IMS slots would be managed by some central or per IR unit entity,
but in that case you'd need IMS specific domain(s).

So the obvious consequence of the hierarchical irq design is:

   vector -> IR -> IDXD

which makes the control flow of allocating an interrupt for a subdevice
straight forward following the irq hierarchy rules.

This still wants to inherit the existing msi domain functionality, but
the amount of code required is small and removes all these pointless
indirections and integrates the slot management naturally.

If you expect or know that there are other devices coming up with IMS
integrated then most of that code can be made a common library. But for
this to make sense, you really want to make sure that these other
devices do not require yet another horrible layer of indirection.

A side note: I just read back on the specification and stumbled over
the following gem:

 "IMS may also optionally support per-message masking and pending bit
  status, similar to the per-vector mask and pending bit array in the
  PCI Express MSI-X capability."

Optionally? Please tell the hardware folks to make this mandatory. We
have enough pain with non maskable MSI interrupts already so introducing
yet another non maskable interrupt trainwreck is not an option.

It's more than a decade now that I tell HW people not to repeat the
non-maskable MSI failure, but obviously they still think that
non-maskable interrupts are a brilliant idea. I know that HW folks
believe that everything they omit can be fixed in software, but they
have to finally understand that this particular issue _cannot_ be fixed
at all.

Thanks,

        tglx
Dey, Megha Aug. 6, 2020, 10:27 p.m. UTC | #20
Hi Thomas,

On 8/6/2020 1:21 PM, Thomas Gleixner wrote:
> Megha,
>
> "Dey, Megha" <megha.dey@intel.com> writes:
>> On 8/6/2020 10:10 AM, Thomas Gleixner wrote:
>>> If the DEV/MSI domain has it's own per IR unit resource management, then
>>> you need one per IR unit.
>>>
>>> If the resource management is solely per device then having a domain per
>>> device is the right choice.
>> The dev-msi domain can be used by other devices if they too would want
>> to follow the vector->intel IR->dev-msi IRQ hierarchy.  I do create
>> one dev-msi IRQ domain instance per IR unit. So I guess for this case,
>> it makes most sense to have a dev-msi IRQ domain per IR unit as
>> opposed to create one per individual driver..
> I'm not really convinced. I looked at the idxd driver and that has it's
> own interrupt related resource management for the IMS slots and provides
> the mask,unmask callbacks for the interrupt chip via this crude platform
> data indirection.
>
> So I don't see the value of the dev-msi domain per IR unit. The domain
> itself does not provide much functionality other than indirections and
> you clearly need per device interrupt resource management on the side
> and a customized irq chip. I rather see it as a plain layering
> violation.
>
> The point is that your IDXD driver manages the per device IMS slots
> which is a interrupt related resource. The story would be different if
> the IMS slots would be managed by some central or per IR unit entity,
> but in that case you'd need IMS specific domain(s).
>
> So the obvious consequence of the hierarchical irq design is:
>
>     vector -> IR -> IDXD
>
> which makes the control flow of allocating an interrupt for a subdevice
> straight forward following the irq hierarchy rules.
>
> This still wants to inherit the existing msi domain functionality, but
> the amount of code required is small and removes all these pointless
> indirections and integrates the slot management naturally.
>
> If you expect or know that there are other devices coming up with IMS
> integrated then most of that code can be made a common library. But for
> this to make sense, you really want to make sure that these other
> devices do not require yet another horrible layer of indirection.
Yes Thomas, for now this may look odd since there is only one device 
using this
IRQ domain. But there will be other devices following suit, hence I have 
added
all the IRQ chip/domain bits in a separate file in drivers/irqchip in 
the next
version of patches. I'll submit the patches shortly and it will be great 
if I
can get more feedback on it.
> A side note: I just read back on the specification and stumbled over
> the following gem:
>
>   "IMS may also optionally support per-message masking and pending bit
>    status, similar to the per-vector mask and pending bit array in the
>    PCI Express MSI-X capability."
>
> Optionally? Please tell the hardware folks to make this mandatory. We
> have enough pain with non maskable MSI interrupts already so introducing
> yet another non maskable interrupt trainwreck is not an option.
>
> It's more than a decade now that I tell HW people not to repeat the
> non-maskable MSI failure, but obviously they still think that
> non-maskable interrupts are a brilliant idea. I know that HW folks
> believe that everything they omit can be fixed in software, but they
> have to finally understand that this particular issue _cannot_ be fixed
> at all.
hmm, I asked the hardware folks and they have informed me that all IMS 
devices
will support per vector masking/pending bit. This will be updated in the 
next SIOV
spec which will be published soon.
>
> Thanks,
>
>          tglx
Thomas Gleixner Aug. 7, 2020, 8:48 a.m. UTC | #21
Megha,

"Dey, Megha" <megha.dey@intel.com> writes:
> On 8/6/2020 1:21 PM, Thomas Gleixner wrote:
>> If you expect or know that there are other devices coming up with IMS
>> integrated then most of that code can be made a common library. But for
>> this to make sense, you really want to make sure that these other
>> devices do not require yet another horrible layer of indirection.
>
> Yes Thomas, for now this may look odd since there is only one device
> using this IRQ domain. But there will be other devices following suit,
> hence I have added all the IRQ chip/domain bits in a separate file in
> drivers/irqchip in the next version of patches. I'll submit the
> patches shortly and it will be great if I can get more feedback on it.

Again. The common domain makes only sense if it provides actual
functionality and resource management at the domain level. The IMS slot
management CANNOT happen at the common domain level simply because IMS
is strictly per device. So your "common" domain is just a shim layer
which pretends to be common and requires warts at the side to do the IMS
management at the device level.

Let's see what you came up with this time :)

>> A side note: I just read back on the specification and stumbled over
>> the following gem:
>>
>>   "IMS may also optionally support per-message masking and pending bit
>>    status, similar to the per-vector mask and pending bit array in the
>>    PCI Express MSI-X capability."
>>
>> Optionally? Please tell the hardware folks to make this mandatory. We
>> have enough pain with non maskable MSI interrupts already so introducing
>> yet another non maskable interrupt trainwreck is not an option.
>>
>> It's more than a decade now that I tell HW people not to repeat the
>> non-maskable MSI failure, but obviously they still think that
>> non-maskable interrupts are a brilliant idea. I know that HW folks
>> believe that everything they omit can be fixed in software, but they
>> have to finally understand that this particular issue _cannot_ be fixed
>> at all.
>
> hmm, I asked the hardware folks and they have informed me that all IMS
> devices will support per vector masking/pending bit. This will be
> updated in the next SIOV spec which will be published soon.

I seriously hope so...

Thanks,

        tglx
Jason Gunthorpe Aug. 7, 2020, 12:06 p.m. UTC | #22
On Thu, Aug 06, 2020 at 10:21:11PM +0200, Thomas Gleixner wrote:

> Optionally? Please tell the hardware folks to make this mandatory. We
> have enough pain with non maskable MSI interrupts already so introducing
> yet another non maskable interrupt trainwreck is not an option.

Can you elaborate on the flows where Linux will need to trigger
masking?

I expect that masking will be available in our NIC HW too - but it
will require a spin loop if masking has to be done in an atomic
context.

> It's more than a decade now that I tell HW people not to repeat the
> non-maskable MSI failure, but obviously they still think that
> non-maskable interrupts are a brilliant idea. I know that HW folks
> believe that everything they omit can be fixed in software, but they
> have to finally understand that this particular issue _cannot_ be fixed
> at all.

Sure, the CPU should always be able to shut off an interrupt!

Maybe explaining the goals would help understand the HW perspective.

Today HW can process > 100k queues of work at once. Interrupt delivery
works by having a MSI index in each queue's metadata and the interrupt
indirects through a MSI-X table on-chip which has the
addr/data/mask/etc.

What IMS proposes is that the interrupt data can move into the queue
meta data (which is not required to be on-chip), eg along side the
producer/consumer pointers, and the central MSI-X table is not
needed. This is necessary because the PCI spec has very harsh design
requirements for a MSI-X table that make scaling it prohibitive.

So an IRQ can be silenced by deleting or stopping the queue(s)
triggering it. It can be masked by including masking in the queue
metadata. We can detect pending by checking the producer/consumer
values.

However synchronizing all the HW and all the state is now more
complicated than just writing a mask bit via MMIO to an on-die memory.

Jason
Greg Kroah-Hartman Aug. 7, 2020, 12:38 p.m. UTC | #23
On Fri, Aug 07, 2020 at 09:06:50AM -0300, Jason Gunthorpe wrote:
> On Thu, Aug 06, 2020 at 10:21:11PM +0200, Thomas Gleixner wrote:
> 
> > Optionally? Please tell the hardware folks to make this mandatory. We
> > have enough pain with non maskable MSI interrupts already so introducing
> > yet another non maskable interrupt trainwreck is not an option.
> 
> Can you elaborate on the flows where Linux will need to trigger
> masking?
> 
> I expect that masking will be available in our NIC HW too - but it
> will require a spin loop if masking has to be done in an atomic
> context.
> 
> > It's more than a decade now that I tell HW people not to repeat the
> > non-maskable MSI failure, but obviously they still think that
> > non-maskable interrupts are a brilliant idea. I know that HW folks
> > believe that everything they omit can be fixed in software, but they
> > have to finally understand that this particular issue _cannot_ be fixed
> > at all.
> 
> Sure, the CPU should always be able to shut off an interrupt!
> 
> Maybe explaining the goals would help understand the HW perspective.
> 
> Today HW can process > 100k queues of work at once. Interrupt delivery
> works by having a MSI index in each queue's metadata and the interrupt
> indirects through a MSI-X table on-chip which has the
> addr/data/mask/etc.
> 
> What IMS proposes is that the interrupt data can move into the queue
> meta data (which is not required to be on-chip), eg along side the
> producer/consumer pointers, and the central MSI-X table is not
> needed. This is necessary because the PCI spec has very harsh design
> requirements for a MSI-X table that make scaling it prohibitive.
> 
> So an IRQ can be silenced by deleting or stopping the queue(s)
> triggering it. It can be masked by including masking in the queue
> metadata. We can detect pending by checking the producer/consumer
> values.
> 
> However synchronizing all the HW and all the state is now more
> complicated than just writing a mask bit via MMIO to an on-die memory.

Because doing all of the work that used to be done in HW in software is
so much faster and scalable?  Feels really wrong to me :(

Do you all have a pointer to the spec for this newly proposed stuff
anywhere to try to figure out how the HW wants this to all work?

thanks,

greg k-h
Jason Gunthorpe Aug. 7, 2020, 1:34 p.m. UTC | #24
On Fri, Aug 07, 2020 at 02:38:31PM +0200, gregkh@linuxfoundation.org wrote:
> On Fri, Aug 07, 2020 at 09:06:50AM -0300, Jason Gunthorpe wrote:
> > On Thu, Aug 06, 2020 at 10:21:11PM +0200, Thomas Gleixner wrote:
> > 
> > > Optionally? Please tell the hardware folks to make this mandatory. We
> > > have enough pain with non maskable MSI interrupts already so introducing
> > > yet another non maskable interrupt trainwreck is not an option.
> > 
> > Can you elaborate on the flows where Linux will need to trigger
> > masking?
> > 
> > I expect that masking will be available in our NIC HW too - but it
> > will require a spin loop if masking has to be done in an atomic
> > context.
> > 
> > > It's more than a decade now that I tell HW people not to repeat the
> > > non-maskable MSI failure, but obviously they still think that
> > > non-maskable interrupts are a brilliant idea. I know that HW folks
> > > believe that everything they omit can be fixed in software, but they
> > > have to finally understand that this particular issue _cannot_ be fixed
> > > at all.
> > 
> > Sure, the CPU should always be able to shut off an interrupt!
> > 
> > Maybe explaining the goals would help understand the HW perspective.
> > 
> > Today HW can process > 100k queues of work at once. Interrupt delivery
> > works by having a MSI index in each queue's metadata and the interrupt
> > indirects through a MSI-X table on-chip which has the
> > addr/data/mask/etc.
> > 
> > What IMS proposes is that the interrupt data can move into the queue
> > meta data (which is not required to be on-chip), eg along side the
> > producer/consumer pointers, and the central MSI-X table is not
> > needed. This is necessary because the PCI spec has very harsh design
> > requirements for a MSI-X table that make scaling it prohibitive.
> > 
> > So an IRQ can be silenced by deleting or stopping the queue(s)
> > triggering it. It can be masked by including masking in the queue
> > metadata. We can detect pending by checking the producer/consumer
> > values.
> > 
> > However synchronizing all the HW and all the state is now more
> > complicated than just writing a mask bit via MMIO to an on-die memory.
> 
> Because doing all of the work that used to be done in HW in software is
> so much faster and scalable?  Feels really wrong to me :(

Yes, it is more scalable. The problem with MSI-X is you need actual
physical silicon for each and every vector. This really limits the
number of vectors.

Placing the vector metadata with the queue means it can potentially
live in system memory which is significantly more scalable.

setup/mask/unmask will be slower. The driver might have
complexity. They are not performance path, right?

I don't think it is wrong or right. IMHO the current design where the
addr/data is hidden inside the platform is an artifact of x86's
compatibility legacy back when there was no such thing as message
interrupts. 

If you were starting from a green field I don't think a design would
include the IOAPIC/MSI/MSI-X indirection tables.

> Do you all have a pointer to the spec for this newly proposed stuff
> anywhere to try to figure out how the HW wants this to all work?

Intel's SIOV document is an interesting place to start:

https://software.intel.com/content/www/us/en/develop/download/intel-scalable-io-virtualization-technical-specification.html

Though it is more of a rational and a cookbook on how to combine
existing technology pieces. (eg PASID, platform_msi, etc)

The basic approach of SIOV's IMS is that there is no longer a generic
interrupt indirection from numbers to addr/data pairs like
IOAPIC/MSI/MSI-X owned by the common OS code.

Instead the driver itself is responsible to set the addr/data pair
into the device in a device specific way, deal with masking, etc.

This lets the device use an implementation that is not limited by the
harsh MSI-X semantics.

In Linux we already have 'IMS' it is called platform_msi and a few
embedded drivers already work like this. The idea here is to bring it
to PCI.

Jason
Thomas Gleixner Aug. 7, 2020, 3:22 p.m. UTC | #25
Jason,

Jason Gunthorpe <jgg@nvidia.com> writes:
> On Thu, Aug 06, 2020 at 10:21:11PM +0200, Thomas Gleixner wrote:
>
>> Optionally? Please tell the hardware folks to make this mandatory. We
>> have enough pain with non maskable MSI interrupts already so introducing
>> yet another non maskable interrupt trainwreck is not an option.
>
> Can you elaborate on the flows where Linux will need to trigger
> masking?

1) disable/enable_irq() obviously needs masking
   
2) Affinity changes are preferrably done with masking to avoid a
   boatload of nasty side effect. We have a "fix" for 32bit addressing
   mode which works by chance due to the layout but it would fail
   miserably with 64bit addressing mode. 64bit addressing mode is only
   relevant for more than 256 CPUs which requires X2APIC which in turn
   requires interrupt remapping. Interrupt remappind saves us here
   because the interrupt can be disabled at the remapping level.

3) The ability to shutdown an irq at the interrupt level in case of
   malfunction. Of course that's pure paranoia because devices are
   perfect and never misbehave :)

So it's nowhere in the hot path of interrupt handling itself.

> I expect that masking will be available in our NIC HW too - but it
> will require a spin loop if masking has to be done in an atomic
> context.

Yes, it's all in atomic context.

We have functionality in the interrupt core to do #1 and #2 from task
context (requires the caller to be in task context as well). #3 not so
much.

>> It's more than a decade now that I tell HW people not to repeat the
>> non-maskable MSI failure, but obviously they still think that
>> non-maskable interrupts are a brilliant idea. I know that HW folks
>> believe that everything they omit can be fixed in software, but they
>> have to finally understand that this particular issue _cannot_ be fixed
>> at all.
>
> Sure, the CPU should always be able to shut off an interrupt!

Oh yes!

> Maybe explaining the goals would help understand the HW perspective.
>
> Today HW can process > 100k queues of work at once. Interrupt delivery
> works by having a MSI index in each queue's metadata and the interrupt
> indirects through a MSI-X table on-chip which has the
> addr/data/mask/etc.
>
> What IMS proposes is that the interrupt data can move into the queue
> meta data (which is not required to be on-chip), eg along side the
> producer/consumer pointers, and the central MSI-X table is not
> needed. This is necessary because the PCI spec has very harsh design
> requirements for a MSI-X table that make scaling it prohibitive.

I know.

> So an IRQ can be silenced by deleting or stopping the queue(s)
> triggering it.

We cannot do that from the interrupt layer without squaring the
circle and violating all locking and layering rules in one go.

> It can be masked by including masking in the queue metadata. We can
> detect pending by checking the producer/consumer values.
>
> However synchronizing all the HW and all the state is now more
> complicated than just writing a mask bit via MMIO to an on-die memory.

That's one of the reasons why I think that the IMS handling has to be a
per device irqdomain with it's own interrupt chip because the way how
IMS is managed is completely device specific.

There is certainly opportunity for sharing some of the functionality and
code, but not by creating a pseudo-shared entity which is customized per
device with indirections and magic storage plus device specific IMS slot
management glued at it as a wart. Such concepts fall apart in no time or
end up in a completely unmaintainable mess.

Coming back to mask/unmask. We could lift that requirement if and only
if irq remapping is mandatory to make use of those magic devices because
the remapping unit allows us to do the masking. That still would not
justify the pseudo-shared irqdomain because the IMS slot management
still stays per device.

Thanks,

        tglx
Thomas Gleixner Aug. 7, 2020, 4:47 p.m. UTC | #26
Jason Gunthorpe <jgg@nvidia.com> writes:
> Though it is more of a rational and a cookbook on how to combine
> existing technology pieces. (eg PASID, platform_msi, etc)
>
> The basic approach of SIOV's IMS is that there is no longer a generic
> interrupt indirection from numbers to addr/data pairs like
> IOAPIC/MSI/MSI-X owned by the common OS code.
>
> Instead the driver itself is responsible to set the addr/data pair
> into the device in a device specific way, deal with masking, etc.
>
> This lets the device use an implementation that is not limited by the
> harsh MSI-X semantics.
>
> In Linux we already have 'IMS' it is called platform_msi and a few
> embedded drivers already work like this. The idea here is to bring it
> to PCI.

platform_msi as it exists today is a crutch and in hindsight I should
have payed more attention back then and shoot it down before it got
merged.

IMS can be somehow mapped to platform MSI but the proposed approach to
extend platform MSI with the extra bolts for IMS (valid for one
particular incarnation) is just going into the wrong direction.

We've been there and the main reason why hierarchical irq domains exist
is that we needed to make a clear cut between the involved hardware
pieces and their drivers. The pre hierarchy model was a maze of stuff
calling back and forth between layers with lots of duct tape added to
make it "work". This finally fell apart when Intel tried to support
I/O-APIC hotplug. The ARM people had similar issues with all the special
irq related SoC specific IP blocks which are placed between the CPU
level interrupt controller and the device.

The hierarchy strictly seperates the per layer resource management and
each layer can work mostly independent of the actual available parent
layer.

Now looking at IMS. It's a subsystem inside a physical device. It has
slot management (where to place the Message) and mask/unmask. Resource
management at that level is what irq domains are for and mask/unmask is
what a irq chip handles.

So the right thing to do is to create shared infrastructure which is
utilized by the device drivers by providing a few bog standard data
structures and the handful of device specific domain and irq functions.

That keeps the functionality common, but avoids that we end up with

  - msi_desc becoming a dump ground for random driver data

  - a zoo of platform callbacks
  
  - glued on driver specific resource management

and all the great hacks which it requires to work on hundreds of
different devices which all implement IMS differently.

I'm all for sharing code and making the life of driver writers simple
because that makes my life simple as well, but not by creating a layer
at the wrong level and then hacking it into submission until it finally
collapses.

Designing the infrastructure following the clear layering rules of
hierarchical domains so it works for IMS and also replaces the platform
MSI hack is the only sane way to go forward, not the other way round.

Thanks,

        tglx
Dey, Megha Aug. 7, 2020, 5:54 p.m. UTC | #27
Hi Thomas,

On 8/7/2020 9:47 AM, Thomas Gleixner wrote:
> Jason Gunthorpe <jgg@nvidia.com> writes:
>> Though it is more of a rational and a cookbook on how to combine
>> existing technology pieces. (eg PASID, platform_msi, etc)
>>
>> The basic approach of SIOV's IMS is that there is no longer a generic
>> interrupt indirection from numbers to addr/data pairs like
>> IOAPIC/MSI/MSI-X owned by the common OS code.
>>
>> Instead the driver itself is responsible to set the addr/data pair
>> into the device in a device specific way, deal with masking, etc.
>>
>> This lets the device use an implementation that is not limited by the
>> harsh MSI-X semantics.
>>
>> In Linux we already have 'IMS' it is called platform_msi and a few
>> embedded drivers already work like this. The idea here is to bring it
>> to PCI.
> platform_msi as it exists today is a crutch and in hindsight I should
> have payed more attention back then and shoot it down before it got
> merged.
>
> IMS can be somehow mapped to platform MSI but the proposed approach to
> extend platform MSI with the extra bolts for IMS (valid for one
> particular incarnation) is just going into the wrong direction.
>
> We've been there and the main reason why hierarchical irq domains exist
> is that we needed to make a clear cut between the involved hardware
> pieces and their drivers. The pre hierarchy model was a maze of stuff
> calling back and forth between layers with lots of duct tape added to
> make it "work". This finally fell apart when Intel tried to support
> I/O-APIC hotplug. The ARM people had similar issues with all the special
> irq related SoC specific IP blocks which are placed between the CPU
> level interrupt controller and the device.
>
> The hierarchy strictly seperates the per layer resource management and
> each layer can work mostly independent of the actual available parent
> layer.
>
> Now looking at IMS. It's a subsystem inside a physical device. It has
> slot management (where to place the Message) and mask/unmask. Resource
> management at that level is what irq domains are for and mask/unmask is
> what a irq chip handles.
>
> So the right thing to do is to create shared infrastructure which is
> utilized by the device drivers by providing a few bog standard data
> structures and the handful of device specific domain and irq functions.
>
> That keeps the functionality common, but avoids that we end up with
>
>    - msi_desc becoming a dump ground for random driver data
>
>    - a zoo of platform callbacks
>    
>    - glued on driver specific resource management
>
> and all the great hacks which it requires to work on hundreds of
> different devices which all implement IMS differently.
>
> I'm all for sharing code and making the life of driver writers simple
> because that makes my life simple as well, but not by creating a layer
> at the wrong level and then hacking it into submission until it finally
> collapses.
>
> Designing the infrastructure following the clear layering rules of
> hierarchical domains so it works for IMS and also replaces the platform
> MSI hack is the only sane way to go forward, not the other way round.
 From what I've gathered, I need to:
1. Get rid of the mantra that "IMS" is an extension of platform-msi.
2. Make this new infra devoid of any platform-msi references
3. Come up with a ground up approach which adheres to the layering 
constraints of the IRQ subsystem
4. Have common code (drivers/irqchip maybe??) where we put in all the 
generic ims-specific bits for the IRQ chip and domain
which can be used by all device drivers belonging to this "IMS"class.
5. Have the device driver do the rest:
     create the chip/domain (one chip/domain per device?)
     provide device specific callbacks for masking, unmasking, write message

So from the hierarchical domain standpoint, we will have:
- For DSA device: vector->intel-IR->IDXD
- For Jason's device: root domain-> domain A-> Jason's device's IRQ domain
- For any other intel IMS device in the future which
     does not require interrupt remapping: vector->new device IRQ domain
     requires interrupt remapping: vector->intel-IR->new device IRQ 
domain (i.e. create a new domain even though IDXD is already present?)
Please let me know if my understanding is correct.

What I still don't understand fully is what if all the IMS devices need 
the same domain ops and chip callbacks, we will be creating various 
instances of the same IRQ chip and domain right? Is that ok?
Currently the creation of the IRQ domain happens at the IR level so that 
we can reuse the same domain but if it advisable to have a per device 
interrupt domain, I will shift this to the device driver.
>
> Thanks,
>
>          tglx
Jason Gunthorpe Aug. 7, 2020, 6:39 p.m. UTC | #28
On Fri, Aug 07, 2020 at 10:54:51AM -0700, Dey, Megha wrote:

> So from the hierarchical domain standpoint, we will have:
> - For DSA device: vector->intel-IR->IDXD
> - For Jason's device: root domain-> domain A-> Jason's device's IRQ domain
> - For any other intel IMS device in the future which
>     does not require interrupt remapping: vector->new device IRQ domain
>     requires interrupt remapping: vector->intel-IR->new device IRQ domain

I think you need a better classification than Jason's device or
Intel's device :)

Shouldn't the two cases be either you take the parent domain from the
IOMMU or you take the parent domain from the pci device?

What other choices could a PCI driver make?

Jason
Dey, Megha Aug. 7, 2020, 8:31 p.m. UTC | #29
On 8/7/2020 11:39 AM, Jason Gunthorpe wrote:
> On Fri, Aug 07, 2020 at 10:54:51AM -0700, Dey, Megha wrote:
>
>> So from the hierarchical domain standpoint, we will have:
>> - For DSA device: vector->intel-IR->IDXD
>> - For Jason's device: root domain-> domain A-> Jason's device's IRQ domain
>> - For any other intel IMS device in the future which
>>      does not require interrupt remapping: vector->new device IRQ domain
>>      requires interrupt remapping: vector->intel-IR->new device IRQ domain
> I think you need a better classification than Jason's device or
> Intel's device :)
hehe yeah, for sure, just wanted to get my point across :)
>
> Shouldn't the two cases be either you take the parent domain from the
> IOMMU or you take the parent domain from the pci device?

Hmm yeah this makes sense..

Although in the case of DSA, we find the iommu corresponding to the 
parent PCI device.

>
> What other choices could a PCI driver make?
Currently I think based on the devices we have, I don't think there are 
any others
>
> Jason
Thomas Gleixner Aug. 8, 2020, 7:47 p.m. UTC | #30
Megha,

"Dey, Megha" <megha.dey@intel.com> writes:
> On 8/7/2020 9:47 AM, Thomas Gleixner wrote:
>> I'm all for sharing code and making the life of driver writers simple
>> because that makes my life simple as well, but not by creating a layer
>> at the wrong level and then hacking it into submission until it finally
>> collapses.
>>
>> Designing the infrastructure following the clear layering rules of
>> hierarchical domains so it works for IMS and also replaces the platform
>> MSI hack is the only sane way to go forward, not the other way round.
>  From what I've gathered, I need to:
>
> 1. Get rid of the mantra that "IMS" is an extension of platform-msi.
> 2. Make this new infra devoid of any platform-msi references

See below.

> 3. Come up with a ground up approach which adheres to the layering 
> constraints of the IRQ subsystem

Yes. It's something which can be used by all devices which have:

   1) A device specific irq chip implementation including a msi write function
   2) Device specific resource management (slots in the IMS case)

The infrastructure you need is basically a wrapper around the core MSI
domain (similar to PCI, platform-MSI etc,) which provides the specific
functionality to handle the above.

> 4. Have common code (drivers/irqchip maybe??) where we put in all the 
> generic ims-specific bits for the IRQ chip and domain
> which can be used by all device drivers belonging to this "IMS"class.

Yes, you can provide a common implementation for devices which share the
same irq chip and domain (slot management functionality)

> 5. Have the device driver do the rest:
>      create the chip/domain (one chip/domain per device?)
>      provide device specific callbacks for masking, unmasking, write
>  message

Correct, but you don't need any magic new data structures for that, the
existing msi_domain_info/msi_domain_ops and related structures are
either sufficient or can be extended when necessary.

So for the IDXD case you need:

  1) An irq chip with mask/unmask callbacks and a write msg function
  2) A slot allocation or association function and their 'free'
     counterpart (irq_domain_ops)

The function and struct pointers go into the appropriate
msi_info/msi_ops structures along with the correct flags to set up the
whole thing and then the infrastructure creates your domain, fills in
the shared functions and sets the whole thing up.

That's all what a device driver needs to provide, i.e. stick the device
specific functionality into right data structures and let the common
infrastructure deal with it. The rest just works and the device specific
functions are invoked from the right places when required.

> So from the hierarchical domain standpoint, we will have:
> - For DSA device: vector->intel-IR->IDXD
> - For Jason's device: root domain-> domain A-> Jason's device's IRQ domain
> - For any other intel IMS device in the future which
>      does not require interrupt remapping: vector->new device IRQ domain
>      requires interrupt remapping: vector->intel-IR->new device IRQ 
> domain (i.e. create a new domain even though IDXD is already present?)

What's special about IDXD? It's just one specific implementation of IMS
and any other device implementing IMS is completely independent and as
documented in the specification the IMS slot management and therefore
the mask/unmask functionality can and will be completely different. IDXD
has a storage array with slots, Jason's hardware puts the IMS slot into
the queue storage.

It does not matter whether a device comes from Intel or any other vendor,
it does neither matter whether the device works with direct vector
delivery or interrupt remapping.

IDXD is not any different from any other IMS capable device when you
look at it from the interrupt hierarchy. It's either:

     vector -> IR -> device
or
     vector -> device

The only point where this is differentiated is when the irq domain is
created. Anything else just falls into place.

To answer Jason's question: No, the parent is never the PCI/MSI irq
domain because that sits at the same level as that device
domain. Remember the scheme:

   vector --- DMAR-MSI
	  |
	  |-- ....
	  |
	  |-- IR-0 --- IO/APIC-0
	  |        | 
	  |        |-- IO/APIC-1
	  |        |
	  |        |-- PCI/MSI-0
	  |        |
	  |        |-- HPET/MSI-0
	  |        |
	  |        |-- DEV-A/MSI-0
	  |        |-- DEV-A/MSI-1
	  |        |-- DEV-B/MSI-2
	  |
	  |-- IR-1 --- PCI/MSI-1
	  |        |
	  |        |-- DEV-C/MSI-3

The PCI/MSI domain(s) are dealing solely with PCI standard compliant
MSI/MSI-X. IMS or similar (platform-MSI being one variant) sit at the
same level as the PCI/MSI domains.

Why? It's how the hardware operates.

The PCI/MSI "irq chip" is configured by the PCI/MSI domain level and it
sends its message to the interrupt parent in the hierarchy, i.e. either
to the Interrupt Remap unit or to the configured vector of the target
CPU.

IMS does not send it to some magic PCI layer first at least not at the
conceptual level. The fact that the message is transported by PCIe does
not change that at all. PCIe in that case is solely the transport, but
the "irq chip" at the PCI/MSI level of the device is not involved at
all. If it were that would be a different story.

So now you might ask why we have a single PCI/MSI domain per IR unit and
why I want seperate IMS domains.

The answer is in the hardware again. PCI/MSI is uniform accross devices
so the irq chip and all of the domain functionality can be shared. But
then we have two PCI/MSI domains in the above example because again the
hardware has one connected to IR unit 0 and the other to IR unit 1.
IR 0 and IR 1 manage different resources (remap tables) so PCI/MSI-0
depends on IR-0 and PCI/MSI-1 on IR-1 which is reflected in the
parent/child relation ship of the domains.

There is another reason why we can spawn a single PCI/MSI domain per
root complex / IR unit. The PCI/MSI domains are not doing any resource
management at all. The resulting message is created from the allocated
vector (direct CPU delivery) or from the allocated Interrupt remapping
slot information. The domain just deals with the logic required to
handle PCI/MSI(X) and the necessary resources are provided by the parent
interrupt layers.

IMS is different. It needs device specific resource management to
allocate an IMS slot which is clearly part of the "irq chip" management
layer, aka. irq domain. If the IMS slot management would happen in a
global or per IR unit table and as a consequence the management, layout,
mask/unmask operations would be uniform then an IMS domain per system or
IR unit would be the right choice, but that's not how the hardware is
specified and implemented.

Now coming back to platform MSI. The way it looks is:

 CPU --- (IR) ---- PLATFORM-MSI  --- PLATFORM-DEVICE-MSI-0
                                 |-- PLATFORM-DEVICE-MSI-1
                                 |...

PLATFORM-MSI is a common resource management which also provides a
shared interrupt chip which operates at the PLATFORM-MSI level with one
exception:

  The irq_msi_write_msg() callback has an indirection so the actual
  devices can provide their device specific msi_write_msg() function.

That's a borderline abuse of the hierarchy, but it makes sense to some
extent as the actual PLATFORM-MSI domain is a truly shared resource and
the only device specific functionality required is the message
write. But that message write is not something which has it's own
resource management, it's just a non uniform storage accessor. IOW, the
underlying PLATFORM-MSI domain does all resource management including
message creation and the quirk allows to write the message in the device
specific way. Not that I love it, but ...

That is the main difference between platform MSI and IMS. IMS is
completely non uniform and the devices do not share any common resource
or chip functionality. Each device has its own message store management,
slot allocation/assignment and a device specifc interrupt chip
functionality which goes way beyond the nasty write msg quirk.

> What I still don't understand fully is what if all the IMS devices
> need the same domain ops and chip callbacks, we will be creating
> various instances of the same IRQ chip and domain right? Is that ok?

Why would it be not ok? Are you really worried about a few hundred bytes
of memory required for this? 

Sharing an instance only makes sense if the instance handles a shared or
uniform resource space, which is clearly not the case with IMS.

We create several PCI/MSI domains and several IO/APIC domains on larger
systems. They all share the code, but they are dealing with seperate
resources so they have seperate storage. 

> Currently the creation of the IRQ domain happens at the IR level so that 
> we can reuse the same domain but if it advisable to have a per device 
> interrupt domain, I will shift this to the device driver.

Again. Look at the layering. What you created now is a pseudo shared
domain which needs

   1) An indirection layer for providing device specific functions

   2) An extra allocation layer in the device specific driver to assign
      IMS slots completely outside of the domain allocation mechanism.

   In other words you try to make things which are neither uniform nor
   share a resource space look the same way. That's the "all I have is a
   hammer so everything is a nail" approach. That never worked out well.

With a per device domain/chip approach you get one consistent domain
per device which provides

   1) The device specific resource management (i.e. slot allocation
      becomes part of the irq domain operations)

   2) The device specific irq chip functions at the correct point in the
      layering without the horrid indirections

   3) Consolidated data storage at the device level where the actual
      data is managed.

   This is of course sharing as much code as possible with the MSI core
   implementation.

   As a side effect any extension of this be it on the domain or the irq
   chip side is just a matter of adding the functionality to that
   particular incarnation and not by having yet another indirection
   logic at the wrong place.

The price you pay is a bit of memory but you get a clean layering and
seperation of functionality as a reward. The amount of code in the
actual IMS device driver is not going to be much more than with the
approach you have now.

The infrastructure itself is not more than a thin wrapper around the
existing msi domain infrastructure and might even share code with
platform-msi.

Thanks,

        tglx
Thomas Gleixner Aug. 10, 2020, 9:46 p.m. UTC | #31
Thomas Gleixner <tglx@linutronix.de> writes:
> The infrastructure itself is not more than a thin wrapper around the
> existing msi domain infrastructure and might even share code with
> platform-msi.

And the annoying fact that you need XEN support which opens another can
of worms...
Thomas Gleixner Aug. 11, 2020, 9:53 a.m. UTC | #32
Thomas Gleixner <tglx@linutronix.de> writes:

CC+: XEN folks

> Thomas Gleixner <tglx@linutronix.de> writes:
>> The infrastructure itself is not more than a thin wrapper around the
>> existing msi domain infrastructure and might even share code with
>> platform-msi.
>
> And the annoying fact that you need XEN support which opens another can
> of worms...

which needs some real cleanup first.

x86 still does not associate the irq domain to devices at device
discovery time, i.e. the device::msi_domain pointer is never populated.

So to support this new fangled device MSI stuff we'd need yet more
x86/xen specific arch_*msi_irqs() indirection and hackery, which is not
going to happen.

The right thing to do is to convert XEN MSI support over to proper irq
domains. This allows to populate device::msi_domain which makes a lot of
things simpler and also more consistent.

Thanks,

        tglx
Dey, Megha Aug. 11, 2020, 6:39 p.m. UTC | #33
Hi Thomas,

On 8/8/2020 12:47 PM, Thomas Gleixner wrote:
> Megha,
>
> "Dey, Megha" <megha.dey@intel.com> writes:
>> On 8/7/2020 9:47 AM, Thomas Gleixner wrote:
>>> I'm all for sharing code and making the life of driver writers simple
>>> because that makes my life simple as well, but not by creating a layer
>>> at the wrong level and then hacking it into submission until it finally
>>> collapses.
>>>
>>> Designing the infrastructure following the clear layering rules of
>>> hierarchical domains so it works for IMS and also replaces the platform
>>> MSI hack is the only sane way to go forward, not the other way round.
>>   From what I've gathered, I need to:
>>
>> 1. Get rid of the mantra that "IMS" is an extension of platform-msi.
>> 2. Make this new infra devoid of any platform-msi references
> See below.
ok..
>
>> 3. Come up with a ground up approach which adheres to the layering
>> constraints of the IRQ subsystem
> Yes. It's something which can be used by all devices which have:
>
>     1) A device specific irq chip implementation including a msi write function
>     2) Device specific resource management (slots in the IMS case)
>
> The infrastructure you need is basically a wrapper around the core MSI
> domain (similar to PCI, platform-MSI etc,) which provides the specific
> functionality to handle the above.

ok, i will create a per device irq chip which will directly have the 
device specific callbacks instead of another layer of redirection.

This way i will get rid of the 'platform_msi_ops' data structure.

I am not sure what you mean by device specific resource management, are 
you referring to dev_msi_alloc/free_irqs?

>> 4. Have common code (drivers/irqchip maybe??) where we put in all the
>> generic ims-specific bits for the IRQ chip and domain
>> which can be used by all device drivers belonging to this "IMS"class.
> Yes, you can provide a common implementation for devices which share the
> same irq chip and domain (slot management functionality)
yeah i think most of the msi_domain_ops (msi_prepare, set_desc etc) are 
common and can be moved into a common file.
>
>> 5. Have the device driver do the rest:
>>       create the chip/domain (one chip/domain per device?)
>>       provide device specific callbacks for masking, unmasking, write
>>   message
> Correct, but you don't need any magic new data structures for that, the
> existing msi_domain_info/msi_domain_ops and related structures are
> either sufficient or can be extended when necessary.
>
> So for the IDXD case you need:
>
>    1) An irq chip with mask/unmask callbacks and a write msg function
>    2) A slot allocation or association function and their 'free'
>       counterpart (irq_domain_ops)

This is one part I didn't understand.

Currently my dev_msi_alloc_irqs is simply a wrapper over 
platform_msi_domain_alloc_irqs which again mostly calls 
msi_domain_alloc_irqs.

When you say add a .alloc, .free, does this mean we should add a device 
specific alloc/free and not use the default 
msi_domain_alloc/msi_domain_free?

I don't see anything device specific to be done for IDXD atleast, can 
you please let me know?

>
> The function and struct pointers go into the appropriate
> msi_info/msi_ops structures along with the correct flags to set up the
> whole thing and then the infrastructure creates your domain, fills in
> the shared functions and sets the whole thing up.
>
> That's all what a device driver needs to provide, i.e. stick the device
> specific functionality into right data structures and let the common
> infrastructure deal with it. The rest just works and the device specific
> functions are invoked from the right places when required.
yeah. makes sense..
>
>> So from the hierarchical domain standpoint, we will have:
>> - For DSA device: vector->intel-IR->IDXD
>> - For Jason's device: root domain-> domain A-> Jason's device's IRQ domain
>> - For any other intel IMS device in the future which
>>       does not require interrupt remapping: vector->new device IRQ domain
>>       requires interrupt remapping: vector->intel-IR->new device IRQ
>> domain (i.e. create a new domain even though IDXD is already present?)
> What's special about IDXD? It's just one specific implementation of IMS
> and any other device implementing IMS is completely independent and as
> documented in the specification the IMS slot management and therefore
> the mask/unmask functionality can and will be completely different. IDXD
> has a storage array with slots, Jason's hardware puts the IMS slot into
> the queue storage.
>
> It does not matter whether a device comes from Intel or any other vendor,
> it does neither matter whether the device works with direct vector
> delivery or interrupt remapping.
>
> IDXD is not any different from any other IMS capable device when you
> look at it from the interrupt hierarchy. It's either:
>
>       vector -> IR -> device
> or
>       vector -> device
>
> The only point where this is differentiated is when the irq domain is
> created. Anything else just falls into place.
yeah, so I will create the IRQ domain in the IDXD driver with INTEL-IR 
as the parent, instead of creating a common per IR unit domain
>
> To answer Jason's question: No, the parent is never the PCI/MSI irq
> domain because that sits at the same level as that device
> domain. Remember the scheme:
>
>     vector --- DMAR-MSI
> 	  |
> 	  |-- ....
> 	  |
> 	  |-- IR-0 --- IO/APIC-0
> 	  |        |
> 	  |        |-- IO/APIC-1
> 	  |        |
> 	  |        |-- PCI/MSI-0
> 	  |        |
> 	  |        |-- HPET/MSI-0
> 	  |        |
> 	  |        |-- DEV-A/MSI-0
> 	  |        |-- DEV-A/MSI-1
> 	  |        |-- DEV-B/MSI-2
> 	  |
> 	  |-- IR-1 --- PCI/MSI-1
> 	  |        |
> 	  |        |-- DEV-C/MSI-3
>
> The PCI/MSI domain(s) are dealing solely with PCI standard compliant
> MSI/MSI-X. IMS or similar (platform-MSI being one variant) sit at the
> same level as the PCI/MSI domains.
>
> Why? It's how the hardware operates.
>
> The PCI/MSI "irq chip" is configured by the PCI/MSI domain level and it
> sends its message to the interrupt parent in the hierarchy, i.e. either
> to the Interrupt Remap unit or to the configured vector of the target
> CPU.
>
> IMS does not send it to some magic PCI layer first at least not at the
> conceptual level. The fact that the message is transported by PCIe does
> not change that at all. PCIe in that case is solely the transport, but
> the "irq chip" at the PCI/MSI level of the device is not involved at
> all. If it were that would be a different story.
>
> So now you might ask why we have a single PCI/MSI domain per IR unit and
> why I want seperate IMS domains.
>
> The answer is in the hardware again. PCI/MSI is uniform accross devices
> so the irq chip and all of the domain functionality can be shared. But
> then we have two PCI/MSI domains in the above example because again the
> hardware has one connected to IR unit 0 and the other to IR unit 1.
> IR 0 and IR 1 manage different resources (remap tables) so PCI/MSI-0
> depends on IR-0 and PCI/MSI-1 on IR-1 which is reflected in the
> parent/child relation ship of the domains.
>
> There is another reason why we can spawn a single PCI/MSI domain per
> root complex / IR unit. The PCI/MSI domains are not doing any resource
> management at all. The resulting message is created from the allocated
> vector (direct CPU delivery) or from the allocated Interrupt remapping
> slot information. The domain just deals with the logic required to
> handle PCI/MSI(X) and the necessary resources are provided by the parent
> interrupt layers.
>
> IMS is different. It needs device specific resource management to
> allocate an IMS slot which is clearly part of the "irq chip" management
> layer, aka. irq domain. If the IMS slot management would happen in a
> global or per IR unit table and as a consequence the management, layout,
> mask/unmask operations would be uniform then an IMS domain per system or
> IR unit would be the right choice, but that's not how the hardware is
> specified and implemented.
>
> Now coming back to platform MSI. The way it looks is:
>
>   CPU --- (IR) ---- PLATFORM-MSI  --- PLATFORM-DEVICE-MSI-0
>                                   |-- PLATFORM-DEVICE-MSI-1
>                                   |...
>
> PLATFORM-MSI is a common resource management which also provides a
> shared interrupt chip which operates at the PLATFORM-MSI level with one
> exception:
>
>    The irq_msi_write_msg() callback has an indirection so the actual
>    devices can provide their device specific msi_write_msg() function.
>
> That's a borderline abuse of the hierarchy, but it makes sense to some
> extent as the actual PLATFORM-MSI domain is a truly shared resource and
> the only device specific functionality required is the message
> write. But that message write is not something which has it's own
> resource management, it's just a non uniform storage accessor. IOW, the
> underlying PLATFORM-MSI domain does all resource management including
> message creation and the quirk allows to write the message in the device
> specific way. Not that I love it, but ...
>
> That is the main difference between platform MSI and IMS. IMS is
> completely non uniform and the devices do not share any common resource
> or chip functionality. Each device has its own message store management,
> slot allocation/assignment and a device specifc interrupt chip
> functionality which goes way beyond the nasty write msg quirk.
Thanks for giving such a detailed explanation! really helps :)
>
>> What I still don't understand fully is what if all the IMS devices
>> need the same domain ops and chip callbacks, we will be creating
>> various instances of the same IRQ chip and domain right? Is that ok?
> Why would it be not ok? Are you really worried about a few hundred bytes
> of memory required for this?
>
> Sharing an instance only makes sense if the instance handles a shared or
> uniform resource space, which is clearly not the case with IMS.
>
> We create several PCI/MSI domains and several IO/APIC domains on larger
> systems. They all share the code, but they are dealing with seperate
> resources so they have seperate storage.
ok, got it ..
>
>> Currently the creation of the IRQ domain happens at the IR level so that
>> we can reuse the same domain but if it advisable to have a per device
>> interrupt domain, I will shift this to the device driver.
> Again. Look at the layering. What you created now is a pseudo shared
> domain which needs
>
>     1) An indirection layer for providing device specific functions
>
>     2) An extra allocation layer in the device specific driver to assign
>        IMS slots completely outside of the domain allocation mechanism.
hmmm, again I am not sure of which extra allocation layer you are 
referring to..
>
>     In other words you try to make things which are neither uniform nor
>     share a resource space look the same way. That's the "all I have is a
>     hammer so everything is a nail" approach. That never worked out well.
>
> With a per device domain/chip approach you get one consistent domain
> per device which provides
>
>     1) The device specific resource management (i.e. slot allocation
>        becomes part of the irq domain operations)
>
>     2) The device specific irq chip functions at the correct point in the
>        layering without the horrid indirections
>
>     3) Consolidated data storage at the device level where the actual
>        data is managed.
>
>     This is of course sharing as much code as possible with the MSI core
>     implementation.
>
>     As a side effect any extension of this be it on the domain or the irq
>     chip side is just a matter of adding the functionality to that
>     particular incarnation and not by having yet another indirection
>     logic at the wrong place.
>
> The price you pay is a bit of memory but you get a clean layering and
> seperation of functionality as a reward. The amount of code in the
> actual IMS device driver is not going to be much more than with the
> approach you have now.
>
> The infrastructure itself is not more than a thin wrapper around the
> existing msi domain infrastructure and might even share code with
> platform-msi.

 From your explanation:


In the device driver:


static const struct irq_domain_ops idxd_irq_domain_ops = {

.alloc= idxd_domain_alloc, //not sure what this should do

.free= idxd_domain_free,

};

struct irq_chip idxd_irq_chip = {

.name= "idxd"

.irq_mask= idxd_irq_mask,

.irq_unmask= idxd_irq_unmask,

.irq_write_msg = idxd_irq_write_msg,

.irq_ack= irq_chip_ack_parent,

.irq_retrigger= irq_chip_retrigger_hierarchy,

.flags= IRQCHIP_SKIP_SET_WAKE,

}

struct msi_domain_info idxd_domain_info = {

.flags        =MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS,

.ops          =&dev_msi_domain_ops,//can be common

.chip         =&idxd_irq_chip //per device

.handler= handle_edge_irq,

.handler_name = "edge",

}

dev->msi_domain = dev_msi_create_irq_domain(iommu->ir_domain, 
idxd_domain_info, idxd_irq_domain_ops)

msi_domain_alloc_irqs(dev->msi_domain, dev, nvec);

Common code:


struct irq_domain *dev_msi_create_irq_domain(struct irq_domain *parent,

struct msi_domain_info *dev_msi_domain_info,

struct irq_domain_ops dev_msi_irq_domain_ops)

{

struct irq_domain *domain;

         .......

domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
NULL, &dev_msi_irq_domain_ops, info);

         .......

return domain;

}

static struct msi_domain_ops dev_msi_domain_ops = {

.set_desc= dev_msi_set_desc,

.msi_prepare= dev_msi_prepare,

.get_hwirq= dev_msi_get_hwirq,

}; // can re-use platform-msi data structures


except the alloc/free irq_domain_ops, does this look fine to you?


-Megha

>
> Thanks,
>
>          tglx
Dey, Megha Aug. 11, 2020, 6:46 p.m. UTC | #34
Hi Thomas,

On 8/11/2020 2:53 AM, Thomas Gleixner wrote:
> Thomas Gleixner <tglx@linutronix.de> writes:
>
> CC+: XEN folks
>
>> Thomas Gleixner <tglx@linutronix.de> writes:
>>> The infrastructure itself is not more than a thin wrapper around the
>>> existing msi domain infrastructure and might even share code with
>>> platform-msi.
>> And the annoying fact that you need XEN support which opens another can
>> of worms...

hmm I am not sure why we need Xen support... are you referring to idxd 
using xen?

> which needs some real cleanup first.
>
> x86 still does not associate the irq domain to devices at device
> discovery time, i.e. the device::msi_domain pointer is never populated.
>
> So to support this new fangled device MSI stuff we'd need yet more
> x86/xen specific arch_*msi_irqs() indirection and hackery, which is not
> going to happen.
>
> The right thing to do is to convert XEN MSI support over to proper irq
> domains. This allows to populate device::msi_domain which makes a lot of
> things simpler and also more consistent.

do you think this cleanup is to be a precursor to my patches? I could 
look into it but I am not familiar with the background of Xen

and this stuff. Can you please provide further guidance on where to look?

> Thanks,
>
>          tglx
Thomas Gleixner Aug. 11, 2020, 9:25 p.m. UTC | #35
"Dey, Megha" <megha.dey@intel.com> writes:
> On 8/11/2020 2:53 AM, Thomas Gleixner wrote:
>>> And the annoying fact that you need XEN support which opens another can
>>> of worms...
>
> hmm I am not sure why we need Xen support... are you referring to idxd 
> using xen?

What about using IDXD when you are running on XEN? I might be missing
something and IDXD/IMS is hypervisor only, but that still does not solve
this problem on bare metal:

>> x86 still does not associate the irq domain to devices at device
>> discovery time, i.e. the device::msi_domain pointer is never
>> populated.

We can't do that right now due to the way how X86 PCI/MSI allocation
works and being able to do so would make things consistent and way
simpler even for your stuff.

>> The right thing to do is to convert XEN MSI support over to proper irq
>> domains. This allows to populate device::msi_domain which makes a lot of
>> things simpler and also more consistent.
>
> do you think this cleanup is to be a precursor to my patches? I could 
> look into it but I am not familiar with the background of Xen
>
> and this stuff. Can you please provide further guidance on where to
> look

As I said:

>> So to support this new fangled device MSI stuff we'd need yet more
>> x86/xen specific arch_*msi_irqs() indirection and hackery, which is not
>> going to happen.

  git grep arch_.*msi_irq arch/x86

This indirection prevents storing the irq_domain pointer in the device
at probe/detection time. Native code already uses irq domains for
PCI/MSI but we can't exploit the full potential because then
pci_msi_setup_msi_irqs() would never end up in arch_setup_msi_irqs()
which breaks XEN.

I was reminded of that nastiness when I was looking at sensible ways to
integrate this device MSI maze proper.

From a conceptual POV this stuff, which is not restricted to IDXD at all,
looks like this:

           ]-------------------------------------------|
PCI BUS -- | PCI device                                |
           ]-------------------|                       |
           | Physical function |                       |
           ]-------------------|                       |
           ]-------------------|----------|            |
           | Control block for subdevices |            |
           ]------------------------------|            |
           |            | <- "Subdevice BUS"           |
           |            |                              |
           |            |-- Subddevice 0               | 
           |            |-- Subddevice 1               | 
           |            |-- ...                        | 
           |            |-- Subddevice N               | 
           ]-------------------------------------------|

It does not matter whether this is IDXD with it's magic devices or a
network card with a gazillion of queues. Conceptually we need to look at
them as individual subdevices.

And obviously the above picture gives you the topology. The physical
function device belongs to PCI in all aspects including the MSI
interrupt control. The control block is part of the PCI device as well
and it even can have regular PCI/MSI interrupts for its own
purposes. There might be devices where the Physical function device does
not exist at all and the only true PCI functionality is the control
block to manage subdevices. That does not matter and does not change the
concept.

Now the subdevices belong topology wise NOT to the PCI part. PCI is just
the transport they utilize. And their irq domain is distinct from the
PCI/MSI domain for reasons I explained before.

So looking at it from a Linux perspective:

  pci-bus -> PCI device (managed by PCI/MSI domain)
               - PF device
               - CB device (hosts DEVMSI domain)
                    | "Subdevice bus"
                    | - subdevice
                    | - subdevice
                    | - subdevice

Now you would assume that figuring out the irq domain which the DEVMSI
domain serving the subdevices on the subdevice bus should take as parent
is pretty trivial when looking at the topology, right?

CB device's parent is PCI device and we know that PCI device MSI is
handled by the PCI/MSI domain which is either system wide or per IR
unit.

So getting the relevant PCI/MSI irq domain is as simple as doing:

   pcimsi_domain = pcidevice->device->msi_domain;

and then because we know that this is a hierarchy the parent domain of
pcimsi_domain is the one which is the parent of our DEVMSI domain, i.e.:

   parent = pcmsi_domain->parent;

Obvious, right?

What's not so obvious is that pcidevice->device->msi_domain is not
populated on x86 and trying to get the parent from there is a NULL
pointer dereference which does not work well.

So you surely can hack up some workaround for this, but that's just
proliferating crap. We want this to be consistent and there is
absolutely no reason why that network card with the MSI storage in the
queue data should not work on any other architecture.

We do the correct association already for IOMMU and whatever topological
stuff is attached to (PCI) devices on probe/detection time so making it
consistent for irq domains is just a logical consequence and matter of
consistency.

Back in the days when x86 was converted to hierarchical irq domains in
order to support I/O APIC hotplug this workaround was accepted to make
progress and it was meant as a transitional step. Of course after the
goal was achieved nobody @Intel cared anymore and so far this did not
cause big problems. But now it does and we really want to make this
consistent first.

And no we are not making an exception for IDXD either just because
that's Intel only. Intel is not special and not exempt from cleaning
stuff up before adding new features especially not when the stuff to
cleanup is a leftover from Intel itself. IOW, we are not adding more
crap on top of crap which should not exists anymore.

It's not rocket science to fix this. All it needs is to let XEN create
irq domains and populate them during init.

On device detection/probe the proper domain needs to be determined which
is trivial and then stored in device->msi_domain. That makes
arch_.*_msi_irq() go away and a lot of code just simpler.

Thanks,

        tglx
Thomas Gleixner Aug. 11, 2020, 10:39 p.m. UTC | #36
Megha.

"Dey, Megha" <megha.dey@intel.com> writes:
> On 8/8/2020 12:47 PM, Thomas Gleixner wrote:
>>> 3. Come up with a ground up approach which adheres to the layering
>>> constraints of the IRQ subsystem
>> Yes. It's something which can be used by all devices which have:
>>
>>     1) A device specific irq chip implementation including a msi write function
>>     2) Device specific resource management (slots in the IMS case)
>>
>> The infrastructure you need is basically a wrapper around the core MSI
>> domain (similar to PCI, platform-MSI etc,) which provides the specific
>> functionality to handle the above.
>
> ok, i will create a per device irq chip which will directly have the 
> device specific callbacks instead of another layer of redirection.
>
> This way i will get rid of the 'platform_msi_ops' data structure.
>
> I am not sure what you mean by device specific resource management, are 
> you referring to dev_msi_alloc/free_irqs?

I think I gave you a hint:

>>     2) Device specific resource management (slots in the IMS case)

The IMS storage is an array with slots in the IDXD case and these slots
are assigned at interrupt allocation time, right? In other cases where
the IMS storage is in some other place, e.g. queue memory, then this
still needs to associated to the interrupt at allocation time.

But of course because you create some disconnected irqdomain you have to
do that assignment seperately on the side and then stick this
information into msi_desc after the fact.

And as this is device specific every device driver which utilizes IMS
has to do that which is bonkers at best.

>>    2) A slot allocation or association function and their 'free'
>>       counterpart (irq_domain_ops)
>
> This is one part I didn't understand.
>
> Currently my dev_msi_alloc_irqs is simply a wrapper over 
> platform_msi_domain_alloc_irqs which again mostly calls 
> msi_domain_alloc_irqs.
>
> When you say add a .alloc, .free, does this mean we should add a device 
> specific alloc/free and not use the default 
> msi_domain_alloc/msi_domain_free?
>
> I don't see anything device specific to be done for IDXD atleast, can 
> you please let me know?

Each and every time I mentioned this, I explicitely mentioned "slot
allocation (array) or slot association (IMS store is not in an
array)".

But you keep asking what's device specific over and over and where
resource management is?

The storage slot array is a resoruce which needs to be managed and it is
device specific by specification. And it's part of the interrupt
allocation obviously because without a place to store the MSI message
the whole thing would not work. This slot does not come out of thin air,
right?

https://github.com/intel/idxd-driver/commit/fb9a2f4e36525a1f18d9e654d472aa87a9adcb30

int vidxd_setup_ims_entries(struct vdcm_idxd *vidxd)
{
	struct idxd_device *idxd = vidxd->idxd;
	struct ims_irq_entry *irq_entry;
	struct mdev_device *mdev = vidxd->vdev.mdev;
	struct device *dev = mdev_dev(mdev);
	struct msi_desc *desc;
	int err, i = 0;
	int index;

	/*
	 * MSIX vec 0 is emulated by the vdcm and does not take up an IMS. The total MSIX vecs used
	 * by the mdev will be total IMS + 1. vec 0 is used for misc interrupts such as command
	 * completion, error notification, PMU, etc. The other vectors are used for descriptor
	 * completion. Thus only the number of IMS vectors need to be allocated, which is
	 * VIDXD_MAX_MSIX_VECS - 1.
	 */
	err = dev_msi_domain_alloc_irqs(dev, VIDXD_MAX_MSIX_VECS - 1, &idxd_ims_ops);
	if (err < 0) {
		dev_dbg(dev, "Enabling IMS entry! %d\n", err);
		return err;
	}

	i = 0;
	for_each_msi_entry(desc, dev) {
		index = idxd_alloc_ims_index(idxd);
		if (index < 0) {
			err = index;
			break;
		}
		vidxd->ims_index[i] = index;

		irq_entry = &vidxd->irq_entries[i];
		irq_entry->vidxd = vidxd;
		irq_entry->int_src = i;
		irq_entry->irq = desc->irq;
		i++;
	}

	if (err)
		vidxd_free_ims_entries(vidxd);

	return 0;
}

idxd_alloc_ims_index() is an allocation, right? And the above aside of
having 3 redundant levels of storage for exactly the same information is
just a violation of all layering concepts at once.

I just wish I've never seen that code.

>> Again. Look at the layering. What you created now is a pseudo shared
>> domain which needs
>>
>>     1) An indirection layer for providing device specific functions
>>
>>     2) An extra allocation layer in the device specific driver to assign
>>        IMS slots completely outside of the domain allocation mechanism.
> hmmm, again I am not sure of which extra allocation layer you are 
> referring to..

See above.

>> The infrastructure itself is not more than a thin wrapper around the
>> existing msi domain infrastructure and might even share code with
>> platform-msi.
>
>  From your explanation:
>
> In the device driver:
>
> static const struct irq_domain_ops idxd_irq_domain_ops = {
>
> .alloc= idxd_domain_alloc, //not sure what this should do

You might know by now. Also it's not necessarily the .alloc callback
which needs to be implemented. As I said we can add ops if necessary and
if it makes sense. This needs some thoughts to provide proper layering
and for sharing as much code as possible.

> except the alloc/free irq_domain_ops, does this look fine to you?

It's at least heading into the right direction.

But before we talk about the details at this level the
device::msi_domain pointer issue wants to be resolved. It's part of the
solution to share code at various levels and to make utilization of this
technology as simple as possible for driver writers.

We need to think about infrastructure which can be used by various types
of IMS devices, e.g. those with storage arrays and this with storage in
random places, like the network card Jason was talking about. And to get
there we need to do the house cleaning first.

Also if you do a proper infrastructure then you need exactly ONE
implementation of an irqdomain and an irqchip for devices which have a
IMS slot storage array. Every driver for a device which has this kind of
storage can reuse that even with different array sizes.

If done right then your IDXD driver needs:

   idxd->domain = create_ims_array_domain(...., basepointer, max_slots);

in the init function of the control block. create_ims_array_domain() is
not part of IDXD, it's a common irq domain/irq chip implementation which
deals with IMS slot storage arrays of arbitrary size. 

And then when creating a subdevice you do:

    subdevice->msi_domain = idxd->domain;

and to allocate the interrupts you just do:

    device_msi_alloc_irqs(subdevice, nrirqs);

and device_msi_alloc_irqs() is shared infrastructure which has nothing
to do with idxd or the ims array domain.

The same can be done for devices which have their storage embedded into
whatever other data structure on the device, e.g. queue memory, and
share the same message storage layout.

And we need to put thoughts into the shared infrastructure upfront
because all of this can also be used on bare metal.

The next thing you completely missed is to think about the ability to
support managed interrupts which we have in PCI/MSIX today. Its just a
matter of time that a IMS device comes along which want's it's subdevice
interrupts managed properly when running on bare metal.

Can we please just go back to proper engineering and figure out how to
create something which is not just yet another half baken works for IDXD
"solution"? 

This means we need a proper decription of possible IMS usage scenarios
and the foreseeable storage scenarios (arrays, queue data, ....). Along
with requirement like managed interrupts etc. I'm sure quite some of
this information is scattered over a wide range of mail threads, but
it's not my job to hunt it down.

Without consistent information at least to the point which is available
today this is going to end up in a major tinkering trainwreck. I have
zero interest in dealing with those especially if the major pain can be
avoided by doing proper analysis and design upfront.

Thanks,

        tglx
diff mbox series

Patch

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 74c12437401e..8ecd7570589d 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -61,6 +61,11 @@  struct irq_alloc_info {
 			irq_hw_number_t	msi_hwirq;
 		};
 #endif
+#ifdef CONFIG_DEV_MSI
+		struct {
+			irq_hw_number_t hwirq;
+		};
+#endif
 #ifdef	CONFIG_X86_IO_APIC
 		struct {
 			int		ioapic_id;
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 8d7001712062..f00901bac056 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -210,4 +210,11 @@  config GENERIC_ARCH_TOPOLOGY
 	  appropriate scaling, sysfs interface for reading capacity values at
 	  runtime.
 
+config DEV_MSI
+	bool "Device Specific Interrupt Messages"
+	select IRQ_DOMAIN_HIERARCHY
+	select GENERIC_MSI_IRQ_DOMAIN
+	help
+	  Allow device drivers to generate device-specific interrupt messages
+	  for devices independent of PCI MSI/-X.
 endmenu
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 157452080f3d..ca1e4d39164e 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -21,6 +21,7 @@  obj-$(CONFIG_REGMAP)	+= regmap/
 obj-$(CONFIG_SOC_BUS) += soc.o
 obj-$(CONFIG_PINCTRL) += pinctrl.o
 obj-$(CONFIG_DEV_COREDUMP) += devcoredump.o
+obj-$(CONFIG_DEV_MSI) += dev-msi.o
 obj-$(CONFIG_GENERIC_MSI_IRQ_DOMAIN) += platform-msi.o
 obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += arch_topology.o
 
diff --git a/drivers/base/dev-msi.c b/drivers/base/dev-msi.c
new file mode 100644
index 000000000000..240ccc353933
--- /dev/null
+++ b/drivers/base/dev-msi.c
@@ -0,0 +1,95 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2020 Intel Corporation.
+ *
+ * Author: Megha Dey <megha.dey@intel.com>
+ */
+
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/msi.h>
+#include "platform-msi.h"
+
+struct irq_domain *dev_msi_default_domain;
+
+static irq_hw_number_t dev_msi_get_hwirq(struct msi_domain_info *info,
+					 msi_alloc_info_t *arg)
+{
+	return arg->hwirq;
+}
+
+static irq_hw_number_t dev_msi_calc_hwirq(struct msi_desc *desc)
+{
+	u32 devid;
+
+	devid = desc->platform.msi_priv_data->devid;
+
+	return (devid << (32 - DEV_ID_SHIFT)) | desc->platform.msi_index;
+}
+
+static void dev_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+	arg->hwirq = dev_msi_calc_hwirq(desc);
+}
+
+static int dev_msi_prepare(struct irq_domain *domain, struct device *dev,
+			   int nvec, msi_alloc_info_t *arg)
+{
+	memset(arg, 0, sizeof(*arg));
+
+	return 0;
+}
+
+static struct msi_domain_ops dev_msi_domain_ops = {
+	.get_hwirq      = dev_msi_get_hwirq,
+	.set_desc       = dev_msi_set_desc,
+	.msi_prepare	= dev_msi_prepare,
+};
+
+static struct irq_chip dev_msi_controller = {
+	.name                   = "DEV-MSI",
+	.irq_unmask             = platform_msi_unmask_irq,
+	.irq_mask               = platform_msi_mask_irq,
+	.irq_write_msi_msg      = platform_msi_write_msg,
+	.irq_ack                = irq_chip_ack_parent,
+	.irq_retrigger          = irq_chip_retrigger_hierarchy,
+	.flags                  = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct msi_domain_info dev_msi_domain_info = {
+	.flags          = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS,
+	.ops            = &dev_msi_domain_ops,
+	.chip           = &dev_msi_controller,
+	.handler        = handle_edge_irq,
+	.handler_name   = "edge",
+};
+
+static int __init create_dev_msi_domain(void)
+{
+	struct irq_domain *parent = NULL;
+	struct fwnode_handle *fn;
+
+	/*
+	 * Modern code should never have to use irq_get_default_host. But since
+	 * dev-msi is invisible to DT/ACPI, this is an exception case.
+	 */
+	parent = irq_get_default_host();
+	if (!parent)
+		return -ENXIO;
+
+	fn = irq_domain_alloc_named_fwnode("DEV_MSI");
+	if (!fn)
+		return -ENXIO;
+
+	dev_msi_default_domain = msi_create_irq_domain(fn, &dev_msi_domain_info, parent);
+	if (!dev_msi_default_domain) {
+		pr_warn("failed to initialize irqdomain for DEV-MSI.\n");
+		return -ENXIO;
+	}
+
+	irq_domain_update_bus_token(dev_msi_default_domain, DOMAIN_BUS_PLATFORM_MSI);
+	irq_domain_free_fwnode(fn);
+
+	return 0;
+}
+device_initcall(create_dev_msi_domain);
diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 9d94cd699468..5e1f210d65ee 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -12,21 +12,7 @@ 
 #include <linux/irqdomain.h>
 #include <linux/msi.h>
 #include <linux/slab.h>
-
-#define DEV_ID_SHIFT	21
-#define MAX_DEV_MSIS	(1 << (32 - DEV_ID_SHIFT))
-
-/*
- * Internal data structure containing a (made up, but unique) devid
- * and the platform-msi ops
- */
-struct platform_msi_priv_data {
-	struct device			*dev;
-	void				*host_data;
-	msi_alloc_info_t		arg;
-	const struct platform_msi_ops	*ops;
-	int				devid;
-};
+#include "platform-msi.h"
 
 /* The devid allocator */
 static DEFINE_IDA(platform_msi_devid_ida);
@@ -76,7 +62,7 @@  static void platform_msi_update_dom_ops(struct msi_domain_info *info)
 		ops->set_desc = platform_msi_set_desc;
 }
 
-static void platform_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+void platform_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
 	struct msi_desc *desc = irq_data_get_msi_desc(data);
 	struct platform_msi_priv_data *priv_data;
@@ -86,6 +72,33 @@  static void platform_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 	priv_data->ops->write_msg(desc, msg);
 }
 
+static void __platform_msi_desc_mask_unmask_irq(struct msi_desc *desc, u32 mask)
+{
+	const struct platform_msi_ops *ops;
+
+	ops = desc->platform.msi_priv_data->ops;
+	if (!ops)
+		return;
+
+	if (mask) {
+		if (ops->irq_mask)
+			ops->irq_mask(desc);
+	} else {
+		if (ops->irq_unmask)
+			ops->irq_unmask(desc);
+	}
+}
+
+void platform_msi_mask_irq(struct irq_data *data)
+{
+	__platform_msi_desc_mask_unmask_irq(irq_data_get_msi_desc(data), 1);
+}
+
+void platform_msi_unmask_irq(struct irq_data *data)
+{
+	__platform_msi_desc_mask_unmask_irq(irq_data_get_msi_desc(data), 0);
+}
+
 static void platform_msi_update_chip_ops(struct msi_domain_info *info)
 {
 	struct irq_chip *chip = info->chip;
diff --git a/drivers/base/platform-msi.h b/drivers/base/platform-msi.h
new file mode 100644
index 000000000000..1de8c2874218
--- /dev/null
+++ b/drivers/base/platform-msi.h
@@ -0,0 +1,23 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright © 2020 Intel Corporation.
+ *
+ * Author: Megha Dey <megha.dey@intel.com>
+ */
+
+#include <linux/msi.h>
+
+#define DEV_ID_SHIFT    21
+#define MAX_DEV_MSIS	(1 << (32 - DEV_ID_SHIFT))
+
+/*
+ * Data structure containing a (made up, but unique) devid
+ * and the platform-msi ops.
+ */
+struct platform_msi_priv_data {
+	struct device			*dev;
+	void				*host_data;
+	msi_alloc_info_t		arg;
+	const struct platform_msi_ops	*ops;
+	int				devid;
+};
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 7f6a8eb51aca..1da97f905720 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -323,9 +323,13 @@  enum {
 
 /*
  * platform_msi_ops - Callbacks for platform MSI ops
+ * @irq_mask:   mask an interrupt source
+ * @irq_unmask: unmask an interrupt source
  * @write_msg:	write message content
  */
 struct platform_msi_ops {
+	unsigned int            (*irq_mask)(struct msi_desc *desc);
+	unsigned int            (*irq_unmask)(struct msi_desc *desc);
 	irq_write_msi_msg_t	write_msg;
 };
 
@@ -370,6 +374,10 @@  int platform_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
 void platform_msi_domain_free(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nvec);
 void *platform_msi_get_host_data(struct irq_domain *domain);
+
+void platform_msi_write_msg(struct irq_data *data, struct msi_msg *msg);
+void platform_msi_unmask_irq(struct irq_data *data);
+void platform_msi_mask_irq(struct irq_data *data);
 #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
 
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN