diff mbox

[Part2,v3,15/24] x86, MSI: Use hierarchy irqdomain to manage MSI interrupts

Message ID 1414484803-10311-16-git-send-email-jiang.liu@linux.intel.com
State Not Applicable
Headers show

Commit Message

Jiang Liu Oct. 28, 2014, 8:26 a.m. UTC
Enhance MSI code to support hierarchy irqdomain, it helps to make
the architecture more clear.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
---
 arch/x86/include/asm/hw_irq.h        |    9 +-
 arch/x86/include/asm/irq_remapping.h |    6 +-
 arch/x86/kernel/apic/msi.c           |  237 ++++++++++++++++++++++++++++------
 arch/x86/kernel/apic/vector.c        |    2 +
 drivers/iommu/irq_remapping.c        |    1 -
 5 files changed, 209 insertions(+), 46 deletions(-)

Comments

Thomas Gleixner Oct. 28, 2014, 9:37 p.m. UTC | #1
On Tue, 28 Oct 2014, Jiang Liu wrote:
> +static int msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
> +			    bool force)
> +{
> +	struct irq_data *parent = data->parent_data;
> +	int ret;
>  
> -	msg.data &= ~MSI_DATA_VECTOR_MASK;
> -	msg.data |= MSI_DATA_VECTOR(cfg->vector);
> -	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
> -	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
> +	ret = parent->chip->irq_set_affinity(parent, mask, force);
> +	/* No need to reprogram MSI registers if interrupt is remapped */
> +	if (ret >= 0 && !msi_irq_remapped(data)) {
> +		struct msi_msg msg;
>  
> -	__write_msi_msg(data->msi_desc, &msg);
> +		__get_cached_msi_msg(data->msi_desc, &msg);
> +		msi_update_msg(&msg, data);
> +		__write_msi_msg(data->msi_desc, &msg);
> +	}

I'm not too happy about the msi_irq_remapped() conditional here. It
violates the whole concept of domain stacking somewhat.

A better separation would be to add a callback to the irq chip:

  	void (*irq_write_msi_msg)(struct irq_data *data, struct msi_desc *msi_desc, bool cached);

and change this code to:

    	if (ret >= 0)
	   	parent->chip->irq_write_msi_msg(parent, data->msi-desc, true);
  
> -	return IRQ_SET_MASK_OK_NOCOPY;
> +	return ret;
>  }

And do the same here:

> +static int msi_domain_activate(struct irq_domain *domain,
> +			       struct irq_data *irq_data)
> +{
> +	struct msi_msg msg;
> +	struct irq_cfg *cfg = irqd_cfg(irq_data);
> +
> +	/*
> +	 * irq_data->chip_data is MSI/MSIx offset.
> +	 * MSI-X message is written per-IRQ, the offset is always 0.
> +	 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
> +	 */
> +	if (irq_data->chip_data)
> +		return 0;

        parent->chip->irq_write_msi_msg(parent, data->msi_desc, false);  		

> +	if (msi_irq_remapped(irq_data))
> +		irq_remapping_get_msi_entry(irq_data->parent_data, &msg);
> +	else
> +		native_compose_msi_msg(NULL, irq_data->irq, cfg->dest_apicid,
> +				       &msg, 0);
> +	write_msi_msg(irq_data->irq, &msg);
> +
> +	return 0;
> +}

And here:

> +static int msi_domain_deactivate(struct irq_domain *domain,
> +				 struct irq_data *irq_data)
> +{
> +	struct msi_msg msg;
> +
> +	if (irq_data->chip_data)
> +		return 0;
> +
> +	memset(&msg, 0, sizeof(msg));
> +	write_msi_msg(irq_data->irq, &msg);

  	parent->chip->irq_write_msi_msg(parent, NULL, false);

> +	return 0;
> +}

And let the vector and the remapping domain deal with it in their callbacks.

> @@ -166,25 +264,59 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
>  
>  int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
>  {
> -	struct msi_desc *msidesc;
> -	int irq, ret;
> +	int irq, cnt, nvec_pow2;
> +	struct irq_domain *domain;
> +	struct msi_desc *msidesc, *iter;
> +	struct irq_alloc_info info;
> +	int node = dev_to_node(&dev->dev);
>  
> -	/* Multiple MSI vectors only supported with interrupt remapping */
> -	if (type == PCI_CAP_ID_MSI && nvec > 1)
> -		return 1;
> +	if (disable_apic)
> +		return -ENOSYS;
>  
> -	list_for_each_entry(msidesc, &dev->msi_list, list) {
> -		irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL);
> +	init_irq_alloc_info(&info, NULL);
> +	info.msi_dev = dev;
> +	if (type == PCI_CAP_ID_MSI) {
> +		msidesc = list_first_entry(&dev->msi_list,
> +					   struct msi_desc, list);
> +		WARN_ON(!list_is_singular(&dev->msi_list));
> +		WARN_ON(msidesc->irq);
> +		WARN_ON(msidesc->msi_attrib.multiple);
> +		WARN_ON(msidesc->nvec_used);
> +		info.type = X86_IRQ_ALLOC_TYPE_MSI;
> +		cnt = nvec;
> +	} else {
> +		info.type = X86_IRQ_ALLOC_TYPE_MSIX;
> +		cnt = 1;
> +	}

We have a similar issue here.

> +	domain = irq_remapping_get_irq_domain(&info);

We add domain specific knowledge to the MSI implementation. Not
necessary at all.

Again MSI is not an x86 problem and we really can move most of that to
the core code. The above sanity checks and the distinction between MSI
and MSIX can be handled in the core code. And every domain involved in
the MSI chain would need a alloc_msi() callback.

So native_setup_msi_irqs() would boil down to:
+ {
+	if (disable_apic)
+		return -ENOSYS;
+ 
+	return irq_domain_alloc_msi(msi_domain, dev, nvec, type);   
+ }

Now that core function performs the sanity checks for the MSI case. In
fact it should not proceed when a warning condition is detected. Not a
x86 issue at all, its true for every MSI implementation.

Then it calls down the domain allocation chain. x86_msi_domain would
simply hand down to the parent domain. That would either be the remap
domain or the vector domain.

The reject for the multi MSI would only be implemented in the vector
domain callback, while the remap domain can handle it. Once we gain
support for allocating consecutive vectors for multi-MSI in the vector
domain we would not have to change any of the MSI code at all. 

Thoughts?

Thanks,

	tglx

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiang Liu Oct. 31, 2014, 12:04 p.m. UTC | #2
On 2014/10/29 5:37, Thomas Gleixner wrote:
> On Tue, 28 Oct 2014, Jiang Liu wrote:
>> @@ -166,25 +264,59 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
>>  
>>  int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
>>  {
>> -	struct msi_desc *msidesc;
>> -	int irq, ret;
>> +	int irq, cnt, nvec_pow2;
>> +	struct irq_domain *domain;
>> +	struct msi_desc *msidesc, *iter;
>> +	struct irq_alloc_info info;
>> +	int node = dev_to_node(&dev->dev);
>>  
>> -	/* Multiple MSI vectors only supported with interrupt remapping */
>> -	if (type == PCI_CAP_ID_MSI && nvec > 1)
>> -		return 1;
>> +	if (disable_apic)
>> +		return -ENOSYS;
>>  
>> -	list_for_each_entry(msidesc, &dev->msi_list, list) {
>> -		irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL);
>> +	init_irq_alloc_info(&info, NULL);
>> +	info.msi_dev = dev;
>> +	if (type == PCI_CAP_ID_MSI) {
>> +		msidesc = list_first_entry(&dev->msi_list,
>> +					   struct msi_desc, list);
>> +		WARN_ON(!list_is_singular(&dev->msi_list));
>> +		WARN_ON(msidesc->irq);
>> +		WARN_ON(msidesc->msi_attrib.multiple);
>> +		WARN_ON(msidesc->nvec_used);
>> +		info.type = X86_IRQ_ALLOC_TYPE_MSI;
>> +		cnt = nvec;
>> +	} else {
>> +		info.type = X86_IRQ_ALLOC_TYPE_MSIX;
>> +		cnt = 1;
>> +	}
> 
> We have a similar issue here.
> 
>> +	domain = irq_remapping_get_irq_domain(&info);
> 
> We add domain specific knowledge to the MSI implementation. Not
> necessary at all.
> 
> Again MSI is not an x86 problem and we really can move most of that to
> the core code. The above sanity checks and the distinction between MSI
> and MSIX can be handled in the core code. And every domain involved in
> the MSI chain would need a alloc_msi() callback.
Add alloc_msi() callback to irq_domain_ops seems a little overaction.
I think the main idea is to make MSI code public as much as possible,
so I have found another solution to move MSI irqdomain code into
drivers/pci/msi.c and only following code are platform dependent now.
How about this solution?
------------------------------------------------------------------------
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
        struct irq_domain *domain;
        struct irq_alloc_info info;

        init_irq_alloc_info(&info, NULL);
        info.msi_dev = dev;
        if (type == PCI_CAP_ID_MSI) {
                info.type = X86_IRQ_ALLOC_TYPE_MSI;
                info.flags |= X86_IRQ_ALLOC_CONTIGOUS_VECTORS;
        } else {
                info.type = X86_IRQ_ALLOC_TYPE_MSIX;
        }

        domain = irq_remapping_get_irq_domain(&info);
        if (domain == NULL)
                domain = msi_default_domain;
        if (domain == NULL)
                return -ENOSYS;

        return msi_irq_domain_alloc_irqs(domain, type, dev, &info);
}

void native_teardown_msi_irq(unsigned int irq)
{
        irq_domain_free_irqs(irq, 1);
}

irq_hw_number_t arch_msi_irq_domain_get_hwirq(void *arg)
{
        struct irq_alloc_info *info = arg;

        return info->msi_hwirq;
}

void arch_msi_irq_domain_set_hwirq(void *arg, irq_hw_number_t hwirq)
{
        struct irq_alloc_info *info = arg;

        info->msi_hwirq = hwirq;
}

void arch_init_msi_domain(struct irq_domain *parent)
{
        if (disable_apic)
                return;

        msi_default_domain = msi_create_irq_domain(parent);
        if (!msi_default_domain)
                pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
}

#ifdef CONFIG_IRQ_REMAP
struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
{
        return msi_create_irq_domain(parent);
}
#endif
-----------------------------------------------------------------

> 
> So native_setup_msi_irqs() would boil down to:
> + {
> +	if (disable_apic)
> +		return -ENOSYS;
> + 
> +	return irq_domain_alloc_msi(msi_domain, dev, nvec, type);   
> + }
> 
> Now that core function performs the sanity checks for the MSI case. In
> fact it should not proceed when a warning condition is detected. Not a
> x86 issue at all, its true for every MSI implementation.
> 
> Then it calls down the domain allocation chain. x86_msi_domain would
> simply hand down to the parent domain. That would either be the remap
> domain or the vector domain.
The issue here is that, the hierarchy irqdomain maintains a tree
topology and every irqdomain only supports one parent.

In case of irq remapping, we need to build one irqdomain for each IOMMU
unit to support hotplug and simplify the implementation. So we need to
build one MSI irqdomain for each IOMMU unit too instead of using a
common MSI irqdomain.

Current design is that, a common MSI irqdomain to support all MSI when
irq remapping is disabled, and one MSI irqdomain for each IOMMU unit
when irq remapping is enabled.

So we have the code below to choose the correct irqdomain for MSI.
        domain = irq_remapping_get_irq_domain(&info);
        if (domain == NULL)
                domain = msi_default_domain;
        if (domain == NULL)
                return -ENOSYS;

> 
> The reject for the multi MSI would only be implemented in the vector
> domain callback, while the remap domain can handle it. Once we gain
> support for allocating consecutive vectors for multi-MSI in the vector
> domain we would not have to change any of the MSI code at all. 
I have worked out a working patch to make decision in vector domain,
as you have suggested.

> 
> Thoughts?
> 
> Thanks,
> 
> 	tglx
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Gleixner Oct. 31, 2014, 2 p.m. UTC | #3
On Fri, 31 Oct 2014, Jiang Liu wrote:
> On 2014/10/29 5:37, Thomas Gleixner wrote:
> > Then it calls down the domain allocation chain. x86_msi_domain would
> > simply hand down to the parent domain. That would either be the remap
> > domain or the vector domain.
> The issue here is that, the hierarchy irqdomain maintains a tree
> topology and every irqdomain only supports one parent.
> 
> In case of irq remapping, we need to build one irqdomain for each IOMMU
> unit to support hotplug and simplify the implementation. So we need to
> build one MSI irqdomain for each IOMMU unit too instead of using a
> common MSI irqdomain.

That makes indeed a difference.
 
> Current design is that, a common MSI irqdomain to support all MSI when
> irq remapping is disabled, and one MSI irqdomain for each IOMMU unit
> when irq remapping is enabled.

> So we have the code below to choose the correct irqdomain for MSI.
>         domain = irq_remapping_get_irq_domain(&info);
>         if (domain == NULL)
>                 domain = msi_default_domain;
>         if (domain == NULL)
>                 return -ENOSYS;

Right. I guess we need to keep it that way for now.

But looking at the code makes me wonder why we actually need to call
into the remap code and do a list walk to figure the domain out. The
association of device and iommu should be known at startup/hotplug
time already. That's out of the scope of this work, but should be
fixed eventually.

Thanks,

	tglx




--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 545460d470bd..1ff7a7f61bf9 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -110,9 +110,10 @@  struct irq_2_irte {
 };
 #endif	/* CONFIG_IRQ_REMAP */
 
+struct irq_domain;
+
 #ifdef	CONFIG_X86_LOCAL_APIC
 struct irq_data;
-struct irq_domain;
 struct pci_dev;
 struct msi_desc;
 
@@ -200,6 +201,12 @@  static inline void lock_vector_lock(void) {}
 static inline void unlock_vector_lock(void) {}
 #endif	/* CONFIG_X86_LOCAL_APIC */
 
+#ifdef	CONFIG_PCI_MSI
+extern void arch_init_msi_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_msi_domain(struct irq_domain *domain) { }
+#endif
+
 /* Statistics */
 extern atomic_t irq_err_count;
 extern atomic_t irq_mis_count;
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f5f624529386..ea71f86423ee 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -75,11 +75,7 @@  extern void irq_remapping_print_chip(struct irq_data *data, struct seq_file *p);
  * Create MSI/MSIx irqdomain for interrupt remapping device, use @parent as
  * parent irqdomain.
  */
-static inline struct irq_domain *
-arch_create_msi_irq_domain(struct irq_domain *parent)
-{
-	return NULL;
-}
+extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent);
 
 /* Get parent irqdomain for interrupt remapping irqdomain */
 static inline struct irq_domain *arch_get_ir_parent_domain(void)
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 47d3a24793ce..354abd7ef2ad 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -3,6 +3,8 @@ 
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Add support of hierarchy irqdomain
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -21,6 +23,8 @@ 
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 
+static struct irq_domain *msi_default_domain;
+
 static void msi_reset_irq_data_and_handler(struct irq_domain *domain, int virq)
 {
 	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
@@ -96,28 +100,28 @@  static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 	return 0;
 }
 
-static int
-msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+static bool msi_irq_remapped(struct irq_data *irq_data)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	struct msi_msg msg;
-	unsigned int dest;
-	int ret;
-
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+	return irq_remapping_domain_is_remapped(irq_data->domain);
+}
 
-	__get_cached_msi_msg(data->msi_desc, &msg);
+static int msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+			    bool force)
+{
+	struct irq_data *parent = data->parent_data;
+	int ret;
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	/* No need to reprogram MSI registers if interrupt is remapped */
+	if (ret >= 0 && !msi_irq_remapped(data)) {
+		struct msi_msg msg;
 
-	__write_msi_msg(data->msi_desc, &msg);
+		__get_cached_msi_msg(data->msi_desc, &msg);
+		msi_update_msg(&msg, data);
+		__write_msi_msg(data->msi_desc, &msg);
+	}
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+	return ret;
 }
 
 /*
@@ -128,12 +132,106 @@  static struct irq_chip msi_chip = {
 	.name			= "PCI-MSI",
 	.irq_unmask		= unmask_msi_irq,
 	.irq_mask		= mask_msi_irq,
-	.irq_ack		= apic_ack_edge,
+	.irq_ack		= irq_chip_ack_parent,
 	.irq_set_affinity	= msi_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_print_chip		= irq_remapping_print_chip,
  	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
+static inline irq_hw_number_t
+get_hwirq_from_pcidev(struct pci_dev *pdev, struct msi_desc *msidesc)
+{
+	return (irq_hw_number_t)msidesc->msi_attrib.entry_nr |
+		PCI_DEVID(pdev->bus->number, pdev->devfn) << 11 |
+		(pci_domain_nr(pdev->bus) & 0xFFFFFFFF) << 27;
+}
+
+static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+			    unsigned int nr_irqs, void *arg)
+{
+	int i, ret;
+	irq_hw_number_t hwirq;
+	struct irq_alloc_info *info = arg;
+
+	hwirq = get_hwirq_from_pcidev(info->msi_dev, info->msi_desc);
+	if (irq_find_mapping(domain, hwirq) > 0)
+		return -EEXIST;
+
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_set_msi_desc_off(virq, i, info->msi_desc);
+		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+					      &msi_chip, (void *)(long)i);
+		__irq_set_handler(virq + i, handle_edge_irq, 0, "edge");
+		dev_dbg(&info->msi_dev->dev, "irq %d for MSI/MSI-X\n",
+			virq + i);
+	}
+
+	return ret;
+}
+
+static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
+			    unsigned int nr_irqs)
+{
+	int i;
+	struct msi_desc *msidesc = irq_get_msi_desc(virq);
+
+	if (msidesc)
+		msidesc->irq = 0;
+	for (i = 0; i < nr_irqs; i++)
+		msi_reset_irq_data_and_handler(domain, virq + i);
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+static int msi_domain_activate(struct irq_domain *domain,
+			       struct irq_data *irq_data)
+{
+	struct msi_msg msg;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
+
+	/*
+	 * irq_data->chip_data is MSI/MSIx offset.
+	 * MSI-X message is written per-IRQ, the offset is always 0.
+	 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
+	 */
+	if (irq_data->chip_data)
+		return 0;
+
+	if (msi_irq_remapped(irq_data))
+		irq_remapping_get_msi_entry(irq_data->parent_data, &msg);
+	else
+		native_compose_msi_msg(NULL, irq_data->irq, cfg->dest_apicid,
+				       &msg, 0);
+	write_msi_msg(irq_data->irq, &msg);
+
+	return 0;
+}
+
+static int msi_domain_deactivate(struct irq_domain *domain,
+				 struct irq_data *irq_data)
+{
+	struct msi_msg msg;
+
+	if (irq_data->chip_data)
+		return 0;
+
+	memset(&msg, 0, sizeof(msg));
+	write_msi_msg(irq_data->irq, &msg);
+
+	return 0;
+}
+
+static struct irq_domain_ops msi_domain_ops = {
+	.alloc = msi_domain_alloc,
+	.free = msi_domain_free,
+	.activate = msi_domain_activate,
+	.deactivate = msi_domain_deactivate,
+};
+
 int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 		  unsigned int irq_base, unsigned int irq_offset)
 {
@@ -166,25 +264,59 @@  int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	struct msi_desc *msidesc;
-	int irq, ret;
+	int irq, cnt, nvec_pow2;
+	struct irq_domain *domain;
+	struct msi_desc *msidesc, *iter;
+	struct irq_alloc_info info;
+	int node = dev_to_node(&dev->dev);
 
-	/* Multiple MSI vectors only supported with interrupt remapping */
-	if (type == PCI_CAP_ID_MSI && nvec > 1)
-		return 1;
+	if (disable_apic)
+		return -ENOSYS;
 
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL);
+	init_irq_alloc_info(&info, NULL);
+	info.msi_dev = dev;
+	if (type == PCI_CAP_ID_MSI) {
+		msidesc = list_first_entry(&dev->msi_list,
+					   struct msi_desc, list);
+		WARN_ON(!list_is_singular(&dev->msi_list));
+		WARN_ON(msidesc->irq);
+		WARN_ON(msidesc->msi_attrib.multiple);
+		WARN_ON(msidesc->nvec_used);
+		info.type = X86_IRQ_ALLOC_TYPE_MSI;
+		cnt = nvec;
+	} else {
+		info.type = X86_IRQ_ALLOC_TYPE_MSIX;
+		cnt = 1;
+	}
+
+	domain = irq_remapping_get_irq_domain(&info);
+	if (domain == NULL) {
+		/*
+		 * Multiple MSI vectors only supported with interrupt
+		 * remapping
+		 */
+		if (type == PCI_CAP_ID_MSI && nvec > 1)
+			return 1;
+		domain = msi_default_domain;
+	}
+	if (domain == NULL)
+		return -ENOSYS;
+
+	list_for_each_entry(iter, &dev->msi_list, list) {
+		info.msi_desc = iter;
+		irq = irq_domain_alloc_irqs(domain, cnt, node, &info);
 		if (irq <= 0)
 			return -ENOSPC;
+	}
 
-		ret = setup_msi_irq(dev, msidesc, irq, 0);
-		if (ret < 0) {
-			irq_domain_free_irqs(irq, 1);
-			return ret;
-		}
-
+	if (type == PCI_CAP_ID_MSI) {
+		nvec_pow2 = __roundup_pow_of_two(nvec);
+		msidesc = list_first_entry(&dev->msi_list,
+					   struct msi_desc, list);
+		msidesc->msi_attrib.multiple = ilog2(nvec_pow2);
+		msidesc->nvec_used = nvec;
 	}
+
 	return 0;
 }
 
@@ -193,6 +325,38 @@  void native_teardown_msi_irq(unsigned int irq)
 	irq_domain_free_irqs(irq, 1);
 }
 
+static struct irq_domain *msi_create_domain(struct irq_domain *parent,
+					    bool remapped)
+{
+	struct irq_domain *domain;
+
+	domain = irq_domain_add_tree(NULL, &msi_domain_ops, NULL);
+	if (domain) {
+		domain->parent = parent;
+		if (remapped)
+			irq_remapping_domain_set_remapped(domain);
+	}
+
+	return domain;
+}
+
+void arch_init_msi_domain(struct irq_domain *parent)
+{
+	if (disable_apic)
+		return;
+
+	msi_default_domain = msi_create_domain(parent, false);
+	if (!msi_default_domain)
+		pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+}
+
+#ifdef CONFIG_IRQ_REMAP
+struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
+{
+	return msi_create_domain(parent, true);
+}
+#endif
+
 #ifdef CONFIG_DMAR_TABLE
 static int
 dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
@@ -264,11 +428,6 @@  static inline int hpet_dev_id(struct irq_domain *domain)
 	return (int)(long)domain->host_data;
 }
 
-static inline bool hpet_irq_remapped(struct irq_data *irq_data)
-{
-	return irq_remapping_domain_is_remapped(irq_data->domain);
-}
-
 static int hpet_msi_set_affinity(struct irq_data *data,
 				 const struct cpumask *mask, bool force)
 {
@@ -278,7 +437,7 @@  static int hpet_msi_set_affinity(struct irq_data *data,
 
 	ret = parent->chip->irq_set_affinity(parent, mask, force);
 	/* No need to rewrite HPET registers if interrupt is remapped */
-	if (ret >= 0 && !hpet_irq_remapped(data)) {
+	if (ret >= 0 && !msi_irq_remapped(data)) {
 		hpet_msi_read(data->handler_data, &msg);
 		msi_update_msg(&msg, data);
 		hpet_msi_write(data->handler_data, &msg);
@@ -356,7 +515,7 @@  static int hpet_domain_activate(struct irq_domain *domain,
 	struct msi_msg msg;
 	struct irq_cfg *cfg = irqd_cfg(irq_data);
 
-	if (hpet_irq_remapped(irq_data))
+	if (msi_irq_remapped(irq_data))
 		irq_remapping_get_msi_entry(irq_data->parent_data, &msg);
 	else
 		native_compose_msi_msg(NULL, irq_data->irq, cfg->dest_apicid,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index c97848125895..61c55d0aeaec 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -359,6 +359,8 @@  int __init arch_early_irq_init(void)
 	BUG_ON(x86_vector_domain == NULL);
 	irq_set_default_host(x86_vector_domain);
 
+	arch_init_msi_domain(x86_vector_domain);
+
 	return arch_early_ioapic_init();
 }
 
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 3a7f62c8becb..f92d49110f83 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -179,7 +179,6 @@  static void __init irq_remapping_modify_x86_ops(void)
 	x86_io_apic_ops.set_affinity	= set_remapped_irq_affinity;
 	x86_io_apic_ops.setup_entry	= setup_ioapic_remapped_entry;
 	x86_io_apic_ops.eoi_ioapic_pin	= eoi_ioapic_pin_remapped;
-	x86_msi.setup_msi_irqs		= irq_remapping_setup_msi_irqs;
 	x86_msi.setup_hpet_msi		= setup_hpet_msi_remapped;
 	x86_msi.compose_msi_msg		= compose_remapped_msi_msg;
 }