diff mbox

[v4,1/2] PCI: Add tango MSI controller support

Message ID 1f6b50c7-4887-838e-8c7d-c014d82b6d8e@sigmadesigns.com
State Changes Requested
Headers show

Commit Message

Marc Gonzalez April 20, 2017, 2:28 p.m. UTC
The MSI controller in Tango supports 256 message-signaled interrupts,
and a single doorbell address.

Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
---
 drivers/pci/host/pcie-tango.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)

Comments

Marc Gonzalez May 17, 2017, 2:56 p.m. UTC | #1
On 20/04/2017 16:28, Marc Gonzalez wrote:

> +static int tango_set_affinity(struct irq_data *data,
> +		const struct cpumask *mask, bool force)
> +{
> +	return -EINVAL;
> +}
> +
> +static struct irq_chip tango_chip = {
> +	.irq_ack		= tango_ack,
> +	.irq_mask		= tango_mask,
> +	.irq_unmask		= tango_unmask,
> +	.irq_set_affinity	= tango_set_affinity,
> +	.irq_compose_msi_msg	= tango_compose_msi_msg,
> +};

Hmmm... I'm wondering why .irq_set_affinity is required.

static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
first calls __irq_can_set_affinity() to check whether
desc->irq_data.chip->irq_set_affinity) exists.

then calls irq_do_set_affinity(&desc->irq_data, mask, false);
which calls chip->irq_set_affinity(data, mask, force);
= msi_domain_set_affinity()
which calls parent->chip->irq_set_affinity() unconditionally.

Would it make sense to test that the callback is implemented
before calling it?


[    0.723895] Unable to handle kernel NULL pointer dereference at virtual address 00000000
...
[    1.135809] [<c0160eb4>] (msi_domain_set_affinity) from [<c015a864>] (irq_do_set_affinity+0x18/0x48)
[    1.144990] [<c015a864>] (irq_do_set_affinity) from [<c015a918>] (setup_affinity+0x84/0xd4)
[    1.153384] [<c015a918>] (setup_affinity) from [<c015b20c>] (__setup_irq+0x40c/0x5d4)
[    1.161254] [<c015b20c>] (__setup_irq) from [<c015b57c>] (request_threaded_irq+0xe4/0x184)
[    1.169569] [<c015b57c>] (request_threaded_irq) from [<c03102e0>] (aer_probe+0x9c/0x218)
[    1.177704] [<c03102e0>] (aer_probe) from [<c030e4bc>] (pcie_port_probe_service+0x34/0x70)
[    1.186017] [<c030e4bc>] (pcie_port_probe_service) from [<c0351cd0>] (really_probe+0x1c4/0x250)
[    1.194763] [<c0351cd0>] (really_probe) from [<c0351ec8>] (__device_attach_driver+0xa4/0xe0)
[    1.203245] [<c0351ec8>] (__device_attach_driver) from [<c0350274>] (bus_for_each_drv+0x60/0x94)
[    1.212076] [<c0350274>] (bus_for_each_drv) from [<c0351abc>] (__device_attach+0x9c/0xdc)
[    1.220296] [<c0351abc>] (__device_attach) from [<c0351ff0>] (device_initial_probe+0xc/0x10)
[    1.228777] [<c0351ff0>] (device_initial_probe) from [<c0351090>] (bus_probe_device+0x84/0x8c)
[    1.237433] [<c0351090>] (bus_probe_device) from [<c034f484>] (device_add+0x3cc/0x548)
[    1.245390] [<c034f484>] (device_add) from [<c034f614>] (device_register+0x14/0x18)
[    1.253086] [<c034f614>] (device_register) from [<c030e8b8>] (pcie_port_device_register+0x3b0/0x450)
[    1.262267] [<c030e8b8>] (pcie_port_device_register) from [<c030ecfc>] (pcie_portdrv_probe+0x2c/0x50)
[    1.271539] [<c030ecfc>] (pcie_portdrv_probe) from [<c0302ba8>] (pci_device_probe+0x7c/0xc8)
[    1.280022] [<c0302ba8>] (pci_device_probe) from [<c0351c84>] (really_probe+0x178/0x250)
[    1.288154] [<c0351c84>] (really_probe) from [<c0351ec8>] (__device_attach_driver+0xa4/0xe0)
[    1.296635] [<c0351ec8>] (__device_attach_driver) from [<c0350274>] (bus_for_each_drv+0x60/0x94)
[    1.305465] [<c0350274>] (bus_for_each_drv) from [<c0351abc>] (__device_attach+0x9c/0xdc)
[    1.313684] [<c0351abc>] (__device_attach) from [<c0351b08>] (device_attach+0xc/0x10)
[    1.321559] [<c0351b08>] (device_attach) from [<c02f9a20>] (pci_bus_add_device+0x44/0x90)
[    1.329778] [<c02f9a20>] (pci_bus_add_device) from [<c02f9aa8>] (pci_bus_add_devices+0x3c/0x80)
[    1.338523] [<c02f9aa8>] (pci_bus_add_devices) from [<c0312dfc>] (pci_host_common_probe+0x100/0x314)
[    1.347703] [<c0312dfc>] (pci_host_common_probe) from [<c0313550>] (tango_pcie_probe+0x138/0x340)
[    1.356624] [<c0313550>] (tango_pcie_probe) from [<c03532c4>] (platform_drv_probe+0x34/0x6c)
Bjorn Helgaas May 23, 2017, 5:03 p.m. UTC | #2
On Wed, May 17, 2017 at 04:56:08PM +0200, Marc Gonzalez wrote:
> On 20/04/2017 16:28, Marc Gonzalez wrote:
> 
> > +static int tango_set_affinity(struct irq_data *data,
> > +		const struct cpumask *mask, bool force)
> > +{
> > +	return -EINVAL;
> > +}
> > +
> > +static struct irq_chip tango_chip = {
> > +	.irq_ack		= tango_ack,
> > +	.irq_mask		= tango_mask,
> > +	.irq_unmask		= tango_unmask,
> > +	.irq_set_affinity	= tango_set_affinity,
> > +	.irq_compose_msi_msg	= tango_compose_msi_msg,
> > +};
> 
> Hmmm... I'm wondering why .irq_set_affinity is required.
> 
> static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
> first calls __irq_can_set_affinity() to check whether
> desc->irq_data.chip->irq_set_affinity) exists.
> 
> then calls irq_do_set_affinity(&desc->irq_data, mask, false);
> which calls chip->irq_set_affinity(data, mask, force);
> = msi_domain_set_affinity()
> which calls parent->chip->irq_set_affinity() unconditionally.
> 
> Would it make sense to test that the callback is implemented
> before calling it?
> 
> 
> [    0.723895] Unable to handle kernel NULL pointer dereference at virtual address 00000000

I'm not sure what you're asking.

Is this a bug report for the v4 tango driver?

Or are you asking whether msi_domain_set_affinity() should be changed
to check whether parent->chip->irq_set_affinity is implemented?

msi_domain_set_affinity() has called parent->chip->irq_set_affinity()
without checking since it was added in 2014 by f3cf8bb0d6c3 ("genirq: Add
generic msi irq domain support"), so if there's a problem here, it's most
likely in the tango code.

> ...
> [    1.135809] [<c0160eb4>] (msi_domain_set_affinity) from [<c015a864>] (irq_do_set_affinity+0x18/0x48)
> [    1.144990] [<c015a864>] (irq_do_set_affinity) from [<c015a918>] (setup_affinity+0x84/0xd4)
> [    1.153384] [<c015a918>] (setup_affinity) from [<c015b20c>] (__setup_irq+0x40c/0x5d4)
> [    1.161254] [<c015b20c>] (__setup_irq) from [<c015b57c>] (request_threaded_irq+0xe4/0x184)
> [    1.169569] [<c015b57c>] (request_threaded_irq) from [<c03102e0>] (aer_probe+0x9c/0x218)
> [    1.177704] [<c03102e0>] (aer_probe) from [<c030e4bc>] (pcie_port_probe_service+0x34/0x70)
> [    1.186017] [<c030e4bc>] (pcie_port_probe_service) from [<c0351cd0>] (really_probe+0x1c4/0x250)
> [    1.194763] [<c0351cd0>] (really_probe) from [<c0351ec8>] (__device_attach_driver+0xa4/0xe0)
> [    1.203245] [<c0351ec8>] (__device_attach_driver) from [<c0350274>] (bus_for_each_drv+0x60/0x94)
> [    1.212076] [<c0350274>] (bus_for_each_drv) from [<c0351abc>] (__device_attach+0x9c/0xdc)
> [    1.220296] [<c0351abc>] (__device_attach) from [<c0351ff0>] (device_initial_probe+0xc/0x10)
> [    1.228777] [<c0351ff0>] (device_initial_probe) from [<c0351090>] (bus_probe_device+0x84/0x8c)
> [    1.237433] [<c0351090>] (bus_probe_device) from [<c034f484>] (device_add+0x3cc/0x548)
> [    1.245390] [<c034f484>] (device_add) from [<c034f614>] (device_register+0x14/0x18)
> [    1.253086] [<c034f614>] (device_register) from [<c030e8b8>] (pcie_port_device_register+0x3b0/0x450)
> [    1.262267] [<c030e8b8>] (pcie_port_device_register) from [<c030ecfc>] (pcie_portdrv_probe+0x2c/0x50)
> [    1.271539] [<c030ecfc>] (pcie_portdrv_probe) from [<c0302ba8>] (pci_device_probe+0x7c/0xc8)
> [    1.280022] [<c0302ba8>] (pci_device_probe) from [<c0351c84>] (really_probe+0x178/0x250)
> [    1.288154] [<c0351c84>] (really_probe) from [<c0351ec8>] (__device_attach_driver+0xa4/0xe0)
> [    1.296635] [<c0351ec8>] (__device_attach_driver) from [<c0350274>] (bus_for_each_drv+0x60/0x94)
> [    1.305465] [<c0350274>] (bus_for_each_drv) from [<c0351abc>] (__device_attach+0x9c/0xdc)
> [    1.313684] [<c0351abc>] (__device_attach) from [<c0351b08>] (device_attach+0xc/0x10)
> [    1.321559] [<c0351b08>] (device_attach) from [<c02f9a20>] (pci_bus_add_device+0x44/0x90)
> [    1.329778] [<c02f9a20>] (pci_bus_add_device) from [<c02f9aa8>] (pci_bus_add_devices+0x3c/0x80)
> [    1.338523] [<c02f9aa8>] (pci_bus_add_devices) from [<c0312dfc>] (pci_host_common_probe+0x100/0x314)
> [    1.347703] [<c0312dfc>] (pci_host_common_probe) from [<c0313550>] (tango_pcie_probe+0x138/0x340)
> [    1.356624] [<c0313550>] (tango_pcie_probe) from [<c03532c4>] (platform_drv_probe+0x34/0x6c)
>
Mason May 23, 2017, 5:54 p.m. UTC | #3
On 23/05/2017 19:03, Bjorn Helgaas wrote:
> On Wed, May 17, 2017 at 04:56:08PM +0200, Marc Gonzalez wrote:
>> On 20/04/2017 16:28, Marc Gonzalez wrote:
>>
>>> +static int tango_set_affinity(struct irq_data *data,
>>> +		const struct cpumask *mask, bool force)
>>> +{
>>> +	return -EINVAL;
>>> +}
>>> +
>>> +static struct irq_chip tango_chip = {
>>> +	.irq_ack		= tango_ack,
>>> +	.irq_mask		= tango_mask,
>>> +	.irq_unmask		= tango_unmask,
>>> +	.irq_set_affinity	= tango_set_affinity,
>>> +	.irq_compose_msi_msg	= tango_compose_msi_msg,
>>> +};
>>
>> Hmmm... I'm wondering why .irq_set_affinity is required.
>>
>> static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
>> first calls __irq_can_set_affinity() to check whether
>> desc->irq_data.chip->irq_set_affinity) exists.
>>
>> then calls irq_do_set_affinity(&desc->irq_data, mask, false);
>> which calls chip->irq_set_affinity(data, mask, force);
>> = msi_domain_set_affinity()
>> which calls parent->chip->irq_set_affinity() unconditionally.
>>
>> Would it make sense to test that the callback is implemented
>> before calling it?
>>
>>
>> [    0.723895] Unable to handle kernel NULL pointer dereference at virtual address 00000000
> 
> I'm not sure what you're asking.
> 
> Is this a bug report for the v4 tango driver?

No.

> Or are you asking whether msi_domain_set_affinity() should be changed
> to check whether parent->chip->irq_set_affinity is implemented?

Yes. The way things are implemented now, drivers are forced
to define an irq_set_affinity callback, even if it just returns
an error, otherwise, the kernel crashes, because of the
unconditional function pointer deref. 

> msi_domain_set_affinity() has called parent->chip->irq_set_affinity()
> without checking since it was added in 2014 by f3cf8bb0d6c3 ("genirq: Add
> generic msi irq domain support"), so if there's a problem here, it's most
> likely in the tango code.

The issue is having to define an "empty" function.
(Unnecessary code bloat and maintenance.)

I'll send a patch illustrating exactly what I intended.

Regards.
Robin Murphy May 23, 2017, 6:03 p.m. UTC | #4
On 23/05/17 18:54, Mason wrote:
> On 23/05/2017 19:03, Bjorn Helgaas wrote:
>> On Wed, May 17, 2017 at 04:56:08PM +0200, Marc Gonzalez wrote:
>>> On 20/04/2017 16:28, Marc Gonzalez wrote:
>>>
>>>> +static int tango_set_affinity(struct irq_data *data,
>>>> +		const struct cpumask *mask, bool force)
>>>> +{
>>>> +	return -EINVAL;
>>>> +}
>>>> +
>>>> +static struct irq_chip tango_chip = {
>>>> +	.irq_ack		= tango_ack,
>>>> +	.irq_mask		= tango_mask,
>>>> +	.irq_unmask		= tango_unmask,
>>>> +	.irq_set_affinity	= tango_set_affinity,
>>>> +	.irq_compose_msi_msg	= tango_compose_msi_msg,
>>>> +};
>>>
>>> Hmmm... I'm wondering why .irq_set_affinity is required.
>>>
>>> static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
>>> first calls __irq_can_set_affinity() to check whether
>>> desc->irq_data.chip->irq_set_affinity) exists.
>>>
>>> then calls irq_do_set_affinity(&desc->irq_data, mask, false);
>>> which calls chip->irq_set_affinity(data, mask, force);
>>> = msi_domain_set_affinity()
>>> which calls parent->chip->irq_set_affinity() unconditionally.
>>>
>>> Would it make sense to test that the callback is implemented
>>> before calling it?
>>>
>>>
>>> [    0.723895] Unable to handle kernel NULL pointer dereference at virtual address 00000000
>>
>> I'm not sure what you're asking.
>>
>> Is this a bug report for the v4 tango driver?
> 
> No.
> 
>> Or are you asking whether msi_domain_set_affinity() should be changed
>> to check whether parent->chip->irq_set_affinity is implemented?
> 
> Yes. The way things are implemented now, drivers are forced
> to define an irq_set_affinity callback, even if it just returns
> an error, otherwise, the kernel crashes, because of the
> unconditional function pointer deref. 
> 
>> msi_domain_set_affinity() has called parent->chip->irq_set_affinity()
>> without checking since it was added in 2014 by f3cf8bb0d6c3 ("genirq: Add
>> generic msi irq domain support"), so if there's a problem here, it's most
>> likely in the tango code.
> 
> The issue is having to define an "empty" function.
> (Unnecessary code bloat and maintenance.)

AFAICS, only one driver (other than this one) implements a "do nothing"
set_affinity callback - everyone else who doesn't do anything hardware
specific just passes it along via irq_chip_set_affinity_parent().

Robin.

> 
> I'll send a patch illustrating exactly what I intended.
> 
> Regards.
>
Mason May 23, 2017, 7:15 p.m. UTC | #5
On 23/05/2017 20:03, Robin Murphy wrote:
> On 23/05/17 18:54, Mason wrote:
>> On 23/05/2017 19:03, Bjorn Helgaas wrote:
>>> On Wed, May 17, 2017 at 04:56:08PM +0200, Marc Gonzalez wrote:
>>>> On 20/04/2017 16:28, Marc Gonzalez wrote:
>>>>
>>>>> +static int tango_set_affinity(struct irq_data *data,
>>>>> +		const struct cpumask *mask, bool force)
>>>>> +{
>>>>> +	return -EINVAL;
>>>>> +}
>>>>> +
>>>>> +static struct irq_chip tango_chip = {
>>>>> +	.irq_ack		= tango_ack,
>>>>> +	.irq_mask		= tango_mask,
>>>>> +	.irq_unmask		= tango_unmask,
>>>>> +	.irq_set_affinity	= tango_set_affinity,
>>>>> +	.irq_compose_msi_msg	= tango_compose_msi_msg,
>>>>> +};
>>>>
>>>> Hmmm... I'm wondering why .irq_set_affinity is required.
>>>>
>>>> static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
>>>> first calls __irq_can_set_affinity() to check whether
>>>> desc->irq_data.chip->irq_set_affinity) exists.
>>>>
>>>> then calls irq_do_set_affinity(&desc->irq_data, mask, false);
>>>> which calls chip->irq_set_affinity(data, mask, force);
>>>> = msi_domain_set_affinity()
>>>> which calls parent->chip->irq_set_affinity() unconditionally.
>>>>
>>>> Would it make sense to test that the callback is implemented
>>>> before calling it?
>>>>
>>>>
>>>> [    0.723895] Unable to handle kernel NULL pointer dereference at virtual address 00000000
>>>
>>> I'm not sure what you're asking.
>>>
>>> Is this a bug report for the v4 tango driver?
>>
>> No.
>>
>>> Or are you asking whether msi_domain_set_affinity() should be changed
>>> to check whether parent->chip->irq_set_affinity is implemented?
>>
>> Yes. The way things are implemented now, drivers are forced
>> to define an irq_set_affinity callback, even if it just returns
>> an error, otherwise, the kernel crashes, because of the
>> unconditional function pointer deref. 
>>
>>> msi_domain_set_affinity() has called parent->chip->irq_set_affinity()
>>> without checking since it was added in 2014 by f3cf8bb0d6c3 ("genirq: Add
>>> generic msi irq domain support"), so if there's a problem here, it's most
>>> likely in the tango code.
>>
>> The issue is having to define an "empty" function.
>> (Unnecessary code bloat and maintenance.)
> 
> AFAICS, only one driver (other than this one) implements a "do nothing"
> set_affinity callback - everyone else who doesn't do anything hardware
> specific just passes it along via irq_chip_set_affinity_parent().

I counted 4. Where did I mess up?

advk_msi_set_affinity
altera_msi_set_affinity
nwl_msi_set_affinity
vmd_irq_set_affinity
tango_set_affinity

Regards.
Robin Murphy May 24, 2017, 10 a.m. UTC | #6
On 23/05/17 20:15, Mason wrote:
> On 23/05/2017 20:03, Robin Murphy wrote:
>> On 23/05/17 18:54, Mason wrote:
>>> On 23/05/2017 19:03, Bjorn Helgaas wrote:
>>>> On Wed, May 17, 2017 at 04:56:08PM +0200, Marc Gonzalez wrote:
>>>>> On 20/04/2017 16:28, Marc Gonzalez wrote:
>>>>>
>>>>>> +static int tango_set_affinity(struct irq_data *data,
>>>>>> +		const struct cpumask *mask, bool force)
>>>>>> +{
>>>>>> +	return -EINVAL;
>>>>>> +}
>>>>>> +
>>>>>> +static struct irq_chip tango_chip = {
>>>>>> +	.irq_ack		= tango_ack,
>>>>>> +	.irq_mask		= tango_mask,
>>>>>> +	.irq_unmask		= tango_unmask,
>>>>>> +	.irq_set_affinity	= tango_set_affinity,
>>>>>> +	.irq_compose_msi_msg	= tango_compose_msi_msg,
>>>>>> +};
>>>>>
>>>>> Hmmm... I'm wondering why .irq_set_affinity is required.
>>>>>
>>>>> static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
>>>>> first calls __irq_can_set_affinity() to check whether
>>>>> desc->irq_data.chip->irq_set_affinity) exists.
>>>>>
>>>>> then calls irq_do_set_affinity(&desc->irq_data, mask, false);
>>>>> which calls chip->irq_set_affinity(data, mask, force);
>>>>> = msi_domain_set_affinity()
>>>>> which calls parent->chip->irq_set_affinity() unconditionally.
>>>>>
>>>>> Would it make sense to test that the callback is implemented
>>>>> before calling it?
>>>>>
>>>>>
>>>>> [    0.723895] Unable to handle kernel NULL pointer dereference at virtual address 00000000
>>>>
>>>> I'm not sure what you're asking.
>>>>
>>>> Is this a bug report for the v4 tango driver?
>>>
>>> No.
>>>
>>>> Or are you asking whether msi_domain_set_affinity() should be changed
>>>> to check whether parent->chip->irq_set_affinity is implemented?
>>>
>>> Yes. The way things are implemented now, drivers are forced
>>> to define an irq_set_affinity callback, even if it just returns
>>> an error, otherwise, the kernel crashes, because of the
>>> unconditional function pointer deref. 
>>>
>>>> msi_domain_set_affinity() has called parent->chip->irq_set_affinity()
>>>> without checking since it was added in 2014 by f3cf8bb0d6c3 ("genirq: Add
>>>> generic msi irq domain support"), so if there's a problem here, it's most
>>>> likely in the tango code.
>>>
>>> The issue is having to define an "empty" function.
>>> (Unnecessary code bloat and maintenance.)
>>
>> AFAICS, only one driver (other than this one) implements a "do nothing"
>> set_affinity callback - everyone else who doesn't do anything hardware
>> specific just passes it along via irq_chip_set_affinity_parent().
> 
> I counted 4. Where did I mess up?
> 
> advk_msi_set_affinity
> altera_msi_set_affinity
> nwl_msi_set_affinity
> vmd_irq_set_affinity
> tango_set_affinity

Fair point - I went through drivers/irqchip and the various
arch-specific instances and found ls_scfg_msi_set_affinity(), but
somehow skipped over drivers/pci. Anyway, I think the question stands of
why are these handful of drivers *not* using irq_chip_set_affinity_parent()?

As an outsider, it naively seems logical that the affinity of an MSI
which just gets translated to a wired interrupt should propagate through
to the affinity of that wired interrupt, but maybe there are reasons not
to; I really don't know.

Robin.
Marc Zyngier May 24, 2017, 10:22 a.m. UTC | #7
On 24/05/17 11:00, Robin Murphy wrote:
> On 23/05/17 20:15, Mason wrote:
>> On 23/05/2017 20:03, Robin Murphy wrote:
>>> On 23/05/17 18:54, Mason wrote:
>>>> On 23/05/2017 19:03, Bjorn Helgaas wrote:
>>>>> On Wed, May 17, 2017 at 04:56:08PM +0200, Marc Gonzalez wrote:
>>>>>> On 20/04/2017 16:28, Marc Gonzalez wrote:
>>>>>>
>>>>>>> +static int tango_set_affinity(struct irq_data *data,
>>>>>>> +		const struct cpumask *mask, bool force)
>>>>>>> +{
>>>>>>> +	return -EINVAL;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static struct irq_chip tango_chip = {
>>>>>>> +	.irq_ack		= tango_ack,
>>>>>>> +	.irq_mask		= tango_mask,
>>>>>>> +	.irq_unmask		= tango_unmask,
>>>>>>> +	.irq_set_affinity	= tango_set_affinity,
>>>>>>> +	.irq_compose_msi_msg	= tango_compose_msi_msg,
>>>>>>> +};
>>>>>>
>>>>>> Hmmm... I'm wondering why .irq_set_affinity is required.
>>>>>>
>>>>>> static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
>>>>>> first calls __irq_can_set_affinity() to check whether
>>>>>> desc->irq_data.chip->irq_set_affinity) exists.
>>>>>>
>>>>>> then calls irq_do_set_affinity(&desc->irq_data, mask, false);
>>>>>> which calls chip->irq_set_affinity(data, mask, force);
>>>>>> = msi_domain_set_affinity()
>>>>>> which calls parent->chip->irq_set_affinity() unconditionally.
>>>>>>
>>>>>> Would it make sense to test that the callback is implemented
>>>>>> before calling it?
>>>>>>
>>>>>>
>>>>>> [    0.723895] Unable to handle kernel NULL pointer dereference at virtual address 00000000
>>>>>
>>>>> I'm not sure what you're asking.
>>>>>
>>>>> Is this a bug report for the v4 tango driver?
>>>>
>>>> No.
>>>>
>>>>> Or are you asking whether msi_domain_set_affinity() should be changed
>>>>> to check whether parent->chip->irq_set_affinity is implemented?
>>>>
>>>> Yes. The way things are implemented now, drivers are forced
>>>> to define an irq_set_affinity callback, even if it just returns
>>>> an error, otherwise, the kernel crashes, because of the
>>>> unconditional function pointer deref. 
>>>>
>>>>> msi_domain_set_affinity() has called parent->chip->irq_set_affinity()
>>>>> without checking since it was added in 2014 by f3cf8bb0d6c3 ("genirq: Add
>>>>> generic msi irq domain support"), so if there's a problem here, it's most
>>>>> likely in the tango code.
>>>>
>>>> The issue is having to define an "empty" function.
>>>> (Unnecessary code bloat and maintenance.)
>>>
>>> AFAICS, only one driver (other than this one) implements a "do nothing"
>>> set_affinity callback - everyone else who doesn't do anything hardware
>>> specific just passes it along via irq_chip_set_affinity_parent().
>>
>> I counted 4. Where did I mess up?
>>
>> advk_msi_set_affinity
>> altera_msi_set_affinity
>> nwl_msi_set_affinity
>> vmd_irq_set_affinity
>> tango_set_affinity
> 
> Fair point - I went through drivers/irqchip and the various
> arch-specific instances and found ls_scfg_msi_set_affinity(), but
> somehow skipped over drivers/pci. Anyway, I think the question stands of
> why are these handful of drivers *not* using irq_chip_set_affinity_parent()?

Probably because they don't have a parent, in the hierarchical sense.
All they have is a chained interrupt (*puke*). Which implies that
changing the affinity of one MSI would affect all of them, completely
confusing unsuspecting userspace such as irqbalance.

> As an outsider, it naively seems logical that the affinity of an MSI
> which just gets translated to a wired interrupt should propagate through
> to the affinity of that wired interrupt, but maybe there are reasons not
> to; I really don't know.

See above. The main issue is that HW folks haven't understood the actual
use of MSIs, and persist in implementing them as an afterthought, based
on some cascading interrupt controller design.

Sad. So sad.

	M.
Marc Zyngier May 25, 2017, 8:37 a.m. UTC | #8
On 20/04/17 15:28, Marc Gonzalez wrote:
> The MSI controller in Tango supports 256 message-signaled interrupts,
> and a single doorbell address.
> 
> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
> ---
>  drivers/pci/host/pcie-tango.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 232 insertions(+)
> 
> diff --git a/drivers/pci/host/pcie-tango.c b/drivers/pci/host/pcie-tango.c
> new file mode 100644
> index 000000000000..ada8d35066ab
> --- /dev/null
> +++ b/drivers/pci/host/pcie-tango.c

As Bjorn mentioned elsewhere, the ordering of the patch is backward.
Please have the PCIe host controller driver in the first patch, then the
MSI stuff. Otherwise, it is impossible to see how the various bits are
wired.

And please have a separate patch the the DT binding, which needs
separate Acks from the DT folks.

> @@ -0,0 +1,232 @@
> +#include <linux/irqchip/chained_irq.h>
> +#include <linux/irqdomain.h>
> +#include <linux/pci-ecam.h>
> +#include <linux/delay.h>
> +#include <linux/msi.h>
> +
> +#define MSI_MAX 256
> +
> +struct tango_pcie {
> +	DECLARE_BITMAP(bitmap, MSI_MAX);
> +	spinlock_t lock;
> +	void __iomem *mux;
> +	void __iomem *msi_status;
> +	void __iomem *msi_enable;
> +	phys_addr_t msi_doorbell;

Init of these three fields should be in this patch.

> +	struct irq_domain *irq_dom;
> +	struct irq_domain *msi_dom;
> +	int irq;
> +};
> +
> +/*** MSI CONTROLLER SUPPORT ***/
> +
> +static void dispatch(struct tango_pcie *pcie, unsigned long status, int base)
> +{
> +	unsigned int pos, virq;
> +
> +	for_each_set_bit(pos, &status, 32) {
> +		virq = irq_find_mapping(pcie->irq_dom, base + pos);
> +		generic_handle_irq(virq);
> +	}
> +}
> +
> +static void tango_msi_isr(struct irq_desc *desc)
> +{
> +	struct irq_chip *chip = irq_desc_get_chip(desc);
> +	struct tango_pcie *pcie = irq_desc_get_handler_data(desc);
> +	unsigned int status, base, pos = 0;
> +
> +	chained_irq_enter(chip, desc);
> +
> +	while ((pos = find_next_bit(pcie->bitmap, MSI_MAX, pos)) < MSI_MAX) {
> +		base = round_down(pos, 32);
> +		status = readl_relaxed(pcie->msi_status + base / 8);
> +		dispatch(pcie, status, base);

Just inline dispatch here.

> +		pos = base + 32;
> +	}
> +
> +	chained_irq_exit(chip, desc);
> +}
> +
> +static void tango_ack(struct irq_data *data)
> +{
> +	u32 bit = BIT(data->hwirq);

How does this work when hwirq is >= 32 (from what I gather, it can range
from 0 to 255...).

> +	struct tango_pcie *pcie = irq_data_get_irq_chip_data(data);
> +
> +	writel_relaxed(bit, pcie->msi_status);
> +}
> +
> +static void update_msi_enable(struct irq_data *data, bool unmask)
> +{
> +	unsigned long flags;
> +	u32 val, bit = BIT(data->hwirq % 32);
> +	int byte_offset = (data->hwirq / 32) * 4;
> +	struct tango_pcie *pcie = data->chip_data;
> +
> +	spin_lock_irqsave(&pcie->lock, flags);
> +	val = readl_relaxed(pcie->msi_enable + byte_offset);

As I already mentioned in one of the previous series, this is a fairly
pointless MMIO access. You could maintain a SW version of the enable
register, update that copy and write it.

But that's only a performance optimization, and it won't affect the
behaviour. Your call.

> +	val = unmask ? val | bit : val & ~bit;
> +	writel_relaxed(val, pcie->msi_enable + byte_offset);
> +	spin_unlock_irqrestore(&pcie->lock, flags);
> +}
> +
> +static void tango_mask(struct irq_data *data)
> +{
> +	update_msi_enable(data, false);
> +}
> +
> +static void tango_unmask(struct irq_data *data)
> +{
> +	update_msi_enable(data, true);
> +}
> +
> +static int tango_set_affinity(struct irq_data *data,
> +		const struct cpumask *mask, bool force)
> +{
> +	return -EINVAL;
> +}
> +
> +static void tango_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> +{
> +	struct tango_pcie *pcie = irq_data_get_irq_chip_data(data);
> +
> +	msg->address_lo = lower_32_bits(pcie->msi_doorbell);
> +	msg->address_hi = upper_32_bits(pcie->msi_doorbell);
> +	msg->data = data->hwirq;
> +}
> +
> +static struct irq_chip tango_chip = {
> +	.irq_ack		= tango_ack,
> +	.irq_mask		= tango_mask,
> +	.irq_unmask		= tango_unmask,
> +	.irq_set_affinity	= tango_set_affinity,
> +	.irq_compose_msi_msg	= tango_compose_msi_msg,
> +};
> +
> +static void msi_ack(struct irq_data *data)
> +{
> +	irq_chip_ack_parent(data);
> +}
> +
> +static void msi_mask(struct irq_data *data)
> +{
> +	pci_msi_mask_irq(data);
> +	irq_chip_mask_parent(data);
> +}
> +
> +static void msi_unmask(struct irq_data *data)
> +{
> +	pci_msi_unmask_irq(data);
> +	irq_chip_unmask_parent(data);
> +}
> +
> +static struct irq_chip msi_chip = {
> +	.name = "MSI",
> +	.irq_ack = msi_ack,
> +	.irq_mask = msi_mask,
> +	.irq_unmask = msi_unmask,
> +};
> +
> +static struct msi_domain_info msi_dom_info = {
> +	.flags	= MSI_FLAG_PCI_MSIX
> +		| MSI_FLAG_USE_DEF_DOM_OPS
> +		| MSI_FLAG_USE_DEF_CHIP_OPS,
> +	.chip	= &msi_chip,
> +};
> +
> +static int find_free_msi(struct irq_domain *dom, unsigned int virq)
> +{
> +	unsigned int pos;
> +	struct tango_pcie *pcie = dom->host_data;
> +
> +	pos = find_first_zero_bit(pcie->bitmap, MSI_MAX);
> +	if (pos >= MSI_MAX)
> +		return -ENOSPC;
> +	__set_bit(pos, pcie->bitmap);
> +
> +	irq_domain_set_info(dom, virq, pos, &tango_chip, pcie,
> +			handle_edge_irq, NULL, NULL);
> +
> +	return 0;
> +}
> +
> +static int tango_irq_domain_alloc(struct irq_domain *dom,
> +		unsigned int virq, unsigned int nr_irqs, void *args)
> +{
> +	int err;
> +	unsigned long flags;
> +	struct tango_pcie *pcie = dom->host_data;
> +
> +	spin_lock_irqsave(&pcie->lock, flags);
> +	err = find_free_msi(dom, virq);

Just inline find_free_msi here. it is not called from anywhere else, and
seeing the locking close to the use of the bitmap is definitely better.

> +	spin_unlock_irqrestore(&pcie->lock, flags);
> +
> +	return err;
> +}
> +
> +static void tango_irq_domain_free(struct irq_domain *dom,
> +		unsigned int virq, unsigned int nr_irqs)
> +{
> +	unsigned long flags;
> +	struct irq_data *data = irq_domain_get_irq_data(dom, virq);
> +	struct tango_pcie *pcie = irq_data_get_irq_chip_data(data);
> +
> +	spin_lock_irqsave(&pcie->lock, flags);
> +	__clear_bit(data->hwirq, pcie->bitmap);
> +	spin_unlock_irqrestore(&pcie->lock, flags);
> +}
> +
> +static const struct irq_domain_ops irq_dom_ops = {
> +	.alloc	= tango_irq_domain_alloc,
> +	.free	= tango_irq_domain_free,
> +};
> +
> +static int tango_msi_remove(struct platform_device *pdev)
> +{
> +	struct tango_pcie *pcie = platform_get_drvdata(pdev);
> +
> +	irq_set_chained_handler_and_data(pcie->irq, NULL, NULL);
> +	irq_domain_remove(pcie->msi_dom);
> +	irq_domain_remove(pcie->irq_dom);
> +
> +	return 0;
> +}
> +
> +static int tango_msi_probe(struct platform_device *pdev, struct tango_pcie *pcie)
> +{
> +	int i, virq;
> +	struct irq_domain *msi_dom, *irq_dom;
> +	struct fwnode_handle *fwnode = of_node_to_fwnode(pdev->dev.of_node);
> +
> +	spin_lock_init(&pcie->lock);
> +	for (i = 0; i < MSI_MAX / 32; ++i)
> +		writel_relaxed(0, pcie->msi_enable + i * 4);
> +
> +	irq_dom = irq_domain_create_linear(fwnode, MSI_MAX, &irq_dom_ops, pcie);
> +	if (!irq_dom) {
> +		pr_err("Failed to create IRQ domain\n");
> +		return -ENOMEM;
> +	}
> +
> +	msi_dom = pci_msi_create_irq_domain(fwnode, &msi_dom_info, irq_dom);
> +	if (!msi_dom) {
> +		pr_err("Failed to create MSI domain\n");
> +		irq_domain_remove(irq_dom);
> +		return -ENOMEM;
> +	}
> +
> +	virq = platform_get_irq(pdev, 1);
> +	if (virq <= 0) {
> +		pr_err("Failed to map IRQ\n");
> +		irq_domain_remove(msi_dom);
> +		irq_domain_remove(irq_dom);
> +		return -ENXIO;
> +	}

Maybe start the probe with this. No need to start allocating data
structures if the DT is wrong.

> +
> +	pcie->irq_dom = irq_dom;
> +	pcie->msi_dom = msi_dom;
> +	pcie->irq = virq;
> +	irq_set_chained_handler_and_data(virq, tango_msi_isr, pcie);
> +
> +	return 0;
> +}
> 

Thanks,

	M.
Marc Gonzalez May 31, 2017, 7:32 a.m. UTC | #9
On 25/05/2017 10:37, Marc Zyngier wrote:

> On 20/04/17 15:28, Marc Gonzalez wrote:
>
>> The MSI controller in Tango supports 256 message-signaled interrupts,
>> and a single doorbell address.
>>
>> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
>> ---
>>  drivers/pci/host/pcie-tango.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 232 insertions(+)
>>
>> diff --git a/drivers/pci/host/pcie-tango.c b/drivers/pci/host/pcie-tango.c
>> new file mode 100644
>> index 000000000000..ada8d35066ab
>> --- /dev/null
>> +++ b/drivers/pci/host/pcie-tango.c
> 
> As Bjorn mentioned elsewhere, the ordering of the patch is backward.
> Please have the PCIe host controller driver in the first patch, then the
> MSI stuff. Otherwise, it is impossible to see how the various bits are
> wired.

Done.

> And please have a separate patch the the DT binding, which needs
> separate Acks from the DT folks.

Done.

>> @@ -0,0 +1,232 @@
>> +#include <linux/irqchip/chained_irq.h>
>> +#include <linux/irqdomain.h>
>> +#include <linux/pci-ecam.h>
>> +#include <linux/delay.h>
>> +#include <linux/msi.h>
>> +
>> +#define MSI_MAX 256
>> +
>> +struct tango_pcie {
>> +	DECLARE_BITMAP(bitmap, MSI_MAX);
>> +	spinlock_t lock;
>> +	void __iomem *mux;
>> +	void __iomem *msi_status;
>> +	void __iomem *msi_enable;
>> +	phys_addr_t msi_doorbell;
> 
> Init of these three fields should be in this patch.

Done.

>> +	struct irq_domain *irq_dom;
>> +	struct irq_domain *msi_dom;
>> +	int irq;
>> +};
>> +
>> +/*** MSI CONTROLLER SUPPORT ***/
>> +
>> +static void dispatch(struct tango_pcie *pcie, unsigned long status, int base)
>> +{
>> +	unsigned int pos, virq;
>> +
>> +	for_each_set_bit(pos, &status, 32) {
>> +		virq = irq_find_mapping(pcie->irq_dom, base + pos);
>> +		generic_handle_irq(virq);
>> +	}
>> +}
>> +
>> +static void tango_msi_isr(struct irq_desc *desc)
>> +{
>> +	struct irq_chip *chip = irq_desc_get_chip(desc);
>> +	struct tango_pcie *pcie = irq_desc_get_handler_data(desc);
>> +	unsigned int status, base, pos = 0;
>> +
>> +	chained_irq_enter(chip, desc);
>> +
>> +	while ((pos = find_next_bit(pcie->bitmap, MSI_MAX, pos)) < MSI_MAX) {
>> +		base = round_down(pos, 32);
>> +		status = readl_relaxed(pcie->msi_status + base / 8);
>> +		dispatch(pcie, status, base);
> 
> Just inline dispatch here.

Done.

>> +		pos = base + 32;
>> +	}
>> +
>> +	chained_irq_exit(chip, desc);
>> +}
>> +
>> +static void tango_ack(struct irq_data *data)
>> +{
>> +	u32 bit = BIT(data->hwirq);
> 
> How does this work when hwirq is >= 32 (from what I gather, it can range
> from 0 to 255...).

It blows up. Fixed now.

>> +	struct tango_pcie *pcie = irq_data_get_irq_chip_data(data);
>> +
>> +	writel_relaxed(bit, pcie->msi_status);
>> +}
>> +
>> +static void update_msi_enable(struct irq_data *data, bool unmask)
>> +{
>> +	unsigned long flags;
>> +	u32 val, bit = BIT(data->hwirq % 32);
>> +	int byte_offset = (data->hwirq / 32) * 4;
>> +	struct tango_pcie *pcie = data->chip_data;
>> +
>> +	spin_lock_irqsave(&pcie->lock, flags);
>> +	val = readl_relaxed(pcie->msi_enable + byte_offset);
> 
> As I already mentioned in one of the previous series, this is a fairly
> pointless MMIO access. You could maintain a SW version of the enable
> register, update that copy and write it.
> 
> But that's only a performance optimization, and it won't affect the
> behaviour. Your call.

Thanks for pointing it out.
I'll implement the optimization in a future patch.

>> +	val = unmask ? val | bit : val & ~bit;
>> +	writel_relaxed(val, pcie->msi_enable + byte_offset);
>> +	spin_unlock_irqrestore(&pcie->lock, flags);
>> +}
>> +
>> +static void tango_mask(struct irq_data *data)
>> +{
>> +	update_msi_enable(data, false);
>> +}
>> +
>> +static void tango_unmask(struct irq_data *data)
>> +{
>> +	update_msi_enable(data, true);
>> +}
>> +
>> +static int tango_set_affinity(struct irq_data *data,
>> +		const struct cpumask *mask, bool force)
>> +{
>> +	return -EINVAL;
>> +}
>> +
>> +static void tango_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>> +{
>> +	struct tango_pcie *pcie = irq_data_get_irq_chip_data(data);
>> +
>> +	msg->address_lo = lower_32_bits(pcie->msi_doorbell);
>> +	msg->address_hi = upper_32_bits(pcie->msi_doorbell);
>> +	msg->data = data->hwirq;
>> +}
>> +
>> +static struct irq_chip tango_chip = {
>> +	.irq_ack		= tango_ack,
>> +	.irq_mask		= tango_mask,
>> +	.irq_unmask		= tango_unmask,
>> +	.irq_set_affinity	= tango_set_affinity,
>> +	.irq_compose_msi_msg	= tango_compose_msi_msg,
>> +};
>> +
>> +static void msi_ack(struct irq_data *data)
>> +{
>> +	irq_chip_ack_parent(data);
>> +}
>> +
>> +static void msi_mask(struct irq_data *data)
>> +{
>> +	pci_msi_mask_irq(data);
>> +	irq_chip_mask_parent(data);
>> +}
>> +
>> +static void msi_unmask(struct irq_data *data)
>> +{
>> +	pci_msi_unmask_irq(data);
>> +	irq_chip_unmask_parent(data);
>> +}
>> +
>> +static struct irq_chip msi_chip = {
>> +	.name = "MSI",
>> +	.irq_ack = msi_ack,
>> +	.irq_mask = msi_mask,
>> +	.irq_unmask = msi_unmask,
>> +};
>> +
>> +static struct msi_domain_info msi_dom_info = {
>> +	.flags	= MSI_FLAG_PCI_MSIX
>> +		| MSI_FLAG_USE_DEF_DOM_OPS
>> +		| MSI_FLAG_USE_DEF_CHIP_OPS,
>> +	.chip	= &msi_chip,
>> +};
>> +
>> +static int find_free_msi(struct irq_domain *dom, unsigned int virq)
>> +{
>> +	unsigned int pos;
>> +	struct tango_pcie *pcie = dom->host_data;
>> +
>> +	pos = find_first_zero_bit(pcie->bitmap, MSI_MAX);
>> +	if (pos >= MSI_MAX)
>> +		return -ENOSPC;
>> +	__set_bit(pos, pcie->bitmap);
>> +
>> +	irq_domain_set_info(dom, virq, pos, &tango_chip, pcie,
>> +			handle_edge_irq, NULL, NULL);
>> +
>> +	return 0;
>> +}
>> +
>> +static int tango_irq_domain_alloc(struct irq_domain *dom,
>> +		unsigned int virq, unsigned int nr_irqs, void *args)
>> +{
>> +	int err;
>> +	unsigned long flags;
>> +	struct tango_pcie *pcie = dom->host_data;
>> +
>> +	spin_lock_irqsave(&pcie->lock, flags);
>> +	err = find_free_msi(dom, virq);
> 
> Just inline find_free_msi here. it is not called from anywhere else, and
> seeing the locking close to the use of the bitmap is definitely better.

Done.

>> +	spin_unlock_irqrestore(&pcie->lock, flags);
>> +
>> +	return err;
>> +}
>> +
>> +static void tango_irq_domain_free(struct irq_domain *dom,
>> +		unsigned int virq, unsigned int nr_irqs)
>> +{
>> +	unsigned long flags;
>> +	struct irq_data *data = irq_domain_get_irq_data(dom, virq);
>> +	struct tango_pcie *pcie = irq_data_get_irq_chip_data(data);
>> +
>> +	spin_lock_irqsave(&pcie->lock, flags);
>> +	__clear_bit(data->hwirq, pcie->bitmap);
>> +	spin_unlock_irqrestore(&pcie->lock, flags);
>> +}
>> +
>> +static const struct irq_domain_ops irq_dom_ops = {
>> +	.alloc	= tango_irq_domain_alloc,
>> +	.free	= tango_irq_domain_free,
>> +};
>> +
>> +static int tango_msi_remove(struct platform_device *pdev)
>> +{
>> +	struct tango_pcie *pcie = platform_get_drvdata(pdev);
>> +
>> +	irq_set_chained_handler_and_data(pcie->irq, NULL, NULL);
>> +	irq_domain_remove(pcie->msi_dom);
>> +	irq_domain_remove(pcie->irq_dom);
>> +
>> +	return 0;
>> +}
>> +
>> +static int tango_msi_probe(struct platform_device *pdev, struct tango_pcie *pcie)
>> +{
>> +	int i, virq;
>> +	struct irq_domain *msi_dom, *irq_dom;
>> +	struct fwnode_handle *fwnode = of_node_to_fwnode(pdev->dev.of_node);
>> +
>> +	spin_lock_init(&pcie->lock);
>> +	for (i = 0; i < MSI_MAX / 32; ++i)
>> +		writel_relaxed(0, pcie->msi_enable + i * 4);
>> +
>> +	irq_dom = irq_domain_create_linear(fwnode, MSI_MAX, &irq_dom_ops, pcie);
>> +	if (!irq_dom) {
>> +		pr_err("Failed to create IRQ domain\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	msi_dom = pci_msi_create_irq_domain(fwnode, &msi_dom_info, irq_dom);
>> +	if (!msi_dom) {
>> +		pr_err("Failed to create MSI domain\n");
>> +		irq_domain_remove(irq_dom);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	virq = platform_get_irq(pdev, 1);
>> +	if (virq <= 0) {
>> +		pr_err("Failed to map IRQ\n");
>> +		irq_domain_remove(msi_dom);
>> +		irq_domain_remove(irq_dom);
>> +		return -ENXIO;
>> +	}
> 
> Maybe start the probe with this. No need to start allocating data
> structures if the DT is wrong.

Done.
diff mbox

Patch

diff --git a/drivers/pci/host/pcie-tango.c b/drivers/pci/host/pcie-tango.c
new file mode 100644
index 000000000000..ada8d35066ab
--- /dev/null
+++ b/drivers/pci/host/pcie-tango.c
@@ -0,0 +1,232 @@ 
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/pci-ecam.h>
+#include <linux/delay.h>
+#include <linux/msi.h>
+
+#define MSI_MAX 256
+
+struct tango_pcie {
+	DECLARE_BITMAP(bitmap, MSI_MAX);
+	spinlock_t lock;
+	void __iomem *mux;
+	void __iomem *msi_status;
+	void __iomem *msi_enable;
+	phys_addr_t msi_doorbell;
+	struct irq_domain *irq_dom;
+	struct irq_domain *msi_dom;
+	int irq;
+};
+
+/*** MSI CONTROLLER SUPPORT ***/
+
+static void dispatch(struct tango_pcie *pcie, unsigned long status, int base)
+{
+	unsigned int pos, virq;
+
+	for_each_set_bit(pos, &status, 32) {
+		virq = irq_find_mapping(pcie->irq_dom, base + pos);
+		generic_handle_irq(virq);
+	}
+}
+
+static void tango_msi_isr(struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	struct tango_pcie *pcie = irq_desc_get_handler_data(desc);
+	unsigned int status, base, pos = 0;
+
+	chained_irq_enter(chip, desc);
+
+	while ((pos = find_next_bit(pcie->bitmap, MSI_MAX, pos)) < MSI_MAX) {
+		base = round_down(pos, 32);
+		status = readl_relaxed(pcie->msi_status + base / 8);
+		dispatch(pcie, status, base);
+		pos = base + 32;
+	}
+
+	chained_irq_exit(chip, desc);
+}
+
+static void tango_ack(struct irq_data *data)
+{
+	u32 bit = BIT(data->hwirq);
+	struct tango_pcie *pcie = irq_data_get_irq_chip_data(data);
+
+	writel_relaxed(bit, pcie->msi_status);
+}
+
+static void update_msi_enable(struct irq_data *data, bool unmask)
+{
+	unsigned long flags;
+	u32 val, bit = BIT(data->hwirq % 32);
+	int byte_offset = (data->hwirq / 32) * 4;
+	struct tango_pcie *pcie = data->chip_data;
+
+	spin_lock_irqsave(&pcie->lock, flags);
+	val = readl_relaxed(pcie->msi_enable + byte_offset);
+	val = unmask ? val | bit : val & ~bit;
+	writel_relaxed(val, pcie->msi_enable + byte_offset);
+	spin_unlock_irqrestore(&pcie->lock, flags);
+}
+
+static void tango_mask(struct irq_data *data)
+{
+	update_msi_enable(data, false);
+}
+
+static void tango_unmask(struct irq_data *data)
+{
+	update_msi_enable(data, true);
+}
+
+static int tango_set_affinity(struct irq_data *data,
+		const struct cpumask *mask, bool force)
+{
+	return -EINVAL;
+}
+
+static void tango_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	struct tango_pcie *pcie = irq_data_get_irq_chip_data(data);
+
+	msg->address_lo = lower_32_bits(pcie->msi_doorbell);
+	msg->address_hi = upper_32_bits(pcie->msi_doorbell);
+	msg->data = data->hwirq;
+}
+
+static struct irq_chip tango_chip = {
+	.irq_ack		= tango_ack,
+	.irq_mask		= tango_mask,
+	.irq_unmask		= tango_unmask,
+	.irq_set_affinity	= tango_set_affinity,
+	.irq_compose_msi_msg	= tango_compose_msi_msg,
+};
+
+static void msi_ack(struct irq_data *data)
+{
+	irq_chip_ack_parent(data);
+}
+
+static void msi_mask(struct irq_data *data)
+{
+	pci_msi_mask_irq(data);
+	irq_chip_mask_parent(data);
+}
+
+static void msi_unmask(struct irq_data *data)
+{
+	pci_msi_unmask_irq(data);
+	irq_chip_unmask_parent(data);
+}
+
+static struct irq_chip msi_chip = {
+	.name = "MSI",
+	.irq_ack = msi_ack,
+	.irq_mask = msi_mask,
+	.irq_unmask = msi_unmask,
+};
+
+static struct msi_domain_info msi_dom_info = {
+	.flags	= MSI_FLAG_PCI_MSIX
+		| MSI_FLAG_USE_DEF_DOM_OPS
+		| MSI_FLAG_USE_DEF_CHIP_OPS,
+	.chip	= &msi_chip,
+};
+
+static int find_free_msi(struct irq_domain *dom, unsigned int virq)
+{
+	unsigned int pos;
+	struct tango_pcie *pcie = dom->host_data;
+
+	pos = find_first_zero_bit(pcie->bitmap, MSI_MAX);
+	if (pos >= MSI_MAX)
+		return -ENOSPC;
+	__set_bit(pos, pcie->bitmap);
+
+	irq_domain_set_info(dom, virq, pos, &tango_chip, pcie,
+			handle_edge_irq, NULL, NULL);
+
+	return 0;
+}
+
+static int tango_irq_domain_alloc(struct irq_domain *dom,
+		unsigned int virq, unsigned int nr_irqs, void *args)
+{
+	int err;
+	unsigned long flags;
+	struct tango_pcie *pcie = dom->host_data;
+
+	spin_lock_irqsave(&pcie->lock, flags);
+	err = find_free_msi(dom, virq);
+	spin_unlock_irqrestore(&pcie->lock, flags);
+
+	return err;
+}
+
+static void tango_irq_domain_free(struct irq_domain *dom,
+		unsigned int virq, unsigned int nr_irqs)
+{
+	unsigned long flags;
+	struct irq_data *data = irq_domain_get_irq_data(dom, virq);
+	struct tango_pcie *pcie = irq_data_get_irq_chip_data(data);
+
+	spin_lock_irqsave(&pcie->lock, flags);
+	__clear_bit(data->hwirq, pcie->bitmap);
+	spin_unlock_irqrestore(&pcie->lock, flags);
+}
+
+static const struct irq_domain_ops irq_dom_ops = {
+	.alloc	= tango_irq_domain_alloc,
+	.free	= tango_irq_domain_free,
+};
+
+static int tango_msi_remove(struct platform_device *pdev)
+{
+	struct tango_pcie *pcie = platform_get_drvdata(pdev);
+
+	irq_set_chained_handler_and_data(pcie->irq, NULL, NULL);
+	irq_domain_remove(pcie->msi_dom);
+	irq_domain_remove(pcie->irq_dom);
+
+	return 0;
+}
+
+static int tango_msi_probe(struct platform_device *pdev, struct tango_pcie *pcie)
+{
+	int i, virq;
+	struct irq_domain *msi_dom, *irq_dom;
+	struct fwnode_handle *fwnode = of_node_to_fwnode(pdev->dev.of_node);
+
+	spin_lock_init(&pcie->lock);
+	for (i = 0; i < MSI_MAX / 32; ++i)
+		writel_relaxed(0, pcie->msi_enable + i * 4);
+
+	irq_dom = irq_domain_create_linear(fwnode, MSI_MAX, &irq_dom_ops, pcie);
+	if (!irq_dom) {
+		pr_err("Failed to create IRQ domain\n");
+		return -ENOMEM;
+	}
+
+	msi_dom = pci_msi_create_irq_domain(fwnode, &msi_dom_info, irq_dom);
+	if (!msi_dom) {
+		pr_err("Failed to create MSI domain\n");
+		irq_domain_remove(irq_dom);
+		return -ENOMEM;
+	}
+
+	virq = platform_get_irq(pdev, 1);
+	if (virq <= 0) {
+		pr_err("Failed to map IRQ\n");
+		irq_domain_remove(msi_dom);
+		irq_domain_remove(irq_dom);
+		return -ENXIO;
+	}
+
+	pcie->irq_dom = irq_dom;
+	pcie->msi_dom = msi_dom;
+	pcie->irq = virq;
+	irq_set_chained_handler_and_data(virq, tango_msi_isr, pcie);
+
+	return 0;
+}