diff mbox

[RFC] arm: imx: Workaround i.MX6 PMU interrupts muxed to one SPI

Message ID 1416483757-24165-1-git-send-email-daniel.thompson@linaro.org
State New
Headers show

Commit Message

Daniel Thompson Nov. 20, 2014, 11:42 a.m. UTC
All PMU interrupts on multi-core i.MX6 devices are muxed onto a single SPI.
Should the PMU of any core except 0 (the default affinity for the
interrupt) trigger the interrupt then it cannot be serviced and,
eventually, the spurious irq detection will forcefully disable the
interrupt.

This can be worked around by getting the interrupt handler to change its
own affinity if it cannot figure out why it has been triggered. This patch
adds logic to rotate the affinity to i.MX6.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---

Notes:
    This patch adopts the approach used on the u8500 (db8500_pmu_handler)
    but the logic has been generalized for any number of CPUs, mostly
    because the i.MX6 has a both dual and quad core variants.
    
    However it might be better to include the generalized logic in the main
    armpmu code. I think the logic could be deployed automatically on SMP
    systems with only a single not-percpu IRQ, replacing the
    plat->handle_irq dance we currently do to hook up this code.
    
    Thoughts? (or is there already shared logic to do this that I
    overlooked)
    

 arch/arm/mach-imx/mach-imx6q.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

--
1.9.3

Comments

Arnd Bergmann Nov. 20, 2014, 11:48 a.m. UTC | #1
On Thursday 20 November 2014 11:42:37 Daniel Thompson wrote:
> Notes:
>     This patch adopts the approach used on the u8500 (db8500_pmu_handler)
>     but the logic has been generalized for any number of CPUs, mostly
>     because the i.MX6 has a both dual and quad core variants.
>     
>     However it might be better to include the generalized logic in the main
>     armpmu code. I think the logic could be deployed automatically on SMP
>     systems with only a single not-percpu IRQ, replacing the
>     plat->handle_irq dance we currently do to hook up this code.
>     
>     Thoughts? (or is there already shared logic to do this that I
>     overlooked)
> 

I would definitely prefer to have this handled in the armpmu implementation,
given that there are at least two platforms that need the exact same
hack.

	Arnd
Lucas Stach Nov. 20, 2014, 11:52 a.m. UTC | #2
Am Donnerstag, den 20.11.2014, 11:42 +0000 schrieb Daniel Thompson:
> All PMU interrupts on multi-core i.MX6 devices are muxed onto a single SPI.
> Should the PMU of any core except 0 (the default affinity for the
> interrupt) trigger the interrupt then it cannot be serviced and,
> eventually, the spurious irq detection will forcefully disable the
> interrupt.
> 
> This can be worked around by getting the interrupt handler to change its
> own affinity if it cannot figure out why it has been triggered. This patch
> adds logic to rotate the affinity to i.MX6.
> 
> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>

I've sent almost the same patch a while ago. At this time it was shot
down due to fears of the measurements being too flaky to be useful with
all that IRQ dance. While I don't think this is true (I did some
measurements on a SOLO and a QUAD variants of the i.MX6 with the same
workload, that were only minimally apart), I believe the IRQ affinity
dance isn't the best way to handle this.

I had planned to look into smp_call_function() to distribute the IRQ to
all CPUs at the same time, but have not got around to it yet. Maybe you
could investigate whether this is a viable solution to the problem at
hand?

Regards,
Lucas
> ---
> 
> Notes:
>     This patch adopts the approach used on the u8500 (db8500_pmu_handler)
>     but the logic has been generalized for any number of CPUs, mostly
>     because the i.MX6 has a both dual and quad core variants.
>     
>     However it might be better to include the generalized logic in the main
>     armpmu code. I think the logic could be deployed automatically on SMP
>     systems with only a single not-percpu IRQ, replacing the
>     plat->handle_irq dance we currently do to hook up this code.
>     
>     Thoughts? (or is there already shared logic to do this that I
>     overlooked)
>     
> 
>  arch/arm/mach-imx/mach-imx6q.c | 39 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 38 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c
> index d51c6e99a2e9..c056b7b97eaa 100644
> --- a/arch/arm/mach-imx/mach-imx6q.c
> +++ b/arch/arm/mach-imx/mach-imx6q.c
> @@ -16,6 +16,7 @@
>  #include <linux/delay.h>
>  #include <linux/export.h>
>  #include <linux/init.h>
> +#include <linux/interrupt.h>
>  #include <linux/io.h>
>  #include <linux/irq.h>
>  #include <linux/irqchip.h>
> @@ -33,6 +34,7 @@
>  #include <linux/mfd/syscon/imx6q-iomuxc-gpr.h>
>  #include <asm/mach/arch.h>
>  #include <asm/mach/map.h>
> +#include <asm/pmu.h>
>  #include <asm/system_misc.h>
> 
>  #include "common.h"
> @@ -261,6 +263,40 @@ static void __init imx6q_axi_init(void)
>  	}
>  }
> 
> +/*
> + * The PMU IRQ lines of all cores are muxed onto a single interrupt.
> + * Rotate the interrupt around the cores if the current CPU cannot
> + * figure out why the interrupt has been triggered.
> + */
> +static irqreturn_t imx6q_pmu_handler(int irq, void *dev, irq_handler_t handler)
> +{
> +	irqreturn_t ret = handler(irq, dev);
> +	int next;
> +
> +	if (ret == IRQ_NONE && num_online_cpus() > 1) {
> +		next = cpumask_next(smp_processor_id(), cpu_online_mask);
> +		if (next > nr_cpu_ids)
> +			next = cpumask_next(-1, cpu_online_mask);
> +		irq_set_affinity(irq, cpumask_of(next));
> +	}
> +
> +	/*
> +	 * We should be able to get away with the amount of IRQ_NONEs we give,
> +	 * while still having the spurious IRQ detection code kick in if the
> +	 * interrupt really starts hitting spuriously.
> +	 */
> +	return ret;
> +}
> +
> +static struct arm_pmu_platdata imx6q_pmu_platdata = {
> +	.handle_irq = imx6q_pmu_handler,
> +};
> +
> +static struct of_dev_auxdata imx6q_auxdata_lookup[] __initdata = {
> +	OF_DEV_AUXDATA("arm,cortex-a9-pmu", 0, "arm-pmu", &imx6q_pmu_platdata),
> +	{},
> +};
> +
>  static void __init imx6q_init_machine(void)
>  {
>  	struct device *parent;
> @@ -276,7 +312,8 @@ static void __init imx6q_init_machine(void)
> 
>  	imx6q_enet_phy_init();
> 
> -	of_platform_populate(NULL, of_default_bus_match_table, NULL, parent);
> +	of_platform_populate(NULL, of_default_bus_match_table,
> +			     imx6q_auxdata_lookup, parent);
> 
>  	imx_anatop_init();
>  	cpu_is_imx6q() ?  imx6q_pm_init() : imx6dl_pm_init();
> --
> 1.9.3
>
Daniel Thompson Nov. 20, 2014, 2:24 p.m. UTC | #3
On 20/11/14 11:52, Lucas Stach wrote:
> Am Donnerstag, den 20.11.2014, 11:42 +0000 schrieb Daniel Thompson:
>> All PMU interrupts on multi-core i.MX6 devices are muxed onto a single SPI.
>> Should the PMU of any core except 0 (the default affinity for the
>> interrupt) trigger the interrupt then it cannot be serviced and,
>> eventually, the spurious irq detection will forcefully disable the
>> interrupt.
>>
>> This can be worked around by getting the interrupt handler to change its
>> own affinity if it cannot figure out why it has been triggered. This patch
>> adds logic to rotate the affinity to i.MX6.
>>
>> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
> 
> I've sent almost the same patch a while ago. At this time it was shot
> down due to fears of the measurements being too flaky to be useful with
> all that IRQ dance. While I don't think this is true (I did some
> measurements on a SOLO and a QUAD variants of the i.MX6 with the same
> workload, that were only minimally apart), I believe the IRQ affinity
> dance isn't the best way to handle this.

Cumulative statistics and time based sampling profilers should be fine
either way since a delay before the interrupt the asserted on the
affected core should have a low impact here.

A use case where the measurement would be flaky might be a profile
trying to highlight which functions (or opcodes) of a process are most
likely to miss the cache. In such a case delay before the interrupt is
asserted would result in the cache miss being attributed to the wrong
opcode.

Note also that for a single threaded processes, or any other workload
where one core's PMU raises interrupts much more frequently than any
other, then the dance remains a good approach since it leaves the
affinity set correctly for next time.


> I had planned to look into smp_call_function() to distribute the IRQ to
> all CPUs at the same time, but have not got around to it yet. Maybe you
> could investigate whether this is a viable solution to the problem at
> hand?

I'm a little sceptical, not least because smp_call_function() and its
friends can't be called from interrupt.

This means the steps to propagate the interrupt become rather complex
and therefore on systems with a small(ish) numbers of cores it is not
obvious that the delay before the interrupt is asserted on the affected
core will improve very much.

Hopefully systems with a large number of cores won't exhibit this
problem (because whatever the workaround we adopt there would be
significant problems).




> 
> Regards,
> Lucas
>> ---
>>
>> Notes:
>>     This patch adopts the approach used on the u8500 (db8500_pmu_handler)
>>     but the logic has been generalized for any number of CPUs, mostly
>>     because the i.MX6 has a both dual and quad core variants.
>>     
>>     However it might be better to include the generalized logic in the main
>>     armpmu code. I think the logic could be deployed automatically on SMP
>>     systems with only a single not-percpu IRQ, replacing the
>>     plat->handle_irq dance we currently do to hook up this code.
>>     
>>     Thoughts? (or is there already shared logic to do this that I
>>     overlooked)
>>     
>>
>>  arch/arm/mach-imx/mach-imx6q.c | 39 ++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 38 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c
>> index d51c6e99a2e9..c056b7b97eaa 100644
>> --- a/arch/arm/mach-imx/mach-imx6q.c
>> +++ b/arch/arm/mach-imx/mach-imx6q.c
>> @@ -16,6 +16,7 @@
>>  #include <linux/delay.h>
>>  #include <linux/export.h>
>>  #include <linux/init.h>
>> +#include <linux/interrupt.h>
>>  #include <linux/io.h>
>>  #include <linux/irq.h>
>>  #include <linux/irqchip.h>
>> @@ -33,6 +34,7 @@
>>  #include <linux/mfd/syscon/imx6q-iomuxc-gpr.h>
>>  #include <asm/mach/arch.h>
>>  #include <asm/mach/map.h>
>> +#include <asm/pmu.h>
>>  #include <asm/system_misc.h>
>>
>>  #include "common.h"
>> @@ -261,6 +263,40 @@ static void __init imx6q_axi_init(void)
>>  	}
>>  }
>>
>> +/*
>> + * The PMU IRQ lines of all cores are muxed onto a single interrupt.
>> + * Rotate the interrupt around the cores if the current CPU cannot
>> + * figure out why the interrupt has been triggered.
>> + */
>> +static irqreturn_t imx6q_pmu_handler(int irq, void *dev, irq_handler_t handler)
>> +{
>> +	irqreturn_t ret = handler(irq, dev);
>> +	int next;
>> +
>> +	if (ret == IRQ_NONE && num_online_cpus() > 1) {
>> +		next = cpumask_next(smp_processor_id(), cpu_online_mask);
>> +		if (next > nr_cpu_ids)
>> +			next = cpumask_next(-1, cpu_online_mask);
>> +		irq_set_affinity(irq, cpumask_of(next));
>> +	}
>> +
>> +	/*
>> +	 * We should be able to get away with the amount of IRQ_NONEs we give,
>> +	 * while still having the spurious IRQ detection code kick in if the
>> +	 * interrupt really starts hitting spuriously.
>> +	 */
>> +	return ret;
>> +}
>> +
>> +static struct arm_pmu_platdata imx6q_pmu_platdata = {
>> +	.handle_irq = imx6q_pmu_handler,
>> +};
>> +
>> +static struct of_dev_auxdata imx6q_auxdata_lookup[] __initdata = {
>> +	OF_DEV_AUXDATA("arm,cortex-a9-pmu", 0, "arm-pmu", &imx6q_pmu_platdata),
>> +	{},
>> +};
>> +
>>  static void __init imx6q_init_machine(void)
>>  {
>>  	struct device *parent;
>> @@ -276,7 +312,8 @@ static void __init imx6q_init_machine(void)
>>
>>  	imx6q_enet_phy_init();
>>
>> -	of_platform_populate(NULL, of_default_bus_match_table, NULL, parent);
>> +	of_platform_populate(NULL, of_default_bus_match_table,
>> +			     imx6q_auxdata_lookup, parent);
>>
>>  	imx_anatop_init();
>>  	cpu_is_imx6q() ?  imx6q_pm_init() : imx6dl_pm_init();
>> --
>> 1.9.3
>>
>
Russell King - ARM Linux Nov. 20, 2014, 4:48 p.m. UTC | #4
On Thu, Nov 20, 2014 at 02:24:43PM +0000, Daniel Thompson wrote:
> On 20/11/14 11:52, Lucas Stach wrote:
> > I've sent almost the same patch a while ago. At this time it was shot
> > down due to fears of the measurements being too flaky to be useful with
> > all that IRQ dance. While I don't think this is true (I did some
> > measurements on a SOLO and a QUAD variants of the i.MX6 with the same
> > workload, that were only minimally apart), I believe the IRQ affinity
> > dance isn't the best way to handle this.
> 
> Cumulative statistics and time based sampling profilers should be fine
> either way since a delay before the interrupt the asserted on the
> affected core should have a low impact here.

One thing you're missing is that the interrupt latency for this can be
horrific.

Firstly, remember that Linux processes one interrupt (per core) at a time.
What this means is that if we have two cores running interrupts (eg, CPU 2
and CPU 3), and we raise a PMU interrupt on CPU 1 which is supposed to be
for CPU 0, then we'll process the interrupt on CPU 1, and forward it to
CPU 2.  CPU 2 will then have it pending, but has to wait for the interrupt
handler to complete before it can service it, where upon it forwards it to
CPU 3.  CPU 3 then goes through the same before forwarding it to CPU 0.

I also wonder how this works when you use perf record -a (from all CPUs.)
If the sampling rate is high enough, will the interrupt be forwarded to
the other CPUs?  Has perf record -a been tested?
Daniel Thompson Nov. 20, 2014, 6:46 p.m. UTC | #5
On 20/11/14 16:48, Russell King - ARM Linux wrote:
> On Thu, Nov 20, 2014 at 02:24:43PM +0000, Daniel Thompson wrote:
>> On 20/11/14 11:52, Lucas Stach wrote:
>>> I've sent almost the same patch a while ago. At this time it was shot
>>> down due to fears of the measurements being too flaky to be useful with
>>> all that IRQ dance. While I don't think this is true (I did some
>>> measurements on a SOLO and a QUAD variants of the i.MX6 with the same
>>> workload, that were only minimally apart), I believe the IRQ affinity
>>> dance isn't the best way to handle this.
>>
>> Cumulative statistics and time based sampling profilers should be fine
>> either way since a delay before the interrupt the asserted on the
>> affected core should have a low impact here.
> 
> One thing you're missing is that the interrupt latency for this can be
> horrific.
>
> Firstly, remember that Linux processes one interrupt (per core) at a time.
> What this means is that if we have two cores running interrupts (eg, CPU 2
> and CPU 3), and we raise a PMU interrupt on CPU 1 which is supposed to be
> for CPU 0, then we'll process the interrupt on CPU 1, and forward it to
> CPU 2.  CPU 2 will then have it pending, but has to wait for the interrupt
> handler to complete before it can service it, where upon it forwards it to
> CPU 3.  CPU 3 then goes through the same before forwarding it to CPU 0.

Agreed. Rotating the affinity is an obviously linear approach so
naturally the worst case interrupt latency grows linearly with the
number of cores.

However unpredictable interrupt responses times should not prevent the
results of a time based sampling profiler from being useful.

A mentioned before such latencies are certainly of significant concern
when we profile multiple cores at once and we are reacting to specific
events within the core rather than simply the passing of time.


> I also wonder how this works when you use perf record -a (from all CPUs.)
> If the sampling rate is high enough, will the interrupt be forwarded to
> the other CPUs?  Has perf record -a been tested?

It has now... (mostly I've been using perf top since its easier decide
if the profile "feels" right given the workload).

Anyhow I ran three CPU burn programs (two in C, one in shell) alongside
"watch cat /proc/interrupts" and stopped the test when all the CPUs had
taken a million PMU interrupts.

At the end we have recorded ~3288483 samples and the relevant line in
/proc/interrupts looks like this:
           CPU0       CPU1       CPU2       CPU3
126:    1283127    1025955    1328177    1328159       GIC 126

Perhaps I'm reading it wrong but I was quite pleasantly surprised by
that. The sum of all PMU interrupts taken is 4965418 and that means ~66%
of the interrupts did useful work *without* rotating the affinity. With
four cores sharing an interrupt I was expecting much worse than that.

BTW *without* the patch "perf record -a" causes CPU #0 to immediately
lock up handling interrupts. If we are lucky the spurious IRQ logic
triggers and disables the interrupt but in most cases the volume of
"good" PMU interrupts is sufficient to prevent the spurious IRQ detector
from firing at all so leaving the system dead in the water.
Thomas Gleixner Nov. 20, 2014, 11:30 p.m. UTC | #6
On Thu, 20 Nov 2014, Daniel Thompson wrote:
> +/*
> + * The PMU IRQ lines of all cores are muxed onto a single interrupt.
> + * Rotate the interrupt around the cores if the current CPU cannot
> + * figure out why the interrupt has been triggered.
> + */
> +static irqreturn_t imx6q_pmu_handler(int irq, void *dev, irq_handler_t handler)
> +{
> +	irqreturn_t ret = handler(irq, dev);
> +	int next;
> +
> +	if (ret == IRQ_NONE && num_online_cpus() > 1) {

What guarantees that ret == IRQ_HANDLED is a sign for 'this is only
for this particular core' interrupt ?

> +		next = cpumask_next(smp_processor_id(), cpu_online_mask);
> +		if (next > nr_cpu_ids)
> +			next = cpumask_next(-1, cpu_online_mask);
> +		irq_set_affinity(irq, cpumask_of(next));
> +	}

Aside of the fact, that the hardware designers who came up with such a
brainfart should be put on sane drugs, this is just silly.

Rotating that thing around will introduce arbitrary latencies and
dependencies on other interrupts to be handled.

So if there is really no way to figure out which of the cores is the
actual target of the PMU interrupt then you should simply broadcast
that interrupt to a designated per cpu vector async from the one which
handles it in the first place and be done with it. That's the only
sane option you have.

Until now I assumed that on the core components side only timers and
interrupt controllers are supposed to be designed by janitors, but
there seems to be some more evil efforts underway.

Thanks,

	tglx
Daniel Thompson Nov. 21, 2014, 9:50 a.m. UTC | #7
On 20/11/14 23:30, Thomas Gleixner wrote:
> On Thu, 20 Nov 2014, Daniel Thompson wrote:
>> +/*
>> + * The PMU IRQ lines of all cores are muxed onto a single interrupt.
>> + * Rotate the interrupt around the cores if the current CPU cannot
>> + * figure out why the interrupt has been triggered.
>> + */
>> +static irqreturn_t imx6q_pmu_handler(int irq, void *dev, irq_handler_t handler)
>> +{
>> +	irqreturn_t ret = handler(irq, dev);
>> +	int next;
>> +
>> +	if (ret == IRQ_NONE && num_online_cpus() > 1) {
> 
> What guarantees that ret == IRQ_HANDLED is a sign for 'this is only
> for this particular core' interrupt ?

It isn't guaranteed. We rely on re-entering the interrupt handler if
more than one PMU is raising the interrupt simultaneously.

> 
>> +		next = cpumask_next(smp_processor_id(), cpu_online_mask);
>> +		if (next > nr_cpu_ids)
>> +			next = cpumask_next(-1, cpu_online_mask);
>> +		irq_set_affinity(irq, cpumask_of(next));
>> +	}
> 
> Aside of the fact, that the hardware designers who came up with such a
> brainfart should be put on sane drugs, this is just silly.
> 
> Rotating that thing around will introduce arbitrary latencies and
> dependencies on other interrupts to be handled.

To be honest I viewed the only real merits of the rotation workaround to
be that it is simple and minimally invasive. I am in total agreement
that there are profiling use cases that it will handle badly (although
there are a useful set which this workaround is sufficient to support).

> So if there is really no way to figure out which of the cores is the
> actual target of the PMU interrupt

PMU is only accessible via the bus if you are a external debugger
(signals external to the cluster control the register windowing). From
the kernel we have to use the co-processor interface and can only see
our own PMU.

> then you should simply broadcast
> that interrupt to a designated per cpu vector async from the one which
> handles it in the first place and be done with it. That's the only
> sane option you have.

As it happens I was planning to do some work on rebroadcasting next
anyway regardless of this discussion because I can't call
irq_set_affinity() from a FIQ handler...

Options I considered to rebroadcast are either direct use of an (new and
ARM specific?) IPI or use of smp_call_function() from a tasklet. I was
inclined to rule out the tasklet because it has the potential for far
greater timing jitter than rotating the affinity (doesn't it?).

> Until now I assumed that on the core components side only timers and
> interrupt controllers are supposed to be designed by janitors, but
> there seems to be some more evil efforts underway.

For now I have been assuming that problems like that are early teething
troubles for the SoC integrators rather than a class of problem that
will keep re-emerging over and over again. We've seen newer designs that
use PPIs to hook up the PMU and systems integrated that way can never
exhibit this problem.

Anyhow on that basis I'm hopeful (but not certain) that we won't see 8-
or 16- core systems that exhibit this issue.
Thomas Gleixner Nov. 21, 2014, 10:41 a.m. UTC | #8
On Fri, 21 Nov 2014, Daniel Thompson wrote:
> On 20/11/14 23:30, Thomas Gleixner wrote:
> > On Thu, 20 Nov 2014, Daniel Thompson wrote:
> >> +/*
> >> + * The PMU IRQ lines of all cores are muxed onto a single interrupt.
> >> + * Rotate the interrupt around the cores if the current CPU cannot
> >> + * figure out why the interrupt has been triggered.
> >> + */
> >> +static irqreturn_t imx6q_pmu_handler(int irq, void *dev, irq_handler_t handler)
> >> +{
> >> +	irqreturn_t ret = handler(irq, dev);
> >> +	int next;
> >> +
> >> +	if (ret == IRQ_NONE && num_online_cpus() > 1) {
> > 
> > What guarantees that ret == IRQ_HANDLED is a sign for 'this is only
> > for this particular core' interrupt ?
> 
> It isn't guaranteed. We rely on re-entering the interrupt handler if
> more than one PMU is raising the interrupt simultaneously.
> 
> > 
> >> +		next = cpumask_next(smp_processor_id(), cpu_online_mask);
> >> +		if (next > nr_cpu_ids)
> >> +			next = cpumask_next(-1, cpu_online_mask);
> >> +		irq_set_affinity(irq, cpumask_of(next));
> >> +	}
> > 
> > Aside of the fact, that the hardware designers who came up with such a
> > brainfart should be put on sane drugs, this is just silly.
> > 
> > Rotating that thing around will introduce arbitrary latencies and
> > dependencies on other interrupts to be handled.
> 
> To be honest I viewed the only real merits of the rotation workaround to
> be that it is simple and minimally invasive. I am in total agreement
> that there are profiling use cases that it will handle badly (although
> there are a useful set which this workaround is sufficient to support).
> 
> > So if there is really no way to figure out which of the cores is the
> > actual target of the PMU interrupt
> 
> PMU is only accessible via the bus if you are a external debugger
> (signals external to the cluster control the register windowing). From
> the kernel we have to use the co-processor interface and can only see
> our own PMU.
> 
> > then you should simply broadcast
> > that interrupt to a designated per cpu vector async from the one which
> > handles it in the first place and be done with it. That's the only
> > sane option you have.
> 
> As it happens I was planning to do some work on rebroadcasting next
> anyway regardless of this discussion because I can't call
> irq_set_affinity() from a FIQ handler...
> 
> Options I considered to rebroadcast are either direct use of an (new and
> ARM specific?) IPI or use of smp_call_function() from a tasklet. I was

> inclined to rule out the tasklet because it has the potential for far
> greater timing jitter than rotating the affinity (doesn't it?).

You cannot schedule a tasklet from FIQ. The only options you have are:

    - Async IPI (direct call into the architecture code). I have no
      idea whether thats possible on ARm

    - irq_work

Thanks,

	tglx
Daniel Thompson Nov. 21, 2014, 11:45 a.m. UTC | #9
On 21/11/14 10:41, Thomas Gleixner wrote:
> On Fri, 21 Nov 2014, Daniel Thompson wrote:
>> On 20/11/14 23:30, Thomas Gleixner wrote:
>>> On Thu, 20 Nov 2014, Daniel Thompson wrote:
>>>> +/*
>>>> + * The PMU IRQ lines of all cores are muxed onto a single interrupt.
>>>> + * Rotate the interrupt around the cores if the current CPU cannot
>>>> + * figure out why the interrupt has been triggered.
>>>> + */
>>>> +static irqreturn_t imx6q_pmu_handler(int irq, void *dev, irq_handler_t handler)
>>>> +{
>>>> +	irqreturn_t ret = handler(irq, dev);
>>>> +	int next;
>>>> +
>>>> +	if (ret == IRQ_NONE && num_online_cpus() > 1) {
>>>
>>> What guarantees that ret == IRQ_HANDLED is a sign for 'this is only
>>> for this particular core' interrupt ?
>>
>> It isn't guaranteed. We rely on re-entering the interrupt handler if
>> more than one PMU is raising the interrupt simultaneously.
>>
>>>
>>>> +		next = cpumask_next(smp_processor_id(), cpu_online_mask);
>>>> +		if (next > nr_cpu_ids)
>>>> +			next = cpumask_next(-1, cpu_online_mask);
>>>> +		irq_set_affinity(irq, cpumask_of(next));
>>>> +	}
>>>
>>> Aside of the fact, that the hardware designers who came up with such a
>>> brainfart should be put on sane drugs, this is just silly.
>>>
>>> Rotating that thing around will introduce arbitrary latencies and
>>> dependencies on other interrupts to be handled.
>>
>> To be honest I viewed the only real merits of the rotation workaround to
>> be that it is simple and minimally invasive. I am in total agreement
>> that there are profiling use cases that it will handle badly (although
>> there are a useful set which this workaround is sufficient to support).
>>
>>> So if there is really no way to figure out which of the cores is the
>>> actual target of the PMU interrupt
>>
>> PMU is only accessible via the bus if you are a external debugger
>> (signals external to the cluster control the register windowing). From
>> the kernel we have to use the co-processor interface and can only see
>> our own PMU.
>>
>>> then you should simply broadcast
>>> that interrupt to a designated per cpu vector async from the one which
>>> handles it in the first place and be done with it. That's the only
>>> sane option you have.
>>
>> As it happens I was planning to do some work on rebroadcasting next
>> anyway regardless of this discussion because I can't call
>> irq_set_affinity() from a FIQ handler...
>>
>> Options I considered to rebroadcast are either direct use of an (new and
>> ARM specific?) IPI or use of smp_call_function() from a tasklet. I was
> 
>> inclined to rule out the tasklet because it has the potential for far
>> greater timing jitter than rotating the affinity (doesn't it?).
> 
> You cannot schedule a tasklet from FIQ.

Indeed, that was another black mark against it. However I would prefer
to be able to convince people to rule out for performance reasons.
My FIQ work for perf isn't mainlined at this point so using FIQ to show
why I prefer one solution (for mainline) over another isn't entirely
reasonable.

> The only options you have are:
> 
>     - Async IPI (direct call into the architecture code). I have no
>       idea whether thats possible on ARM

I think this may be where I have to go eventually (sigh). If I can get
to the stage where we can deliver PMU events via FIQ then ideally the
workaround would also have to broadcast via FIQ.

Its a real shame that one of the best chips to do FIQ work suffers has
such an annoying hardware bug.

>     - irq_work

I'm currently trying this approach. It makes sense today without FIQ
and, fortunately for me, the workaround should mostly still work if it
were deployed from a FIQ handler.

BTW thanks very much for the listing options here; I was a little
worried I might have overlooked some really obvious way to deploy the
workaround.


Daniel.
diff mbox

Patch

diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c
index d51c6e99a2e9..c056b7b97eaa 100644
--- a/arch/arm/mach-imx/mach-imx6q.c
+++ b/arch/arm/mach-imx/mach-imx6q.c
@@ -16,6 +16,7 @@ 
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/irqchip.h>
@@ -33,6 +34,7 @@ 
 #include <linux/mfd/syscon/imx6q-iomuxc-gpr.h>
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
+#include <asm/pmu.h>
 #include <asm/system_misc.h>

 #include "common.h"
@@ -261,6 +263,40 @@  static void __init imx6q_axi_init(void)
 	}
 }

+/*
+ * The PMU IRQ lines of all cores are muxed onto a single interrupt.
+ * Rotate the interrupt around the cores if the current CPU cannot
+ * figure out why the interrupt has been triggered.
+ */
+static irqreturn_t imx6q_pmu_handler(int irq, void *dev, irq_handler_t handler)
+{
+	irqreturn_t ret = handler(irq, dev);
+	int next;
+
+	if (ret == IRQ_NONE && num_online_cpus() > 1) {
+		next = cpumask_next(smp_processor_id(), cpu_online_mask);
+		if (next > nr_cpu_ids)
+			next = cpumask_next(-1, cpu_online_mask);
+		irq_set_affinity(irq, cpumask_of(next));
+	}
+
+	/*
+	 * We should be able to get away with the amount of IRQ_NONEs we give,
+	 * while still having the spurious IRQ detection code kick in if the
+	 * interrupt really starts hitting spuriously.
+	 */
+	return ret;
+}
+
+static struct arm_pmu_platdata imx6q_pmu_platdata = {
+	.handle_irq = imx6q_pmu_handler,
+};
+
+static struct of_dev_auxdata imx6q_auxdata_lookup[] __initdata = {
+	OF_DEV_AUXDATA("arm,cortex-a9-pmu", 0, "arm-pmu", &imx6q_pmu_platdata),
+	{},
+};
+
 static void __init imx6q_init_machine(void)
 {
 	struct device *parent;
@@ -276,7 +312,8 @@  static void __init imx6q_init_machine(void)

 	imx6q_enet_phy_init();

-	of_platform_populate(NULL, of_default_bus_match_table, NULL, parent);
+	of_platform_populate(NULL, of_default_bus_match_table,
+			     imx6q_auxdata_lookup, parent);

 	imx_anatop_init();
 	cpu_is_imx6q() ?  imx6q_pm_init() : imx6dl_pm_init();