diff mbox

soc/tegra: pmc: Fix "scheduling while atomic"

Message ID 1460900051-3065-1-git-send-email-digetx@gmail.com
State Deferred
Delegated to: Jon Hunter
Headers show

Commit Message

Dmitry Osipenko April 17, 2016, 1:34 p.m. UTC
clk_get_rate() takes a mutex, hence cannot be used while IRQ's been
disabled. Replace it with a locked version.

[    3.430853] [<c0850fcc>] (dump_stack) from [<c00411f8>] (__schedule_bug+0x50/0x64)
[    3.431079] [<c00411f8>] (__schedule_bug) from [<c08553a8>] (__schedule+0x5c8/0x688)
[    3.431453] [<c08553a8>] (__schedule) from [<c08558f4>] (schedule_preempt_disabled+0x24/0x34)
[    3.431835] [<c08558f4>] (schedule_preempt_disabled) from [<c0856f24>] (__mutex_lock_slowpath+0xbc/0x170)
[    3.432204] [<c0856f24>] (__mutex_lock_slowpath) from [<c0857024>] (mutex_lock+0x4c/0x50)
[    3.432427] [<c0857024>] (mutex_lock) from [<c0610368>] (clk_prepare_lock+0x88/0xfc)
[    3.432800] [<c0610368>] (clk_prepare_lock) from [<c0611034>] (clk_get_rate+0xc/0x60)
[    3.433177] [<c0611034>] (clk_get_rate) from [<c034f10c>] (tegra_pmc_enter_suspend_mode+0x188/0x20c)
[    3.433580] [<c034f10c>] (tegra_pmc_enter_suspend_mode) from [<c0020d48>] (tegra_idle_lp2_last+0xc/0x40)
[    3.433795] [<c0020d48>] (tegra_idle_lp2_last) from [<c0021e1c>] (tegra20_idle_lp2_coupled+0x118/0x1fc)
[    3.434171] [<c0021e1c>] (tegra20_idle_lp2_coupled) from [<c055ec24>] (cpuidle_enter_state+0x3c/0x160)
[    3.434551] [<c055ec24>] (cpuidle_enter_state) from [<c0560ce8>] (cpuidle_enter_state_coupled+0x3dc/0x3f4)
[    3.434959] [<c0560ce8>] (cpuidle_enter_state_coupled) from [<c0055f1c>] (cpu_startup_entry+0x240/0x288)
[    3.435340] [<c0055f1c>] (cpu_startup_entry) from [<c0b29c84>] (start_kernel+0x3b4/0x3c0)
[    3.435557] [<c0b29c84>] (start_kernel) from [<00008074>] (0x8074)

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/clk/tegra/clk-tegra-pmc.c | 9 +++++++++
 drivers/soc/tegra/pmc.c           | 2 +-
 include/linux/clk/tegra.h         | 2 ++
 3 files changed, 12 insertions(+), 1 deletion(-)

Comments

Dmitry Osipenko May 5, 2016, 11:45 a.m. UTC | #1
I forgot to mention that this is observed on 3.18 android kernel fork under a 
quite specific condition: EMC scaling enabled and set to the lowest freq, CPU 
max freq reduced. This may cause higher latency on CPU idle enter/exit.

I couldn't reproduce this issue with the upstream kernel, guess some specific 
workload is also required. Any comments? =)
Jon Hunter May 5, 2016, 1:17 p.m. UTC | #2
On 17/04/16 14:34, Dmitry Osipenko wrote:
> clk_get_rate() takes a mutex, hence cannot be used while IRQ's been
> disabled. Replace it with a locked version.
> 
> [    3.430853] [<c0850fcc>] (dump_stack) from [<c00411f8>] (__schedule_bug+0x50/0x64)
> [    3.431079] [<c00411f8>] (__schedule_bug) from [<c08553a8>] (__schedule+0x5c8/0x688)
> [    3.431453] [<c08553a8>] (__schedule) from [<c08558f4>] (schedule_preempt_disabled+0x24/0x34)
> [    3.431835] [<c08558f4>] (schedule_preempt_disabled) from [<c0856f24>] (__mutex_lock_slowpath+0xbc/0x170)
> [    3.432204] [<c0856f24>] (__mutex_lock_slowpath) from [<c0857024>] (mutex_lock+0x4c/0x50)
> [    3.432427] [<c0857024>] (mutex_lock) from [<c0610368>] (clk_prepare_lock+0x88/0xfc)
> [    3.432800] [<c0610368>] (clk_prepare_lock) from [<c0611034>] (clk_get_rate+0xc/0x60)
> [    3.433177] [<c0611034>] (clk_get_rate) from [<c034f10c>] (tegra_pmc_enter_suspend_mode+0x188/0x20c)
> [    3.433580] [<c034f10c>] (tegra_pmc_enter_suspend_mode) from [<c0020d48>] (tegra_idle_lp2_last+0xc/0x40)
> [    3.433795] [<c0020d48>] (tegra_idle_lp2_last) from [<c0021e1c>] (tegra20_idle_lp2_coupled+0x118/0x1fc)
> [    3.434171] [<c0021e1c>] (tegra20_idle_lp2_coupled) from [<c055ec24>] (cpuidle_enter_state+0x3c/0x160)
> [    3.434551] [<c055ec24>] (cpuidle_enter_state) from [<c0560ce8>] (cpuidle_enter_state_coupled+0x3dc/0x3f4)
> [    3.434959] [<c0560ce8>] (cpuidle_enter_state_coupled) from [<c0055f1c>] (cpu_startup_entry+0x240/0x288)
> [    3.435340] [<c0055f1c>] (cpu_startup_entry) from [<c0b29c84>] (start_kernel+0x3b4/0x3c0)
> [    3.435557] [<c0b29c84>] (start_kernel) from [<00008074>] (0x8074)
> 
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>

Thanks for the report. I have been unable to reproduce this, but then I
don't see my tegra20 entering LP2 during cpuidle. I did force my tegra20
into LP2 during suspend which will exercise the same code but I did not
trigger this either. However, from looking at the code it does appear
that we could hit this.

> ---
>  drivers/clk/tegra/clk-tegra-pmc.c | 9 +++++++++
>  drivers/soc/tegra/pmc.c           | 2 +-
>  include/linux/clk/tegra.h         | 2 ++
>  3 files changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/clk/tegra/clk-tegra-pmc.c b/drivers/clk/tegra/clk-tegra-pmc.c
> index 91377ab..1ccf414 100644
> --- a/drivers/clk/tegra/clk-tegra-pmc.c
> +++ b/drivers/clk/tegra/clk-tegra-pmc.c
> @@ -78,6 +78,8 @@ static struct pmc_clk_init_data pmc_clks[] = {
>  	PMC_CLK(3, 22, 18),
>  };
>  
> +static struct clk_hw *pclk_hw;
> +
>  void __init tegra_pmc_clk_init(void __iomem *pmc_base,
>  				struct tegra_clk *tegra_clks)
>  {
> @@ -112,6 +114,9 @@ void __init tegra_pmc_clk_init(void __iomem *pmc_base,
>  		clk_register_clkdev(clk, data->dev_name, data->gate_name);
>  	}
>  
> +	dt_clk = tegra_lookup_dt_id(tegra_clk_pclk, tegra_clks);
> +	pclk_hw = __clk_get_hw(*dt_clk);
> +
>  	/* blink */
>  	writel_relaxed(0, pmc_base + PMC_BLINK_TIMER);
>  	clk = clk_register_gate(NULL, "blink_override", "clk_32k", 0,
> @@ -129,3 +134,7 @@ void __init tegra_pmc_clk_init(void __iomem *pmc_base,
>  	*dt_clk = clk;
>  }
>  
> +unsigned long tegra_pmc_get_pclk_rate(void)
> +{
> +	return clk_hw_get_rate(pclk_hw);
> +}

Ideally, it would be great if we did not need to add another custom API
for this, but I did not find anything in the CCF that would allow us to
avoid but that was only a quick look. However, we could ask the CCF folks.

What I plan to do next is to understand if the pclk is likely to change.
I know that it comes from one of the plls but I am not sure if we ever
change the rate. If not we may be able to move this to probe time and
avoid this.

Cheers
Jon
--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitry Osipenko May 5, 2016, 2:24 p.m. UTC | #3
Hello, Jon!

On 05.05.2016 16:17, Jon Hunter wrote:
>
> Thanks for the report. I have been unable to reproduce this, but then I
> don't see my tegra20 entering LP2 during cpuidle. I did force my tegra20
> into LP2 during suspend which will exercise the same code but I did not
> trigger this either. However, from looking at the code it does appear
> that we could hit this.
>

As I wrote before, it's quite difficult to reproduce.

>
> Ideally, it would be great if we did not need to add another custom API
> for this, but I did not find anything in the CCF that would allow us to
> avoid but that was only a quick look. However, we could ask the CCF folks.
>

Yes, CCF doesn't expose locked API. Code should be designed to avoid it.

> What I plan to do next is to understand if the pclk is likely to change.
> I know that it comes from one of the plls but I am not sure if we ever
> change the rate. If not we may be able to move this to probe time and
> avoid this.
>

That's reasonable, I'd also take a look at it. Thanks for the comment!
Jon Hunter May 25, 2016, 3:09 p.m. UTC | #4
On 05/05/16 15:24, Dmitry Osipenko wrote:
> Hello, Jon!
> 
> On 05.05.2016 16:17, Jon Hunter wrote:
>>
>> Thanks for the report. I have been unable to reproduce this, but then I
>> don't see my tegra20 entering LP2 during cpuidle. I did force my tegra20
>> into LP2 during suspend which will exercise the same code but I did not
>> trigger this either. However, from looking at the code it does appear
>> that we could hit this.
>>
> 
> As I wrote before, it's quite difficult to reproduce.

So far I have been unable to reproduce this. I did noticed that in the
upstream kernel we disable LP2 on Tegra20 if PCIE is enabled (see
arch/arm/mach-tegra/cpuidle-tegra20.c) ...

 /*
  * Tegra20 HW appears to have a bug such that PCIe device interrupts, whether
  * they are legacy IRQs or MSI, are lost when LP2 is enabled. To work around
  * this, simply disable LP2 if the PCI driver and DT node are both enabled.
  */
 void tegra20_cpuidle_pcie_irqs_in_use(void)
 {
         pr_info_once(
                 "Disabling cpuidle LP2 state, since PCIe IRQs are in use\n");
         tegra_idle_driver.states[1].disabled = true;
 }

Even if I remove this and verify that I can enter LP2, I have been unable
to reproduce this. I know that you said that it is difficult to reproduce
and there needs to be a specific workload, however, from looking at the 
code I am trying to understand the situation that would trigger this. 

Your backtrace shows ...

[    3.430853] [<c0850fcc>] (dump_stack) from [<c00411f8>] (__schedule_bug+0x50/0x64)
[    3.431079] [<c00411f8>] (__schedule_bug) from [<c08553a8>] (__schedule+0x5c8/0x688)
[    3.431453] [<c08553a8>] (__schedule) from [<c08558f4>] (schedule_preempt_disabled+0x24/0x34)
[    3.431835] [<c08558f4>] (schedule_preempt_disabled) from [<c0856f24>] (__mutex_lock_slowpath+0xbc/0x170)
[    3.432204] [<c0856f24>] (__mutex_lock_slowpath) from [<c0857024>] (mutex_lock+0x4c/0x50)
[    3.432427] [<c0857024>] (mutex_lock) from [<c0610368>] (clk_prepare_lock+0x88/0xfc)
[    3.432800] [<c0610368>] (clk_prepare_lock) from [<c0611034>] (clk_get_rate+0xc/0x60)
[    3.433177] [<c0611034>] (clk_get_rate) from [<c034f10c>] (tegra_pmc_enter_suspend_mode+0x188/0x20c)
[    3.433580] [<c034f10c>] (tegra_pmc_enter_suspend_mode) from [<c0020d48>] (tegra_idle_lp2_last+0xc/0x40)
[    3.433795] [<c0020d48>] (tegra_idle_lp2_last) from [<c0021e1c>] (tegra20_idle_lp2_coupled+0x118/0x1fc)
[    3.434171] [<c0021e1c>] (tegra20_idle_lp2_coupled) from [<c055ec24>] (cpuidle_enter_state+0x3c/0x160)
[    3.434551] [<c055ec24>] (cpuidle_enter_state) from [<c0560ce8>] (cpuidle_enter_state_coupled+0x3dc/0x3f4)
[    3.434959] [<c0560ce8>] (cpuidle_enter_state_coupled) from [<c0055f1c>] (cpu_startup_entry+0x240/0x288)
[    3.435340] [<c0055f1c>] (cpu_startup_entry) from [<c0b29c84>] (start_kernel+0x3b4/0x3c0)
[    3.435557] [<c0b29c84>] (start_kernel) from [<00008074>] (0x8074)

... however, when we call tegra_idle_lp2_last(), CPU1 should be down and
so I would not expect that the call to mutex_trylock() in clk_get_rate() 
would fail (ie. return 0 for contention) at this point and cause us to
call mutex_lock and sleep. Therefore, I am wondering if there could be
another bug in the v3.18 kernel that you are using that could be
triggering this. 

If you are able to reproduce this on v3.18, then it would be good if you
could trace the CCF calls around this WARNING to see what is causing the 
contention.

Cheers
Jon
Dmitry Osipenko May 25, 2016, 6:51 p.m. UTC | #5
On 25.05.2016 18:09, Jon Hunter wrote:
>
> On 05/05/16 15:24, Dmitry Osipenko wrote:
>> Hello, Jon!
>>
>> On 05.05.2016 16:17, Jon Hunter wrote:
>>>
>>> Thanks for the report. I have been unable to reproduce this, but then I
>>> don't see my tegra20 entering LP2 during cpuidle. I did force my tegra20
>>> into LP2 during suspend which will exercise the same code but I did not
>>> trigger this either. However, from looking at the code it does appear
>>> that we could hit this.
>>>
>>
>> As I wrote before, it's quite difficult to reproduce.
>
> So far I have been unable to reproduce this. I did noticed that in the
> upstream kernel we disable LP2 on Tegra20 if PCIE is enabled (see
> arch/arm/mach-tegra/cpuidle-tegra20.c) ...
>
>  /*
>   * Tegra20 HW appears to have a bug such that PCIe device interrupts, whether
>   * they are legacy IRQs or MSI, are lost when LP2 is enabled. To work around
>   * this, simply disable LP2 if the PCI driver and DT node are both enabled.
>   */
>  void tegra20_cpuidle_pcie_irqs_in_use(void)
>  {
>          pr_info_once(
>                  "Disabling cpuidle LP2 state, since PCIe IRQs are in use\n");
>          tegra_idle_driver.states[1].disabled = true;
>  }
>
> Even if I remove this and verify that I can enter LP2, I have been unable
> to reproduce this. I know that you said that it is difficult to reproduce
> and there needs to be a specific workload, however, from looking at the
> code I am trying to understand the situation that would trigger this.
>
> Your backtrace shows ...
>
> [    3.430853] [<c0850fcc>] (dump_stack) from [<c00411f8>] (__schedule_bug+0x50/0x64)
> [    3.431079] [<c00411f8>] (__schedule_bug) from [<c08553a8>] (__schedule+0x5c8/0x688)
> [    3.431453] [<c08553a8>] (__schedule) from [<c08558f4>] (schedule_preempt_disabled+0x24/0x34)
> [    3.431835] [<c08558f4>] (schedule_preempt_disabled) from [<c0856f24>] (__mutex_lock_slowpath+0xbc/0x170)
> [    3.432204] [<c0856f24>] (__mutex_lock_slowpath) from [<c0857024>] (mutex_lock+0x4c/0x50)
> [    3.432427] [<c0857024>] (mutex_lock) from [<c0610368>] (clk_prepare_lock+0x88/0xfc)
> [    3.432800] [<c0610368>] (clk_prepare_lock) from [<c0611034>] (clk_get_rate+0xc/0x60)
> [    3.433177] [<c0611034>] (clk_get_rate) from [<c034f10c>] (tegra_pmc_enter_suspend_mode+0x188/0x20c)
> [    3.433580] [<c034f10c>] (tegra_pmc_enter_suspend_mode) from [<c0020d48>] (tegra_idle_lp2_last+0xc/0x40)
> [    3.433795] [<c0020d48>] (tegra_idle_lp2_last) from [<c0021e1c>] (tegra20_idle_lp2_coupled+0x118/0x1fc)
> [    3.434171] [<c0021e1c>] (tegra20_idle_lp2_coupled) from [<c055ec24>] (cpuidle_enter_state+0x3c/0x160)
> [    3.434551] [<c055ec24>] (cpuidle_enter_state) from [<c0560ce8>] (cpuidle_enter_state_coupled+0x3dc/0x3f4)
> [    3.434959] [<c0560ce8>] (cpuidle_enter_state_coupled) from [<c0055f1c>] (cpu_startup_entry+0x240/0x288)
> [    3.435340] [<c0055f1c>] (cpu_startup_entry) from [<c0b29c84>] (start_kernel+0x3b4/0x3c0)
> [    3.435557] [<c0b29c84>] (start_kernel) from [<00008074>] (0x8074)
>
> ... however, when we call tegra_idle_lp2_last(), CPU1 should be down and
> so I would not expect that the call to mutex_trylock() in clk_get_rate()
> would fail (ie. return 0 for contention) at this point and cause us to
> call mutex_lock and sleep. Therefore, I am wondering if there could be
> another bug in the v3.18 kernel that you are using that could be
> triggering this.
>
> If you are able to reproduce this on v3.18, then it would be good if you
> could trace the CCF calls around this WARNING to see what is causing the
> contention.
>

I managed to reproduce it with some CCF "tracing".
Full kmsg log is here: https://bpaste.net/show/d8ab7b7534b7

Looks like CPU freq governor thread yields during clk_set_rate() and then CPU 
idle kicks in, taking the same mutex.

However, cpufreq_interactive governor is android specific governor and isn't in 
upstream kernel yet. Quick googling shows that recent "upstreaming" patch uses 
same cpufreq_interactive_speedchange_task: https://lkml.org/lkml/2016/5/20/41

I'm not aware of other possibility to reproduce this issue, it needs some CCF 
interaction from a separate task. So the current upstream kernel shouldn't be 
affected, I guess.

[snip]
[    3.923019] clk_set_rate: cpu_emc CPU: 0 +
[    3.923151] clk_get_rate: cpu_emc CPU: 0 +
[    3.923287] clk_get_rate: cpu_emc CPU: 0 -
[    3.923423] clk_set_rate: emc CPU: 0 +
[    3.923664] clk_get_rate: emc CPU: 0 +
[    3.923799] clk_get_rate: emc CPU: 0 -
[    3.925405] clk_get_rate: pclk CPU: 0 +
[    3.925776] BUG: scheduling while atomic: swapper/0/0/0x00000002
[    3.925975] Modules linked in:
[    3.926377] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
3.18.31-digetx-thor-01336-g63f7b8f-dirty #1184
[    3.926860] [<c010d3e8>] (unwind_backtrace) from [<c0109b28>] 
(show_stack+0x10/0x14)
[    3.927252] [<c0109b28>] (show_stack) from [<c0941cd4>] (dump_stack+0x94/0xa8)
[    3.927488] [<c0941cd4>] (dump_stack) from [<c01390c8>] 
(__schedule_bug+0x50/0x64)
[    3.927865] [<c01390c8>] (__schedule_bug) from [<c09460b0>] 
(__schedule+0x5c8/0x688)
[    3.928235] [<c09460b0>] (__schedule) from [<c09465fc>] 
(schedule_preempt_disabled+0x24/0x34)
[    3.928608] [<c09465fc>] (schedule_preempt_disabled) from [<c0947c2c>] 
(__mutex_lock_slowpath+0xbc/0x170)
[    3.928820] [<c0947c2c>] (__mutex_lock_slowpath) from [<c0947d2c>] 
(mutex_lock+0x4c/0x50)
[    3.929230] [<c0947d2c>] (mutex_lock) from [<c0700d54>] 
(clk_prepare_lock+0x88/0xfc)
[    3.929602] [<c0700d54>] (clk_prepare_lock) from [<c0701aa0>] 
(clk_get_rate+0x30/0xa4)
[    3.930005] [<c0701aa0>] (clk_get_rate) from [<c04511cc>] 
(tegra_pmc_enter_suspend_mode+0x188/0x20c)
[    3.930259] [<c04511cc>] (tegra_pmc_enter_suspend_mode) from [<c0118c08>] 
(tegra_idle_lp2_last+0xc/0x40)
[    3.930627] [<c0118c08>] (tegra_idle_lp2_last) from [<c0119c9c>] 
(tegra20_idle_lp2_coupled+0x118/0x1fc)
[    3.931051] [<c0119c9c>] (tegra20_idle_lp2_coupled) from [<c065007c>] 
(cpuidle_enter_state+0x3c/0x160)
[    3.931437] [<c065007c>] (cpuidle_enter_state) from [<c0652140>] 
(cpuidle_enter_state_coupled+0x3dc/0x3f4)
[    3.931846] [<c0652140>] (cpuidle_enter_state_coupled) from [<c014ddec>] 
(cpu_startup_entry+0x240/0x288)
[    3.932247] [<c014ddec>] (cpu_startup_entry) from [<c0d00c84>] 
(start_kernel+0x3b4/0x3c0)
[    3.932472] [<c0d00c84>] (start_kernel) from [<00008074>] (0x8074)
[    3.932881] bad: scheduling from the idle thread!

[snip]

[    4.131017] [<c013b754>] (ttwu_do_activate.constprop.15) from [<c013dc04>] 
(try_to_wake_up+0x264/0x308)
[    4.131239] [<c013dc04>] (try_to_wake_up) from [<c0947d64>] 
(__mutex_unlock_slowpath+0x34/0x40)
[    4.131612] [<c0947d64>] (__mutex_unlock_slowpath) from [<c0702d0c>] 
(clk_set_rate+0xac/0xd8)
[    4.131991] [<c0702d0c>] (clk_set_rate) from [<c064fa24>] 
(tegra_target+0x74/0x134)
[    4.132401] [<c064fa24>] (tegra_target) from [<c0647720>] 
(__cpufreq_driver_target+0x164/0x294)
[    4.132619] [<c0647720>] (__cpufreq_driver_target) from [<c064eba4>] 
(cpufreq_interactive_speedchange_task+0x23c/0x28c)
[    4.133010] [<c064eba4>] (cpufreq_interactive_speedchange_task) from 
[<c0134fbc>] (kthread+0xd0/0xe8)
[    4.133385] [<c0134fbc>] (kthread) from [<c0106460>] (ret_from_fork+0x14/0x34)
[    4.133748] Code: bad PC value
[    4.133955] ---[ end trace e93befbb37372e27 ]---
[    4.134162] note: cfinteractive[53] exited with preempt_count 3
Jon Hunter May 26, 2016, 8:42 a.m. UTC | #6
On 25/05/16 19:51, Dmitry Osipenko wrote:
> On 25.05.2016 18:09, Jon Hunter wrote:

...

>> If you are able to reproduce this on v3.18, then it would be good if you
>> could trace the CCF calls around this WARNING to see what is causing the
>> contention.
> 
> I managed to reproduce it with some CCF "tracing".
> Full kmsg log is here: https://bpaste.net/show/d8ab7b7534b7
> 
> Looks like CPU freq governor thread yields during clk_set_rate() and
> then CPU idle kicks in, taking the same mutex.

On the surface that sounds odd to me, but without understanding the
details, I guess I don't know if this is a valid thing to be doing or
even how that actually works!

> However, cpufreq_interactive governor is android specific governor and
> isn't in upstream kernel yet. Quick googling shows that recent
> "upstreaming" patch uses same cpufreq_interactive_speedchange_task:
> https://lkml.org/lkml/2016/5/20/41

Do you know if this version they are upstreaming could also yield during
the clk_set_rate()?

> I'm not aware of other possibility to reproduce this issue, it needs
> some CCF interaction from a separate task. So the current upstream
> kernel shouldn't be affected, I guess.

What still does not make sense to me is why any frequency changes have
not completed before we attempt to enter the LP2 state?

OK, well may be we will hold off on this change for the moment.

Cheers
Jon
Dmitry Osipenko May 26, 2016, 11:42 a.m. UTC | #7
On 26.05.2016 11:42, Jon Hunter wrote:
>
> On 25/05/16 19:51, Dmitry Osipenko wrote:
>> On 25.05.2016 18:09, Jon Hunter wrote:
>
> ...
>
>>> If you are able to reproduce this on v3.18, then it would be good if you
>>> could trace the CCF calls around this WARNING to see what is causing the
>>> contention.
>>
>> I managed to reproduce it with some CCF "tracing".
>> Full kmsg log is here: https://bpaste.net/show/d8ab7b7534b7
>>
>> Looks like CPU freq governor thread yields during clk_set_rate() and
>> then CPU idle kicks in, taking the same mutex.
>
> On the surface that sounds odd to me, but without understanding the
> details, I guess I don't know if this is a valid thing to be doing or
> even how that actually works!
>

The reason of that happening should be that I'm using clk PRE/POST rate change 
notifiers in my DVFS driver that takes other mutexes and they could be locked, 
causing schedule. I haven't mentioned it before, sorry.

 From drivers/clk/clk.c:

static struct task_struct *prepare_owner;

...

/***           locking             ***/
static void clk_prepare_lock(void)
{
	if (!mutex_trylock(&prepare_lock)) {
		if (prepare_owner == current) {
			prepare_refcnt++;
			return;
		}
		mutex_lock(&prepare_lock);
	}

You can see that it would lock the mutex if prepare_owner != current, in my case 
it's idle thread != interactive gov. thread.

>> However, cpufreq_interactive governor is android specific governor and
>> isn't in upstream kernel yet. Quick googling shows that recent
>> "upstreaming" patch uses same cpufreq_interactive_speedchange_task:
>> https://lkml.org/lkml/2016/5/20/41
>
> Do you know if this version they are upstreaming could also yield during
> the clk_set_rate()?
>

I think it should be assumed that any clk_set_rate() potentially could. Please 
correct me if I'm wrong.

>> I'm not aware of other possibility to reproduce this issue, it needs
>> some CCF interaction from a separate task. So the current upstream
>> kernel shouldn't be affected, I guess.
>
> What still does not make sense to me is why any frequency changes have
> not completed before we attempt to enter the LP2 state?
>

Why not? I don't see any CPUIDLE <-> CPUFREQ interlocking. Do you think it could 
be harmful somehow?

> OK, well may be we will hold off on this change for the moment.
Jon Hunter May 26, 2016, 2:32 p.m. UTC | #8
On 26/05/16 12:42, Dmitry Osipenko wrote:
> On 26.05.2016 11:42, Jon Hunter wrote:
>>
>> On 25/05/16 19:51, Dmitry Osipenko wrote:
>>> On 25.05.2016 18:09, Jon Hunter wrote:
>>
>> ...
>>
>>>> If you are able to reproduce this on v3.18, then it would be good if
>>>> you
>>>> could trace the CCF calls around this WARNING to see what is causing
>>>> the
>>>> contention.
>>>
>>> I managed to reproduce it with some CCF "tracing".
>>> Full kmsg log is here: https://bpaste.net/show/d8ab7b7534b7
>>>
>>> Looks like CPU freq governor thread yields during clk_set_rate() and
>>> then CPU idle kicks in, taking the same mutex.
>>
>> On the surface that sounds odd to me, but without understanding the
>> details, I guess I don't know if this is a valid thing to be doing or
>> even how that actually works!
>>
> 
> The reason of that happening should be that I'm using clk PRE/POST rate
> change notifiers in my DVFS driver that takes other mutexes and they
> could be locked, causing schedule. I haven't mentioned it before, sorry.

OK, but I am not sure how these "other mutexes" would be relevant here
without any more details.

> From drivers/clk/clk.c:
> 
> static struct task_struct *prepare_owner;
> 
> ...
> 
> /***           locking             ***/
> static void clk_prepare_lock(void)
> {
>     if (!mutex_trylock(&prepare_lock)) {
>         if (prepare_owner == current) {
>             prepare_refcnt++;
>             return;
>         }
>         mutex_lock(&prepare_lock);
>     }
> 
> You can see that it would lock the mutex if prepare_owner != current, in
> my case it's idle thread != interactive gov. thread.

Right, but that would imply that someone else is actively doing
something with a clock. However, if we are entering LP2, then that
implies that all CPUs are idle and so I still don't understand the
scenario where this would be locked in that case. May be there is
something I am overlooking here?

>>> However, cpufreq_interactive governor is android specific governor and
>>> isn't in upstream kernel yet. Quick googling shows that recent
>>> "upstreaming" patch uses same cpufreq_interactive_speedchange_task:
>>> https://lkml.org/lkml/2016/5/20/41
>>
>> Do you know if this version they are upstreaming could also yield during
>> the clk_set_rate()?
>>
> 
> I think it should be assumed that any clk_set_rate() potentially could.
> Please correct me if I'm wrong.
> 
>>> I'm not aware of other possibility to reproduce this issue, it needs
>>> some CCF interaction from a separate task. So the current upstream
>>> kernel shouldn't be affected, I guess.
>>
>> What still does not make sense to me is why any frequency changes have
>> not completed before we attempt to enter the LP2 state?
>>
> Why not? I don't see any CPUIDLE <-> CPUFREQ interlocking. Do you think
> it could be harmful somehow?

Like I said before, I still don't understand that scenario that is
causing this and without being able to fully understand it, I have no
idea what the exact problem we are trying to fix here is.

Cheers
Jon
Dmitry Osipenko May 26, 2016, 2:57 p.m. UTC | #9
On 26.05.2016 17:32, Jon Hunter wrote:
>
> On 26/05/16 12:42, Dmitry Osipenko wrote:
>> On 26.05.2016 11:42, Jon Hunter wrote:
>>>
>>> On 25/05/16 19:51, Dmitry Osipenko wrote:
>>>> On 25.05.2016 18:09, Jon Hunter wrote:
>>>
>>> ...
>>>
>>>>> If you are able to reproduce this on v3.18, then it would be good if
>>>>> you
>>>>> could trace the CCF calls around this WARNING to see what is causing
>>>>> the
>>>>> contention.
>>>>
>>>> I managed to reproduce it with some CCF "tracing".
>>>> Full kmsg log is here: https://bpaste.net/show/d8ab7b7534b7
>>>>
>>>> Looks like CPU freq governor thread yields during clk_set_rate() and
>>>> then CPU idle kicks in, taking the same mutex.
>>>
>>> On the surface that sounds odd to me, but without understanding the
>>> details, I guess I don't know if this is a valid thing to be doing or
>>> even how that actually works!
>>>
>>
>> The reason of that happening should be that I'm using clk PRE/POST rate
>> change notifiers in my DVFS driver that takes other mutexes and they
>> could be locked, causing schedule. I haven't mentioned it before, sorry.
>
> OK, but I am not sure how these "other mutexes" would be relevant here
> without any more details.
>
>> From drivers/clk/clk.c:
>>
>> static struct task_struct *prepare_owner;
>>
>> ...
>>
>> /***           locking             ***/
>> static void clk_prepare_lock(void)
>> {
>>     if (!mutex_trylock(&prepare_lock)) {
>>         if (prepare_owner == current) {
>>             prepare_refcnt++;
>>             return;
>>         }
>>         mutex_lock(&prepare_lock);
>>     }
>>
>> You can see that it would lock the mutex if prepare_owner != current, in
>> my case it's idle thread != interactive gov. thread.
>
> Right, but that would imply that someone else is actively doing
> something with a clock. However, if we are entering LP2, then that
> implies that all CPUs are idle and so I still don't understand the
> scenario where this would be locked in that case. May be there is
> something I am overlooking here?
>
>>>> However, cpufreq_interactive governor is android specific governor and
>>>> isn't in upstream kernel yet. Quick googling shows that recent
>>>> "upstreaming" patch uses same cpufreq_interactive_speedchange_task:
>>>> https://lkml.org/lkml/2016/5/20/41
>>>
>>> Do you know if this version they are upstreaming could also yield during
>>> the clk_set_rate()?
>>>
>>
>> I think it should be assumed that any clk_set_rate() potentially could.
>> Please correct me if I'm wrong.
>>
>>>> I'm not aware of other possibility to reproduce this issue, it needs
>>>> some CCF interaction from a separate task. So the current upstream
>>>> kernel shouldn't be affected, I guess.
>>>
>>> What still does not make sense to me is why any frequency changes have
>>> not completed before we attempt to enter the LP2 state?
>>>
>> Why not? I don't see any CPUIDLE <-> CPUFREQ interlocking. Do you think
>> it could be harmful somehow?
>
> Like I said before, I still don't understand that scenario that is
> causing this and without being able to fully understand it, I have no
> idea what the exact problem we are trying to fix here is.
>

That's how I see it:

+----------------------------------------------+
|                    CPU 0                     |
+-------------------+--------------------------+
|    Idle thread    | Interactive gov. thread  |
+----------------------------------------------+
|     inactive      |                          |
|                   |                          |
|                   |   CPU freq. change       |
|                   |                          |
|                   |   clk_set_rate()         |
|                   |                          |
|       ...         |   clk_prepare_lock()     |
|                   |                          |
|                   |   PRE rate notifier call |
|                   |                          |
|                   |   schedule               |
|                   |                          |
| irqs_disable()    |                          |
|                   |                          |
| enter CPU idle    |                          |
|                   |                          |
| clk_get_rate(pclk)|                          |
|                   |                          |
| clk_prepare_lock()|                          |
|                   |                          |
| schedule bug()    |                          |
|                   |                          |
+-------------------+--------------------------+
Jon Hunter May 26, 2016, 3:27 p.m. UTC | #10
On 26/05/16 15:57, Dmitry Osipenko wrote:
> On 26.05.2016 17:32, Jon Hunter wrote:
>>
>> On 26/05/16 12:42, Dmitry Osipenko wrote:
>>> On 26.05.2016 11:42, Jon Hunter wrote:
>>>>
>>>> On 25/05/16 19:51, Dmitry Osipenko wrote:
>>>>> On 25.05.2016 18:09, Jon Hunter wrote:
>>>>
>>>> ...
>>>>
>>>>>> If you are able to reproduce this on v3.18, then it would be good if
>>>>>> you
>>>>>> could trace the CCF calls around this WARNING to see what is causing
>>>>>> the
>>>>>> contention.
>>>>>
>>>>> I managed to reproduce it with some CCF "tracing".
>>>>> Full kmsg log is here: https://bpaste.net/show/d8ab7b7534b7
>>>>>
>>>>> Looks like CPU freq governor thread yields during clk_set_rate() and
>>>>> then CPU idle kicks in, taking the same mutex.
>>>>
>>>> On the surface that sounds odd to me, but without understanding the
>>>> details, I guess I don't know if this is a valid thing to be doing or
>>>> even how that actually works!
>>>>
>>>
>>> The reason of that happening should be that I'm using clk PRE/POST rate
>>> change notifiers in my DVFS driver that takes other mutexes and they
>>> could be locked, causing schedule. I haven't mentioned it before, sorry.
>>
>> OK, but I am not sure how these "other mutexes" would be relevant here
>> without any more details.
>>
>>> From drivers/clk/clk.c:
>>>
>>> static struct task_struct *prepare_owner;
>>>
>>> ...
>>>
>>> /***           locking             ***/
>>> static void clk_prepare_lock(void)
>>> {
>>>     if (!mutex_trylock(&prepare_lock)) {
>>>         if (prepare_owner == current) {
>>>             prepare_refcnt++;
>>>             return;
>>>         }
>>>         mutex_lock(&prepare_lock);
>>>     }
>>>
>>> You can see that it would lock the mutex if prepare_owner != current, in
>>> my case it's idle thread != interactive gov. thread.
>>
>> Right, but that would imply that someone else is actively doing
>> something with a clock. However, if we are entering LP2, then that
>> implies that all CPUs are idle and so I still don't understand the
>> scenario where this would be locked in that case. May be there is
>> something I am overlooking here?
>>
>>>>> However, cpufreq_interactive governor is android specific governor and
>>>>> isn't in upstream kernel yet. Quick googling shows that recent
>>>>> "upstreaming" patch uses same cpufreq_interactive_speedchange_task:
>>>>> https://lkml.org/lkml/2016/5/20/41
>>>>
>>>> Do you know if this version they are upstreaming could also yield
>>>> during
>>>> the clk_set_rate()?
>>>>
>>>
>>> I think it should be assumed that any clk_set_rate() potentially could.
>>> Please correct me if I'm wrong.
>>>
>>>>> I'm not aware of other possibility to reproduce this issue, it needs
>>>>> some CCF interaction from a separate task. So the current upstream
>>>>> kernel shouldn't be affected, I guess.
>>>>
>>>> What still does not make sense to me is why any frequency changes have
>>>> not completed before we attempt to enter the LP2 state?
>>>>
>>> Why not? I don't see any CPUIDLE <-> CPUFREQ interlocking. Do you think
>>> it could be harmful somehow?
>>
>> Like I said before, I still don't understand that scenario that is
>> causing this and without being able to fully understand it, I have no
>> idea what the exact problem we are trying to fix here is.
>>
> 
> That's how I see it:
> 
> +----------------------------------------------+
> |                    CPU 0                     |
> +-------------------+--------------------------+
> |    Idle thread    | Interactive gov. thread  |
> +----------------------------------------------+
> |     inactive      |                          |
> |                   |                          |
> |                   |   CPU freq. change       |
> |                   |                          |
> |                   |   clk_set_rate()         |
> |                   |                          |
> |       ...         |   clk_prepare_lock()     |
> |                   |                          |
> |                   |   PRE rate notifier call |
> |                   |                          |
> |                   |   schedule               |

What is this notifier doing? Is there some sort of hardware activity
that it is waiting for to complete?

Cheers
Jon
Dmitry Osipenko May 26, 2016, 5:01 p.m. UTC | #11
On 26.05.2016 18:27, Jon Hunter wrote:
>
> On 26/05/16 15:57, Dmitry Osipenko wrote:
>> On 26.05.2016 17:32, Jon Hunter wrote:
>>>
>>> On 26/05/16 12:42, Dmitry Osipenko wrote:
>>>> On 26.05.2016 11:42, Jon Hunter wrote:
>>>>>
>>>>> On 25/05/16 19:51, Dmitry Osipenko wrote:
>>>>>> On 25.05.2016 18:09, Jon Hunter wrote:
>>>>>
>>>>> ...
>>>>>
>>>>>>> If you are able to reproduce this on v3.18, then it would be good if
>>>>>>> you
>>>>>>> could trace the CCF calls around this WARNING to see what is causing
>>>>>>> the
>>>>>>> contention.
>>>>>>
>>>>>> I managed to reproduce it with some CCF "tracing".
>>>>>> Full kmsg log is here: https://bpaste.net/show/d8ab7b7534b7
>>>>>>
>>>>>> Looks like CPU freq governor thread yields during clk_set_rate() and
>>>>>> then CPU idle kicks in, taking the same mutex.
>>>>>
>>>>> On the surface that sounds odd to me, but without understanding the
>>>>> details, I guess I don't know if this is a valid thing to be doing or
>>>>> even how that actually works!
>>>>>
>>>>
>>>> The reason of that happening should be that I'm using clk PRE/POST rate
>>>> change notifiers in my DVFS driver that takes other mutexes and they
>>>> could be locked, causing schedule. I haven't mentioned it before, sorry.
>>>
>>> OK, but I am not sure how these "other mutexes" would be relevant here
>>> without any more details.
>>>
>>>> From drivers/clk/clk.c:
>>>>
>>>> static struct task_struct *prepare_owner;
>>>>
>>>> ...
>>>>
>>>> /***           locking             ***/
>>>> static void clk_prepare_lock(void)
>>>> {
>>>>     if (!mutex_trylock(&prepare_lock)) {
>>>>         if (prepare_owner == current) {
>>>>             prepare_refcnt++;
>>>>             return;
>>>>         }
>>>>         mutex_lock(&prepare_lock);
>>>>     }
>>>>
>>>> You can see that it would lock the mutex if prepare_owner != current, in
>>>> my case it's idle thread != interactive gov. thread.
>>>
>>> Right, but that would imply that someone else is actively doing
>>> something with a clock. However, if we are entering LP2, then that
>>> implies that all CPUs are idle and so I still don't understand the
>>> scenario where this would be locked in that case. May be there is
>>> something I am overlooking here?
>>>
>>>>>> However, cpufreq_interactive governor is android specific governor and
>>>>>> isn't in upstream kernel yet. Quick googling shows that recent
>>>>>> "upstreaming" patch uses same cpufreq_interactive_speedchange_task:
>>>>>> https://lkml.org/lkml/2016/5/20/41
>>>>>
>>>>> Do you know if this version they are upstreaming could also yield
>>>>> during
>>>>> the clk_set_rate()?
>>>>>
>>>>
>>>> I think it should be assumed that any clk_set_rate() potentially could.
>>>> Please correct me if I'm wrong.
>>>>
>>>>>> I'm not aware of other possibility to reproduce this issue, it needs
>>>>>> some CCF interaction from a separate task. So the current upstream
>>>>>> kernel shouldn't be affected, I guess.
>>>>>
>>>>> What still does not make sense to me is why any frequency changes have
>>>>> not completed before we attempt to enter the LP2 state?
>>>>>
>>>> Why not? I don't see any CPUIDLE <-> CPUFREQ interlocking. Do you think
>>>> it could be harmful somehow?
>>>
>>> Like I said before, I still don't understand that scenario that is
>>> causing this and without being able to fully understand it, I have no
>>> idea what the exact problem we are trying to fix here is.
>>>
>>
>> That's how I see it:
>>
>> +----------------------------------------------+
>> |                    CPU 0                     |
>> +-------------------+--------------------------+
>> |    Idle thread    | Interactive gov. thread  |
>> +----------------------------------------------+
>> |     inactive      |                          |
>> |                   |                          |
>> |                   |   CPU freq. change       |
>> |                   |                          |
>> |                   |   clk_set_rate()         |
>> |                   |                          |
>> |       ...         |   clk_prepare_lock()     |
>> |                   |                          |
>> |                   |   PRE rate notifier call |
>> |                   |                          |
>> |                   |   schedule               |
>
> What is this notifier doing? Is there some sort of hardware activity
> that it is waiting for to complete?
>

It changes regulator voltage if required. So at least I2C would cause scheduling 
on wait_for_completion_timeout().
Jon Hunter May 27, 2016, 12:46 p.m. UTC | #12
On 26/05/16 18:01, Dmitry Osipenko wrote:
> On 26.05.2016 18:27, Jon Hunter wrote:
>> On 26/05/16 15:57, Dmitry Osipenko wrote:

...

>>> That's how I see it:
>>>
>>> +----------------------------------------------+
>>> |                    CPU 0                     |
>>> +-------------------+--------------------------+
>>> |    Idle thread    | Interactive gov. thread  |
>>> +----------------------------------------------+
>>> |     inactive      |                          |
>>> |                   |                          |
>>> |                   |   CPU freq. change       |
>>> |                   |                          |
>>> |                   |   clk_set_rate()         |
>>> |                   |                          |
>>> |       ...         |   clk_prepare_lock()     |
>>> |                   |                          |
>>> |                   |   PRE rate notifier call |
>>> |                   |                          |
>>> |                   |   schedule               |
>>
>> What is this notifier doing? Is there some sort of hardware activity
>> that it is waiting for to complete?
>>
> 
> It changes regulator voltage if required. So at least I2C would cause
> scheduling on wait_for_completion_timeout().

Yes, of course that would make sense. What is interesting/odd in this
case is that the frequency is increasing (voltage scaled pre frequency
change) but yet you are entering LP2. May be that is possible? I guess
this problem may also occur on reducing frequency as well?

What are you using in the v3.18 kernel for exit_latency and
target_residency? The current mainline has 5000us and 10000us,
respectively.

It does seem that this could be triggered in the right circumstances and
I have to say I don't like the fact that this could be fragile as it is
today. Have you thought about adding a post clock notifier for pclk in
the PMC driver as an alternative to the change you are suggesting?

Cheers
Jon
Dmitry Osipenko May 27, 2016, 2:43 p.m. UTC | #13
On 27.05.2016 15:46, Jon Hunter wrote:
>
> On 26/05/16 18:01, Dmitry Osipenko wrote:
>> On 26.05.2016 18:27, Jon Hunter wrote:
>>> On 26/05/16 15:57, Dmitry Osipenko wrote:
>
> ...
>
>>>> That's how I see it:
>>>>
>>>> +----------------------------------------------+
>>>> |                    CPU 0                     |
>>>> +-------------------+--------------------------+
>>>> |    Idle thread    | Interactive gov. thread  |
>>>> +----------------------------------------------+
>>>> |     inactive      |                          |
>>>> |                   |                          |
>>>> |                   |   CPU freq. change       |
>>>> |                   |                          |
>>>> |                   |   clk_set_rate()         |
>>>> |                   |                          |
>>>> |       ...         |   clk_prepare_lock()     |
>>>> |                   |                          |
>>>> |                   |   PRE rate notifier call |
>>>> |                   |                          |
>>>> |                   |   schedule               |
>>>
>>> What is this notifier doing? Is there some sort of hardware activity
>>> that it is waiting for to complete?
>>>
>>
>> It changes regulator voltage if required. So at least I2C would cause
>> scheduling on wait_for_completion_timeout().
>
> Yes, of course that would make sense. What is interesting/odd in this
> case is that the frequency is increasing (voltage scaled pre frequency
> change) but yet you are entering LP2. May be that is possible? I guess
> this problem may also occur on reducing frequency as well?
>

Sorry, possible what? Surely it might happen on the POST notify, I just used PRE 
for example.

There are no active tasks while CPU changing the frequency, so kernel enters 
idle mode while waiting for the HW completion - the interrupt. I don't see 
anything wrong here.

> What are you using in the v3.18 kernel for exit_latency and
> target_residency? The current mainline has 5000us and 10000us,
> respectively.
>

Default stock latencies, not sure why you are asking. It's essentially a 
mainline kernel with some added device drivers and android patches.

> It does seem that this could be triggered in the right circumstances and
> I have to say I don't like the fact that this could be fragile as it is
> today. Have you thought about adding a post clock notifier for pclk in
> the PMC driver as an alternative to the change you are suggesting?
>

No, I haven't. Sounds like a good idea, thanks for the suggestion! I'll try it 
and send V2 if it will be okay, otherwise will report the problem.
diff mbox

Patch

diff --git a/drivers/clk/tegra/clk-tegra-pmc.c b/drivers/clk/tegra/clk-tegra-pmc.c
index 91377ab..1ccf414 100644
--- a/drivers/clk/tegra/clk-tegra-pmc.c
+++ b/drivers/clk/tegra/clk-tegra-pmc.c
@@ -78,6 +78,8 @@  static struct pmc_clk_init_data pmc_clks[] = {
 	PMC_CLK(3, 22, 18),
 };
 
+static struct clk_hw *pclk_hw;
+
 void __init tegra_pmc_clk_init(void __iomem *pmc_base,
 				struct tegra_clk *tegra_clks)
 {
@@ -112,6 +114,9 @@  void __init tegra_pmc_clk_init(void __iomem *pmc_base,
 		clk_register_clkdev(clk, data->dev_name, data->gate_name);
 	}
 
+	dt_clk = tegra_lookup_dt_id(tegra_clk_pclk, tegra_clks);
+	pclk_hw = __clk_get_hw(*dt_clk);
+
 	/* blink */
 	writel_relaxed(0, pmc_base + PMC_BLINK_TIMER);
 	clk = clk_register_gate(NULL, "blink_override", "clk_32k", 0,
@@ -129,3 +134,7 @@  void __init tegra_pmc_clk_init(void __iomem *pmc_base,
 	*dt_clk = clk;
 }
 
+unsigned long tegra_pmc_get_pclk_rate(void)
+{
+	return clk_hw_get_rate(pclk_hw);
+}
diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index 08966c2..6752714 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -638,7 +638,7 @@  void tegra_pmc_enter_suspend_mode(enum tegra_suspend_mode mode)
 		break;
 
 	case TEGRA_SUSPEND_LP2:
-		rate = clk_get_rate(pmc->clk);
+		rate = tegra_pmc_get_pclk_rate();
 		break;
 
 	default:
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 57bf7aa..717d71d 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -121,4 +121,6 @@  static inline void tegra_cpu_clock_resume(void)
 }
 #endif
 
+unsigned long tegra_pmc_get_pclk_rate(void);
+
 #endif /* __LINUX_CLK_TEGRA_H_ */