diff mbox series

i2c: imx: Fix external abort on early interrupt

Message ID 1591796802-23504-1-git-send-email-krzk@kernel.org
State Superseded
Headers show
Series i2c: imx: Fix external abort on early interrupt | expand

Commit Message

Krzysztof Kozlowski June 10, 2020, 1:46 p.m. UTC
If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
the i2c_imx_isr() will access registers before the I2C hardware is
initialized.  This leads to external abort on non-linefetch on Toradex
Colibri VF50 module (with Vybrid VF5xx):

    Unhandled fault: external abort on non-linefetch (0x1008) at 0x8882d003
    Internal error: : 1008 [#1] ARM
    Modules linked in:
    CPU: 0 PID: 1 Comm: swapper Not tainted 5.7.0 #607
    Hardware name: Freescale Vybrid VF5xx/VF6xx (Device Tree)
      (i2c_imx_isr) from [<8017009c>] (free_irq+0x25c/0x3b0)
      (free_irq) from [<805844ec>] (release_nodes+0x178/0x284)
      (release_nodes) from [<80580030>] (really_probe+0x10c/0x348)
      (really_probe) from [<80580380>] (driver_probe_device+0x60/0x170)
      (driver_probe_device) from [<80580630>] (device_driver_attach+0x58/0x60)
      (device_driver_attach) from [<805806bc>] (__driver_attach+0x84/0xc0)
      (__driver_attach) from [<8057e228>] (bus_for_each_dev+0x68/0xb4)
      (bus_for_each_dev) from [<8057f3ec>] (bus_add_driver+0x144/0x1ec)
      (bus_add_driver) from [<80581320>] (driver_register+0x78/0x110)
      (driver_register) from [<8010213c>] (do_one_initcall+0xa8/0x2f4)
      (do_one_initcall) from [<80c0100c>] (kernel_init_freeable+0x178/0x1dc)
      (kernel_init_freeable) from [<80807048>] (kernel_init+0x8/0x110)
      (kernel_init) from [<80100114>] (ret_from_fork+0x14/0x20)

Additionally, the i2c_imx_isr() could wake up the wait queue
(imx_i2c_struct->queue) before its initialization happens.

Fixes: 1c4b6c3bcf30 ("i2c: imx: implement bus recovery")
Cc: <stable@vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
---
 drivers/i2c/busses/i2c-imx.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

Comments

Oleksij Rempel June 12, 2020, 5:51 a.m. UTC | #1
Hi Krzysztof,

thank you for your patch.

On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
> the i2c_imx_isr() will access registers before the I2C hardware is
> initialized.  This leads to external abort on non-linefetch on Toradex
> Colibri VF50 module (with Vybrid VF5xx):
> 
>     Unhandled fault: external abort on non-linefetch (0x1008) at 0x8882d003
>     Internal error: : 1008 [#1] ARM
>     Modules linked in:
>     CPU: 0 PID: 1 Comm: swapper Not tainted 5.7.0 #607
>     Hardware name: Freescale Vybrid VF5xx/VF6xx (Device Tree)
>       (i2c_imx_isr) from [<8017009c>] (free_irq+0x25c/0x3b0)
>       (free_irq) from [<805844ec>] (release_nodes+0x178/0x284)
>       (release_nodes) from [<80580030>] (really_probe+0x10c/0x348)
>       (really_probe) from [<80580380>] (driver_probe_device+0x60/0x170)
>       (driver_probe_device) from [<80580630>] (device_driver_attach+0x58/0x60)
>       (device_driver_attach) from [<805806bc>] (__driver_attach+0x84/0xc0)
>       (__driver_attach) from [<8057e228>] (bus_for_each_dev+0x68/0xb4)
>       (bus_for_each_dev) from [<8057f3ec>] (bus_add_driver+0x144/0x1ec)
>       (bus_add_driver) from [<80581320>] (driver_register+0x78/0x110)
>       (driver_register) from [<8010213c>] (do_one_initcall+0xa8/0x2f4)
>       (do_one_initcall) from [<80c0100c>] (kernel_init_freeable+0x178/0x1dc)
>       (kernel_init_freeable) from [<80807048>] (kernel_init+0x8/0x110)
>       (kernel_init) from [<80100114>] (ret_from_fork+0x14/0x20)
> 
> Additionally, the i2c_imx_isr() could wake up the wait queue
> (imx_i2c_struct->queue) before its initialization happens.
> 
> Fixes: 1c4b6c3bcf30 ("i2c: imx: implement bus recovery")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>


I assume register access is aborted, because the IP core clock is not
enabled. In this case we have bigger problem then just probe.

Since this driver support runtime power management, the clock will be
disabled as soon as transfer is done. It means, on shared interrupt, we
will get in trouble even if there is no active transfer.

So, probably the only way to fix it, is to check in i2c_imx_isr() if the
HW is expected to be active and register access should be save.

Regards,
Oleksij


> ---
>  drivers/i2c/busses/i2c-imx.c | 20 ++++++++++----------
>  1 file changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c
> index 0ab5381aa012..e28a39f4840f 100644
> --- a/drivers/i2c/busses/i2c-imx.c
> +++ b/drivers/i2c/busses/i2c-imx.c
> @@ -1171,14 +1171,6 @@ static int i2c_imx_probe(struct platform_device *pdev)
>  		return ret;
>  	}
>  
> -	/* Request IRQ */
> -	ret = devm_request_irq(&pdev->dev, irq, i2c_imx_isr, IRQF_SHARED,
> -				pdev->name, i2c_imx);
> -	if (ret) {
> -		dev_err(&pdev->dev, "can't claim irq %d\n", irq);
> -		goto clk_disable;
> -	}
> -
>  	/* Init queue */
>  	init_waitqueue_head(&i2c_imx->queue);
>  
> @@ -1223,6 +1215,14 @@ static int i2c_imx_probe(struct platform_device *pdev)
>  	if (ret < 0)
>  		goto clk_notifier_unregister;
>  
> +	/* Request IRQ */
> +	ret = devm_request_irq(&pdev->dev, irq, i2c_imx_isr, IRQF_SHARED,
> +				pdev->name, i2c_imx);
> +	if (ret) {
> +		dev_err(&pdev->dev, "can't claim irq %d\n", irq);
> +		goto i2c_del_adapter;
> +	}
> +
>  	pm_runtime_mark_last_busy(&pdev->dev);
>  	pm_runtime_put_autosuspend(&pdev->dev);
>  
> @@ -1237,6 +1237,8 @@ static int i2c_imx_probe(struct platform_device *pdev)
>  
>  	return 0;   /* Return OK */
>  
> +i2c_del_adapter:
> +	i2c_del_adapter(&i2c_imx->adapter);
>  clk_notifier_unregister:
>  	clk_notifier_unregister(i2c_imx->clk, &i2c_imx->clk_change_nb);
>  rpm_disable:
> @@ -1244,8 +1246,6 @@ static int i2c_imx_probe(struct platform_device *pdev)
>  	pm_runtime_disable(&pdev->dev);
>  	pm_runtime_set_suspended(&pdev->dev);
>  	pm_runtime_dont_use_autosuspend(&pdev->dev);
> -
> -clk_disable:
>  	clk_disable_unprepare(i2c_imx->clk);
>  	return ret;
>  }
> -- 
> 2.7.4
> 
>
Krzysztof Kozlowski June 12, 2020, 7:38 a.m. UTC | #2
On Fri, Jun 12, 2020 at 07:51:14AM +0200, Oleksij Rempel wrote:
> Hi Krzysztof,
> 
> thank you for your patch.
> 
> On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> > If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
> > the i2c_imx_isr() will access registers before the I2C hardware is
> > initialized.  This leads to external abort on non-linefetch on Toradex
> > Colibri VF50 module (with Vybrid VF5xx):
> > 
> >     Unhandled fault: external abort on non-linefetch (0x1008) at 0x8882d003
> >     Internal error: : 1008 [#1] ARM
> >     Modules linked in:
> >     CPU: 0 PID: 1 Comm: swapper Not tainted 5.7.0 #607
> >     Hardware name: Freescale Vybrid VF5xx/VF6xx (Device Tree)
> >       (i2c_imx_isr) from [<8017009c>] (free_irq+0x25c/0x3b0)
> >       (free_irq) from [<805844ec>] (release_nodes+0x178/0x284)
> >       (release_nodes) from [<80580030>] (really_probe+0x10c/0x348)
> >       (really_probe) from [<80580380>] (driver_probe_device+0x60/0x170)
> >       (driver_probe_device) from [<80580630>] (device_driver_attach+0x58/0x60)
> >       (device_driver_attach) from [<805806bc>] (__driver_attach+0x84/0xc0)
> >       (__driver_attach) from [<8057e228>] (bus_for_each_dev+0x68/0xb4)
> >       (bus_for_each_dev) from [<8057f3ec>] (bus_add_driver+0x144/0x1ec)
> >       (bus_add_driver) from [<80581320>] (driver_register+0x78/0x110)
> >       (driver_register) from [<8010213c>] (do_one_initcall+0xa8/0x2f4)
> >       (do_one_initcall) from [<80c0100c>] (kernel_init_freeable+0x178/0x1dc)
> >       (kernel_init_freeable) from [<80807048>] (kernel_init+0x8/0x110)
> >       (kernel_init) from [<80100114>] (ret_from_fork+0x14/0x20)
> > 
> > Additionally, the i2c_imx_isr() could wake up the wait queue
> > (imx_i2c_struct->queue) before its initialization happens.
> > 
> > Fixes: 1c4b6c3bcf30 ("i2c: imx: implement bus recovery")
> > Cc: <stable@vger.kernel.org>
> > Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
> 
> 
> I assume register access is aborted, because the IP core clock is not
> enabled. In this case we have bigger problem then just probe.

If by IP core clock you mean the clock which driver is getting, then
answer is no. This clock is enabled.

> Since this driver support runtime power management, the clock will be
> disabled as soon as transfer is done. It means, on shared interrupt, we
> will get in trouble even if there is no active transfer.

The driver's runtime PM plays only with this one clock, so it seems
you meant i2c_imx->clk. It is not this problem.

> 
> So, probably the only way to fix it, is to check in i2c_imx_isr() if the
> HW is expected to be active and register access should be save.

Checking in every interrupt whether the interrupt should be serviced
based on some SW flag because HW might be disabled? That looks unusual,
like a hack.

No, the interrupt should be registered when the driver and some other
pieces of HW are ready to service it.

Best regards,
Krzysztof
Oleksij Rempel June 12, 2020, 8:02 a.m. UTC | #3
On Fri, Jun 12, 2020 at 09:38:15AM +0200, Krzysztof Kozlowski wrote:
> On Fri, Jun 12, 2020 at 07:51:14AM +0200, Oleksij Rempel wrote:
> > Hi Krzysztof,
> > 
> > thank you for your patch.
> > 
> > On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> > > If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
> > > the i2c_imx_isr() will access registers before the I2C hardware is
> > > initialized.  This leads to external abort on non-linefetch on Toradex
> > > Colibri VF50 module (with Vybrid VF5xx):
> > > 
> > >     Unhandled fault: external abort on non-linefetch (0x1008) at 0x8882d003
> > >     Internal error: : 1008 [#1] ARM
> > >     Modules linked in:
> > >     CPU: 0 PID: 1 Comm: swapper Not tainted 5.7.0 #607
> > >     Hardware name: Freescale Vybrid VF5xx/VF6xx (Device Tree)
> > >       (i2c_imx_isr) from [<8017009c>] (free_irq+0x25c/0x3b0)
> > >       (free_irq) from [<805844ec>] (release_nodes+0x178/0x284)
> > >       (release_nodes) from [<80580030>] (really_probe+0x10c/0x348)
> > >       (really_probe) from [<80580380>] (driver_probe_device+0x60/0x170)
> > >       (driver_probe_device) from [<80580630>] (device_driver_attach+0x58/0x60)
> > >       (device_driver_attach) from [<805806bc>] (__driver_attach+0x84/0xc0)
> > >       (__driver_attach) from [<8057e228>] (bus_for_each_dev+0x68/0xb4)
> > >       (bus_for_each_dev) from [<8057f3ec>] (bus_add_driver+0x144/0x1ec)
> > >       (bus_add_driver) from [<80581320>] (driver_register+0x78/0x110)
> > >       (driver_register) from [<8010213c>] (do_one_initcall+0xa8/0x2f4)
> > >       (do_one_initcall) from [<80c0100c>] (kernel_init_freeable+0x178/0x1dc)
> > >       (kernel_init_freeable) from [<80807048>] (kernel_init+0x8/0x110)
> > >       (kernel_init) from [<80100114>] (ret_from_fork+0x14/0x20)
> > > 
> > > Additionally, the i2c_imx_isr() could wake up the wait queue
> > > (imx_i2c_struct->queue) before its initialization happens.
> > > 
> > > Fixes: 1c4b6c3bcf30 ("i2c: imx: implement bus recovery")
> > > Cc: <stable@vger.kernel.org>
> > > Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
> > 
> > 
> > I assume register access is aborted, because the IP core clock is not
> > enabled. In this case we have bigger problem then just probe.
> 
> If by IP core clock you mean the clock which driver is getting, then
> answer is no. This clock is enabled.
> 
> > Since this driver support runtime power management, the clock will be
> > disabled as soon as transfer is done. It means, on shared interrupt, we
> > will get in trouble even if there is no active transfer.
> 
> The driver's runtime PM plays only with this one clock, so it seems
> you meant i2c_imx->clk. It is not this problem.
> 
> > 
> > So, probably the only way to fix it, is to check in i2c_imx_isr() if the
> > HW is expected to be active and register access should be save.
> 
> Checking in every interrupt whether the interrupt should be serviced
> based on some SW flag because HW might be disabled? That looks unusual,
> like a hack.
> 
> No, the interrupt should be registered when the driver and some other
> pieces of HW are ready to service it.

OK.
please make sure, irq is probed before calling
i2c_add_numbered_adapter(). This will trigger deferred probing of
slave devices. Since the irq handler will be added later, tx completion
of some requests will be lost or fail.

Regards,
Oleksij
Krzysztof Kozlowski June 12, 2020, 8:23 a.m. UTC | #4
On Fri, Jun 12, 2020 at 10:02:40AM +0200, Oleksij Rempel wrote:
> On Fri, Jun 12, 2020 at 09:38:15AM +0200, Krzysztof Kozlowski wrote:
> > On Fri, Jun 12, 2020 at 07:51:14AM +0200, Oleksij Rempel wrote:
> > > Hi Krzysztof,
> > > 
> > > thank you for your patch.
> > > 
> > > On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> > > > If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
> > > > the i2c_imx_isr() will access registers before the I2C hardware is
> > > > initialized.  This leads to external abort on non-linefetch on Toradex
> > > > Colibri VF50 module (with Vybrid VF5xx):
> > > > 
> > > >     Unhandled fault: external abort on non-linefetch (0x1008) at 0x8882d003
> > > >     Internal error: : 1008 [#1] ARM
> > > >     Modules linked in:
> > > >     CPU: 0 PID: 1 Comm: swapper Not tainted 5.7.0 #607
> > > >     Hardware name: Freescale Vybrid VF5xx/VF6xx (Device Tree)
> > > >       (i2c_imx_isr) from [<8017009c>] (free_irq+0x25c/0x3b0)
> > > >       (free_irq) from [<805844ec>] (release_nodes+0x178/0x284)
> > > >       (release_nodes) from [<80580030>] (really_probe+0x10c/0x348)
> > > >       (really_probe) from [<80580380>] (driver_probe_device+0x60/0x170)
> > > >       (driver_probe_device) from [<80580630>] (device_driver_attach+0x58/0x60)
> > > >       (device_driver_attach) from [<805806bc>] (__driver_attach+0x84/0xc0)
> > > >       (__driver_attach) from [<8057e228>] (bus_for_each_dev+0x68/0xb4)
> > > >       (bus_for_each_dev) from [<8057f3ec>] (bus_add_driver+0x144/0x1ec)
> > > >       (bus_add_driver) from [<80581320>] (driver_register+0x78/0x110)
> > > >       (driver_register) from [<8010213c>] (do_one_initcall+0xa8/0x2f4)
> > > >       (do_one_initcall) from [<80c0100c>] (kernel_init_freeable+0x178/0x1dc)
> > > >       (kernel_init_freeable) from [<80807048>] (kernel_init+0x8/0x110)
> > > >       (kernel_init) from [<80100114>] (ret_from_fork+0x14/0x20)
> > > > 
> > > > Additionally, the i2c_imx_isr() could wake up the wait queue
> > > > (imx_i2c_struct->queue) before its initialization happens.
> > > > 
> > > > Fixes: 1c4b6c3bcf30 ("i2c: imx: implement bus recovery")
> > > > Cc: <stable@vger.kernel.org>
> > > > Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
> > > 
> > > 
> > > I assume register access is aborted, because the IP core clock is not
> > > enabled. In this case we have bigger problem then just probe.
> > 
> > If by IP core clock you mean the clock which driver is getting, then
> > answer is no. This clock is enabled.
> > 
> > > Since this driver support runtime power management, the clock will be
> > > disabled as soon as transfer is done. It means, on shared interrupt, we
> > > will get in trouble even if there is no active transfer.
> > 
> > The driver's runtime PM plays only with this one clock, so it seems
> > you meant i2c_imx->clk. It is not this problem.
> > 
> > > 
> > > So, probably the only way to fix it, is to check in i2c_imx_isr() if the
> > > HW is expected to be active and register access should be save.
> > 
> > Checking in every interrupt whether the interrupt should be serviced
> > based on some SW flag because HW might be disabled? That looks unusual,
> > like a hack.
> > 
> > No, the interrupt should be registered when the driver and some other
> > pieces of HW are ready to service it.
> 
> OK.
> please make sure, irq is probed before calling
> i2c_add_numbered_adapter(). This will trigger deferred probing of
> slave devices. Since the irq handler will be added later, tx completion
> of some requests will be lost or fail.

Right. I'll move the devm_request_irq().

Best regards,
Krzysztof
Wolfram Sang June 12, 2020, 9:05 a.m. UTC | #5
On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),

That code is disabled since 2011 (6d83f94db95c ("genirq: Disable the
SHIRQ_DEBUG call in request_threaded_irq for now"))? So, you had this
without fake injection, I assume?
Krzysztof Kozlowski June 12, 2020, 9:29 a.m. UTC | #6
On Fri, Jun 12, 2020 at 11:05:17AM +0200, Wolfram Sang wrote:
> On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> > If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
> 
> That code is disabled since 2011 (6d83f94db95c ("genirq: Disable the
> SHIRQ_DEBUG call in request_threaded_irq for now"))? So, you had this
> without fake injection, I assume?

No, I observed it only after enabling DEBUG_SHIRQ (to a kernel with
some debugging options already).

Best regards,
Krzysztof
Wolfram Sang June 12, 2020, 9:56 a.m. UTC | #7
On Fri, Jun 12, 2020 at 11:29:41AM +0200, Krzysztof Kozlowski wrote:
> On Fri, Jun 12, 2020 at 11:05:17AM +0200, Wolfram Sang wrote:
> > On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> > > If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
> > 
> > That code is disabled since 2011 (6d83f94db95c ("genirq: Disable the
> > SHIRQ_DEBUG call in request_threaded_irq for now"))? So, you had this
> > without fake injection, I assume?
> 
> No, I observed it only after enabling DEBUG_SHIRQ (to a kernel with
> some debugging options already).

Interesting. Maybe probe was deferred and you got the extra irq when
deregistering?
Krzysztof Kozlowski June 12, 2020, 10:21 a.m. UTC | #8
On Fri, Jun 12, 2020 at 11:56:04AM +0200, Wolfram Sang wrote:
> On Fri, Jun 12, 2020 at 11:29:41AM +0200, Krzysztof Kozlowski wrote:
> > On Fri, Jun 12, 2020 at 11:05:17AM +0200, Wolfram Sang wrote:
> > > On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> > > > If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
> > > 
> > > That code is disabled since 2011 (6d83f94db95c ("genirq: Disable the
> > > SHIRQ_DEBUG call in request_threaded_irq for now"))? So, you had this
> > > without fake injection, I assume?
> > 
> > No, I observed it only after enabling DEBUG_SHIRQ (to a kernel with
> > some debugging options already).
> 
> Interesting. Maybe probe was deferred and you got the extra irq when
> deregistering?

Yes, good catch. The abort happens right after deferred probe exit.  It
could be then different reason than I thought - the interrupt is freed
through devm infrastructure quite late.  At this time, the clock might
be indeed disabled (error path of probe()).

Best regards,
Krzysztof
Oleksij Rempel June 12, 2020, 10:31 a.m. UTC | #9
On Fri, Jun 12, 2020 at 12:21:13PM +0200, Krzysztof Kozlowski wrote:
> On Fri, Jun 12, 2020 at 11:56:04AM +0200, Wolfram Sang wrote:
> > On Fri, Jun 12, 2020 at 11:29:41AM +0200, Krzysztof Kozlowski wrote:
> > > On Fri, Jun 12, 2020 at 11:05:17AM +0200, Wolfram Sang wrote:
> > > > On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> > > > > If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
> > > > 
> > > > That code is disabled since 2011 (6d83f94db95c ("genirq: Disable the
> > > > SHIRQ_DEBUG call in request_threaded_irq for now"))? So, you had this
> > > > without fake injection, I assume?
> > > 
> > > No, I observed it only after enabling DEBUG_SHIRQ (to a kernel with
> > > some debugging options already).
> > 
> > Interesting. Maybe probe was deferred and you got the extra irq when
> > deregistering?
> 
> Yes, good catch. The abort happens right after deferred probe exit.  It
> could be then different reason than I thought - the interrupt is freed
> through devm infrastructure quite late.  At this time, the clock might
> be indeed disabled (error path of probe()).

This line looks suspicious to me:
 Unhandled fault: external abort on non-linefetch (0x1008) at 0x8882d003

0x8882d003 looks like not initialized pointer.
The only not initialized value at devm_request_irq stage is i2c_imx->queue.
Marc Kleine-Budde June 12, 2020, 10:34 a.m. UTC | #10
On 6/12/20 12:31 PM, Oleksij Rempel wrote:
> On Fri, Jun 12, 2020 at 12:21:13PM +0200, Krzysztof Kozlowski wrote:
>> On Fri, Jun 12, 2020 at 11:56:04AM +0200, Wolfram Sang wrote:
>>> On Fri, Jun 12, 2020 at 11:29:41AM +0200, Krzysztof Kozlowski wrote:
>>>> On Fri, Jun 12, 2020 at 11:05:17AM +0200, Wolfram Sang wrote:
>>>>> On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
>>>>>> If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
>>>>>
>>>>> That code is disabled since 2011 (6d83f94db95c ("genirq: Disable the
>>>>> SHIRQ_DEBUG call in request_threaded_irq for now"))? So, you had this
>>>>> without fake injection, I assume?
>>>>
>>>> No, I observed it only after enabling DEBUG_SHIRQ (to a kernel with
>>>> some debugging options already).
>>>
>>> Interesting. Maybe probe was deferred and you got the extra irq when
>>> deregistering?
>>
>> Yes, good catch. The abort happens right after deferred probe exit.  It
>> could be then different reason than I thought - the interrupt is freed
>> through devm infrastructure quite late.  At this time, the clock might
>> be indeed disabled (error path of probe()).

From my point of view, the clocks are disabled as Oleksij pointed out, due to
RUNTIME_PM at the end of probe():

> 	pm_runtime_mark_last_busy(&pdev->dev);
> 	pm_runtime_put_autosuspend(&pdev->dev);

Marc
Krzysztof Kozlowski June 12, 2020, 10:39 a.m. UTC | #11
On Fri, Jun 12, 2020 at 12:31:49PM +0200, Oleksij Rempel wrote:
> On Fri, Jun 12, 2020 at 12:21:13PM +0200, Krzysztof Kozlowski wrote:
> > On Fri, Jun 12, 2020 at 11:56:04AM +0200, Wolfram Sang wrote:
> > > On Fri, Jun 12, 2020 at 11:29:41AM +0200, Krzysztof Kozlowski wrote:
> > > > On Fri, Jun 12, 2020 at 11:05:17AM +0200, Wolfram Sang wrote:
> > > > > On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> > > > > > If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
> > > > > 
> > > > > That code is disabled since 2011 (6d83f94db95c ("genirq: Disable the
> > > > > SHIRQ_DEBUG call in request_threaded_irq for now"))? So, you had this
> > > > > without fake injection, I assume?
> > > > 
> > > > No, I observed it only after enabling DEBUG_SHIRQ (to a kernel with
> > > > some debugging options already).
> > > 
> > > Interesting. Maybe probe was deferred and you got the extra irq when
> > > deregistering?
> > 
> > Yes, good catch. The abort happens right after deferred probe exit.  It
> > could be then different reason than I thought - the interrupt is freed
> > through devm infrastructure quite late.  At this time, the clock might
> > be indeed disabled (error path of probe()).
> 
> This line looks suspicious to me:
>  Unhandled fault: external abort on non-linefetch (0x1008) at 0x8882d003
> 
> 0x8882d003 looks like not initialized pointer.
> The only not initialized value at devm_request_irq stage is i2c_imx->queue.

The queue should be good at this time because it is part of i2c_imx
which is allocated before interrupt (so freed after interrupt).

Like Wolfram suggested, the interrupt comes because of deferred probe.
The only solution would be to free the IRQ in error path... and in
driver remove.

This basically kills the concept of devm for interrupts. Some other
drivers experience exactly the same pattern. I now reproduced it on
unbind of dspi driver of VF5xx:

echo 4002d000.spi > /sys/devices/platform/soc/40000000.bus/4002d000.spi/driver/unbind
[  218.391867] Unhandled fault: external abort on non-linefetch (0x1008) at 0x8887f02c
...
[  218.754493] [<806185c4>] (regmap_mmio_read32le) from [<8061885c>] (regmap_mmio_read+0x48/0x68)
[  218.820049] [<80678c64>] (dspi_interrupt) from [<8017acec>] (free_irq+0x26c/0x3cc)
[  218.827853]  r5:86312200 r4:85a71d40
[  218.831602] [<8017aa80>] (free_irq) from [<8017dcec>] (devm_irq_release+0x1c/0x20)
[  218.839420]  r10:805f91fc r9:8630dac8 r8:8630dac8 r7:805f9214 r6:8630d810 r5:85a54780
[  218.847468]  r4:85a54800
[  218.850152] [<8017dcd0>] (devm_irq_release) from [<805f98ec>] (release_nodes+0x1e4/0x298)

Best regards,
Krzysztof
Krzysztof Kozlowski June 12, 2020, 10:44 a.m. UTC | #12
On Fri, Jun 12, 2020 at 12:34:47PM +0200, Marc Kleine-Budde wrote:
> On 6/12/20 12:31 PM, Oleksij Rempel wrote:
> > On Fri, Jun 12, 2020 at 12:21:13PM +0200, Krzysztof Kozlowski wrote:
> >> On Fri, Jun 12, 2020 at 11:56:04AM +0200, Wolfram Sang wrote:
> >>> On Fri, Jun 12, 2020 at 11:29:41AM +0200, Krzysztof Kozlowski wrote:
> >>>> On Fri, Jun 12, 2020 at 11:05:17AM +0200, Wolfram Sang wrote:
> >>>>> On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
> >>>>>> If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
> >>>>>
> >>>>> That code is disabled since 2011 (6d83f94db95c ("genirq: Disable the
> >>>>> SHIRQ_DEBUG call in request_threaded_irq for now"))? So, you had this
> >>>>> without fake injection, I assume?
> >>>>
> >>>> No, I observed it only after enabling DEBUG_SHIRQ (to a kernel with
> >>>> some debugging options already).
> >>>
> >>> Interesting. Maybe probe was deferred and you got the extra irq when
> >>> deregistering?
> >>
> >> Yes, good catch. The abort happens right after deferred probe exit.  It
> >> could be then different reason than I thought - the interrupt is freed
> >> through devm infrastructure quite late.  At this time, the clock might
> >> be indeed disabled (error path of probe()).
> 
> From my point of view, the clocks are disabled as Oleksij pointed out, due to
> RUNTIME_PM at the end of probe():
> 
> > 	pm_runtime_mark_last_busy(&pdev->dev);
> > 	pm_runtime_put_autosuspend(&pdev->dev);

These lines come from regular successful probe path, not deferred error path.

The clock is indeed disabled but not because of runtime PM, but:
clk_disable:
	clk_disable_unprepare(i2c_imx->clk);


Best regards,
Krzysztof
Marc Kleine-Budde June 12, 2020, 10:50 a.m. UTC | #13
On 6/12/20 12:44 PM, Krzysztof Kozlowski wrote:
> On Fri, Jun 12, 2020 at 12:34:47PM +0200, Marc Kleine-Budde wrote:
>> On 6/12/20 12:31 PM, Oleksij Rempel wrote:
>>> On Fri, Jun 12, 2020 at 12:21:13PM +0200, Krzysztof Kozlowski wrote:
>>>> On Fri, Jun 12, 2020 at 11:56:04AM +0200, Wolfram Sang wrote:
>>>>> On Fri, Jun 12, 2020 at 11:29:41AM +0200, Krzysztof Kozlowski wrote:
>>>>>> On Fri, Jun 12, 2020 at 11:05:17AM +0200, Wolfram Sang wrote:
>>>>>>> On Wed, Jun 10, 2020 at 03:46:42PM +0200, Krzysztof Kozlowski wrote:
>>>>>>>> If interrupt comes early (could be triggered with CONFIG_DEBUG_SHIRQ),
>>>>>>>
>>>>>>> That code is disabled since 2011 (6d83f94db95c ("genirq: Disable the
>>>>>>> SHIRQ_DEBUG call in request_threaded_irq for now"))? So, you had this
>>>>>>> without fake injection, I assume?
>>>>>>
>>>>>> No, I observed it only after enabling DEBUG_SHIRQ (to a kernel with
>>>>>> some debugging options already).
>>>>>
>>>>> Interesting. Maybe probe was deferred and you got the extra irq when
>>>>> deregistering?
>>>>
>>>> Yes, good catch. The abort happens right after deferred probe exit.  It
>>>> could be then different reason than I thought - the interrupt is freed
>>>> through devm infrastructure quite late.  At this time, the clock might
>>>> be indeed disabled (error path of probe()).
>>
>> From my point of view, the clocks are disabled as Oleksij pointed out, due to
>> RUNTIME_PM at the end of probe():
>>
>>> 	pm_runtime_mark_last_busy(&pdev->dev);
>>> 	pm_runtime_put_autosuspend(&pdev->dev);
> 
> These lines come from regular successful probe path, not deferred error path.
> 
> The clock is indeed disabled but not because of runtime PM, but:
> clk_disable:
> 	clk_disable_unprepare(i2c_imx->clk);

ACK.

I think your analysis is correct: devm for shared IRQs does not work for IP
cores that need enabled clocks for register access.

And from Oleksij's comment we can conclude that runtime_pm and shared IRQ
handlers are also tricky for these IP cores :(

Marc
Wolfram Sang June 12, 2020, 11:51 a.m. UTC | #14
> This basically kills the concept of devm for interrupts. Some other

It only works when you can ensure you have all interrupts disabled (and
none pending) in remove() or the error paths of probe() etc.
Marc Kleine-Budde June 12, 2020, 12:18 p.m. UTC | #15
On 6/12/20 1:51 PM, Wolfram Sang wrote:
> 
>> This basically kills the concept of devm for interrupts. Some other
> 
> It only works when you can ensure you have all interrupts disabled (and
> none pending) in remove() or the error paths of probe() etc.

But when requesting the interrupt as shared the interrupt handler can get called
any time, even if you have disabled the IRQ source in your IP core....The shared
IRQ debug code tests this.

Marc
Wolfram Sang June 12, 2020, 1 p.m. UTC | #16
On Fri, Jun 12, 2020 at 02:18:06PM +0200, Marc Kleine-Budde wrote:
> On 6/12/20 1:51 PM, Wolfram Sang wrote:
> > 
> >> This basically kills the concept of devm for interrupts. Some other
> > 
> > It only works when you can ensure you have all interrupts disabled (and
> > none pending) in remove() or the error paths of probe() etc.
> 
> But when requesting the interrupt as shared the interrupt handler can get called
> any time, even if you have disabled the IRQ source in your IP core....The shared
> IRQ debug code tests this.

Yes, so you'd need something like

	if (clks_are_off)
		return IRQ_NONE;

or skip devm_ for interrupts and handle it manually. (IIRC the input
subsystem really frowns upon devm + irqs for such reasons)

D'accord?
Krzysztof Kozlowski June 12, 2020, 1:09 p.m. UTC | #17
On Fri, Jun 12, 2020 at 03:00:03PM +0200, Wolfram Sang wrote:
> On Fri, Jun 12, 2020 at 02:18:06PM +0200, Marc Kleine-Budde wrote:
> > On 6/12/20 1:51 PM, Wolfram Sang wrote:
> > > 
> > >> This basically kills the concept of devm for interrupts. Some other
> > > 
> > > It only works when you can ensure you have all interrupts disabled (and
> > > none pending) in remove() or the error paths of probe() etc.
> > 
> > But when requesting the interrupt as shared the interrupt handler can get called
> > any time, even if you have disabled the IRQ source in your IP core....The shared
> > IRQ debug code tests this.
> 
> Yes, so you'd need something like
> 
> 	if (clks_are_off)
> 		return IRQ_NONE;

Maybe then:
	if (pm_runtime_enabled())?

The device structure should be valid at this point so the call should
work.


> 
> or skip devm_ for interrupts and handle it manually. (IIRC the input
> subsystem really frowns upon devm + irqs for such reasons)
> 
> D'accord?

I guess dream of managing every resource automatically is an utopia :)

Best regards,
Krzysztof
Dong Aisheng June 15, 2020, 7:37 a.m. UTC | #18
> From: Wolfram Sang <wsa@kernel.org>
> Sent: Friday, June 12, 2020 9:00 PM
> 
> On Fri, Jun 12, 2020 at 02:18:06PM +0200, Marc Kleine-Budde wrote:
> > On 6/12/20 1:51 PM, Wolfram Sang wrote:
> > >
> > >> This basically kills the concept of devm for interrupts. Some other
> > >
> > > It only works when you can ensure you have all interrupts disabled
> > > (and none pending) in remove() or the error paths of probe() etc.
> >
> > But when requesting the interrupt as shared the interrupt handler can
> > get called any time, even if you have disabled the IRQ source in your
> > IP core....The shared IRQ debug code tests this.
> 
> Yes, so you'd need something like
> 
> 	if (clks_are_off)
> 		return IRQ_NONE;
> 

I understand this. But ..

> or skip devm_ for interrupts and handle it manually. (IIRC the input subsystem
> really frowns upon devm + irqs for such reasons)
> 
> D'accord?

Handle it manually looks to me can only address the issue for probe error path.
But how can it handle the normal running cases that shared irq arrived when device
is in runtime suspend state?

Regards
Aisheng
diff mbox series

Patch

diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c
index 0ab5381aa012..e28a39f4840f 100644
--- a/drivers/i2c/busses/i2c-imx.c
+++ b/drivers/i2c/busses/i2c-imx.c
@@ -1171,14 +1171,6 @@  static int i2c_imx_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	/* Request IRQ */
-	ret = devm_request_irq(&pdev->dev, irq, i2c_imx_isr, IRQF_SHARED,
-				pdev->name, i2c_imx);
-	if (ret) {
-		dev_err(&pdev->dev, "can't claim irq %d\n", irq);
-		goto clk_disable;
-	}
-
 	/* Init queue */
 	init_waitqueue_head(&i2c_imx->queue);
 
@@ -1223,6 +1215,14 @@  static int i2c_imx_probe(struct platform_device *pdev)
 	if (ret < 0)
 		goto clk_notifier_unregister;
 
+	/* Request IRQ */
+	ret = devm_request_irq(&pdev->dev, irq, i2c_imx_isr, IRQF_SHARED,
+				pdev->name, i2c_imx);
+	if (ret) {
+		dev_err(&pdev->dev, "can't claim irq %d\n", irq);
+		goto i2c_del_adapter;
+	}
+
 	pm_runtime_mark_last_busy(&pdev->dev);
 	pm_runtime_put_autosuspend(&pdev->dev);
 
@@ -1237,6 +1237,8 @@  static int i2c_imx_probe(struct platform_device *pdev)
 
 	return 0;   /* Return OK */
 
+i2c_del_adapter:
+	i2c_del_adapter(&i2c_imx->adapter);
 clk_notifier_unregister:
 	clk_notifier_unregister(i2c_imx->clk, &i2c_imx->clk_change_nb);
 rpm_disable:
@@ -1244,8 +1246,6 @@  static int i2c_imx_probe(struct platform_device *pdev)
 	pm_runtime_disable(&pdev->dev);
 	pm_runtime_set_suspended(&pdev->dev);
 	pm_runtime_dont_use_autosuspend(&pdev->dev);
-
-clk_disable:
 	clk_disable_unprepare(i2c_imx->clk);
 	return ret;
 }