diff mbox series

libata: Use per port sync for detach

Message ID 20200515110916.11556-1-kai.heng.feng@canonical.com
State Not Applicable
Delegated to: David Miller
Headers show
Series libata: Use per port sync for detach | expand

Commit Message

Kai-Heng Feng May 15, 2020, 11:09 a.m. UTC
Commit 130f4caf145c ("libata: Ensure ata_port probe has completed before
detach") may cause system freeze during suspend.

Using async_synchronize_full() in PM callbacks is wrong, since async
callbacks that are already scheduled may wait for not-yet-scheduled
callbacks, causes a circular dependency.

Instead of using big hammer like async_synchronize_full(), use async
cookie to make sure port probe are synced, without affecting other
scheduled PM callbacks.

Fixes: 130f4caf145c ("libata: Ensure ata_port probe has completed before detach")
BugLink: https://bugs.launchpad.net/bugs/1867983
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
 drivers/ata/libata-core.c | 6 +++---
 include/linux/libata.h    | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

Comments

John Garry May 15, 2020, 12:38 p.m. UTC | #1
On 15/05/2020 12:09, Kai-Heng Feng wrote:
> Commit 130f4caf145c ("libata: Ensure ata_port probe has completed before
> detach") may cause system freeze during suspend.
> 
> Using async_synchronize_full() in PM callbacks is wrong, since async
> callbacks that are already scheduled may wait for not-yet-scheduled
> callbacks, causes a circular dependency.

It would be nice to describe this circular dependency a bit more.

> 
> Instead of using big hammer like async_synchronize_full(), use async
> cookie to make sure port probe are synced, without affecting other
> scheduled PM callbacks.

oh, I thought that we could avoid the hassle of per-port cookie 
management. Sorry.

Did you check if the original issue which I tried to remedy is still 
suppressed?

I tried your patch, and got this:

[   28.190587] ------------[ cut here ]------------
[   28.195194] WARNING: CPU: 79 PID: 1 at drivers/ata/libata-core.c:5888 
ata_hos
t_detach+0x238/0x248
[   28.204025] Modules linked in:
[   28.207072] CPU: 79 PID: 1 Comm: swapper/0 Tainted: G        W 
  5.7.0-
rc5-g644cd6f-dirty #84
[   28.216076] Hardware name: Huawei TaiShan 2280 V2/BC82AMDC, BIOS 
2280-V2 CS V
3.B220.02 03/27/2020
[   28.224906] pstate: 20c00009 (nzCv daif +PAN +UAO)
[   28.229677] pc : ata_host_detach+0x238/0x248
[   28.233929] lr : ata_host_detach+0x12c/0x248
[   28.238181] sp : ffff0026dc74f980
[   28.241481] x29: ffff0026dc74f980 x28: 0000000000000000
[   28.246769] x27: ffff2026c541c010 x26: 00000000000038e0
[   28.252059] x25: 0000000000003590 x24: ffff0026d3a0a018
[   28.257350] x23: 0000000000000001 x22: ffff0026d3a0a000
[   28.262638] x21: 0000000000013ec0 x20: ffff2026c541c000
[   28.267928] x19: ffff2026c541c020 x18: 0000000000000000
[   28.273215] x17: 0000000000000000 x16: 0000000000000000
[   28.278504] x15: 00000000000006c0 x14: 0000000000000000
[   28.283792] x13: 0000000000000000 x12: 1fffe004da744317
[   28.289081] x11: ffff8004da744317 x10: dfffa00000000000
[   28.294370] x9 : ffff8004da744318 x8 : ffff0026d3a218bc
[   28.299657] x7 : 0000000000000001 x6 : ffff8004da744318
[   28.304945] x5 : ffff8004da744318 x4 : dfffa00000000000
[   28.310233] x3 : ffffa00075ea18b4 x2 : 0000000000000003
[   28.315521] x1 : 0000000000000000 x0 : 0000000000400200
[   28.320808] Call trace:
[   28.323246]  ata_host_detach+0x238/0x248
[   28.327153]  ata_pci_remove_one+0x24/0x38
[   28.331147]  ahci_remove_one+0x54/0x88
[   28.334881]  pci_device_remove+0x70/0x148
[   28.338874]  really_probe+0x17c/0x570
[   28.342522]  driver_probe_device+0x80/0x150
[   28.346690]  device_driver_attach+0x9c/0xa8
[   28.350856]  __driver_attach+0xac/0x118
[   28.354677]  bus_for_each_dev+0xf0/0x168
[   28.358584]  driver_attach+0x34/0x48
[   28.362146]  bus_add_driver+0x240/0x300
[   28.365966]  driver_register+0xc0/0x1e0
[   28.369787]  __pci_register_driver+0xb4/0xd0
[   28.374039]  ahci_pci_driver_init+0x24/0x30
[   28.378205]  do_one_initcall+0xb8/0x268
[   28.382027]  kernel_init_freeable+0x294/0x30c
[   28.386366]  kernel_init+0x14/0x120
[   28.389841]  ret_from_fork+0x10/0x1c
[   28.393400] ---[ end trace 9972785c7052048f ]---
[   28.435826] ahci 0000:b4:03.0: SSS flag set, parallel bus scan disabled


Thanks,
John

> 
> Fixes: 130f4caf145c ("libata: Ensure ata_port probe has completed before detach")
> BugLink: https://bugs.launchpad.net/bugs/1867983
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ---
>   drivers/ata/libata-core.c | 6 +++---
>   include/linux/libata.h    | 3 +++
>   2 files changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
> index beca5f91bb4c..4a698f6388cd 100644
> --- a/drivers/ata/libata-core.c
> +++ b/drivers/ata/libata-core.c
> @@ -42,7 +42,6 @@
>   #include <linux/workqueue.h>
>   #include <linux/scatterlist.h>
>   #include <linux/io.h>
> -#include <linux/async.h>
>   #include <linux/log2.h>
>   #include <linux/slab.h>
>   #include <linux/glob.h>
> @@ -5778,7 +5777,7 @@ int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
>   	/* perform each probe asynchronously */
>   	for (i = 0; i < host->n_ports; i++) {
>   		struct ata_port *ap = host->ports[i];
> -		async_schedule(async_port_probe, ap);
> +		ap->cookie = async_schedule(async_port_probe, ap);
>   	}
>   
>   	return 0;
> @@ -5921,7 +5920,8 @@ void ata_host_detach(struct ata_host *host)
>   	int i;
>   
>   	/* Ensure ata_port probe has completed */
> -	async_synchronize_full();
> +	for (i = 0; i < host->n_ports; i++)
> +		async_synchronize_cookie(host->ports[i]->cookie);
>   
>   	for (i = 0; i < host->n_ports; i++)

Is it possible to combine these "for" loops?

>   		ata_port_detach(host->ports[i]);
> diff --git a/include/linux/libata.h b/include/linux/libata.h
> index cffa4714bfa8..ae6dfc107ea8 100644
> --- a/include/linux/libata.h
> +++ b/include/linux/libata.h
> @@ -22,6 +22,7 @@
>   #include <linux/acpi.h>
>   #include <linux/cdrom.h>
>   #include <linux/sched.h>
> +#include <linux/async.h>
>   

alphabetic?

>   /*
>    * Define if arch has non-standard setup.  This is a _PCI_ standard
> @@ -872,6 +873,8 @@ struct ata_port {
>   	struct timer_list	fastdrain_timer;
>   	unsigned long		fastdrain_cnt;
>   
> +	async_cookie_t		cookie;
> +
>   	int			em_message_type;
>   	void			*private_data;
>   
>
Kai-Heng Feng May 15, 2020, 5:48 p.m. UTC | #2
> On May 15, 2020, at 20:38, John Garry <john.garry@huawei.com> wrote:
> 
> On 15/05/2020 12:09, Kai-Heng Feng wrote:
>> Commit 130f4caf145c ("libata: Ensure ata_port probe has completed before
>> detach") may cause system freeze during suspend.
>> Using async_synchronize_full() in PM callbacks is wrong, since async
>> callbacks that are already scheduled may wait for not-yet-scheduled
>> callbacks, causes a circular dependency.
> 
> It would be nice to describe this circular dependency a bit more.

Hmm, I think it's quite self-explanatory. Which part do you want me to add? 

> 
>> Instead of using big hammer like async_synchronize_full(), use async
>> cookie to make sure port probe are synced, without affecting other
>> scheduled PM callbacks.
> 
> oh, I thought that we could avoid the hassle of per-port cookie management. Sorry.
> 
> Did you check if the original issue which I tried to remedy is still suppressed?
> 
> I tried your patch, and got this:
> 
> [   28.190587] ------------[ cut here ]------------
> [   28.195194] WARNING: CPU: 79 PID: 1 at drivers/ata/libata-core.c:5888 ata_hos
> t_detach+0x238/0x248
> [   28.204025] Modules linked in:
> [   28.207072] CPU: 79 PID: 1 Comm: swapper/0 Tainted: G        W  5.7.0-
> rc5-g644cd6f-dirty #84
> [   28.216076] Hardware name: Huawei TaiShan 2280 V2/BC82AMDC, BIOS 2280-V2 CS V
> 3.B220.02 03/27/2020
> [   28.224906] pstate: 20c00009 (nzCv daif +PAN +UAO)
> [   28.229677] pc : ata_host_detach+0x238/0x248
> [   28.233929] lr : ata_host_detach+0x12c/0x248
> [   28.238181] sp : ffff0026dc74f980
> [   28.241481] x29: ffff0026dc74f980 x28: 0000000000000000
> [   28.246769] x27: ffff2026c541c010 x26: 00000000000038e0
> [   28.252059] x25: 0000000000003590 x24: ffff0026d3a0a018
> [   28.257350] x23: 0000000000000001 x22: ffff0026d3a0a000
> [   28.262638] x21: 0000000000013ec0 x20: ffff2026c541c000
> [   28.267928] x19: ffff2026c541c020 x18: 0000000000000000
> [   28.273215] x17: 0000000000000000 x16: 0000000000000000
> [   28.278504] x15: 00000000000006c0 x14: 0000000000000000
> [   28.283792] x13: 0000000000000000 x12: 1fffe004da744317
> [   28.289081] x11: ffff8004da744317 x10: dfffa00000000000
> [   28.294370] x9 : ffff8004da744318 x8 : ffff0026d3a218bc
> [   28.299657] x7 : 0000000000000001 x6 : ffff8004da744318
> [   28.304945] x5 : ffff8004da744318 x4 : dfffa00000000000
> [   28.310233] x3 : ffffa00075ea18b4 x2 : 0000000000000003
> [   28.315521] x1 : 0000000000000000 x0 : 0000000000400200
> [   28.320808] Call trace:
> [   28.323246]  ata_host_detach+0x238/0x248
> [   28.327153]  ata_pci_remove_one+0x24/0x38
> [   28.331147]  ahci_remove_one+0x54/0x88
> [   28.334881]  pci_device_remove+0x70/0x148
> [   28.338874]  really_probe+0x17c/0x570
> [   28.342522]  driver_probe_device+0x80/0x150
> [   28.346690]  device_driver_attach+0x9c/0xa8
> [   28.350856]  __driver_attach+0xac/0x118
> [   28.354677]  bus_for_each_dev+0xf0/0x168
> [   28.358584]  driver_attach+0x34/0x48
> [   28.362146]  bus_add_driver+0x240/0x300
> [   28.365966]  driver_register+0xc0/0x1e0
> [   28.369787]  __pci_register_driver+0xb4/0xd0
> [   28.374039]  ahci_pci_driver_init+0x24/0x30
> [   28.378205]  do_one_initcall+0xb8/0x268
> [   28.382027]  kernel_init_freeable+0x294/0x30c
> [   28.386366]  kernel_init+0x14/0x120
> [   28.389841]  ret_from_fork+0x10/0x1c
> [   28.393400] ---[ end trace 9972785c7052048f ]---
> [   28.435826] ahci 0000:b4:03.0: SSS flag set, parallel bus scan disabled

Can you please test the following patch:

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 474c6c34fe02..51ee0cc4d414 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -3583,8 +3583,10 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
        rc = 0;
 
        /* if UNLOADING, finish immediately */
-       if (ap->pflags & ATA_PFLAG_UNLOADING)
+       if (ap->pflags & ATA_PFLAG_UNLOADING) {
+               ap->pflags |= ATA_PFLAG_UNLOADED;
                goto out;
+       }

It's only compile-tested, many drivers panic with CONFIG_DEBUG_TEST_DRIVER_REMOVE enabled, so the system I have can't even boot properly :(

Probably worth some time to fix them one by one...

Kai-Heng

> 
> 
> Thanks,
> John
> 
>> Fixes: 130f4caf145c ("libata: Ensure ata_port probe has completed before detach")
>> BugLink: https://bugs.launchpad.net/bugs/1867983
>> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
>> ---
>>  drivers/ata/libata-core.c | 6 +++---
>>  include/linux/libata.h    | 3 +++
>>  2 files changed, 6 insertions(+), 3 deletions(-)
>> diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
>> index beca5f91bb4c..4a698f6388cd 100644
>> --- a/drivers/ata/libata-core.c
>> +++ b/drivers/ata/libata-core.c
>> @@ -42,7 +42,6 @@
>>  #include <linux/workqueue.h>
>>  #include <linux/scatterlist.h>
>>  #include <linux/io.h>
>> -#include <linux/async.h>
>>  #include <linux/log2.h>
>>  #include <linux/slab.h>
>>  #include <linux/glob.h>
>> @@ -5778,7 +5777,7 @@ int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
>>  	/* perform each probe asynchronously */
>>  	for (i = 0; i < host->n_ports; i++) {
>>  		struct ata_port *ap = host->ports[i];
>> -		async_schedule(async_port_probe, ap);
>> +		ap->cookie = async_schedule(async_port_probe, ap);
>>  	}
>>    	return 0;
>> @@ -5921,7 +5920,8 @@ void ata_host_detach(struct ata_host *host)
>>  	int i;
>>    	/* Ensure ata_port probe has completed */
>> -	async_synchronize_full();
>> +	for (i = 0; i < host->n_ports; i++)
>> +		async_synchronize_cookie(host->ports[i]->cookie);
>>    	for (i = 0; i < host->n_ports; i++)
> 
> Is it possible to combine these "for" loops?
> 
>>  		ata_port_detach(host->ports[i]);
>> diff --git a/include/linux/libata.h b/include/linux/libata.h
>> index cffa4714bfa8..ae6dfc107ea8 100644
>> --- a/include/linux/libata.h
>> +++ b/include/linux/libata.h
>> @@ -22,6 +22,7 @@
>>  #include <linux/acpi.h>
>>  #include <linux/cdrom.h>
>>  #include <linux/sched.h>
>> +#include <linux/async.h>
>>  
> 
> alphabetic?
> 
>>  /*
>>   * Define if arch has non-standard setup.  This is a _PCI_ standard
>> @@ -872,6 +873,8 @@ struct ata_port {
>>  	struct timer_list	fastdrain_timer;
>>  	unsigned long		fastdrain_cnt;
>>  +	async_cookie_t		cookie;
>> +
>>  	int			em_message_type;
>>  	void			*private_data;
John Garry May 18, 2020, 9:06 a.m. UTC | #3
On 15/05/2020 18:48, Kai-Heng Feng wrote:
>> 841]  ret_from_fork+0x10/0x1c
>> [   28.393400] ---[ end trace 9972785c7052048f ]---
>> [   28.435826] ahci 0000:b4:03.0: SSS flag set, parallel bus scan disabled
> Can you please test the following patch:
> 
> diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
> index 474c6c34fe02..51ee0cc4d414 100644
> --- a/drivers/ata/libata-eh.c
> +++ b/drivers/ata/libata-eh.c
> @@ -3583,8 +3583,10 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
>          rc = 0;
>   
>          /* if UNLOADING, finish immediately */
> -       if (ap->pflags & ATA_PFLAG_UNLOADING)
> +       if (ap->pflags & ATA_PFLAG_UNLOADING) {
> +               ap->pflags |= ATA_PFLAG_UNLOADED;
>                  goto out;
> +       }
> 
> It's only compile-tested, many drivers panic with CONFIG_DEBUG_TEST_DRIVER_REMOVE enabled, so the system I have can't even boot properly:(

ok, let me debug this today.

Thanks,
John
John Garry May 18, 2020, 3:21 p.m. UTC | #4
On 18/05/2020 10:06, John Garry wrote:
> On 15/05/2020 18:48, Kai-Heng Feng wrote:
>>> 841]  ret_from_fork+0x10/0x1c
>>> [   28.393400] ---[ end trace 9972785c7052048f ]---
>>> [   28.435826] ahci 0000:b4:03.0: SSS flag set, parallel bus scan 
>>> disabled
>> Can you please test the following patch:
>>
>> diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
>> index 474c6c34fe02..51ee0cc4d414 100644
>> --- a/drivers/ata/libata-eh.c
>> +++ b/drivers/ata/libata-eh.c
>> @@ -3583,8 +3583,10 @@ int ata_eh_recover(struct ata_port *ap, 
>> ata_prereset_fn_t prereset,
>>          rc = 0;
>>          /* if UNLOADING, finish immediately */
>> -       if (ap->pflags & ATA_PFLAG_UNLOADING)
>> +       if (ap->pflags & ATA_PFLAG_UNLOADING) {
>> +               ap->pflags |= ATA_PFLAG_UNLOADED;
>>                  goto out;
>> +       }
>>
>> It's only compile-tested, many drivers panic with 
>> CONFIG_DEBUG_TEST_DRIVER_REMOVE enabled, so the system I have can't 
>> even boot properly:(
> 

According to the comment for async_synchronize_cookie(), we sync upto 
(but excluding) the cookie:

/**
  * async_synchronize_cookie - synchronize asynchronous function calls 
with cookie checkpointing
  *
  * This function waits until all asynchronous function calls prior to 
@cookie
  * have been done.

So maybe it should be:

+	for (i = 0; i < host->n_ports; i++)
+		async_synchronize_cookie(host->ports[i]->cookie + 1);

That is how other callsites use this API, and that change resolves the 
WARN for me.

Thanks,
John
Kai-Heng Feng May 19, 2020, 6:09 a.m. UTC | #5
Hi John,

> On May 18, 2020, at 23:21, John Garry <john.garry@huawei.com> wrote:
> 
> On 18/05/2020 10:06, John Garry wrote:
>> On 15/05/2020 18:48, Kai-Heng Feng wrote:
>>>> 841]  ret_from_fork+0x10/0x1c
>>>> [   28.393400] ---[ end trace 9972785c7052048f ]---
>>>> [   28.435826] ahci 0000:b4:03.0: SSS flag set, parallel bus scan disabled
>>> Can you please test the following patch:
>>> 
>>> diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
>>> index 474c6c34fe02..51ee0cc4d414 100644
>>> --- a/drivers/ata/libata-eh.c
>>> +++ b/drivers/ata/libata-eh.c
>>> @@ -3583,8 +3583,10 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
>>>          rc = 0;
>>>          /* if UNLOADING, finish immediately */
>>> -       if (ap->pflags & ATA_PFLAG_UNLOADING)
>>> +       if (ap->pflags & ATA_PFLAG_UNLOADING) {
>>> +               ap->pflags |= ATA_PFLAG_UNLOADED;
>>>                  goto out;
>>> +       }
>>> 
>>> It's only compile-tested, many drivers panic with CONFIG_DEBUG_TEST_DRIVER_REMOVE enabled, so the system I have can't even boot properly:(
> 
> According to the comment for async_synchronize_cookie(), we sync upto (but excluding) the cookie:
> 
> /**
> * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
> *
> * This function waits until all asynchronous function calls prior to @cookie
> * have been done.
> 
> So maybe it should be:
> 
> +	for (i = 0; i < host->n_ports; i++)
> +		async_synchronize_cookie(host->ports[i]->cookie + 1);
> 
> That is how other callsites use this API, and that change resolves the WARN for me.

Thanks for catching this up!
Let me ask the user to test it, and I'll send v2 base on this.

Kai-Heng

> 
> Thanks,
> John
diff mbox series

Patch

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index beca5f91bb4c..4a698f6388cd 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -42,7 +42,6 @@ 
 #include <linux/workqueue.h>
 #include <linux/scatterlist.h>
 #include <linux/io.h>
-#include <linux/async.h>
 #include <linux/log2.h>
 #include <linux/slab.h>
 #include <linux/glob.h>
@@ -5778,7 +5777,7 @@  int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
 	/* perform each probe asynchronously */
 	for (i = 0; i < host->n_ports; i++) {
 		struct ata_port *ap = host->ports[i];
-		async_schedule(async_port_probe, ap);
+		ap->cookie = async_schedule(async_port_probe, ap);
 	}
 
 	return 0;
@@ -5921,7 +5920,8 @@  void ata_host_detach(struct ata_host *host)
 	int i;
 
 	/* Ensure ata_port probe has completed */
-	async_synchronize_full();
+	for (i = 0; i < host->n_ports; i++)
+		async_synchronize_cookie(host->ports[i]->cookie);
 
 	for (i = 0; i < host->n_ports; i++)
 		ata_port_detach(host->ports[i]);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index cffa4714bfa8..ae6dfc107ea8 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -22,6 +22,7 @@ 
 #include <linux/acpi.h>
 #include <linux/cdrom.h>
 #include <linux/sched.h>
+#include <linux/async.h>
 
 /*
  * Define if arch has non-standard setup.  This is a _PCI_ standard
@@ -872,6 +873,8 @@  struct ata_port {
 	struct timer_list	fastdrain_timer;
 	unsigned long		fastdrain_cnt;
 
+	async_cookie_t		cookie;
+
 	int			em_message_type;
 	void			*private_data;