diff mbox series

[V2] PCI: AER: fix deadlock in do_recovery

Message ID 20170930054938.43271-1-gvaradar@cisco.com
State Changes Requested
Headers show
Series [V2] PCI: AER: fix deadlock in do_recovery | expand

Commit Message

Govindarajulu Varadarajan Sept. 30, 2017, 5:49 a.m. UTC
CPU0					CPU1
---------------------------------------------------------------------
__driver_attach()
device_lock(&dev->mutex) <--- device mutex lock here
driver_probe_device()
pci_enable_sriov()
pci_iov_add_virtfn()
pci_device_add()
					aer_isr()		<--- pci aer error
					do_recovery()
					broadcast_error_message()
					pci_walk_bus()
					down_read(&pci_bus_sem) <--- rd sem
down_write(&pci_bus_sem) <-- stuck on wr sem
					report_error_detected()
					device_lock(&dev->mutex)<--- DEAD LOCK

This can also happen when aer error occurs while pci_dev->sriov_config() is
called.

This patch does a pci_bus_walk and adds all the devices to a list. After
unlocking (up_read) &pci_bus_sem, we go through the list and call
err_handler of the devices with devic_lock() held. This way, we dont try
to hold both locks at same time.

v2:
* Drop patch 1, 2 & 4.
* Instead of locking 50+ devices, do get_device() and add them to a list.
  After unlocking &pci_bus_sem, go through the list call err_handler.

v1:
* Previous discussion here: https://lkml.org/lkml/2017/9/27/720

[   70.984091] pcieport 0000:00:02.0: AER: Uncorrected (Non-Fatal) error received: id=0010
[   70.984112] pcieport 0000:00:02.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=0010(Requester ID)
[   70.984116] pcieport 0000:00:02.0:   device [8086:3c04] error status/mask=00004000/00100000
[   70.984120] pcieport 0000:00:02.0:    [14] Completion Timeout     (First)
...
[  107.484190] INFO: task kworker/0:1:76 blocked for more than 30 seconds.
[  107.563619]       Not tainted 4.13.0+ #28
[  107.611618] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  107.705368] kworker/0:1     D    0    76      2 0x80000000
[  107.771050] Workqueue: events aer_isr
[  107.814895] Call Trace:
[  107.844181]  __schedule+0x312/0xa40
[  107.885928]  schedule+0x3d/0x90
[  107.923506]  schedule_preempt_disabled+0x15/0x20
[  107.978773]  __mutex_lock+0x304/0xa30
[  108.022594]  ? dev_printk_emit+0x3b/0x50
[  108.069534]  ? report_error_detected+0xa6/0x210
[  108.123770]  mutex_lock_nested+0x1b/0x20
[  108.170713]  ? mutex_lock_nested+0x1b/0x20
[  108.219730]  report_error_detected+0xa6/0x210
[  108.271881]  ? aer_recover_queue+0xe0/0xe0
[  108.320904]  pci_walk_bus+0x46/0x90
[  108.362645]  ? aer_recover_queue+0xe0/0xe0
[  108.411658]  broadcast_error_message+0xc3/0xf0
[  108.464835]  do_recovery+0x34/0x220
[  108.506569]  ? get_device_error_info+0x92/0x130
[  108.560785]  aer_isr+0x28f/0x3b0
[  108.599410]  process_one_work+0x277/0x6c0
[  108.647399]  worker_thread+0x4d/0x3b0
[  108.691218]  kthread+0x171/0x190
[  108.729830]  ? process_one_work+0x6c0/0x6c0
[  108.779888]  ? kthread_create_on_node+0x40/0x40
[  108.834110]  ret_from_fork+0x2a/0x40
[  108.876916] INFO: task kworker/0:2:205 blocked for more than 30 seconds.
[  108.957129]       Not tainted 4.13.0+ #28
[  109.005114] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  109.098873] kworker/0:2     D    0   205      2 0x80000000
[  109.164544] Workqueue: events work_for_cpu_fn
[  109.216681] Call Trace:
[  109.245943]  __schedule+0x312/0xa40
[  109.287687]  ? rwsem_down_write_failed+0x308/0x4f0
[  109.345021]  schedule+0x3d/0x90
[  109.382603]  rwsem_down_write_failed+0x30d/0x4f0
[  109.437869]  ? __lock_acquire+0x75c/0x1410
[  109.486910]  call_rwsem_down_write_failed+0x17/0x30
[  109.545287]  ? call_rwsem_down_write_failed+0x17/0x30
[  109.605752]  down_write+0x88/0xb0
[  109.645410]  pci_device_add+0x158/0x240
[  109.691313]  pci_iov_add_virtfn+0x24f/0x340
[  109.741375]  pci_enable_sriov+0x32b/0x420
[  109.789466]  ? pci_read+0x2c/0x30
[  109.829142]  enic_probe+0x5d4/0xff0 [enic]
[  109.878184]  ? trace_hardirqs_on+0xd/0x10
[  109.926180]  local_pci_probe+0x42/0xa0
[  109.971037]  work_for_cpu_fn+0x14/0x20
[  110.015898]  process_one_work+0x277/0x6c0
[  110.063884]  worker_thread+0x1d6/0x3b0
[  110.108750]  kthread+0x171/0x190
[  110.147363]  ? process_one_work+0x6c0/0x6c0
[  110.197426]  ? kthread_create_on_node+0x40/0x40
[  110.251642]  ret_from_fork+0x2a/0x40
[  110.294448] INFO: task systemd-udevd:492 blocked for more than 30 seconds.
[  110.376742]       Not tainted 4.13.0+ #28
[  110.424715] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  110.518457] systemd-udevd   D    0   492    444 0x80000180
[  110.584116] Call Trace:
[  110.613382]  __schedule+0x312/0xa40
[  110.655127]  ? wait_for_completion+0x14a/0x1d0
[  110.708302]  schedule+0x3d/0x90
[  110.745875]  schedule_timeout+0x26e/0x5b0
[  110.793887]  ? wait_for_completion+0x14a/0x1d0
[  110.847068]  wait_for_completion+0x169/0x1d0
[  110.898165]  ? wait_for_completion+0x169/0x1d0
[  110.951354]  ? wake_up_q+0x80/0x80
[  110.992060]  flush_work+0x237/0x300
[  111.033795]  ? flush_workqueue_prep_pwqs+0x1b0/0x1b0
[  111.093224]  ? wait_for_completion+0x5a/0x1d0
[  111.145363]  ? flush_work+0x237/0x300
[  111.189189]  work_on_cpu+0x94/0xb0
[  111.229894]  ? work_is_static_object+0x20/0x20
[  111.283070]  ? pci_device_shutdown+0x60/0x60
[  111.334173]  pci_device_probe+0x17a/0x190
[  111.382163]  driver_probe_device+0x2ff/0x450
[  111.433260]  __driver_attach+0x103/0x140
[  111.480195]  ? driver_probe_device+0x450/0x450
[  111.533381]  bus_for_each_dev+0x74/0xb0
[  111.579276]  driver_attach+0x1e/0x20
[  111.622056]  bus_add_driver+0x1ca/0x270
[  111.667955]  ? 0xffffffffc039c000
[  111.707616]  driver_register+0x60/0xe0
[  111.752472]  ? 0xffffffffc039c000
[  111.792126]  __pci_register_driver+0x6b/0x70
[  111.843275]  enic_init_module+0x38/0x1000 [enic]
[  111.898533]  do_one_initcall+0x50/0x192
[  111.944428]  ? trace_hardirqs_on+0xd/0x10
[  111.992408]  do_init_module+0x5f/0x1f2
[  112.037274]  load_module+0x1740/0x1f70
[  112.082148]  SYSC_finit_module+0xd7/0xf0
[  112.129083]  ? SYSC_finit_module+0xd7/0xf0
[  112.178106]  SyS_finit_module+0xe/0x10
[  112.222972]  do_syscall_64+0x69/0x180
[  112.266793]  entry_SYSCALL64_slow_path+0x25/0x25
[  112.322047] RIP: 0033:0x7f3da098b559
[  112.364826] RSP: 002b:00007ffeb3306a38 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
[  112.455447] RAX: ffffffffffffffda RBX: 0000557fe41ed3d0 RCX: 00007f3da098b559
[  112.540860] RDX: 0000000000000000 RSI: 00007f3da14c79c5 RDI: 0000000000000006
[  112.626281] RBP: 00007f3da14c79c5 R08: 0000000000000000 R09: 00007ffeb3306b50
[  112.711698] R10: 0000000000000006 R11: 0000000000000246 R12: 0000000000000000
[  112.797114] R13: 0000557fe420e210 R14: 0000000000020000 R15: 0000557fe2c1ef4a
[  112.882568]
               Showing all locks held in the system:
[  112.956545] 5 locks held by kworker/0:1/76:
[  113.006616]  #0:  ("events"){+.+.}, at: [<ffffffffb00b10ed>] process_one_work+0x1ed/0x6c0
[  113.104535]  #1:  ((&rpc->dpc_handler)){+.+.}, at: [<ffffffffb00b10ed>] process_one_work+0x1ed/0x6c0
[  113.213894]  #2:  (&rpc->rpc_mutex){+.+.}, at: [<ffffffffb0505ca2>] aer_isr+0x32/0x3b0
[  113.308711]  #3:  (pci_bus_sem){++++}, at: [<ffffffffb04ea18a>] pci_walk_bus+0x2a/0x90
[  113.403501]  #4:  (&dev->mutex){....}, at: [<ffffffffb0505706>] report_error_detected+0xa6/0x210
[  113.508715] 3 locks held by kworker/0:2/205:
[  113.559808]  #0:  ("events"){+.+.}, at: [<ffffffffb00b10ed>] process_one_work+0x1ed/0x6c0
[  113.657718]  #1:  ((&wfc.work)){+.+.}, at: [<ffffffffb00b10ed>] process_one_work+0x1ed/0x6c0
[  113.758745]  #2:  (pci_bus_sem){++++}, at: [<ffffffffb04ec978>] pci_device_add+0x158/0x240
[  113.857710] 1 lock held by khungtaskd/239:
[  113.906729]  #0:  (tasklist_lock){.+.+}, at: [<ffffffffb00f07dd>] debug_show_all_locks+0x3d/0x1a0
[  114.012972] 2 locks held by systemd-udevd/492:
[  114.066148]  #0:  (&dev->mutex){....}, at: [<ffffffffb06254d5>] __driver_attach+0x55/0x140
[  114.165107]  #1:  (&dev->mutex){....}, at: [<ffffffffb06254f2>] __driver_attach+0x72/0x140

[  114.281879] =============================================

Signed-off-by: Govindarajulu Varadarajan <gvaradar@cisco.com>
---
 drivers/pci/pcie/aer/aerdrv.h      |  5 +++++
 drivers/pci/pcie/aer/aerdrv_core.c | 45 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 48 insertions(+), 2 deletions(-)

Comments

Sinan Kaya Sept. 30, 2017, 1:31 p.m. UTC | #1
On 9/30/2017 1:49 AM, Govindarajulu Varadarajan wrote:
> This patch does a pci_bus_walk and adds all the devices to a list. After
> unlocking (up_read) &pci_bus_sem, we go through the list and call
> err_handler of the devices with devic_lock() held. This way, we dont try
> to hold both locks at same time.

I do like this approach with some more feedback.

I need a little bit of help here from someone that knows get/put device calls.

I understand get_device() and put_device() are there to increment/decrement
reference counters. This patch seems to use them as an alternative for device_lock()
and device_unlock() API. 

If this is a good assumption, then you can get away with just replacing device_lock()
with get_device() and device_unlock() with put_device() in the existing code as
well. Then, you don't need to build a linklist.

A nit is your history messages in the commit message belongs to a cover letter.
Christoph Hellwig Oct. 1, 2017, 7:55 a.m. UTC | #2
Can you please start with an explanation of the problem in text and
cut down the cycle of involved functions to those actually relevant?

That'll make reviewing the patch a bit easier..

>  2 files changed, 48 insertions(+), 2 deletions(-)

> diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
> index d51e4a57b190..f6d8761dc656 100644
> --- a/drivers/pci/pcie/aer/aerdrv.h
> +++ b/drivers/pci/pcie/aer/aerdrv.h
> @@ -105,6 +105,11 @@ static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
>  	return orig;
>  }
>  
> +struct aer_device_list {
> +	struct device *dev;
> +	struct hlist_node node;
> +};
> +
>  extern struct bus_type pcie_port_bus_type;
>  void aer_isr(struct work_struct *work);
>  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
> index 890efcc574cb..d524f2c2c288 100644
> --- a/drivers/pci/pcie/aer/aerdrv_core.c
> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> @@ -346,6 +346,47 @@ static int report_resume(struct pci_dev *dev, void *data)
>  	return 0;
>  }
>  
> +int aer_get_pci_dev(struct pci_dev *pdev, void *data)
> +{
> +	struct hlist_head *hhead = (struct hlist_head *)data;
> +	struct device *dev = &pdev->dev;
> +	struct aer_device_list *entry;
> +
> +	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
> +	if (!entry)
> +		/* continue with other devices, lets not return error */
> +		return 0;
> +
> +	entry->dev = get_device(dev);
> +	hlist_add_head(&entry->node, hhead);
> +
> +	return 0;

Can we just embedded the list_head in the pci_dev?  Or is there a way
we can have multiple chains like this pending at the same time?

> +static void aer_pci_walk_bus(struct pci_bus *top,
> +			     int (*cb)(struct pci_dev *, void *),
> +			     struct aer_broadcast_data *result)
> +{
> +	HLIST_HEAD(dev_hlist);
> +	struct hlist_node *tmp;
> +	struct aer_device_list *entry;

Do we want to offer this as generic PCIe functionality?  If not we can
probably hardcode the callback and callback data to simplify this a lot.
Govindarajulu Varadarajan Oct. 3, 2017, 12:14 a.m. UTC | #3
On Sun, 1 Oct 2017, Christoph Hellwig wrote:

>> +struct aer_device_list {
>> +	struct device *dev;
>> +	struct hlist_node node;
>> +};
>> +
>>  extern struct bus_type pcie_port_bus_type;
>>  void aer_isr(struct work_struct *work);
>>  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
>> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
>> index 890efcc574cb..d524f2c2c288 100644
>> --- a/drivers/pci/pcie/aer/aerdrv_core.c
>> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
>> @@ -346,6 +346,47 @@ static int report_resume(struct pci_dev *dev, void *data)
>>  	return 0;
>>  }
>>
>> +int aer_get_pci_dev(struct pci_dev *pdev, void *data)
>> +{
>> +	struct hlist_head *hhead = (struct hlist_head *)data;
>> +	struct device *dev = &pdev->dev;
>> +	struct aer_device_list *entry;
>> +
>> +	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
>> +	if (!entry)
>> +		/* continue with other devices, lets not return error */
>> +		return 0;
>> +
>> +	entry->dev = get_device(dev);
>> +	hlist_add_head(&entry->node, hhead);
>> +
>> +	return 0;
>
> Can we just embedded the list_head in the pci_dev?  Or is there a way
> we can have multiple chains like this pending at the same time?
>

I would avoid increasing the size of pci_dev. The list_head would be empty after
we call aer_pci_walk_bus(). We already have pci_bus *subordinate to link all the
'device's. Making list_head available to others would require a lock. I would
avoid that.

>> +static void aer_pci_walk_bus(struct pci_bus *top,
>> +			     int (*cb)(struct pci_dev *, void *),
>> +			     struct aer_broadcast_data *result)
>> +{
>> +	HLIST_HEAD(dev_hlist);
>> +	struct hlist_node *tmp;
>> +	struct aer_device_list *entry;
>
> Do we want to offer this as generic PCIe functionality?  If not we can
> probably hardcode the callback and callback data to simplify this a lot.
>

I could not find any other code which aquires device_lock in pci_walk_bus cb
function. Can you tell me how we can hardcore callback and callback data here?
Govindarajulu Varadarajan Oct. 3, 2017, 12:19 a.m. UTC | #4
On Sat, 30 Sep 2017, Sinan Kaya wrote:

> On 9/30/2017 1:49 AM, Govindarajulu Varadarajan wrote:
>> This patch does a pci_bus_walk and adds all the devices to a list. After
>> unlocking (up_read) &pci_bus_sem, we go through the list and call
>> err_handler of the devices with devic_lock() held. This way, we dont try
>> to hold both locks at same time.
>
> I do like this approach with some more feedback.
>
> I need a little bit of help here from someone that knows get/put device calls.
>
> I understand get_device() and put_device() are there to increment/decrement
> reference counters. This patch seems to use them as an alternative for device_lock()
> and device_unlock() API.
>

get/put_deivce is not used as an alternative to device_lock() here. We are
incrementing the counter to make sure no one else free the device while they
are in our list. report_error_detected() and other cb will still acquire
device_lock().


> If this is a good assumption, then you can get away with just replacing device_lock()
> with get_device() and device_unlock() with put_device() in the existing code as
> well. Then, you don't need to build a linklist.
Christoph Hellwig Oct. 3, 2017, 8:09 a.m. UTC | #5
On Mon, Oct 02, 2017 at 05:14:09PM -0700, Govindarajulu Varadarajan wrote:
>
> I would avoid increasing the size of pci_dev. The list_head would be empty after
> we call aer_pci_walk_bus(). We already have pci_bus *subordinate to link all the
> 'device's. Making list_head available to others would require a lock. I would
> avoid that.

It does not require a new lock if you clearly document the concurrency
semantics.  And I'd much rather add a hlist_node (which should be enough
if we want to be small) to pci_dev than requiring these additional
memory allocations and boiler plate code in an error recovery situation.

>
>>> +static void aer_pci_walk_bus(struct pci_bus *top,
>>> +			     int (*cb)(struct pci_dev *, void *),
>>> +			     struct aer_broadcast_data *result)
>>> +{
>>> +	HLIST_HEAD(dev_hlist);
>>> +	struct hlist_node *tmp;
>>> +	struct aer_device_list *entry;
>>
>> Do we want to offer this as generic PCIe functionality?  If not we can
>> probably hardcode the callback and callback data to simplify this a lot.
>>
>
> I could not find any other code which aquires device_lock in pci_walk_bus cb
> function. Can you tell me how we can hardcore callback and callback data here?

The word is hardcode.  If you follow the above suggestion we won't
really need the aer_pci_walk_bus helper but could just open code most
of the code.  Or maybe just keep passing the cb for simplicity - it
was just extending on the above idea.
Bjorn Helgaas Oct. 3, 2017, 9:15 p.m. UTC | #6
[+cc Alex, Gavin, Wei]

On Fri, Sep 29, 2017 at 10:49:38PM -0700, Govindarajulu Varadarajan wrote:
> CPU0					CPU1
> ---------------------------------------------------------------------
> __driver_attach()
> device_lock(&dev->mutex) <--- device mutex lock here
> driver_probe_device()
> pci_enable_sriov()
> pci_iov_add_virtfn()
> pci_device_add()
> 					aer_isr()		<--- pci aer error
> 					do_recovery()
> 					broadcast_error_message()
> 					pci_walk_bus()
> 					down_read(&pci_bus_sem) <--- rd sem
> down_write(&pci_bus_sem) <-- stuck on wr sem
> 					report_error_detected()
> 					device_lock(&dev->mutex)<--- DEAD LOCK
> 
> This can also happen when aer error occurs while pci_dev->sriov_config() is
> called.
> 
> This patch does a pci_bus_walk and adds all the devices to a list. After
> unlocking (up_read) &pci_bus_sem, we go through the list and call
> err_handler of the devices with devic_lock() held. This way, we dont try
> to hold both locks at same time.

I feel like we're working too hard to come up with an ad hoc solution
for this lock ordering problem: the __driver_attach() path acquires
the device lock, then the pci_bus_sem; the AER path acquires
pci_bus_sem, then the device lock.

To me, the pci_bus_sem, then device lock order seems natural.  The
pci_bus_sem protects all the bus device lists, so it makes sense to
hold it while iterating over those lists.  And if we're operating on
one of those devices while we're iterating, it makes sense to acquire
the device lock.

The pci_enable_sriov() path is the one that feels strange to me.
We're in a driver probe method, and, surprise!, brand-new devices show
up and we basically ask the PCI core to enumerate them synchronously
while still in the probe method.

Is there some reason this enumeration has to be done synchronously?
I wonder if we can get that piece out of the driver probe path, e.g.,
by queuing up the pci_iov_add_virtfn() part to be done later, in a
path where we're not holding a device lock?

We have this problem with AER today, but it's conceivable that we'll
have a similar problem somewhere else tomorrow, and having to build a
list, fiddle with reference counts, etc., doesn't seem like a very
general purpose solution.

> v2:
> * Drop patch 1, 2 & 4.
> * Instead of locking 50+ devices, do get_device() and add them to a list.
>   After unlocking &pci_bus_sem, go through the list call err_handler.
> 
> v1:
> * Previous discussion here: https://lkml.org/lkml/2017/9/27/720
> 
> [   70.984091] pcieport 0000:00:02.0: AER: Uncorrected (Non-Fatal) error received: id=0010
> [   70.984112] pcieport 0000:00:02.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=0010(Requester ID)
> [   70.984116] pcieport 0000:00:02.0:   device [8086:3c04] error status/mask=00004000/00100000
> [   70.984120] pcieport 0000:00:02.0:    [14] Completion Timeout     (First)
> ...
> [  107.484190] INFO: task kworker/0:1:76 blocked for more than 30 seconds.
> [  107.563619]       Not tainted 4.13.0+ #28
> [  107.611618] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  107.705368] kworker/0:1     D    0    76      2 0x80000000
> [  107.771050] Workqueue: events aer_isr
> [  107.814895] Call Trace:
> [  107.844181]  __schedule+0x312/0xa40
> [  107.885928]  schedule+0x3d/0x90
> [  107.923506]  schedule_preempt_disabled+0x15/0x20
> [  107.978773]  __mutex_lock+0x304/0xa30
> [  108.022594]  ? dev_printk_emit+0x3b/0x50
> [  108.069534]  ? report_error_detected+0xa6/0x210
> [  108.123770]  mutex_lock_nested+0x1b/0x20
> [  108.170713]  ? mutex_lock_nested+0x1b/0x20
> [  108.219730]  report_error_detected+0xa6/0x210
> [  108.271881]  ? aer_recover_queue+0xe0/0xe0
> [  108.320904]  pci_walk_bus+0x46/0x90
> [  108.362645]  ? aer_recover_queue+0xe0/0xe0
> [  108.411658]  broadcast_error_message+0xc3/0xf0
> [  108.464835]  do_recovery+0x34/0x220
> [  108.506569]  ? get_device_error_info+0x92/0x130
> [  108.560785]  aer_isr+0x28f/0x3b0
> [  108.599410]  process_one_work+0x277/0x6c0
> [  108.647399]  worker_thread+0x4d/0x3b0
> [  108.691218]  kthread+0x171/0x190
> [  108.729830]  ? process_one_work+0x6c0/0x6c0
> [  108.779888]  ? kthread_create_on_node+0x40/0x40
> [  108.834110]  ret_from_fork+0x2a/0x40
> [  108.876916] INFO: task kworker/0:2:205 blocked for more than 30 seconds.
> [  108.957129]       Not tainted 4.13.0+ #28
> [  109.005114] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  109.098873] kworker/0:2     D    0   205      2 0x80000000
> [  109.164544] Workqueue: events work_for_cpu_fn
> [  109.216681] Call Trace:
> [  109.245943]  __schedule+0x312/0xa40
> [  109.287687]  ? rwsem_down_write_failed+0x308/0x4f0
> [  109.345021]  schedule+0x3d/0x90
> [  109.382603]  rwsem_down_write_failed+0x30d/0x4f0
> [  109.437869]  ? __lock_acquire+0x75c/0x1410
> [  109.486910]  call_rwsem_down_write_failed+0x17/0x30
> [  109.545287]  ? call_rwsem_down_write_failed+0x17/0x30
> [  109.605752]  down_write+0x88/0xb0
> [  109.645410]  pci_device_add+0x158/0x240
> [  109.691313]  pci_iov_add_virtfn+0x24f/0x340
> [  109.741375]  pci_enable_sriov+0x32b/0x420
> [  109.789466]  ? pci_read+0x2c/0x30
> [  109.829142]  enic_probe+0x5d4/0xff0 [enic]
> [  109.878184]  ? trace_hardirqs_on+0xd/0x10
> [  109.926180]  local_pci_probe+0x42/0xa0
> [  109.971037]  work_for_cpu_fn+0x14/0x20
> [  110.015898]  process_one_work+0x277/0x6c0
> [  110.063884]  worker_thread+0x1d6/0x3b0
> [  110.108750]  kthread+0x171/0x190
> [  110.147363]  ? process_one_work+0x6c0/0x6c0
> [  110.197426]  ? kthread_create_on_node+0x40/0x40
> [  110.251642]  ret_from_fork+0x2a/0x40
> [  110.294448] INFO: task systemd-udevd:492 blocked for more than 30 seconds.
> [  110.376742]       Not tainted 4.13.0+ #28
> [  110.424715] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  110.518457] systemd-udevd   D    0   492    444 0x80000180
> [  110.584116] Call Trace:
> [  110.613382]  __schedule+0x312/0xa40
> [  110.655127]  ? wait_for_completion+0x14a/0x1d0
> [  110.708302]  schedule+0x3d/0x90
> [  110.745875]  schedule_timeout+0x26e/0x5b0
> [  110.793887]  ? wait_for_completion+0x14a/0x1d0
> [  110.847068]  wait_for_completion+0x169/0x1d0
> [  110.898165]  ? wait_for_completion+0x169/0x1d0
> [  110.951354]  ? wake_up_q+0x80/0x80
> [  110.992060]  flush_work+0x237/0x300
> [  111.033795]  ? flush_workqueue_prep_pwqs+0x1b0/0x1b0
> [  111.093224]  ? wait_for_completion+0x5a/0x1d0
> [  111.145363]  ? flush_work+0x237/0x300
> [  111.189189]  work_on_cpu+0x94/0xb0
> [  111.229894]  ? work_is_static_object+0x20/0x20
> [  111.283070]  ? pci_device_shutdown+0x60/0x60
> [  111.334173]  pci_device_probe+0x17a/0x190
> [  111.382163]  driver_probe_device+0x2ff/0x450
> [  111.433260]  __driver_attach+0x103/0x140
> [  111.480195]  ? driver_probe_device+0x450/0x450
> [  111.533381]  bus_for_each_dev+0x74/0xb0
> [  111.579276]  driver_attach+0x1e/0x20
> [  111.622056]  bus_add_driver+0x1ca/0x270
> [  111.667955]  ? 0xffffffffc039c000
> [  111.707616]  driver_register+0x60/0xe0
> [  111.752472]  ? 0xffffffffc039c000
> [  111.792126]  __pci_register_driver+0x6b/0x70
> [  111.843275]  enic_init_module+0x38/0x1000 [enic]
> [  111.898533]  do_one_initcall+0x50/0x192
> [  111.944428]  ? trace_hardirqs_on+0xd/0x10
> [  111.992408]  do_init_module+0x5f/0x1f2
> [  112.037274]  load_module+0x1740/0x1f70
> [  112.082148]  SYSC_finit_module+0xd7/0xf0
> [  112.129083]  ? SYSC_finit_module+0xd7/0xf0
> [  112.178106]  SyS_finit_module+0xe/0x10
> [  112.222972]  do_syscall_64+0x69/0x180
> [  112.266793]  entry_SYSCALL64_slow_path+0x25/0x25
> [  112.322047] RIP: 0033:0x7f3da098b559
> [  112.364826] RSP: 002b:00007ffeb3306a38 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
> [  112.455447] RAX: ffffffffffffffda RBX: 0000557fe41ed3d0 RCX: 00007f3da098b559
> [  112.540860] RDX: 0000000000000000 RSI: 00007f3da14c79c5 RDI: 0000000000000006
> [  112.626281] RBP: 00007f3da14c79c5 R08: 0000000000000000 R09: 00007ffeb3306b50
> [  112.711698] R10: 0000000000000006 R11: 0000000000000246 R12: 0000000000000000
> [  112.797114] R13: 0000557fe420e210 R14: 0000000000020000 R15: 0000557fe2c1ef4a
> [  112.882568]
>                Showing all locks held in the system:
> [  112.956545] 5 locks held by kworker/0:1/76:
> [  113.006616]  #0:  ("events"){+.+.}, at: [<ffffffffb00b10ed>] process_one_work+0x1ed/0x6c0
> [  113.104535]  #1:  ((&rpc->dpc_handler)){+.+.}, at: [<ffffffffb00b10ed>] process_one_work+0x1ed/0x6c0
> [  113.213894]  #2:  (&rpc->rpc_mutex){+.+.}, at: [<ffffffffb0505ca2>] aer_isr+0x32/0x3b0
> [  113.308711]  #3:  (pci_bus_sem){++++}, at: [<ffffffffb04ea18a>] pci_walk_bus+0x2a/0x90
> [  113.403501]  #4:  (&dev->mutex){....}, at: [<ffffffffb0505706>] report_error_detected+0xa6/0x210
> [  113.508715] 3 locks held by kworker/0:2/205:
> [  113.559808]  #0:  ("events"){+.+.}, at: [<ffffffffb00b10ed>] process_one_work+0x1ed/0x6c0
> [  113.657718]  #1:  ((&wfc.work)){+.+.}, at: [<ffffffffb00b10ed>] process_one_work+0x1ed/0x6c0
> [  113.758745]  #2:  (pci_bus_sem){++++}, at: [<ffffffffb04ec978>] pci_device_add+0x158/0x240
> [  113.857710] 1 lock held by khungtaskd/239:
> [  113.906729]  #0:  (tasklist_lock){.+.+}, at: [<ffffffffb00f07dd>] debug_show_all_locks+0x3d/0x1a0
> [  114.012972] 2 locks held by systemd-udevd/492:
> [  114.066148]  #0:  (&dev->mutex){....}, at: [<ffffffffb06254d5>] __driver_attach+0x55/0x140
> [  114.165107]  #1:  (&dev->mutex){....}, at: [<ffffffffb06254f2>] __driver_attach+0x72/0x140
> 
> [  114.281879] =============================================
> 
> Signed-off-by: Govindarajulu Varadarajan <gvaradar@cisco.com>
> ---
>  drivers/pci/pcie/aer/aerdrv.h      |  5 +++++
>  drivers/pci/pcie/aer/aerdrv_core.c | 45 ++++++++++++++++++++++++++++++++++++--
>  2 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
> index d51e4a57b190..f6d8761dc656 100644
> --- a/drivers/pci/pcie/aer/aerdrv.h
> +++ b/drivers/pci/pcie/aer/aerdrv.h
> @@ -105,6 +105,11 @@ static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
>  	return orig;
>  }
>  
> +struct aer_device_list {
> +	struct device *dev;
> +	struct hlist_node node;
> +};
> +
>  extern struct bus_type pcie_port_bus_type;
>  void aer_isr(struct work_struct *work);
>  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
> index 890efcc574cb..d524f2c2c288 100644
> --- a/drivers/pci/pcie/aer/aerdrv_core.c
> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> @@ -346,6 +346,47 @@ static int report_resume(struct pci_dev *dev, void *data)
>  	return 0;
>  }
>  
> +int aer_get_pci_dev(struct pci_dev *pdev, void *data)
> +{
> +	struct hlist_head *hhead = (struct hlist_head *)data;
> +	struct device *dev = &pdev->dev;
> +	struct aer_device_list *entry;
> +
> +	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
> +	if (!entry)
> +		/* continue with other devices, lets not return error */
> +		return 0;
> +
> +	entry->dev = get_device(dev);
> +	hlist_add_head(&entry->node, hhead);
> +
> +	return 0;
> +}
> +
> +static void aer_pci_walk_bus(struct pci_bus *top,
> +			     int (*cb)(struct pci_dev *, void *),
> +			     struct aer_broadcast_data *result)
> +{
> +	HLIST_HEAD(dev_hlist);
> +	struct hlist_node *tmp;
> +	struct aer_device_list *entry;
> +
> +	pci_walk_bus(top, aer_get_pci_dev, &dev_hlist);
> +	hlist_for_each_entry_safe(entry, tmp, &dev_hlist, node) {
> +		struct pci_dev *pdev = container_of(entry->dev, struct pci_dev,
> +						    dev);
> +		int err;
> +
> +		err = cb(pdev, result);
> +		if (err)
> +			dev_err(entry->dev, "AER: recovery handler failed: %d",
> +				err);
> +		put_device(entry->dev);
> +		hlist_del(&entry->node);
> +		kfree(entry);
> +	}
> +}
> +
>  /**
>   * broadcast_error_message - handle message broadcast to downstream drivers
>   * @dev: pointer to from where in a hierarchy message is broadcasted down
> @@ -380,7 +421,7 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
>  		 */
>  		if (cb == report_error_detected)
>  			dev->error_state = state;
> -		pci_walk_bus(dev->subordinate, cb, &result_data);
> +		aer_pci_walk_bus(dev->subordinate, cb, &result_data);
>  		if (cb == report_resume) {
>  			pci_cleanup_aer_uncorrect_error_status(dev);
>  			dev->error_state = pci_channel_io_normal;
> @@ -390,7 +431,7 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
>  		 * If the error is reported by an end point, we think this
>  		 * error is related to the upstream link of the end point.
>  		 */
> -		pci_walk_bus(dev->bus, cb, &result_data);
> +		aer_pci_walk_bus(dev->bus, cb, &result_data);
>  	}
>  
>  	return result_data.result;
> -- 
> 2.14.1
>
Wei Yang Oct. 5, 2017, 3:05 p.m. UTC | #7
On Wed, Oct 4, 2017 at 5:15 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> [+cc Alex, Gavin, Wei]
>
> On Fri, Sep 29, 2017 at 10:49:38PM -0700, Govindarajulu Varadarajan wrote:
>> CPU0                                  CPU1
>> ---------------------------------------------------------------------
>> __driver_attach()
>> device_lock(&dev->mutex) <--- device mutex lock here
>> driver_probe_device()
>> pci_enable_sriov()
>> pci_iov_add_virtfn()
>> pci_device_add()
>>                                       aer_isr()               <--- pci aer error
>>                                       do_recovery()
>>                                       broadcast_error_message()
>>                                       pci_walk_bus()
>>                                       down_read(&pci_bus_sem) <--- rd sem
>> down_write(&pci_bus_sem) <-- stuck on wr sem
>>                                       report_error_detected()
>>                                       device_lock(&dev->mutex)<--- DEAD LOCK
>>
>> This can also happen when aer error occurs while pci_dev->sriov_config() is
>> called.
>>
>> This patch does a pci_bus_walk and adds all the devices to a list. After
>> unlocking (up_read) &pci_bus_sem, we go through the list and call
>> err_handler of the devices with devic_lock() held. This way, we dont try
>> to hold both locks at same time.
>
> I feel like we're working too hard to come up with an ad hoc solution
> for this lock ordering problem: the __driver_attach() path acquires
> the device lock, then the pci_bus_sem; the AER path acquires
> pci_bus_sem, then the device lock.
>
> To me, the pci_bus_sem, then device lock order seems natural.  The
> pci_bus_sem protects all the bus device lists, so it makes sense to
> hold it while iterating over those lists.  And if we're operating on
> one of those devices while we're iterating, it makes sense to acquire
> the device lock.
>
> The pci_enable_sriov() path is the one that feels strange to me.
> We're in a driver probe method, and, surprise!, brand-new devices show
> up and we basically ask the PCI core to enumerate them synchronously
> while still in the probe method.
>
> Is there some reason this enumeration has to be done synchronously?
> I wonder if we can get that piece out of the driver probe path, e.g.,
> by queuing up the pci_iov_add_virtfn() part to be done later, in a
> path where we're not holding a device lock?
>

Hi, Bjorn,

First let me catch up with the thread.

We have two locking sequence:
1. pci_bus_sem -> device lock, which is natural
2. device lock -> pci_bus_sem, which is not

pci_enable_sriov() sits in class #2 and your suggestion is to move the
pci_iov_add_virtfn() to some queue which will avoid case #2.

If we want to implement your suggestion, one thing unclear to me is
how would we handle the error path? Add a notification for the
failure? This would be easy for the core kernel, while some big change
for those drivers.

This is just my quick thought, maybe you would have some better solution.

> We have this problem with AER today, but it's conceivable that we'll
> have a similar problem somewhere else tomorrow, and having to build a
> list, fiddle with reference counts, etc., doesn't seem like a very
> general purpose solution.
>
Bjorn Helgaas Oct. 5, 2017, 6:42 p.m. UTC | #8
On Thu, Oct 05, 2017 at 11:05:12PM +0800, Wei Yang wrote:
> On Wed, Oct 4, 2017 at 5:15 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> > [+cc Alex, Gavin, Wei]
> >
> > On Fri, Sep 29, 2017 at 10:49:38PM -0700, Govindarajulu Varadarajan wrote:
> >> CPU0                                  CPU1
> >> ---------------------------------------------------------------------
> >> __driver_attach()
> >> device_lock(&dev->mutex) <--- device mutex lock here
> >> driver_probe_device()
> >> pci_enable_sriov()
> >> pci_iov_add_virtfn()
> >> pci_device_add()
> >>                                       aer_isr()               <--- pci aer error
> >>                                       do_recovery()
> >>                                       broadcast_error_message()
> >>                                       pci_walk_bus()
> >>                                       down_read(&pci_bus_sem) <--- rd sem
> >> down_write(&pci_bus_sem) <-- stuck on wr sem
> >>                                       report_error_detected()
> >>                                       device_lock(&dev->mutex)<--- DEAD LOCK
> >>
> >> This can also happen when aer error occurs while pci_dev->sriov_config() is
> >> called.
> >>
> >> This patch does a pci_bus_walk and adds all the devices to a list. After
> >> unlocking (up_read) &pci_bus_sem, we go through the list and call
> >> err_handler of the devices with devic_lock() held. This way, we dont try
> >> to hold both locks at same time.
> >
> > I feel like we're working too hard to come up with an ad hoc solution
> > for this lock ordering problem: the __driver_attach() path acquires
> > the device lock, then the pci_bus_sem; the AER path acquires
> > pci_bus_sem, then the device lock.
> >
> > To me, the pci_bus_sem, then device lock order seems natural.  The
> > pci_bus_sem protects all the bus device lists, so it makes sense to
> > hold it while iterating over those lists.  And if we're operating on
> > one of those devices while we're iterating, it makes sense to acquire
> > the device lock.
> >
> > The pci_enable_sriov() path is the one that feels strange to me.
> > We're in a driver probe method, and, surprise!, brand-new devices show
> > up and we basically ask the PCI core to enumerate them synchronously
> > while still in the probe method.
> >
> > Is there some reason this enumeration has to be done synchronously?
> > I wonder if we can get that piece out of the driver probe path, e.g.,
> > by queuing up the pci_iov_add_virtfn() part to be done later, in a
> > path where we're not holding a device lock?
> >
> 
> Hi, Bjorn,
> 
> First let me catch up with the thread.
> 
> We have two locking sequence:
> 1. pci_bus_sem -> device lock, which is natural
> 2. device lock -> pci_bus_sem, which is not

Right.  Or at least, that's my assertion :)  I could be convinced
otherwise.

> pci_enable_sriov() sits in class #2 and your suggestion is to move the
> pci_iov_add_virtfn() to some queue which will avoid case #2.
> 
> If we want to implement your suggestion, one thing unclear to me is
> how would we handle the error path? Add a notification for the
> failure? This would be easy for the core kernel, while some big change
> for those drivers.

My suggestion was for discussion.  It's entirely possible it will turn
out not to be feasible.

We're only talking about errors from pci_iov_add_virtfn() here.  We
can still return all the other existing errors from sriov_enable(),
which the driver can see.  These errors seem more directly related to
the PF itself.

The pci_iov_add_virtfn() errors are enumeration-type errors (failure
to add a bus, failure to read config space of a VF, etc.)  These
feel more like PCI core issues to me.  The driver isn't going to be
able to do anything about them.

The end result would likely be that a VF is enabled in the hardware
but not added as a PCI device.  The same errors can occur during
boot-time or hotplug-time enumeration of non-SR-IOV devices.

Are these sort of errors important to the PF driver?  If the PF driver
can get along without them, maybe we can use the same strategy as when
we enumerate all other devices, i.e., log something in dmesg and
continue on without the device.

Bjorn
Wei Yang Oct. 6, 2017, 1:11 a.m. UTC | #9
On Thu, Oct 05, 2017 at 01:42:09PM -0500, Bjorn Helgaas wrote:
>On Thu, Oct 05, 2017 at 11:05:12PM +0800, Wei Yang wrote:
>> On Wed, Oct 4, 2017 at 5:15 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
>> > [+cc Alex, Gavin, Wei]
>> >
>> > On Fri, Sep 29, 2017 at 10:49:38PM -0700, Govindarajulu Varadarajan wrote:
>> >> CPU0                                  CPU1
>> >> ---------------------------------------------------------------------
>> >> __driver_attach()
>> >> device_lock(&dev->mutex) <--- device mutex lock here
>> >> driver_probe_device()
>> >> pci_enable_sriov()
>> >> pci_iov_add_virtfn()
>> >> pci_device_add()
>> >>                                       aer_isr()               <--- pci aer error
>> >>                                       do_recovery()
>> >>                                       broadcast_error_message()
>> >>                                       pci_walk_bus()
>> >>                                       down_read(&pci_bus_sem) <--- rd sem
>> >> down_write(&pci_bus_sem) <-- stuck on wr sem
>> >>                                       report_error_detected()
>> >>                                       device_lock(&dev->mutex)<--- DEAD LOCK
>> >>
>> >> This can also happen when aer error occurs while pci_dev->sriov_config() is
>> >> called.
>> >>
>> >> This patch does a pci_bus_walk and adds all the devices to a list. After
>> >> unlocking (up_read) &pci_bus_sem, we go through the list and call
>> >> err_handler of the devices with devic_lock() held. This way, we dont try
>> >> to hold both locks at same time.
>> >
>> > I feel like we're working too hard to come up with an ad hoc solution
>> > for this lock ordering problem: the __driver_attach() path acquires
>> > the device lock, then the pci_bus_sem; the AER path acquires
>> > pci_bus_sem, then the device lock.
>> >
>> > To me, the pci_bus_sem, then device lock order seems natural.  The
>> > pci_bus_sem protects all the bus device lists, so it makes sense to
>> > hold it while iterating over those lists.  And if we're operating on
>> > one of those devices while we're iterating, it makes sense to acquire
>> > the device lock.
>> >
>> > The pci_enable_sriov() path is the one that feels strange to me.
>> > We're in a driver probe method, and, surprise!, brand-new devices show
>> > up and we basically ask the PCI core to enumerate them synchronously
>> > while still in the probe method.
>> >
>> > Is there some reason this enumeration has to be done synchronously?
>> > I wonder if we can get that piece out of the driver probe path, e.g.,
>> > by queuing up the pci_iov_add_virtfn() part to be done later, in a
>> > path where we're not holding a device lock?
>> >
>> 
>> Hi, Bjorn,
>> 
>> First let me catch up with the thread.
>> 
>> We have two locking sequence:
>> 1. pci_bus_sem -> device lock, which is natural
>> 2. device lock -> pci_bus_sem, which is not
>
>Right.  Or at least, that's my assertion :)  I could be convinced
>otherwise.
>
>> pci_enable_sriov() sits in class #2 and your suggestion is to move the
>> pci_iov_add_virtfn() to some queue which will avoid case #2.
>> 
>> If we want to implement your suggestion, one thing unclear to me is
>> how would we handle the error path? Add a notification for the
>> failure? This would be easy for the core kernel, while some big change
>> for those drivers.
>
>My suggestion was for discussion.  It's entirely possible it will turn
>out not to be feasible.
>
>We're only talking about errors from pci_iov_add_virtfn() here.  We
>can still return all the other existing errors from sriov_enable(),
>which the driver can see.  These errors seem more directly related to
>the PF itself.
>
>The pci_iov_add_virtfn() errors are enumeration-type errors (failure
>to add a bus, failure to read config space of a VF, etc.)  These
>feel more like PCI core issues to me.  The driver isn't going to be
>able to do anything about them.
>

Ideally, PF and VF has their own probe function and they don't interfere each
other. From this point of view, I agree these failures are not handled by
drivers.

While in the real implementation, I am not 100% for sure the PF driver
operates without the knowledge of enabled VFs.

>The end result would likely be that a VF is enabled in the hardware
>but not added as a PCI device.  The same errors can occur during
>boot-time or hotplug-time enumeration of non-SR-IOV devices.
>
>Are these sort of errors important to the PF driver?  If the PF driver
>can get along without them, maybe we can use the same strategy as when
>we enumerate all other devices, i.e., log something in dmesg and
>continue on without the device.
>

Besides the functionality, I have another concern on the behavior change.

Current behavior is the VFs will be enabled ALL or NONE, which we will add a
third condition PARTIAL.

For example, the sys admin wants to enable 5 VFs while leads to 3 enabled at
last.

Hmm, not a big deal, while need to inform the users.

>Bjorn
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index d51e4a57b190..f6d8761dc656 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -105,6 +105,11 @@  static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
 	return orig;
 }
 
+struct aer_device_list {
+	struct device *dev;
+	struct hlist_node node;
+};
+
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 890efcc574cb..d524f2c2c288 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -346,6 +346,47 @@  static int report_resume(struct pci_dev *dev, void *data)
 	return 0;
 }
 
+int aer_get_pci_dev(struct pci_dev *pdev, void *data)
+{
+	struct hlist_head *hhead = (struct hlist_head *)data;
+	struct device *dev = &pdev->dev;
+	struct aer_device_list *entry;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		/* continue with other devices, lets not return error */
+		return 0;
+
+	entry->dev = get_device(dev);
+	hlist_add_head(&entry->node, hhead);
+
+	return 0;
+}
+
+static void aer_pci_walk_bus(struct pci_bus *top,
+			     int (*cb)(struct pci_dev *, void *),
+			     struct aer_broadcast_data *result)
+{
+	HLIST_HEAD(dev_hlist);
+	struct hlist_node *tmp;
+	struct aer_device_list *entry;
+
+	pci_walk_bus(top, aer_get_pci_dev, &dev_hlist);
+	hlist_for_each_entry_safe(entry, tmp, &dev_hlist, node) {
+		struct pci_dev *pdev = container_of(entry->dev, struct pci_dev,
+						    dev);
+		int err;
+
+		err = cb(pdev, result);
+		if (err)
+			dev_err(entry->dev, "AER: recovery handler failed: %d",
+				err);
+		put_device(entry->dev);
+		hlist_del(&entry->node);
+		kfree(entry);
+	}
+}
+
 /**
  * broadcast_error_message - handle message broadcast to downstream drivers
  * @dev: pointer to from where in a hierarchy message is broadcasted down
@@ -380,7 +421,7 @@  static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
 		 */
 		if (cb == report_error_detected)
 			dev->error_state = state;
-		pci_walk_bus(dev->subordinate, cb, &result_data);
+		aer_pci_walk_bus(dev->subordinate, cb, &result_data);
 		if (cb == report_resume) {
 			pci_cleanup_aer_uncorrect_error_status(dev);
 			dev->error_state = pci_channel_io_normal;
@@ -390,7 +431,7 @@  static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
 		 * If the error is reported by an end point, we think this
 		 * error is related to the upstream link of the end point.
 		 */
-		pci_walk_bus(dev->bus, cb, &result_data);
+		aer_pci_walk_bus(dev->bus, cb, &result_data);
 	}
 
 	return result_data.result;