diff mbox series

[net,v2] ice: avoid bonding causing auxiliary plug/unplug under RTNL lock

Message ID 20230215191757.1826508-1-david.m.ertman@intel.com
State Accepted
Delegated to: Anthony Nguyen
Headers show
Series [net,v2] ice: avoid bonding causing auxiliary plug/unplug under RTNL lock | expand

Commit Message

Dave Ertman Feb. 15, 2023, 7:17 p.m. UTC
RDMA is not supported in ice on a PF that has been added to a bonded
interface. To enforce this, when an interface enters a bond, we unplug
the auxiliary device that supports RDMA functionality.  This unplug
currently happens in the context of handling the netdev bonding event.
This event is sent to the ice driver under RTNL context.  This is causing
a deadlock where the RDMA driver is waiting for the RTNL lock to complete
the removal.

Defer the unplugging/re-plugging of the auxiliary device to the service
task so that it is not performed under the RTNL lock context.

Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Link: https://lore.kernel.org/linux-rdma/68b14b11-d0c7-65c9-4eeb-0487c95e395d@leemhuis.info/
Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface enslave")
Fixes: 425c9bd06b7a ("RDMA/irdma: Report the correct link speed")
Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
---
Changes since v1:
Reversed order of bit processing in ice_service_task for PLUG/UNPLUG

 drivers/net/ethernet/intel/ice/ice.h      | 14 +++++---------
 drivers/net/ethernet/intel/ice/ice_main.c | 19 ++++++++-----------
 2 files changed, 13 insertions(+), 20 deletions(-)

Comments

Petr Oros Feb. 16, 2023, 8:54 a.m. UTC | #1
Hi Dave,

The first version was sent and commented on upstream ml.
Why do you omit cc on upstream (netdev@vger.kernel.org
<netdev@vger.kernel.org>)?
Can you resend this to upstream ml to take valuable feedback for this
critical fix?

Many Thanks,
Petr


Dave Ertman píše v St 15. 02. 2023 v 11:17 -0800:
> RDMA is not supported in ice on a PF that has been added to a bonded
> interface. To enforce this, when an interface enters a bond, we
> unplug
> the auxiliary device that supports RDMA functionality.  This unplug
> currently happens in the context of handling the netdev bonding
> event.
> This event is sent to the ice driver under RTNL context.  This is
> causing
> a deadlock where the RDMA driver is waiting for the RTNL lock to
> complete
> the removal.
> 
> Defer the unplugging/re-plugging of the auxiliary device to the
> service
> task so that it is not performed under the RTNL lock context.
> 
> Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
> Link:
> https://lore.kernel.org/linux-rdma/68b14b11-d0c7-65c9-4eeb-0487c95e395d@leemhuis.info/
> Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface
> enslave")
> Fixes: 425c9bd06b7a ("RDMA/irdma: Report the correct link speed")
> Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
> ---
> Changes since v1:
> Reversed order of bit processing in ice_service_task for PLUG/UNPLUG
> 
>  drivers/net/ethernet/intel/ice/ice.h      | 14 +++++---------
>  drivers/net/ethernet/intel/ice/ice_main.c | 19 ++++++++-----------
>  2 files changed, 13 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice.h
> b/drivers/net/ethernet/intel/ice/ice.h
> index 2f0b604abc5e..0ad9bab84617 100644
> --- a/drivers/net/ethernet/intel/ice/ice.h
> +++ b/drivers/net/ethernet/intel/ice/ice.h
> @@ -506,6 +506,7 @@ enum ice_pf_flags {
>         ICE_FLAG_VF_VLAN_PRUNING,
>         ICE_FLAG_LINK_LENIENT_MODE_ENA,
>         ICE_FLAG_PLUG_AUX_DEV,
> +       ICE_FLAG_UNPLUG_AUX_DEV,
>         ICE_FLAG_MTU_CHANGED,
>         ICE_FLAG_GNSS,                  /* GNSS successfully
> initialized */
>         ICE_PF_FLAGS_NBITS              /* must be last */
> @@ -950,16 +951,11 @@ static inline void ice_set_rdma_cap(struct
> ice_pf *pf)
>   */
>  static inline void ice_clear_rdma_cap(struct ice_pf *pf)
>  {
> -       /* We can directly unplug aux device here only if the flag
> bit
> -        * ICE_FLAG_PLUG_AUX_DEV is not set because
> ice_unplug_aux_dev()
> -        * could race with ice_plug_aux_dev() called from
> -        * ice_service_task(). In this case we only clear that bit
> now and
> -        * aux device will be unplugged later once
> ice_plug_aux_device()
> -        * called from ice_service_task() finishes (see
> ice_service_task()).
> +       /* defer unplug to service task to avoid RTNL lock and
> +        * clear PLUG bit so that pending plugs don't interfere
>          */
> -       if (!test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags))
> -               ice_unplug_aux_dev(pf);
> -
> +       clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags);
> +       set_bit(ICE_FLAG_UNPLUG_AUX_DEV, pf->flags);
>         clear_bit(ICE_FLAG_RDMA_ENA, pf->flags);
>  }
>  #endif /* _ICE_H_ */
> diff --git a/drivers/net/ethernet/intel/ice/ice_main.c
> b/drivers/net/ethernet/intel/ice/ice_main.c
> index a9a7f8b52140..b62c2f18eb5f 100644
> --- a/drivers/net/ethernet/intel/ice/ice_main.c
> +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> @@ -2290,18 +2290,15 @@ static void ice_service_task(struct
> work_struct *work)
>                 }
>         }
>  
> -       if (test_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) {
> -               /* Plug aux device per request */
> -               ice_plug_aux_dev(pf);
> +       /* unplug aux dev per request, if an unplug request came in
> +        * while processing a plug request, this will handle it
> +        */
> +       if (test_and_clear_bit(ICE_FLAG_UNPLUG_AUX_DEV, pf->flags))
> +               ice_unplug_aux_dev(pf);
>  
> -               /* Mark plugging as done but check whether unplug was
> -                * requested during ice_plug_aux_dev() call
> -                * (e.g. from ice_clear_rdma_cap()) and if so then
> -                * plug aux device.
> -                */
> -               if (!test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf-
> >flags))
> -                       ice_unplug_aux_dev(pf);
> -       }
> +       /* Plug aux device per request */
> +       if (test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags))
> +               ice_plug_aux_dev(pf);
>  
>         if (test_and_clear_bit(ICE_FLAG_MTU_CHANGED, pf->flags)) {
>                 struct iidc_event *event;
Linux regression tracking (Thorsten Leemhuis) Feb. 16, 2023, 10:33 a.m. UTC | #2
On 15.02.23 20:17, Dave Ertman wrote:
> RDMA is not supported in ice on a PF that has been added to a bonded
> interface. To enforce this, when an interface enters a bond, we unplug
> the auxiliary device that supports RDMA functionality.  This unplug
> currently happens in the context of handling the netdev bonding event.
> This event is sent to the ice driver under RTNL context.  This is causing
> a deadlock where the RDMA driver is waiting for the RTNL lock to complete
> the removal.
> 
> Defer the unplugging/re-plugging of the auxiliary device to the service
> task so that it is not performed under the RTNL lock context.

Thanks for taking care of this.

> Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
> Link: https://lore.kernel.org/linux-rdma/68b14b11-d0c7-65c9-4eeb-0487c95e395d@leemhuis.info/

FWIW, that should be

Link:
https://lore.kernel.org/netdev/CAK8fFZ6A_Gphw_3-QMGKEFQk=sfCw1Qmq0TVZK3rtAi7vb621A@mail.gmail.com/

instead, as that's Jaroslav's report

> Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface enslave")
> Fixes: 425c9bd06b7a ("RDMA/irdma: Report the correct link speed")
> Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
> [...]

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
--
Everything you wanna know about Linux kernel regression tracking:
https://linux-regtracking.leemhuis.info/about/#tldr
If I did something stupid, please tell me, as explained on that page.
Linux regression tracking (Thorsten Leemhuis) Feb. 16, 2023, 10:36 a.m. UTC | #3
Sorry, forgot something

On 16.02.23 11:33, Linux regression tracking (Thorsten Leemhuis) wrote:
> On 15.02.23 20:17, Dave Ertman wrote:
>> RDMA is not supported in ice on a PF that has been added to a bonded
>> interface. To enforce this, when an interface enters a bond, we unplug
> [...]
>> Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
>> Link: https://lore.kernel.org/linux-rdma/68b14b11-d0c7-65c9-4eeb-0487c95e395d@leemhuis.info/
> 
> FWIW, that should be
> 
> Link:
> https://lore.kernel.org/netdev/CAK8fFZ6A_Gphw_3-QMGKEFQk=sfCw1Qmq0TVZK3rtAi7vb621A@mail.gmail.com/
> 
> instead, as that's Jaroslav's report
> 
>> Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface enslave")
>> Fixes: 425c9bd06b7a ("RDMA/irdma: Report the correct link speed")

As this was a regression that was noticed in 6.1.2, this afaics also needs:

Cc: stable@vger.kernel.org # 6.1.x

[no, fixes tags alone do not suffice, see docs]

Ciao, Thorsten
Linus Heckemann Feb. 16, 2023, 5:24 p.m. UTC | #4
Dave Ertman <david.m.ertman@intel.com> writes:
> RDMA is not supported in ice on a PF that has been added to a bonded
> interface. To enforce this, when an interface enters a bond, we unplug
> the auxiliary device that supports RDMA functionality.  This unplug
> currently happens in the context of handling the netdev bonding event.
> This event is sent to the ice driver under RTNL context.  This is causing
> a deadlock where the RDMA driver is waiting for the RTNL lock to complete
> the removal.
>
> Defer the unplugging/re-plugging of the auxiliary device to the service
> task so that it is not performed under the RTNL lock context.
>
> Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
> Link: https://lore.kernel.org/linux-rdma/68b14b11-d0c7-65c9-4eeb-0487c95e395d@leemhuis.info/
> Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface enslave")
> Fixes: 425c9bd06b7a ("RDMA/irdma: Report the correct link speed")
> Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
> ---
> Changes since v1:
> Reversed order of bit processing in ice_service_task for PLUG/UNPLUG

Hi Dave,

Thanks for your continued work on this! We've tested this on a system
affected by the original issue (with 8086:1593 cards) and, unlike v1 of
the patch, it appears not to resolve it:

[  250.826308] INFO: task kworker/0:0:7 blocked for more than 122 seconds.
[  250.832925]       Not tainted 5.15.86 #1-NixOS
[  250.837370] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  250.845195] task:kworker/0:0     state:D stack:    0 pid:    7 ppid:     2 flags:0x00004000
[  250.845200] Workqueue: infiniband ib_cache_event_task [ib_core]
[  250.845227] Call Trace:
[  250.845228]  <TASK>
[  250.845230]  __schedule+0x2e6/0x13a0
[  250.845238]  ? dev_vprintk_emit+0x171/0x199
[  250.845242]  schedule+0x5b/0xd0
[  250.845244]  schedule_preempt_disabled+0xa/0x10
[  250.845247]  __mutex_lock.constprop.0+0x216/0x400
[  250.845249]  ib_get_eth_speed+0x65/0x190 [ib_core]
[  250.845262]  ? update_load_avg+0x7a/0x5c0
[  250.845265]  ? newidle_balance+0x121/0x400
[  250.845268]  irdma_query_port+0xac/0x110 [irdma]
[  250.845283]  ib_query_port+0x16f/0x1f0 [ib_core]
[  250.845301]  ib_cache_update.part.0+0x5e/0x280 [ib_core]
[  250.845313]  ? __schedule+0x2ee/0x13a0
[  250.845316]  ib_cache_event_task+0x53/0x70 [ib_core]
[  250.845327]  process_one_work+0x1ee/0x390
[  250.845330]  worker_thread+0x53/0x3e0
[  250.845332]  ? process_one_work+0x390/0x390
[  250.845334]  kthread+0x124/0x150
[  250.845336]  ? set_kthread_struct+0x50/0x50
[  250.845339]  ret_from_fork+0x1f/0x30
[  250.845344]  </TASK>
[  250.845418] INFO: task kworker/19:1:750 blocked for more than 122 seconds.
[  250.852294]       Not tainted 5.15.86 #1-NixOS
[  250.856739] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  250.864565] task:kworker/19:1    state:D stack:    0 pid:  750 ppid:     2 flags:0x00004000
[  250.864567] Workqueue: events linkwatch_event
[  250.864571] Call Trace:
[  250.864572]  <TASK>
[  250.864573]  __schedule+0x2e6/0x13a0
[  250.864576]  ? newidle_balance+0x121/0x400
[  250.864578]  schedule+0x5b/0xd0
[  250.864581]  schedule_preempt_disabled+0xa/0x10
[  250.864583]  __mutex_lock.constprop.0+0x216/0x400
[  250.864585]  linkwatch_event+0xa/0x30
[  250.864588]  process_one_work+0x1ee/0x390
[  250.864589]  worker_thread+0x53/0x3e0
[  250.864591]  ? process_one_work+0x390/0x390
[  250.864592]  kthread+0x124/0x150
[  250.864594]  ? set_kthread_struct+0x50/0x50
[  250.864597]  ret_from_fork+0x1f/0x30
[  250.864600]  </TASK>
[  250.864621] INFO: task kworker/119:1:946 blocked for more than 122 seconds.
[  250.871578]       Not tainted 5.15.86 #1-NixOS
[  250.876025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  250.883848] task:kworker/119:1   state:D stack:    0 pid:  946 ppid:     2 flags:0x00004000
[  250.883851] Workqueue: events_power_efficient reg_check_chans_work [cfg80211]
[  250.883890] Call Trace:
[  250.883891]  <TASK>
[  250.883892]  __schedule+0x2e6/0x13a0
[  250.883895]  ? ttwu_do_wakeup+0x17/0x150
[  250.883898]  ? try_to_wake_up+0x1d1/0x420
[  250.883901]  schedule+0x5b/0xd0
[  250.883903]  schedule_preempt_disabled+0xa/0x10
[  250.883905]  __mutex_lock.constprop.0+0x216/0x400
[  250.883907]  reg_check_chans_work+0x2d/0x390 [cfg80211]
[  250.883925]  ? add_timer_on+0xe9/0x120
[  250.883928]  process_one_work+0x1ee/0x390
[  250.883930]  worker_thread+0x53/0x3e0
[  250.883931]  ? process_one_work+0x390/0x390
[  250.883932]  kthread+0x124/0x150
[  250.883935]  ? set_kthread_struct+0x50/0x50
[  250.883937]  ret_from_fork+0x1f/0x30
[  250.883940]  </TASK>
[  250.883943] INFO: task kworker/0:3:1082 blocked for more than 122 seconds.
[  250.890817]       Not tainted 5.15.86 #1-NixOS
[  250.895262] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  250.903088] task:kworker/0:3     state:D stack:    0 pid: 1082 ppid:     2 flags:0x00004000
[  250.903090] Workqueue: infiniband ib_cache_event_task [ib_core]
[  250.903104] Call Trace:
[  250.903105]  <TASK>
[  250.903106]  __schedule+0x2e6/0x13a0
[  250.903109]  schedule+0x5b/0xd0
[  250.903111]  schedule_preempt_disabled+0xa/0x10
[  250.903114]  __mutex_lock.constprop.0+0x216/0x400
[  250.903115]  ib_get_eth_speed+0x65/0x190 [ib_core]
[  250.903127]  ? raw_spin_rq_lock_nested+0x12/0x70
[  250.903129]  ? newidle_balance+0x2ee/0x400
[  250.903131]  irdma_query_port+0xac/0x110 [irdma]
[  250.903144]  ib_query_port+0x16f/0x1f0 [ib_core]
[  250.903155]  ib_cache_update.part.0+0x5e/0x280 [ib_core]
[  250.903166]  ? __schedule+0x2ee/0x13a0
[  250.903169]  ib_cache_event_task+0x53/0x70 [ib_core]
[  250.903178]  process_one_work+0x1ee/0x390
[  250.903180]  worker_thread+0x53/0x3e0
[  250.903181]  ? process_one_work+0x390/0x390
[  250.903183]  kthread+0x124/0x150
[  250.903185]  ? set_kthread_struct+0x50/0x50
[  250.903187]  ret_from_fork+0x1f/0x30
[  250.903190]  </TASK>
[  250.903195] INFO: task systemd-network:2679 blocked for more than 122 seconds.
[  250.910414]       Not tainted 5.15.86 #1-NixOS
[  250.914858] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  250.922685] task:systemd-network state:D stack:    0 pid: 2679 ppid:     1 flags:0x00004226
[  250.922687] Call Trace:
[  250.922688]  <TASK>
[  250.922688]  ? usleep_range_state+0x90/0x90
[  250.922690]  __schedule+0x2e6/0x13a0
[  250.922693]  ? __wake_up_common+0x7d/0x190
[  250.922695]  ? usleep_range_state+0x90/0x90
[  250.922696]  schedule+0x5b/0xd0
[  250.922699]  schedule_timeout+0x104/0x140
[  250.922700]  ? __prepare_to_swait+0x4f/0x70
[  250.922702]  __wait_for_common+0xac/0x160
[  250.922704]  flush_workqueue+0x13a/0x3f0
[  250.922706]  ib_cache_cleanup_one+0x1c/0xe0 [ib_core]
[  250.922719]  __ib_unregister_device+0x6a/0xb0 [ib_core]
[  250.922730]  ib_unregister_device+0x22/0x30 [ib_core]
[  250.922742]  irdma_remove+0x1a/0x60 [irdma]
[  250.922752]  auxiliary_bus_remove+0x18/0x30
[  250.922754]  __device_release_driver+0x17b/0x250
[  250.922758]  device_release_driver+0x24/0x30
[  250.922760]  bus_remove_device+0xd8/0x150
[  250.922762]  device_del+0x18b/0x400
[  250.922765]  ice_unplug_aux_dev+0x42/0x60 [ice]
[  250.922786]  ice_lag_event_handler+0x4c9/0x580 [ice]
[  250.922800]  raw_notifier_call_chain+0x41/0x60
[  250.922802]  __netdev_upper_dev_link+0x1a0/0x370
[  250.922806]  netdev_master_upper_dev_link+0x3d/0x60
[  250.922809]  bond_enslave+0xbe3/0x1620 [bonding]
[  250.922819]  ? __nla_parse+0x22/0x30
[  250.922822]  ? inet6_set_link_af+0x6a/0x3d0
[  250.922825]  do_setlink+0x28e/0x1050
[  250.922828]  ? select_task_rq_fair+0x131/0x1090
[  250.922830]  ? select_task_rq_fair+0x131/0x1090
[  250.922832]  ? __nla_validate_parse+0x5f/0xc10
[  250.922834]  ? sched_clock_cpu+0x9/0xa0
[  250.922835]  ? __smp_call_single_queue+0x23/0x40
[  250.922837]  ? ttwu_queue_wakelist+0xf4/0x110
[  250.922840]  rtnl_setlink+0xe5/0x180
[  250.922844]  ? security_capable+0x33/0x60
[  250.922846]  rtnetlink_rcv_msg+0x12e/0x380
[  250.922849]  ? rtnl_calcit.isra.0+0x130/0x130
[  250.922851]  netlink_rcv_skb+0x4e/0x100
[  250.922854]  netlink_unicast+0x204/0x2e0
[  250.922855]  netlink_sendmsg+0x24e/0x4b0
[  250.922857]  sock_sendmsg+0x5f/0x70
[  250.922861]  __sys_sendto+0xf0/0x160
[  250.922862]  ? __seccomp_filter+0x389/0x580
[  250.922867]  __x64_sys_sendto+0x20/0x30
[  250.922868]  do_syscall_64+0x38/0x90
[  250.922872]  entry_SYSCALL_64_after_hwframe+0x61/0xcb
[  250.922876] RIP: 0033:0x7f4340b8befa
[  250.922894] RSP: 002b:00007ffd17815b68 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
[  250.922896] RAX: ffffffffffffffda RBX: 00005603231d29a8 RCX: 00007f4340b8befa
[  250.922897] RDX: 0000000000000028 RSI: 00005603231d2930 RDI: 0000000000000003
[  250.922898] RBP: 00005603231aade0 R08: 00007ffd17815b70 R09: 0000000000000080
[  250.922899] R10: 0000000000000000 R11: 0000000000000246 R12: 00005603231d24a0
[  250.922900] R13: 00005603231d2960 R14: 0000000000000000 R15: 0000560323063140
[  250.922902]  </TASK>
Dave Ertman Feb. 24, 2023, 6:33 p.m. UTC | #5
> -----Original Message-----
> From: Linus Heckemann <git@sphalerite.org>
> Sent: Thursday, February 16, 2023 9:24 AM
> To: Ertman, David M <david.m.ertman@intel.com>; intel-wired-
> lan@lists.osuosl.org
> Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
> Subject: Re: [Intel-wired-lan] [PATCH net v2] ice: avoid bonding causing
> auxiliary plug/unplug under RTNL lock
> 
> Dave Ertman <david.m.ertman@intel.com> writes:
> > RDMA is not supported in ice on a PF that has been added to a bonded
> > interface. To enforce this, when an interface enters a bond, we unplug
> > the auxiliary device that supports RDMA functionality.  This unplug
> > currently happens in the context of handling the netdev bonding event.
> > This event is sent to the ice driver under RTNL context.  This is causing
> > a deadlock where the RDMA driver is waiting for the RTNL lock to complete
> > the removal.
> >
> > Defer the unplugging/re-plugging of the auxiliary device to the service
> > task so that it is not performed under the RTNL lock context.
> >
> > Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
> > Link: https://lore.kernel.org/linux-rdma/68b14b11-d0c7-65c9-4eeb-
> 0487c95e395d@leemhuis.info/
> > Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface enslave")
> > Fixes: 425c9bd06b7a ("RDMA/irdma: Report the correct link speed")
> > Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
> > ---
> > Changes since v1:
> > Reversed order of bit processing in ice_service_task for PLUG/UNPLUG
> 
> Hi Dave,
> 
> Thanks for your continued work on this! We've tested this on a system
> affected by the original issue (with 8086:1593 cards) and, unlike v1 of
> the patch, it appears not to resolve it:

Hi Linus,

This error confuses me.  The only difference between v1 and v2 of this patch
is the order in which we process state bits in the service task thread.  They are
still being processed outside of RTNL context.

Can you provide the steps you used to reproduce this issue? 

Thanks,
DaveE
Petr Oros March 1, 2023, 1:12 p.m. UTC | #6
Ertman, David M píše v Pá 24. 02. 2023 v 18:33 +0000:
> > -----Original Message-----
> > From: Linus Heckemann <git@sphalerite.org>
> > Sent: Thursday, February 16, 2023 9:24 AM
> > To: Ertman, David M <david.m.ertman@intel.com>; intel-wired-
> > lan@lists.osuosl.org
> > Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
> > Subject: Re: [Intel-wired-lan] [PATCH net v2] ice: avoid bonding
> > causing
> > auxiliary plug/unplug under RTNL lock
> > 
> > Dave Ertman <david.m.ertman@intel.com> writes:
> > > RDMA is not supported in ice on a PF that has been added to a
> > > bonded
> > > interface. To enforce this, when an interface enters a bond, we
> > > unplug
> > > the auxiliary device that supports RDMA functionality.  This
> > > unplug
> > > currently happens in the context of handling the netdev bonding
> > > event.
> > > This event is sent to the ice driver under RTNL context.  This is
> > > causing
> > > a deadlock where the RDMA driver is waiting for the RTNL lock to
> > > complete
> > > the removal.
> > > 
> > > Defer the unplugging/re-plugging of the auxiliary device to the
> > > service
> > > task so that it is not performed under the RTNL lock context.
> > > 
> > > Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
> > > Link: https://lore.kernel.org/linux-rdma/68b14b11-d0c7-65c9-4eeb-
> > 0487c95e395d@leemhuis.info/
> > > Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface
> > > enslave")
> > > Fixes: 425c9bd06b7a ("RDMA/irdma: Report the correct link speed")
> > > Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
> > > ---
> > > Changes since v1:
> > > Reversed order of bit processing in ice_service_task for
> > > PLUG/UNPLUG
> > 
> > Hi Dave,
> > 
> > Thanks for your continued work on this! We've tested this on a
> > system
> > affected by the original issue (with 8086:1593 cards) and, unlike
> > v1 of
> > the patch, it appears not to resolve it:
> 
> Hi Linus,
> 
> This error confuses me.  The only difference between v1 and v2 of
> this patch
> is the order in which we process state bits in the service task
> thread.  They are
> still being processed outside of RTNL context.
> 
> Can you provide the steps you used to reproduce this issue? 
Hi all,

I have tested this fix and i can confirm that the issue is resolved
with v2.

With patch (v1 or v2)
$ modprobe -v bonding mode=1 miimon=100 max_bonds=1
insmod /lib/modules/6.2.0+/kernel/net/tls/tls.ko 
insmod /lib/modules/6.2.0+/kernel/drivers/net/bonding/bonding.ko
max_bonds=0 mode=1 miimon=100 max_bonds=1
$ ip link set up bond0
$ ifenslave bond0 enp65s0f0np0 enp65s0f1np1
$ 

Without patch
$ modprobe -v bonding mode=1 miimon=100 max_bonds=1
insmod /lib/modules/6.2.0+/kernel/net/tls/tls.ko 
insmod /lib/modules/6.2.0+/kernel/drivers/net/bonding/bonding.ko
max_bonds=0 mode=1 miimon=100 max_bonds=1
$ ip link set up bond0
$ ifenslave bond0 enp65s0f0np0 enp65s0f1np1
^^^^^^ HANG

Regards,
Petr

> 
> Thanks,
> DaveE
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan
>
Linus Heckemann March 2, 2023, 12:38 p.m. UTC | #7
"Ertman, David M" <david.m.ertman@intel.com> writes:

>> -----Original Message-----
>> From: Linus Heckemann <git@sphalerite.org>
>> Sent: Thursday, February 16, 2023 9:24 AM
>> To: Ertman, David M <david.m.ertman@intel.com>; intel-wired-
>> lan@lists.osuosl.org
>> Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
>> Subject: Re: [Intel-wired-lan] [PATCH net v2] ice: avoid bonding causing
>> auxiliary plug/unplug under RTNL lock
>> 
>> Dave Ertman <david.m.ertman@intel.com> writes:
>> > RDMA is not supported in ice on a PF that has been added to a bonded
>> > interface. To enforce this, when an interface enters a bond, we unplug
>> > the auxiliary device that supports RDMA functionality.  This unplug
>> > currently happens in the context of handling the netdev bonding event.
>> > This event is sent to the ice driver under RTNL context.  This is causing
>> > a deadlock where the RDMA driver is waiting for the RTNL lock to complete
>> > the removal.
>> >
>> > Defer the unplugging/re-plugging of the auxiliary device to the service
>> > task so that it is not performed under the RTNL lock context.
>> >
>> > Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
>> > Link: https://lore.kernel.org/linux-rdma/68b14b11-d0c7-65c9-4eeb-
>> 0487c95e395d@leemhuis.info/
>> > Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface enslave")
>> > Fixes: 425c9bd06b7a ("RDMA/irdma: Report the correct link speed")
>> > Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
>> > ---
>> > Changes since v1:
>> > Reversed order of bit processing in ice_service_task for PLUG/UNPLUG
>> 
>> Hi Dave,
>> 
>> Thanks for your continued work on this! We've tested this on a system
>> affected by the original issue (with 8086:1593 cards) and, unlike v1 of
>> the patch, it appears not to resolve it:
>
> Hi Linus,
>
> This error confuses me.  The only difference between v1 and v2 of this patch
> is the order in which we process state bits in the service task thread.  They are
> still being processed outside of RTNL context.
>
> Can you provide the steps you used to reproduce this issue? 
>
> Thanks,
> DaveE

Hi Dave,

It confuses me as well!

Like before, this was reproduced by booting a system configured to bond
the interfaces provided by two of the cards (using systemd-networkd,
relevant config below). The failure occurred less frequently than prior
to applying the patch, but still enough to be quite an annoyance!

According to the provider, the machine's card was on an older firmware
(3.00 0x8000893f 20.5.13), and upgrading to the latest available version
resolved this issue for our purposes. Nevertheless, I think the kernel
shouldn't be deadlock on the RTNL lock regardless of which firmware
version is running. If there's any more information that would be
helpful for debugging, let us know -- though we can't get at machines
running the old firmware trivially, so it's hard for us to reproduce at
this point.

As mentioned, upgrading the firmware has resolved the problem for us,
though it certainly feels unsatisfying to leave the bug there. I have no
strong opinion on whether the patch should be included as is
anyway. Maybe the firmware version info is enough to help you reproduce
the problem?

Linus
Arland, ArpanaX March 6, 2023, 11:03 a.m. UTC | #8
> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf Of Ertman, David M
> Sent: Thursday, February 16, 2023 12:48 AM
> To: intel-wired-lan@lists.osuosl.org
> Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
> Subject: [Intel-wired-lan] [PATCH net v2] ice: avoid bonding causing auxiliary plug/unplug under RTNL lock
>
> RDMA is not supported in ice on a PF that has been added to a bonded interface. To enforce this, when an interface enters a bond, we unplug the auxiliary device that supports RDMA functionality.  This unplug currently happens in the context of handling the netdev bonding event.
> This event is sent to the ice driver under RTNL context.  This is causing a deadlock where the RDMA driver is waiting for the RTNL lock to complete the removal.
>
> Defer the unplugging/re-plugging of the auxiliary device to the service task so that it is not performed under the RTNL lock context.
>
> Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
> Link: https://lore.kernel.org/linux-rdma/68b14b11-d0c7-65c9-4eeb-0487c95e395d@leemhuis.info/
> Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface enslave")
> Fixes: 425c9bd06b7a ("RDMA/irdma: Report the correct link speed")
> Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
> ---
> Changes since v1:
> Reversed order of bit processing in ice_service_task for PLUG/UNPLUG
>
>  drivers/net/ethernet/intel/ice/ice.h      | 14 +++++---------
>  drivers/net/ethernet/intel/ice/ice_main.c | 19 ++++++++-----------
>  2 files changed, 13 insertions(+), 20 deletions(-)
>
Tested-by: Arpana Arland < arpanax.arland@intel.com> (A Contingent worker at Intel)
Tony Nguyen March 10, 2023, 7:03 p.m. UTC | #9
On 3/2/2023 4:38 AM, Linus Heckemann wrote:
> "Ertman, David M" <david.m.ertman@intel.com> writes:
> 
>>> -----Original Message-----
>>> From: Linus Heckemann <git@sphalerite.org>
>>> Sent: Thursday, February 16, 2023 9:24 AM
>>> To: Ertman, David M <david.m.ertman@intel.com>; intel-wired-
>>> lan@lists.osuosl.org
>>> Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
>>> Subject: Re: [Intel-wired-lan] [PATCH net v2] ice: avoid bonding causing
>>> auxiliary plug/unplug under RTNL lock
>>>
>>> Dave Ertman <david.m.ertman@intel.com> writes:
>>>> RDMA is not supported in ice on a PF that has been added to a bonded
>>>> interface. To enforce this, when an interface enters a bond, we unplug
>>>> the auxiliary device that supports RDMA functionality.  This unplug
>>>> currently happens in the context of handling the netdev bonding event.
>>>> This event is sent to the ice driver under RTNL context.  This is causing
>>>> a deadlock where the RDMA driver is waiting for the RTNL lock to complete
>>>> the removal.
>>>>
>>>> Defer the unplugging/re-plugging of the auxiliary device to the service
>>>> task so that it is not performed under the RTNL lock context.
>>>>
>>>> Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
>>>> Link: https://lore.kernel.org/linux-rdma/68b14b11-d0c7-65c9-4eeb-
>>> 0487c95e395d@leemhuis.info/
>>>> Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface enslave")
>>>> Fixes: 425c9bd06b7a ("RDMA/irdma: Report the correct link speed")
>>>> Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
>>>> ---
>>>> Changes since v1:
>>>> Reversed order of bit processing in ice_service_task for PLUG/UNPLUG
>>>
>>> Hi Dave,
>>>
>>> Thanks for your continued work on this! We've tested this on a system
>>> affected by the original issue (with 8086:1593 cards) and, unlike v1 of
>>> the patch, it appears not to resolve it:
>>
>> Hi Linus,
>>
>> This error confuses me.  The only difference between v1 and v2 of this patch
>> is the order in which we process state bits in the service task thread.  They are
>> still being processed outside of RTNL context.
>>
>> Can you provide the steps you used to reproduce this issue?
>>
>> Thanks,
>> DaveE
> 
> Hi Dave,
> 
> It confuses me as well!
> 
> Like before, this was reproduced by booting a system configured to bond
> the interfaces provided by two of the cards (using systemd-networkd,
> relevant config below). The failure occurred less frequently than prior
> to applying the patch, but still enough to be quite an annoyance!
> 
> According to the provider, the machine's card was on an older firmware
> (3.00 0x8000893f 20.5.13), and upgrading to the latest available version
> resolved this issue for our purposes. Nevertheless, I think the kernel
> shouldn't be deadlock on the RTNL lock regardless of which firmware
> version is running. If there's any more information that would be
> helpful for debugging, let us know -- though we can't get at machines
> running the old firmware trivially, so it's hard for us to reproduce at
> this point.
> 
> As mentioned, upgrading the firmware has resolved the problem for us,
> though it certainly feels unsatisfying to leave the bug there. I have no
> strong opinion on whether the patch should be included as is
> anyway. Maybe the firmware version info is enough to help you reproduce
> the problem?

Thanks for the additional information Linus. Unfortunately, our 
validation was unable to reproduce the issue with the specified firmware 
and setup. We were able to see failure without patch, but not with the 
patch applied. If you are able to run across the issue again or have any 
other setup info that may help with a reproduction, please let us know.

Thanks,
Tony

> Linus
> 
> 
> ________
> /etc/systemd/network/10-bond0.netdev:
> [NetDev]
> Kind=bond
> Name=bond0
> 
> [Bond]
> DownDelaySec=0.200000
> LACPTransmitRate=fast
> MIIMonitorSec=0.100000
> Mode=802.3ad
> TransmitHashPolicy=layer3+4
> UpDelaySec=0.200000
> 
> ________
> /etc/systemd/network/40-bond0.network:
> [Match]
> Name=bond0
> 
> [Link]
> #MACAddress=<omitted>
> RequiredForOnline=carrier
> 
> [Network]
> LinkLocalAddressing=no
> 
> # some Address and Route sections omitted
> ________
> /etc/systemd/network/30-eno12419.network:
> [Match]
> Name=eno12419
> 
> [Network]
> Bond=bond0
> 
> ________
> /etc/systemd/network/30-eno12399.network:
> [Match]
> Name=eno12399
> 
> [Network]
> Bond=bond0
> 
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 2f0b604abc5e..0ad9bab84617 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -506,6 +506,7 @@  enum ice_pf_flags {
 	ICE_FLAG_VF_VLAN_PRUNING,
 	ICE_FLAG_LINK_LENIENT_MODE_ENA,
 	ICE_FLAG_PLUG_AUX_DEV,
+	ICE_FLAG_UNPLUG_AUX_DEV,
 	ICE_FLAG_MTU_CHANGED,
 	ICE_FLAG_GNSS,			/* GNSS successfully initialized */
 	ICE_PF_FLAGS_NBITS		/* must be last */
@@ -950,16 +951,11 @@  static inline void ice_set_rdma_cap(struct ice_pf *pf)
  */
 static inline void ice_clear_rdma_cap(struct ice_pf *pf)
 {
-	/* We can directly unplug aux device here only if the flag bit
-	 * ICE_FLAG_PLUG_AUX_DEV is not set because ice_unplug_aux_dev()
-	 * could race with ice_plug_aux_dev() called from
-	 * ice_service_task(). In this case we only clear that bit now and
-	 * aux device will be unplugged later once ice_plug_aux_device()
-	 * called from ice_service_task() finishes (see ice_service_task()).
+	/* defer unplug to service task to avoid RTNL lock and
+	 * clear PLUG bit so that pending plugs don't interfere
 	 */
-	if (!test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags))
-		ice_unplug_aux_dev(pf);
-
+	clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags);
+	set_bit(ICE_FLAG_UNPLUG_AUX_DEV, pf->flags);
 	clear_bit(ICE_FLAG_RDMA_ENA, pf->flags);
 }
 #endif /* _ICE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index a9a7f8b52140..b62c2f18eb5f 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -2290,18 +2290,15 @@  static void ice_service_task(struct work_struct *work)
 		}
 	}
 
-	if (test_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) {
-		/* Plug aux device per request */
-		ice_plug_aux_dev(pf);
+	/* unplug aux dev per request, if an unplug request came in
+	 * while processing a plug request, this will handle it
+	 */
+	if (test_and_clear_bit(ICE_FLAG_UNPLUG_AUX_DEV, pf->flags))
+		ice_unplug_aux_dev(pf);
 
-		/* Mark plugging as done but check whether unplug was
-		 * requested during ice_plug_aux_dev() call
-		 * (e.g. from ice_clear_rdma_cap()) and if so then
-		 * plug aux device.
-		 */
-		if (!test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags))
-			ice_unplug_aux_dev(pf);
-	}
+	/* Plug aux device per request */
+	if (test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags))
+		ice_plug_aux_dev(pf);
 
 	if (test_and_clear_bit(ICE_FLAG_MTU_CHANGED, pf->flags)) {
 		struct iidc_event *event;