Message ID | 1565140434-8109-6-git-send-email-wenxu@ucloud.cn |
---|---|
State | Awaiting Upstream |
Delegated to: | Pablo Neira |
Headers | show |
Series | [net-next,v7,1/6] cls_api: modify the tc_indr_block_ing_cmd parameters. | expand |
On Wed 07 Aug 2019 at 04:13, wenxu@ucloud.cn wrote: > From: wenxu <wenxu@ucloud.cn> > > It provide a callback list to find the blocks of tc > and nft subsystems > > Signed-off-by: wenxu <wenxu@ucloud.cn> > Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com> > --- > v7: add a mutex lock for add/del flow_indr_block_ing_cb > > include/net/flow_offload.h | 10 ++++++++- > net/core/flow_offload.c | 51 ++++++++++++++++++++++++++++++++++------------ > net/sched/cls_api.c | 9 +++++++- > 3 files changed, 55 insertions(+), 15 deletions(-) > > diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h > index 46b8777..e8069b6 100644 > --- a/include/net/flow_offload.h > +++ b/include/net/flow_offload.h > @@ -379,6 +379,15 @@ typedef void flow_indr_block_ing_cmd_t(struct net_device *dev, > void *cb_priv, > enum flow_block_command command); > > +struct flow_indr_block_ing_entry { > + flow_indr_block_ing_cmd_t *cb; > + struct list_head list; > +}; > + > +void flow_indr_add_block_ing_cb(struct flow_indr_block_ing_entry *entry); > + > +void flow_indr_del_block_ing_cb(struct flow_indr_block_ing_entry *entry); > + > int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, > flow_indr_block_bind_cb_t *cb, > void *cb_ident); > @@ -395,7 +404,6 @@ void flow_indr_block_cb_unregister(struct net_device *dev, > void *cb_ident); > > void flow_indr_block_call(struct net_device *dev, > - flow_indr_block_ing_cmd_t *cb, > struct flow_block_offload *bo, > enum flow_block_command command); > > diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c > index 4cc18e4..64c3d4d 100644 > --- a/net/core/flow_offload.c > +++ b/net/core/flow_offload.c > @@ -3,6 +3,7 @@ > #include <linux/slab.h> > #include <net/flow_offload.h> > #include <linux/rtnetlink.h> > +#include <linux/mutex.h> > > struct flow_rule *flow_rule_alloc(unsigned int num_actions) > { > @@ -282,6 +283,8 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, > } > EXPORT_SYMBOL(flow_block_cb_setup_simple); > > +static LIST_HEAD(block_ing_cb_list); > + > static struct rhashtable indr_setup_block_ht; > > struct flow_indr_block_cb { > @@ -295,7 +298,6 @@ struct flow_indr_block_dev { > struct rhash_head ht_node; > struct net_device *dev; > unsigned int refcnt; > - flow_indr_block_ing_cmd_t *block_ing_cmd_cb; > struct list_head cb_list; > }; > > @@ -389,6 +391,20 @@ static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb) > kfree(indr_block_cb); > } > > +static void flow_block_ing_cmd(struct net_device *dev, > + flow_indr_block_bind_cb_t *cb, > + void *cb_priv, > + enum flow_block_command command) > +{ > + struct flow_indr_block_ing_entry *entry; > + > + rcu_read_lock(); > + list_for_each_entry_rcu(entry, &block_ing_cb_list, list) { > + entry->cb(dev, cb, cb_priv, command); > + } > + rcu_read_unlock(); > +} Hi, I'm getting following incorrect rcu usage warnings with this patch caused by rcu_read_lock in flow_block_ing_cmd: [ 401.510948] ============================= [ 401.510952] WARNING: suspicious RCU usage [ 401.510993] 5.3.0-rc3+ #589 Not tainted [ 401.510996] ----------------------------- [ 401.511001] include/linux/rcupdate.h:265 Illegal context switch in RCU read-side critical section! [ 401.511004] other info that might help us debug this: [ 401.511008] rcu_scheduler_active = 2, debug_locks = 1 [ 401.511012] 7 locks held by test-ecmp-add-v/7576: [ 401.511015] #0: 00000000081d71a5 (sb_writers#4){.+.+}, at: vfs_write+0x166/0x1d0 [ 401.511037] #1: 000000002bd338c3 (&of->mutex){+.+.}, at: kernfs_fop_write+0xef/0x1b0 [ 401.511051] #2: 00000000c921c634 (kn->count#317){.+.+}, at: kernfs_fop_write+0xf7/0x1b0 [ 401.511062] #3: 00000000a19cdd56 (&dev->mutex){....}, at: sriov_numvfs_store+0x6b/0x130 [ 401.511079] #4: 000000005425fa52 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x30/0x140 [ 401.511092] #5: 00000000c5822793 (rtnl_mutex){+.+.}, at: unregister_netdevice_notifier+0x35/0x140 [ 401.511101] #6: 00000000c2f3507e (rcu_read_lock){....}, at: flow_block_ing_cmd+0x5/0x130 [ 401.511115] stack backtrace: [ 401.511121] CPU: 21 PID: 7576 Comm: test-ecmp-add-v Not tainted 5.3.0-rc3+ #589 [ 401.511124] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 401.511127] Call Trace: [ 401.511138] dump_stack+0x85/0xc0 [ 401.511146] ___might_sleep+0x100/0x180 [ 401.511154] __mutex_lock+0x5b/0x960 [ 401.511162] ? find_held_lock+0x2b/0x80 [ 401.511173] ? __tcf_get_next_chain+0x1d/0xb0 [ 401.511179] ? mark_held_locks+0x49/0x70 [ 401.511194] ? __tcf_get_next_chain+0x1d/0xb0 [ 401.511198] __tcf_get_next_chain+0x1d/0xb0 [ 401.511251] ? uplink_rep_async_event+0x70/0x70 [mlx5_core] [ 401.511261] tcf_block_playback_offloads+0x39/0x160 [ 401.511276] tcf_block_setup+0x1b0/0x240 [ 401.511312] ? mlx5e_rep_indr_setup_tc_cb+0xca/0x290 [mlx5_core] [ 401.511347] ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core] [ 401.511359] tc_indr_block_get_and_ing_cmd+0x11b/0x1e0 [ 401.511404] ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core] [ 401.511414] flow_block_ing_cmd+0x7e/0x130 [ 401.511453] ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core] [ 401.511462] __flow_indr_block_cb_unregister+0x7f/0xf0 [ 401.511502] mlx5e_nic_rep_netdevice_event+0x75/0xb0 [mlx5_core] [ 401.511513] unregister_netdevice_notifier+0xe9/0x140 [ 401.511554] mlx5e_cleanup_rep_tx+0x6f/0xe0 [mlx5_core] [ 401.511597] mlx5e_detach_netdev+0x4b/0x60 [mlx5_core] [ 401.511637] mlx5e_vport_rep_unload+0x71/0xc0 [mlx5_core] [ 401.511679] esw_offloads_disable+0x5b/0x90 [mlx5_core] [ 401.511724] mlx5_eswitch_disable.cold+0xdf/0x176 [mlx5_core] [ 401.511759] mlx5_device_disable_sriov+0xab/0xb0 [mlx5_core] [ 401.511794] mlx5_core_sriov_configure+0xaf/0xd0 [mlx5_core] [ 401.511805] sriov_numvfs_store+0xf8/0x130 [ 401.511817] kernfs_fop_write+0x122/0x1b0 [ 401.511826] vfs_write+0xdb/0x1d0 [ 401.511835] ksys_write+0x65/0xe0 [ 401.511847] do_syscall_64+0x5c/0xb0 [ 401.511857] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 401.511862] RIP: 0033:0x7fad892d30f8 [ 401.511868] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 96 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 60 c3 0f 1f 80 00 00 00 00 48 83 ec 28 48 89 [ 401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8 [ 401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001 [ 401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a [ 401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002 [ 401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740 I don't think it is correct approach to try to call these callbacks with rcu protection because: - Cls API uses sleeping locks that cannot be used in rcu read section (hence the included trace). - It assumes that all implementation of classifier ops reoffload() don't sleep. - And that all driver offload callbacks (both block and classifier setup) don't sleep, which is not the case. I don't see any straightforward way to fix this, besides using some other locking mechanism to protect block_ing_cb_list. Regards, Vlad
On 8/12/2019 10:11 PM, Vlad Buslov wrote: > >> +static void flow_block_ing_cmd(struct net_device *dev, >> + flow_indr_block_bind_cb_t *cb, >> + void *cb_priv, >> + enum flow_block_command command) >> +{ >> + struct flow_indr_block_ing_entry *entry; >> + >> + rcu_read_lock(); >> + list_for_each_entry_rcu(entry, &block_ing_cb_list, list) { >> + entry->cb(dev, cb, cb_priv, command); >> + } >> + rcu_read_unlock(); >> +} > Hi, > > I'm getting following incorrect rcu usage warnings with this patch > caused by rcu_read_lock in flow_block_ing_cmd: > > [ 401.510948] ============================= > [ 401.510952] WARNING: suspicious RCU usage > [ 401.510993] 5.3.0-rc3+ #589 Not tainted > [ 401.510996] ----------------------------- > [ 401.511001] include/linux/rcupdate.h:265 Illegal context switch in RCU read-side critical section! > [ 401.511004] > other info that might help us debug this: > > [ 401.511008] > rcu_scheduler_active = 2, debug_locks = 1 > [ 401.511012] 7 locks held by test-ecmp-add-v/7576: > [ 401.511015] #0: 00000000081d71a5 (sb_writers#4){.+.+}, at: vfs_write+0x166/0x1d0 > [ 401.511037] #1: 000000002bd338c3 (&of->mutex){+.+.}, at: kernfs_fop_write+0xef/0x1b0 > [ 401.511051] #2: 00000000c921c634 (kn->count#317){.+.+}, at: kernfs_fop_write+0xf7/0x1b0 > [ 401.511062] #3: 00000000a19cdd56 (&dev->mutex){....}, at: sriov_numvfs_store+0x6b/0x130 > [ 401.511079] #4: 000000005425fa52 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x30/0x140 > [ 401.511092] #5: 00000000c5822793 (rtnl_mutex){+.+.}, at: unregister_netdevice_notifier+0x35/0x140 > [ 401.511101] #6: 00000000c2f3507e (rcu_read_lock){....}, at: flow_block_ing_cmd+0x5/0x130 > [ 401.511115] > stack backtrace: > [ 401.511121] CPU: 21 PID: 7576 Comm: test-ecmp-add-v Not tainted 5.3.0-rc3+ #589 > [ 401.511124] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 > [ 401.511127] Call Trace: > [ 401.511138] dump_stack+0x85/0xc0 > [ 401.511146] ___might_sleep+0x100/0x180 > [ 401.511154] __mutex_lock+0x5b/0x960 > [ 401.511162] ? find_held_lock+0x2b/0x80 > [ 401.511173] ? __tcf_get_next_chain+0x1d/0xb0 > [ 401.511179] ? mark_held_locks+0x49/0x70 > [ 401.511194] ? __tcf_get_next_chain+0x1d/0xb0 > [ 401.511198] __tcf_get_next_chain+0x1d/0xb0 > [ 401.511251] ? uplink_rep_async_event+0x70/0x70 [mlx5_core] > [ 401.511261] tcf_block_playback_offloads+0x39/0x160 > [ 401.511276] tcf_block_setup+0x1b0/0x240 > [ 401.511312] ? mlx5e_rep_indr_setup_tc_cb+0xca/0x290 [mlx5_core] > [ 401.511347] ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core] > [ 401.511359] tc_indr_block_get_and_ing_cmd+0x11b/0x1e0 > [ 401.511404] ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core] > [ 401.511414] flow_block_ing_cmd+0x7e/0x130 > [ 401.511453] ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core] > [ 401.511462] __flow_indr_block_cb_unregister+0x7f/0xf0 > [ 401.511502] mlx5e_nic_rep_netdevice_event+0x75/0xb0 [mlx5_core] > [ 401.511513] unregister_netdevice_notifier+0xe9/0x140 > [ 401.511554] mlx5e_cleanup_rep_tx+0x6f/0xe0 [mlx5_core] > [ 401.511597] mlx5e_detach_netdev+0x4b/0x60 [mlx5_core] > [ 401.511637] mlx5e_vport_rep_unload+0x71/0xc0 [mlx5_core] > [ 401.511679] esw_offloads_disable+0x5b/0x90 [mlx5_core] > [ 401.511724] mlx5_eswitch_disable.cold+0xdf/0x176 [mlx5_core] > [ 401.511759] mlx5_device_disable_sriov+0xab/0xb0 [mlx5_core] > [ 401.511794] mlx5_core_sriov_configure+0xaf/0xd0 [mlx5_core] > [ 401.511805] sriov_numvfs_store+0xf8/0x130 > [ 401.511817] kernfs_fop_write+0x122/0x1b0 > [ 401.511826] vfs_write+0xdb/0x1d0 > [ 401.511835] ksys_write+0x65/0xe0 > [ 401.511847] do_syscall_64+0x5c/0xb0 > [ 401.511857] entry_SYSCALL_64_after_hwframe+0x49/0xbe > [ 401.511862] RIP: 0033:0x7fad892d30f8 > [ 401.511868] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 96 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 60 c3 0f 1f 80 00 00 00 00 48 83 > ec 28 48 89 > [ 401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 > [ 401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8 > [ 401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001 > [ 401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a > [ 401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002 > [ 401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740 > > I don't think it is correct approach to try to call these callbacks with > rcu protection because: > > - Cls API uses sleeping locks that cannot be used in rcu read section > (hence the included trace). > > - It assumes that all implementation of classifier ops reoffload() don't > sleep. > > - And that all driver offload callbacks (both block and classifier > setup) don't sleep, which is not the case. > > I don't see any straightforward way to fix this, besides using some > other locking mechanism to protect block_ing_cb_list. > > Regards, > Vlad Maybe get the mutex flow_indr_block_ing_cb_lock for both lookup, add, delete? the callbacks_lists. the add and delete is work only on modules init case. So the lookup is also not frequently(ony [un]register) and can protect with the locks.
On Wed 14 Aug 2019 at 05:50, wenxu <wenxu@ucloud.cn> wrote: > On 8/12/2019 10:11 PM, Vlad Buslov wrote: >> >>> +static void flow_block_ing_cmd(struct net_device *dev, >>> + flow_indr_block_bind_cb_t *cb, >>> + void *cb_priv, >>> + enum flow_block_command command) >>> +{ >>> + struct flow_indr_block_ing_entry *entry; >>> + >>> + rcu_read_lock(); >>> + list_for_each_entry_rcu(entry, &block_ing_cb_list, list) { >>> + entry->cb(dev, cb, cb_priv, command); >>> + } >>> + rcu_read_unlock(); >>> +} >> Hi, >> >> I'm getting following incorrect rcu usage warnings with this patch >> caused by rcu_read_lock in flow_block_ing_cmd: >> >> [ 401.510948] ============================= >> [ 401.510952] WARNING: suspicious RCU usage >> [ 401.510993] 5.3.0-rc3+ #589 Not tainted >> [ 401.510996] ----------------------------- >> [ 401.511001] include/linux/rcupdate.h:265 Illegal context switch in RCU read-side critical section! >> [ 401.511004] >> other info that might help us debug this: >> >> [ 401.511008] >> rcu_scheduler_active = 2, debug_locks = 1 >> [ 401.511012] 7 locks held by test-ecmp-add-v/7576: >> [ 401.511015] #0: 00000000081d71a5 (sb_writers#4){.+.+}, at: vfs_write+0x166/0x1d0 >> [ 401.511037] #1: 000000002bd338c3 (&of->mutex){+.+.}, at: kernfs_fop_write+0xef/0x1b0 >> [ 401.511051] #2: 00000000c921c634 (kn->count#317){.+.+}, at: kernfs_fop_write+0xf7/0x1b0 >> [ 401.511062] #3: 00000000a19cdd56 (&dev->mutex){....}, at: sriov_numvfs_store+0x6b/0x130 >> [ 401.511079] #4: 000000005425fa52 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x30/0x140 >> [ 401.511092] #5: 00000000c5822793 (rtnl_mutex){+.+.}, at: unregister_netdevice_notifier+0x35/0x140 >> [ 401.511101] #6: 00000000c2f3507e (rcu_read_lock){....}, at: flow_block_ing_cmd+0x5/0x130 >> [ 401.511115] >> stack backtrace: >> [ 401.511121] CPU: 21 PID: 7576 Comm: test-ecmp-add-v Not tainted 5.3.0-rc3+ #589 >> [ 401.511124] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 >> [ 401.511127] Call Trace: >> [ 401.511138] dump_stack+0x85/0xc0 >> [ 401.511146] ___might_sleep+0x100/0x180 >> [ 401.511154] __mutex_lock+0x5b/0x960 >> [ 401.511162] ? find_held_lock+0x2b/0x80 >> [ 401.511173] ? __tcf_get_next_chain+0x1d/0xb0 >> [ 401.511179] ? mark_held_locks+0x49/0x70 >> [ 401.511194] ? __tcf_get_next_chain+0x1d/0xb0 >> [ 401.511198] __tcf_get_next_chain+0x1d/0xb0 >> [ 401.511251] ? uplink_rep_async_event+0x70/0x70 [mlx5_core] >> [ 401.511261] tcf_block_playback_offloads+0x39/0x160 >> [ 401.511276] tcf_block_setup+0x1b0/0x240 >> [ 401.511312] ? mlx5e_rep_indr_setup_tc_cb+0xca/0x290 [mlx5_core] >> [ 401.511347] ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core] >> [ 401.511359] tc_indr_block_get_and_ing_cmd+0x11b/0x1e0 >> [ 401.511404] ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core] >> [ 401.511414] flow_block_ing_cmd+0x7e/0x130 >> [ 401.511453] ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core] >> [ 401.511462] __flow_indr_block_cb_unregister+0x7f/0xf0 >> [ 401.511502] mlx5e_nic_rep_netdevice_event+0x75/0xb0 [mlx5_core] >> [ 401.511513] unregister_netdevice_notifier+0xe9/0x140 >> [ 401.511554] mlx5e_cleanup_rep_tx+0x6f/0xe0 [mlx5_core] >> [ 401.511597] mlx5e_detach_netdev+0x4b/0x60 [mlx5_core] >> [ 401.511637] mlx5e_vport_rep_unload+0x71/0xc0 [mlx5_core] >> [ 401.511679] esw_offloads_disable+0x5b/0x90 [mlx5_core] >> [ 401.511724] mlx5_eswitch_disable.cold+0xdf/0x176 [mlx5_core] >> [ 401.511759] mlx5_device_disable_sriov+0xab/0xb0 [mlx5_core] >> [ 401.511794] mlx5_core_sriov_configure+0xaf/0xd0 [mlx5_core] >> [ 401.511805] sriov_numvfs_store+0xf8/0x130 >> [ 401.511817] kernfs_fop_write+0x122/0x1b0 >> [ 401.511826] vfs_write+0xdb/0x1d0 >> [ 401.511835] ksys_write+0x65/0xe0 >> [ 401.511847] do_syscall_64+0x5c/0xb0 >> [ 401.511857] entry_SYSCALL_64_after_hwframe+0x49/0xbe >> [ 401.511862] RIP: 0033:0x7fad892d30f8 >> [ 401.511868] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 96 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 60 c3 0f 1f 80 00 00 00 00 48 83 >> ec 28 48 89 >> [ 401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 >> [ 401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8 >> [ 401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001 >> [ 401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a >> [ 401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002 >> [ 401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740 >> >> I don't think it is correct approach to try to call these callbacks with >> rcu protection because: >> >> - Cls API uses sleeping locks that cannot be used in rcu read section >> (hence the included trace). >> >> - It assumes that all implementation of classifier ops reoffload() don't >> sleep. >> >> - And that all driver offload callbacks (both block and classifier >> setup) don't sleep, which is not the case. >> >> I don't see any straightforward way to fix this, besides using some >> other locking mechanism to protect block_ing_cb_list. >> >> Regards, >> Vlad > > Maybe get the mutex flow_indr_block_ing_cb_lock for both lookup, add, delete? > > the callbacks_lists. the add and delete is work only on modules init case. So the > > lookup is also not frequently(ony [un]register) and can protect with the locks. That should do the job. I'll send the patch.
On Fri, 16 Aug 2019 15:04:44 +0000, Vlad Buslov wrote: > >> [ 401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 > >> [ 401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8 > >> [ 401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001 > >> [ 401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a > >> [ 401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002 > >> [ 401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740 > >> > >> I don't think it is correct approach to try to call these callbacks with > >> rcu protection because: > >> > >> - Cls API uses sleeping locks that cannot be used in rcu read section > >> (hence the included trace). > >> > >> - It assumes that all implementation of classifier ops reoffload() don't > >> sleep. > >> > >> - And that all driver offload callbacks (both block and classifier > >> setup) don't sleep, which is not the case. > >> > >> I don't see any straightforward way to fix this, besides using some > >> other locking mechanism to protect block_ing_cb_list. > >> > >> Regards, > >> Vlad > > > > Maybe get the mutex flow_indr_block_ing_cb_lock for both lookup, add, delete? > > > > the callbacks_lists. the add and delete is work only on modules init case. So the > > > > lookup is also not frequently(ony [un]register) and can protect with the locks. > > That should do the job. I'll send the patch. Hi Vlad! While looking into this, would you mind also add the missing flow_block_cb_is_busy() calls in the indirect handlers in the drivers? LMK if you're too busy, I don't want this to get forgotten :)
On Fri 16 Aug 2019 at 20:56, Jakub Kicinski <jakub.kicinski@netronome.com> wrote: > On Fri, 16 Aug 2019 15:04:44 +0000, Vlad Buslov wrote: >> >> [ 401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 >> >> [ 401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8 >> >> [ 401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001 >> >> [ 401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a >> >> [ 401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002 >> >> [ 401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740 >> >> >> >> I don't think it is correct approach to try to call these callbacks with >> >> rcu protection because: >> >> >> >> - Cls API uses sleeping locks that cannot be used in rcu read section >> >> (hence the included trace). >> >> >> >> - It assumes that all implementation of classifier ops reoffload() don't >> >> sleep. >> >> >> >> - And that all driver offload callbacks (both block and classifier >> >> setup) don't sleep, which is not the case. >> >> >> >> I don't see any straightforward way to fix this, besides using some >> >> other locking mechanism to protect block_ing_cb_list. >> >> >> >> Regards, >> >> Vlad >> > >> > Maybe get the mutex flow_indr_block_ing_cb_lock for both lookup, add, delete? >> > >> > the callbacks_lists. the add and delete is work only on modules init case. So the >> > >> > lookup is also not frequently(ony [un]register) and can protect with the locks. >> >> That should do the job. I'll send the patch. > > Hi Vlad! > > While looking into this, would you mind also add the missing > flow_block_cb_is_busy() calls in the indirect handlers in the drivers? > > LMK if you're too busy, I don't want this to get forgotten :) Hi Jakub, Will do!
On Fri 16 Aug 2019 at 20:56, Jakub Kicinski <jakub.kicinski@netronome.com> wrote: > On Fri, 16 Aug 2019 15:04:44 +0000, Vlad Buslov wrote: >> >> [ 401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 >> >> [ 401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8 >> >> [ 401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001 >> >> [ 401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a >> >> [ 401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002 >> >> [ 401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740 >> >> >> >> I don't think it is correct approach to try to call these callbacks with >> >> rcu protection because: >> >> >> >> - Cls API uses sleeping locks that cannot be used in rcu read section >> >> (hence the included trace). >> >> >> >> - It assumes that all implementation of classifier ops reoffload() don't >> >> sleep. >> >> >> >> - And that all driver offload callbacks (both block and classifier >> >> setup) don't sleep, which is not the case. >> >> >> >> I don't see any straightforward way to fix this, besides using some >> >> other locking mechanism to protect block_ing_cb_list. >> >> >> >> Regards, >> >> Vlad >> > >> > Maybe get the mutex flow_indr_block_ing_cb_lock for both lookup, add, delete? >> > >> > the callbacks_lists. the add and delete is work only on modules init case. So the >> > >> > lookup is also not frequently(ony [un]register) and can protect with the locks. >> >> That should do the job. I'll send the patch. > > Hi Vlad! > > While looking into this, would you mind also add the missing > flow_block_cb_is_busy() calls in the indirect handlers in the drivers? > > LMK if you're too busy, I don't want this to get forgotten :) Hi Jakub, I've checked the code and it looks like only nfp driver is affected: - I added check in nfp to lookup cb_priv with nfp_flower_indr_block_cb_priv_lookup() and call flow_block_cb_is_busy() if cb_priv exists. - In mlx5 en_rep.c there is already a check that indr_priv exists, so trying to lookup block_cb->cb_indent==indr_priv is redundant. - Switch drivers (mlxsw and ocelot) take reference to block_cb on FLOW_BLOCK_BIND, so they should not require any modifications. Tell me if I missed anything. Sending the patch for nfp. Regards, Vlad
On Mon, 19 Aug 2019 07:26:07 +0000, Vlad Buslov wrote: > On Fri 16 Aug 2019 at 20:56, Jakub Kicinski <jakub.kicinski@netronome.com> wrote: > > Hi Vlad! > > > > While looking into this, would you mind also add the missing > > flow_block_cb_is_busy() calls in the indirect handlers in the drivers? > > > > LMK if you're too busy, I don't want this to get forgotten :) > > Hi Jakub, > > I've checked the code and it looks like only nfp driver is affected: > > - I added check in nfp to lookup cb_priv with > nfp_flower_indr_block_cb_priv_lookup() and call > flow_block_cb_is_busy() if cb_priv exists. > > - In mlx5 en_rep.c there is already a check that indr_priv exists, so > trying to lookup block_cb->cb_indent==indr_priv is redundant. > > - Switch drivers (mlxsw and ocelot) take reference to block_cb on > FLOW_BLOCK_BIND, so they should not require any modifications. > > Tell me if I missed anything. Sending the patch for nfp. Ah, that sounds plausible, I've only checked the nfp driver.
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 46b8777..e8069b6 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -379,6 +379,15 @@ typedef void flow_indr_block_ing_cmd_t(struct net_device *dev, void *cb_priv, enum flow_block_command command); +struct flow_indr_block_ing_entry { + flow_indr_block_ing_cmd_t *cb; + struct list_head list; +}; + +void flow_indr_add_block_ing_cb(struct flow_indr_block_ing_entry *entry); + +void flow_indr_del_block_ing_cb(struct flow_indr_block_ing_entry *entry); + int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, flow_indr_block_bind_cb_t *cb, void *cb_ident); @@ -395,7 +404,6 @@ void flow_indr_block_cb_unregister(struct net_device *dev, void *cb_ident); void flow_indr_block_call(struct net_device *dev, - flow_indr_block_ing_cmd_t *cb, struct flow_block_offload *bo, enum flow_block_command command); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 4cc18e4..64c3d4d 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -3,6 +3,7 @@ #include <linux/slab.h> #include <net/flow_offload.h> #include <linux/rtnetlink.h> +#include <linux/mutex.h> struct flow_rule *flow_rule_alloc(unsigned int num_actions) { @@ -282,6 +283,8 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, } EXPORT_SYMBOL(flow_block_cb_setup_simple); +static LIST_HEAD(block_ing_cb_list); + static struct rhashtable indr_setup_block_ht; struct flow_indr_block_cb { @@ -295,7 +298,6 @@ struct flow_indr_block_dev { struct rhash_head ht_node; struct net_device *dev; unsigned int refcnt; - flow_indr_block_ing_cmd_t *block_ing_cmd_cb; struct list_head cb_list; }; @@ -389,6 +391,20 @@ static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb) kfree(indr_block_cb); } +static void flow_block_ing_cmd(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, + void *cb_priv, + enum flow_block_command command) +{ + struct flow_indr_block_ing_entry *entry; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &block_ing_cb_list, list) { + entry->cb(dev, cb, cb_priv, command); + } + rcu_read_unlock(); +} + int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, flow_indr_block_bind_cb_t *cb, void *cb_ident) @@ -406,10 +422,8 @@ int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, if (err) goto err_dev_put; - if (indr_dev->block_ing_cmd_cb) - indr_dev->block_ing_cmd_cb(dev, indr_block_cb->cb, - indr_block_cb->cb_priv, - FLOW_BLOCK_BIND); + flow_block_ing_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, + FLOW_BLOCK_BIND); return 0; @@ -448,10 +462,8 @@ void __flow_indr_block_cb_unregister(struct net_device *dev, if (!indr_block_cb) return; - if (indr_dev->block_ing_cmd_cb) - indr_dev->block_ing_cmd_cb(dev, indr_block_cb->cb, - indr_block_cb->cb_priv, - FLOW_BLOCK_UNBIND); + flow_block_ing_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, + FLOW_BLOCK_UNBIND); flow_indr_block_cb_del(indr_block_cb); flow_indr_block_dev_put(indr_dev); @@ -469,7 +481,6 @@ void flow_indr_block_cb_unregister(struct net_device *dev, EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister); void flow_indr_block_call(struct net_device *dev, - flow_indr_block_ing_cmd_t cb, struct flow_block_offload *bo, enum flow_block_command command) { @@ -480,15 +491,29 @@ void flow_indr_block_call(struct net_device *dev, if (!indr_dev) return; - indr_dev->block_ing_cmd_cb = command == FLOW_BLOCK_BIND - ? cb : NULL; - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, bo); } EXPORT_SYMBOL_GPL(flow_indr_block_call); +static DEFINE_MUTEX(flow_indr_block_ing_cb_lock); +void flow_indr_add_block_ing_cb(struct flow_indr_block_ing_entry *entry) +{ + mutex_lock(&flow_indr_block_ing_cb_lock); + list_add_tail_rcu(&entry->list, &block_ing_cb_list); + mutex_unlock(&flow_indr_block_ing_cb_lock); +} +EXPORT_SYMBOL_GPL(flow_indr_add_block_ing_cb); + +void flow_indr_del_block_ing_cb(struct flow_indr_block_ing_entry *entry) +{ + mutex_lock(&flow_indr_block_ing_cb_lock); + list_del_rcu(&entry->list); + mutex_unlock(&flow_indr_block_ing_cb_lock); +} +EXPORT_SYMBOL_GPL(flow_indr_del_block_ing_cb); + static int __init init_flow_indr_rhashtable(void) { return rhashtable_init(&indr_setup_block_ht, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0b0dde2..e0d8b45 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -621,7 +621,7 @@ static void tc_indr_block_call(struct tcf_block *block, }; INIT_LIST_HEAD(&bo.cb_list); - flow_indr_block_call(dev, tc_indr_block_get_and_ing_cmd, &bo, command); + flow_indr_block_call(dev, &bo, command); tcf_block_setup(block, &bo); } @@ -3183,6 +3183,11 @@ static void __net_exit tcf_net_exit(struct net *net) .size = sizeof(struct tcf_net), }; +static struct flow_indr_block_ing_entry block_ing_entry = { + .cb = tc_indr_block_get_and_ing_cmd, + .list = LIST_HEAD_INIT(block_ing_entry.list), +}; + static int __init tc_filter_init(void) { int err; @@ -3195,6 +3200,8 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; + flow_indr_add_block_ing_cb(&block_ing_entry); + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL,