diff mbox series

[net-next,v7,5/6] flow_offload: support get multi-subsystem block

Message ID 1565140434-8109-6-git-send-email-wenxu@ucloud.cn
State Awaiting Upstream
Delegated to: Pablo Neira
Headers show
Series [net-next,v7,1/6] cls_api: modify the tc_indr_block_ing_cmd parameters. | expand

Commit Message

wenxu Aug. 7, 2019, 1:13 a.m. UTC
From: wenxu <wenxu@ucloud.cn>

It provide a callback list to find the blocks of tc
and nft subsystems

Signed-off-by: wenxu <wenxu@ucloud.cn>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
v7: add a mutex lock for add/del flow_indr_block_ing_cb

 include/net/flow_offload.h | 10 ++++++++-
 net/core/flow_offload.c    | 51 ++++++++++++++++++++++++++++++++++------------
 net/sched/cls_api.c        |  9 +++++++-
 3 files changed, 55 insertions(+), 15 deletions(-)

Comments

Vlad Buslov Aug. 12, 2019, 2:11 p.m. UTC | #1
On Wed 07 Aug 2019 at 04:13, wenxu@ucloud.cn wrote:
> From: wenxu <wenxu@ucloud.cn>
>
> It provide a callback list to find the blocks of tc
> and nft subsystems
>
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> ---
> v7: add a mutex lock for add/del flow_indr_block_ing_cb
>
>  include/net/flow_offload.h | 10 ++++++++-
>  net/core/flow_offload.c    | 51 ++++++++++++++++++++++++++++++++++------------
>  net/sched/cls_api.c        |  9 +++++++-
>  3 files changed, 55 insertions(+), 15 deletions(-)
>
> diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
> index 46b8777..e8069b6 100644
> --- a/include/net/flow_offload.h
> +++ b/include/net/flow_offload.h
> @@ -379,6 +379,15 @@ typedef void flow_indr_block_ing_cmd_t(struct net_device *dev,
>  					void *cb_priv,
>  					enum flow_block_command command);
>  
> +struct flow_indr_block_ing_entry {
> +	flow_indr_block_ing_cmd_t *cb;
> +	struct list_head	list;
> +};
> +
> +void flow_indr_add_block_ing_cb(struct flow_indr_block_ing_entry *entry);
> +
> +void flow_indr_del_block_ing_cb(struct flow_indr_block_ing_entry *entry);
> +
>  int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
>  				  flow_indr_block_bind_cb_t *cb,
>  				  void *cb_ident);
> @@ -395,7 +404,6 @@ void flow_indr_block_cb_unregister(struct net_device *dev,
>  				   void *cb_ident);
>  
>  void flow_indr_block_call(struct net_device *dev,
> -			  flow_indr_block_ing_cmd_t *cb,
>  			  struct flow_block_offload *bo,
>  			  enum flow_block_command command);
>  
> diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
> index 4cc18e4..64c3d4d 100644
> --- a/net/core/flow_offload.c
> +++ b/net/core/flow_offload.c
> @@ -3,6 +3,7 @@
>  #include <linux/slab.h>
>  #include <net/flow_offload.h>
>  #include <linux/rtnetlink.h>
> +#include <linux/mutex.h>
>  
>  struct flow_rule *flow_rule_alloc(unsigned int num_actions)
>  {
> @@ -282,6 +283,8 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
>  }
>  EXPORT_SYMBOL(flow_block_cb_setup_simple);
>  
> +static LIST_HEAD(block_ing_cb_list);
> +
>  static struct rhashtable indr_setup_block_ht;
>  
>  struct flow_indr_block_cb {
> @@ -295,7 +298,6 @@ struct flow_indr_block_dev {
>  	struct rhash_head ht_node;
>  	struct net_device *dev;
>  	unsigned int refcnt;
> -	flow_indr_block_ing_cmd_t  *block_ing_cmd_cb;
>  	struct list_head cb_list;
>  };
>  
> @@ -389,6 +391,20 @@ static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb)
>  	kfree(indr_block_cb);
>  }
>  
> +static void flow_block_ing_cmd(struct net_device *dev,
> +			       flow_indr_block_bind_cb_t *cb,
> +			       void *cb_priv,
> +			       enum flow_block_command command)
> +{
> +	struct flow_indr_block_ing_entry *entry;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(entry, &block_ing_cb_list, list) {
> +		entry->cb(dev, cb, cb_priv, command);
> +	}
> +	rcu_read_unlock();
> +}

Hi,

I'm getting following incorrect rcu usage warnings with this patch
caused by rcu_read_lock in flow_block_ing_cmd:

[  401.510948] =============================
[  401.510952] WARNING: suspicious RCU usage
[  401.510993] 5.3.0-rc3+ #589 Not tainted
[  401.510996] -----------------------------
[  401.511001] include/linux/rcupdate.h:265 Illegal context switch in RCU read-side critical section!
[  401.511004]
               other info that might help us debug this:

[  401.511008]
               rcu_scheduler_active = 2, debug_locks = 1
[  401.511012] 7 locks held by test-ecmp-add-v/7576:
[  401.511015]  #0: 00000000081d71a5 (sb_writers#4){.+.+}, at: vfs_write+0x166/0x1d0
[  401.511037]  #1: 000000002bd338c3 (&of->mutex){+.+.}, at: kernfs_fop_write+0xef/0x1b0
[  401.511051]  #2: 00000000c921c634 (kn->count#317){.+.+}, at: kernfs_fop_write+0xf7/0x1b0
[  401.511062]  #3: 00000000a19cdd56 (&dev->mutex){....}, at: sriov_numvfs_store+0x6b/0x130
[  401.511079]  #4: 000000005425fa52 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x30/0x140
[  401.511092]  #5: 00000000c5822793 (rtnl_mutex){+.+.}, at: unregister_netdevice_notifier+0x35/0x140
[  401.511101]  #6: 00000000c2f3507e (rcu_read_lock){....}, at: flow_block_ing_cmd+0x5/0x130
[  401.511115]
               stack backtrace:
[  401.511121] CPU: 21 PID: 7576 Comm: test-ecmp-add-v Not tainted 5.3.0-rc3+ #589
[  401.511124] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
[  401.511127] Call Trace:
[  401.511138]  dump_stack+0x85/0xc0
[  401.511146]  ___might_sleep+0x100/0x180
[  401.511154]  __mutex_lock+0x5b/0x960
[  401.511162]  ? find_held_lock+0x2b/0x80
[  401.511173]  ? __tcf_get_next_chain+0x1d/0xb0
[  401.511179]  ? mark_held_locks+0x49/0x70
[  401.511194]  ? __tcf_get_next_chain+0x1d/0xb0
[  401.511198]  __tcf_get_next_chain+0x1d/0xb0
[  401.511251]  ? uplink_rep_async_event+0x70/0x70 [mlx5_core]
[  401.511261]  tcf_block_playback_offloads+0x39/0x160
[  401.511276]  tcf_block_setup+0x1b0/0x240
[  401.511312]  ? mlx5e_rep_indr_setup_tc_cb+0xca/0x290 [mlx5_core]
[  401.511347]  ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core]
[  401.511359]  tc_indr_block_get_and_ing_cmd+0x11b/0x1e0
[  401.511404]  ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core]
[  401.511414]  flow_block_ing_cmd+0x7e/0x130
[  401.511453]  ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core]
[  401.511462]  __flow_indr_block_cb_unregister+0x7f/0xf0
[  401.511502]  mlx5e_nic_rep_netdevice_event+0x75/0xb0 [mlx5_core]
[  401.511513]  unregister_netdevice_notifier+0xe9/0x140
[  401.511554]  mlx5e_cleanup_rep_tx+0x6f/0xe0 [mlx5_core]
[  401.511597]  mlx5e_detach_netdev+0x4b/0x60 [mlx5_core]
[  401.511637]  mlx5e_vport_rep_unload+0x71/0xc0 [mlx5_core]
[  401.511679]  esw_offloads_disable+0x5b/0x90 [mlx5_core]
[  401.511724]  mlx5_eswitch_disable.cold+0xdf/0x176 [mlx5_core]
[  401.511759]  mlx5_device_disable_sriov+0xab/0xb0 [mlx5_core]
[  401.511794]  mlx5_core_sriov_configure+0xaf/0xd0 [mlx5_core]
[  401.511805]  sriov_numvfs_store+0xf8/0x130
[  401.511817]  kernfs_fop_write+0x122/0x1b0
[  401.511826]  vfs_write+0xdb/0x1d0
[  401.511835]  ksys_write+0x65/0xe0
[  401.511847]  do_syscall_64+0x5c/0xb0
[  401.511857]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  401.511862] RIP: 0033:0x7fad892d30f8
[  401.511868] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 96 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 60 c3 0f 1f 80 00 00 00 00 48 83
 ec 28 48 89
[  401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8
[  401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001
[  401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a
[  401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002
[  401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740

I don't think it is correct approach to try to call these callbacks with
rcu protection because:

- Cls API uses sleeping locks that cannot be used in rcu read section
  (hence the included trace).

- It assumes that all implementation of classifier ops reoffload() don't
  sleep.

- And that all driver offload callbacks (both block and classifier
  setup) don't sleep, which is not the case.

I don't see any straightforward way to fix this, besides using some
other locking mechanism to protect block_ing_cb_list.

Regards,
Vlad
wenxu Aug. 14, 2019, 2:50 a.m. UTC | #2
On 8/12/2019 10:11 PM, Vlad Buslov wrote:
>
>> +static void flow_block_ing_cmd(struct net_device *dev,
>> +			       flow_indr_block_bind_cb_t *cb,
>> +			       void *cb_priv,
>> +			       enum flow_block_command command)
>> +{
>> +	struct flow_indr_block_ing_entry *entry;
>> +
>> +	rcu_read_lock();
>> +	list_for_each_entry_rcu(entry, &block_ing_cb_list, list) {
>> +		entry->cb(dev, cb, cb_priv, command);
>> +	}
>> +	rcu_read_unlock();
>> +}
> Hi,
>
> I'm getting following incorrect rcu usage warnings with this patch
> caused by rcu_read_lock in flow_block_ing_cmd:
>
> [  401.510948] =============================
> [  401.510952] WARNING: suspicious RCU usage
> [  401.510993] 5.3.0-rc3+ #589 Not tainted
> [  401.510996] -----------------------------
> [  401.511001] include/linux/rcupdate.h:265 Illegal context switch in RCU read-side critical section!
> [  401.511004]
>                other info that might help us debug this:
>
> [  401.511008]
>                rcu_scheduler_active = 2, debug_locks = 1
> [  401.511012] 7 locks held by test-ecmp-add-v/7576:
> [  401.511015]  #0: 00000000081d71a5 (sb_writers#4){.+.+}, at: vfs_write+0x166/0x1d0
> [  401.511037]  #1: 000000002bd338c3 (&of->mutex){+.+.}, at: kernfs_fop_write+0xef/0x1b0
> [  401.511051]  #2: 00000000c921c634 (kn->count#317){.+.+}, at: kernfs_fop_write+0xf7/0x1b0
> [  401.511062]  #3: 00000000a19cdd56 (&dev->mutex){....}, at: sriov_numvfs_store+0x6b/0x130
> [  401.511079]  #4: 000000005425fa52 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x30/0x140
> [  401.511092]  #5: 00000000c5822793 (rtnl_mutex){+.+.}, at: unregister_netdevice_notifier+0x35/0x140
> [  401.511101]  #6: 00000000c2f3507e (rcu_read_lock){....}, at: flow_block_ing_cmd+0x5/0x130
> [  401.511115]
>                stack backtrace:
> [  401.511121] CPU: 21 PID: 7576 Comm: test-ecmp-add-v Not tainted 5.3.0-rc3+ #589
> [  401.511124] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
> [  401.511127] Call Trace:
> [  401.511138]  dump_stack+0x85/0xc0
> [  401.511146]  ___might_sleep+0x100/0x180
> [  401.511154]  __mutex_lock+0x5b/0x960
> [  401.511162]  ? find_held_lock+0x2b/0x80
> [  401.511173]  ? __tcf_get_next_chain+0x1d/0xb0
> [  401.511179]  ? mark_held_locks+0x49/0x70
> [  401.511194]  ? __tcf_get_next_chain+0x1d/0xb0
> [  401.511198]  __tcf_get_next_chain+0x1d/0xb0
> [  401.511251]  ? uplink_rep_async_event+0x70/0x70 [mlx5_core]
> [  401.511261]  tcf_block_playback_offloads+0x39/0x160
> [  401.511276]  tcf_block_setup+0x1b0/0x240
> [  401.511312]  ? mlx5e_rep_indr_setup_tc_cb+0xca/0x290 [mlx5_core]
> [  401.511347]  ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core]
> [  401.511359]  tc_indr_block_get_and_ing_cmd+0x11b/0x1e0
> [  401.511404]  ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core]
> [  401.511414]  flow_block_ing_cmd+0x7e/0x130
> [  401.511453]  ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core]
> [  401.511462]  __flow_indr_block_cb_unregister+0x7f/0xf0
> [  401.511502]  mlx5e_nic_rep_netdevice_event+0x75/0xb0 [mlx5_core]
> [  401.511513]  unregister_netdevice_notifier+0xe9/0x140
> [  401.511554]  mlx5e_cleanup_rep_tx+0x6f/0xe0 [mlx5_core]
> [  401.511597]  mlx5e_detach_netdev+0x4b/0x60 [mlx5_core]
> [  401.511637]  mlx5e_vport_rep_unload+0x71/0xc0 [mlx5_core]
> [  401.511679]  esw_offloads_disable+0x5b/0x90 [mlx5_core]
> [  401.511724]  mlx5_eswitch_disable.cold+0xdf/0x176 [mlx5_core]
> [  401.511759]  mlx5_device_disable_sriov+0xab/0xb0 [mlx5_core]
> [  401.511794]  mlx5_core_sriov_configure+0xaf/0xd0 [mlx5_core]
> [  401.511805]  sriov_numvfs_store+0xf8/0x130
> [  401.511817]  kernfs_fop_write+0x122/0x1b0
> [  401.511826]  vfs_write+0xdb/0x1d0
> [  401.511835]  ksys_write+0x65/0xe0
> [  401.511847]  do_syscall_64+0x5c/0xb0
> [  401.511857]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  401.511862] RIP: 0033:0x7fad892d30f8
> [  401.511868] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 96 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 60 c3 0f 1f 80 00 00 00 00 48 83
>  ec 28 48 89
> [  401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> [  401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8
> [  401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001
> [  401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a
> [  401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002
> [  401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740
>
> I don't think it is correct approach to try to call these callbacks with
> rcu protection because:
>
> - Cls API uses sleeping locks that cannot be used in rcu read section
>   (hence the included trace).
>
> - It assumes that all implementation of classifier ops reoffload() don't
>   sleep.
>
> - And that all driver offload callbacks (both block and classifier
>   setup) don't sleep, which is not the case.
>
> I don't see any straightforward way to fix this, besides using some
> other locking mechanism to protect block_ing_cb_list.
>
> Regards,
> Vlad

Maybe get the  mutex flow_indr_block_ing_cb_lock for both lookup, add, delete? 

the callbacks_lists. the add and delete is work only on modules init case. So the

lookup is also not frequently(ony [un]register) and can protect with the locks.
Vlad Buslov Aug. 16, 2019, 3:04 p.m. UTC | #3
On Wed 14 Aug 2019 at 05:50, wenxu <wenxu@ucloud.cn> wrote:
> On 8/12/2019 10:11 PM, Vlad Buslov wrote:
>>
>>> +static void flow_block_ing_cmd(struct net_device *dev,
>>> +			       flow_indr_block_bind_cb_t *cb,
>>> +			       void *cb_priv,
>>> +			       enum flow_block_command command)
>>> +{
>>> +	struct flow_indr_block_ing_entry *entry;
>>> +
>>> +	rcu_read_lock();
>>> +	list_for_each_entry_rcu(entry, &block_ing_cb_list, list) {
>>> +		entry->cb(dev, cb, cb_priv, command);
>>> +	}
>>> +	rcu_read_unlock();
>>> +}
>> Hi,
>>
>> I'm getting following incorrect rcu usage warnings with this patch
>> caused by rcu_read_lock in flow_block_ing_cmd:
>>
>> [  401.510948] =============================
>> [  401.510952] WARNING: suspicious RCU usage
>> [  401.510993] 5.3.0-rc3+ #589 Not tainted
>> [  401.510996] -----------------------------
>> [  401.511001] include/linux/rcupdate.h:265 Illegal context switch in RCU read-side critical section!
>> [  401.511004]
>>                other info that might help us debug this:
>>
>> [  401.511008]
>>                rcu_scheduler_active = 2, debug_locks = 1
>> [  401.511012] 7 locks held by test-ecmp-add-v/7576:
>> [  401.511015]  #0: 00000000081d71a5 (sb_writers#4){.+.+}, at: vfs_write+0x166/0x1d0
>> [  401.511037]  #1: 000000002bd338c3 (&of->mutex){+.+.}, at: kernfs_fop_write+0xef/0x1b0
>> [  401.511051]  #2: 00000000c921c634 (kn->count#317){.+.+}, at: kernfs_fop_write+0xf7/0x1b0
>> [  401.511062]  #3: 00000000a19cdd56 (&dev->mutex){....}, at: sriov_numvfs_store+0x6b/0x130
>> [  401.511079]  #4: 000000005425fa52 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x30/0x140
>> [  401.511092]  #5: 00000000c5822793 (rtnl_mutex){+.+.}, at: unregister_netdevice_notifier+0x35/0x140
>> [  401.511101]  #6: 00000000c2f3507e (rcu_read_lock){....}, at: flow_block_ing_cmd+0x5/0x130
>> [  401.511115]
>>                stack backtrace:
>> [  401.511121] CPU: 21 PID: 7576 Comm: test-ecmp-add-v Not tainted 5.3.0-rc3+ #589
>> [  401.511124] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
>> [  401.511127] Call Trace:
>> [  401.511138]  dump_stack+0x85/0xc0
>> [  401.511146]  ___might_sleep+0x100/0x180
>> [  401.511154]  __mutex_lock+0x5b/0x960
>> [  401.511162]  ? find_held_lock+0x2b/0x80
>> [  401.511173]  ? __tcf_get_next_chain+0x1d/0xb0
>> [  401.511179]  ? mark_held_locks+0x49/0x70
>> [  401.511194]  ? __tcf_get_next_chain+0x1d/0xb0
>> [  401.511198]  __tcf_get_next_chain+0x1d/0xb0
>> [  401.511251]  ? uplink_rep_async_event+0x70/0x70 [mlx5_core]
>> [  401.511261]  tcf_block_playback_offloads+0x39/0x160
>> [  401.511276]  tcf_block_setup+0x1b0/0x240
>> [  401.511312]  ? mlx5e_rep_indr_setup_tc_cb+0xca/0x290 [mlx5_core]
>> [  401.511347]  ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core]
>> [  401.511359]  tc_indr_block_get_and_ing_cmd+0x11b/0x1e0
>> [  401.511404]  ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core]
>> [  401.511414]  flow_block_ing_cmd+0x7e/0x130
>> [  401.511453]  ? mlx5e_rep_indr_tc_block_unbind+0x50/0x50 [mlx5_core]
>> [  401.511462]  __flow_indr_block_cb_unregister+0x7f/0xf0
>> [  401.511502]  mlx5e_nic_rep_netdevice_event+0x75/0xb0 [mlx5_core]
>> [  401.511513]  unregister_netdevice_notifier+0xe9/0x140
>> [  401.511554]  mlx5e_cleanup_rep_tx+0x6f/0xe0 [mlx5_core]
>> [  401.511597]  mlx5e_detach_netdev+0x4b/0x60 [mlx5_core]
>> [  401.511637]  mlx5e_vport_rep_unload+0x71/0xc0 [mlx5_core]
>> [  401.511679]  esw_offloads_disable+0x5b/0x90 [mlx5_core]
>> [  401.511724]  mlx5_eswitch_disable.cold+0xdf/0x176 [mlx5_core]
>> [  401.511759]  mlx5_device_disable_sriov+0xab/0xb0 [mlx5_core]
>> [  401.511794]  mlx5_core_sriov_configure+0xaf/0xd0 [mlx5_core]
>> [  401.511805]  sriov_numvfs_store+0xf8/0x130
>> [  401.511817]  kernfs_fop_write+0x122/0x1b0
>> [  401.511826]  vfs_write+0xdb/0x1d0
>> [  401.511835]  ksys_write+0x65/0xe0
>> [  401.511847]  do_syscall_64+0x5c/0xb0
>> [  401.511857]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
>> [  401.511862] RIP: 0033:0x7fad892d30f8
>> [  401.511868] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 96 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 60 c3 0f 1f 80 00 00 00 00 48 83
>>  ec 28 48 89
>> [  401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
>> [  401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8
>> [  401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001
>> [  401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a
>> [  401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002
>> [  401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740
>>
>> I don't think it is correct approach to try to call these callbacks with
>> rcu protection because:
>>
>> - Cls API uses sleeping locks that cannot be used in rcu read section
>>   (hence the included trace).
>>
>> - It assumes that all implementation of classifier ops reoffload() don't
>>   sleep.
>>
>> - And that all driver offload callbacks (both block and classifier
>>   setup) don't sleep, which is not the case.
>>
>> I don't see any straightforward way to fix this, besides using some
>> other locking mechanism to protect block_ing_cb_list.
>>
>> Regards,
>> Vlad
>
> Maybe get the  mutex flow_indr_block_ing_cb_lock for both lookup, add, delete? 
>
> the callbacks_lists. the add and delete is work only on modules init case. So the
>
> lookup is also not frequently(ony [un]register) and can protect with the locks.

That should do the job. I'll send the patch.
Jakub Kicinski Aug. 16, 2019, 5:56 p.m. UTC | #4
On Fri, 16 Aug 2019 15:04:44 +0000, Vlad Buslov wrote:
> >> [  401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> >> [  401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8
> >> [  401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001
> >> [  401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a
> >> [  401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002
> >> [  401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740
> >>
> >> I don't think it is correct approach to try to call these callbacks with
> >> rcu protection because:
> >>
> >> - Cls API uses sleeping locks that cannot be used in rcu read section
> >>   (hence the included trace).
> >>
> >> - It assumes that all implementation of classifier ops reoffload() don't
> >>   sleep.
> >>
> >> - And that all driver offload callbacks (both block and classifier
> >>   setup) don't sleep, which is not the case.
> >>
> >> I don't see any straightforward way to fix this, besides using some
> >> other locking mechanism to protect block_ing_cb_list.
> >>
> >> Regards,
> >> Vlad  
> >
> > Maybe get the  mutex flow_indr_block_ing_cb_lock for both lookup, add, delete? 
> >
> > the callbacks_lists. the add and delete is work only on modules init case. So the
> >
> > lookup is also not frequently(ony [un]register) and can protect with the locks.  
> 
> That should do the job. I'll send the patch.

Hi Vlad! 

While looking into this, would you mind also add the missing
flow_block_cb_is_busy() calls in the indirect handlers in the drivers?

LMK if you're too busy, I don't want this to get forgotten :)
Vlad Buslov Aug. 16, 2019, 6:44 p.m. UTC | #5
On Fri 16 Aug 2019 at 20:56, Jakub Kicinski <jakub.kicinski@netronome.com> wrote:
> On Fri, 16 Aug 2019 15:04:44 +0000, Vlad Buslov wrote:
>> >> [  401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
>> >> [  401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8
>> >> [  401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001
>> >> [  401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a
>> >> [  401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002
>> >> [  401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740
>> >>
>> >> I don't think it is correct approach to try to call these callbacks with
>> >> rcu protection because:
>> >>
>> >> - Cls API uses sleeping locks that cannot be used in rcu read section
>> >>   (hence the included trace).
>> >>
>> >> - It assumes that all implementation of classifier ops reoffload() don't
>> >>   sleep.
>> >>
>> >> - And that all driver offload callbacks (both block and classifier
>> >>   setup) don't sleep, which is not the case.
>> >>
>> >> I don't see any straightforward way to fix this, besides using some
>> >> other locking mechanism to protect block_ing_cb_list.
>> >>
>> >> Regards,
>> >> Vlad  
>> >
>> > Maybe get the  mutex flow_indr_block_ing_cb_lock for both lookup, add, delete? 
>> >
>> > the callbacks_lists. the add and delete is work only on modules init case. So the
>> >
>> > lookup is also not frequently(ony [un]register) and can protect with the locks.  
>> 
>> That should do the job. I'll send the patch.
>
> Hi Vlad! 
>
> While looking into this, would you mind also add the missing
> flow_block_cb_is_busy() calls in the indirect handlers in the drivers?
>
> LMK if you're too busy, I don't want this to get forgotten :)

Hi Jakub,

Will do!
Vlad Buslov Aug. 19, 2019, 7:26 a.m. UTC | #6
On Fri 16 Aug 2019 at 20:56, Jakub Kicinski <jakub.kicinski@netronome.com> wrote:
> On Fri, 16 Aug 2019 15:04:44 +0000, Vlad Buslov wrote:
>> >> [  401.511871] RSP: 002b:00007ffca2a9fad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
>> >> [  401.511875] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fad892d30f8
>> >> [  401.511878] RDX: 0000000000000002 RSI: 000055afeb072a90 RDI: 0000000000000001
>> >> [  401.511881] RBP: 000055afeb072a90 R08: 00000000ffffffff R09: 000000000000000a
>> >> [  401.511884] R10: 000055afeb058710 R11: 0000000000000246 R12: 0000000000000002
>> >> [  401.511887] R13: 00007fad893a8780 R14: 0000000000000002 R15: 00007fad893a3740
>> >>
>> >> I don't think it is correct approach to try to call these callbacks with
>> >> rcu protection because:
>> >>
>> >> - Cls API uses sleeping locks that cannot be used in rcu read section
>> >>   (hence the included trace).
>> >>
>> >> - It assumes that all implementation of classifier ops reoffload() don't
>> >>   sleep.
>> >>
>> >> - And that all driver offload callbacks (both block and classifier
>> >>   setup) don't sleep, which is not the case.
>> >>
>> >> I don't see any straightforward way to fix this, besides using some
>> >> other locking mechanism to protect block_ing_cb_list.
>> >>
>> >> Regards,
>> >> Vlad
>> >
>> > Maybe get the  mutex flow_indr_block_ing_cb_lock for both lookup, add, delete?
>> >
>> > the callbacks_lists. the add and delete is work only on modules init case. So the
>> >
>> > lookup is also not frequently(ony [un]register) and can protect with the locks.
>>
>> That should do the job. I'll send the patch.
>
> Hi Vlad!
>
> While looking into this, would you mind also add the missing
> flow_block_cb_is_busy() calls in the indirect handlers in the drivers?
>
> LMK if you're too busy, I don't want this to get forgotten :)

Hi Jakub,

I've checked the code and it looks like only nfp driver is affected:

- I added check in nfp to lookup cb_priv with
  nfp_flower_indr_block_cb_priv_lookup() and call
  flow_block_cb_is_busy() if cb_priv exists.

- In mlx5 en_rep.c there is already a check that indr_priv exists, so
  trying to lookup block_cb->cb_indent==indr_priv is redundant.

- Switch drivers (mlxsw and ocelot) take reference to block_cb on
  FLOW_BLOCK_BIND, so they should not require any modifications.

Tell me if I missed anything. Sending the patch for nfp.

Regards,
Vlad
Jakub Kicinski Aug. 19, 2019, 8:27 p.m. UTC | #7
On Mon, 19 Aug 2019 07:26:07 +0000, Vlad Buslov wrote:
> On Fri 16 Aug 2019 at 20:56, Jakub Kicinski <jakub.kicinski@netronome.com> wrote:
> > Hi Vlad!
> >
> > While looking into this, would you mind also add the missing
> > flow_block_cb_is_busy() calls in the indirect handlers in the drivers?
> >
> > LMK if you're too busy, I don't want this to get forgotten :)  
> 
> Hi Jakub,
> 
> I've checked the code and it looks like only nfp driver is affected:
> 
> - I added check in nfp to lookup cb_priv with
>   nfp_flower_indr_block_cb_priv_lookup() and call
>   flow_block_cb_is_busy() if cb_priv exists.
> 
> - In mlx5 en_rep.c there is already a check that indr_priv exists, so
>   trying to lookup block_cb->cb_indent==indr_priv is redundant.
> 
> - Switch drivers (mlxsw and ocelot) take reference to block_cb on
>   FLOW_BLOCK_BIND, so they should not require any modifications.
> 
> Tell me if I missed anything. Sending the patch for nfp.

Ah, that sounds plausible, I've only checked the nfp driver.
diff mbox series

Patch

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 46b8777..e8069b6 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -379,6 +379,15 @@  typedef void flow_indr_block_ing_cmd_t(struct net_device *dev,
 					void *cb_priv,
 					enum flow_block_command command);
 
+struct flow_indr_block_ing_entry {
+	flow_indr_block_ing_cmd_t *cb;
+	struct list_head	list;
+};
+
+void flow_indr_add_block_ing_cb(struct flow_indr_block_ing_entry *entry);
+
+void flow_indr_del_block_ing_cb(struct flow_indr_block_ing_entry *entry);
+
 int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
 				  flow_indr_block_bind_cb_t *cb,
 				  void *cb_ident);
@@ -395,7 +404,6 @@  void flow_indr_block_cb_unregister(struct net_device *dev,
 				   void *cb_ident);
 
 void flow_indr_block_call(struct net_device *dev,
-			  flow_indr_block_ing_cmd_t *cb,
 			  struct flow_block_offload *bo,
 			  enum flow_block_command command);
 
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 4cc18e4..64c3d4d 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -3,6 +3,7 @@ 
 #include <linux/slab.h>
 #include <net/flow_offload.h>
 #include <linux/rtnetlink.h>
+#include <linux/mutex.h>
 
 struct flow_rule *flow_rule_alloc(unsigned int num_actions)
 {
@@ -282,6 +283,8 @@  int flow_block_cb_setup_simple(struct flow_block_offload *f,
 }
 EXPORT_SYMBOL(flow_block_cb_setup_simple);
 
+static LIST_HEAD(block_ing_cb_list);
+
 static struct rhashtable indr_setup_block_ht;
 
 struct flow_indr_block_cb {
@@ -295,7 +298,6 @@  struct flow_indr_block_dev {
 	struct rhash_head ht_node;
 	struct net_device *dev;
 	unsigned int refcnt;
-	flow_indr_block_ing_cmd_t  *block_ing_cmd_cb;
 	struct list_head cb_list;
 };
 
@@ -389,6 +391,20 @@  static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb)
 	kfree(indr_block_cb);
 }
 
+static void flow_block_ing_cmd(struct net_device *dev,
+			       flow_indr_block_bind_cb_t *cb,
+			       void *cb_priv,
+			       enum flow_block_command command)
+{
+	struct flow_indr_block_ing_entry *entry;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(entry, &block_ing_cb_list, list) {
+		entry->cb(dev, cb, cb_priv, command);
+	}
+	rcu_read_unlock();
+}
+
 int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
 				  flow_indr_block_bind_cb_t *cb,
 				  void *cb_ident)
@@ -406,10 +422,8 @@  int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
 	if (err)
 		goto err_dev_put;
 
-	if (indr_dev->block_ing_cmd_cb)
-		indr_dev->block_ing_cmd_cb(dev, indr_block_cb->cb,
-					   indr_block_cb->cb_priv,
-					   FLOW_BLOCK_BIND);
+	flow_block_ing_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv,
+			   FLOW_BLOCK_BIND);
 
 	return 0;
 
@@ -448,10 +462,8 @@  void __flow_indr_block_cb_unregister(struct net_device *dev,
 	if (!indr_block_cb)
 		return;
 
-	if (indr_dev->block_ing_cmd_cb)
-		indr_dev->block_ing_cmd_cb(dev, indr_block_cb->cb,
-					   indr_block_cb->cb_priv,
-					   FLOW_BLOCK_UNBIND);
+	flow_block_ing_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv,
+			   FLOW_BLOCK_UNBIND);
 
 	flow_indr_block_cb_del(indr_block_cb);
 	flow_indr_block_dev_put(indr_dev);
@@ -469,7 +481,6 @@  void flow_indr_block_cb_unregister(struct net_device *dev,
 EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister);
 
 void flow_indr_block_call(struct net_device *dev,
-			  flow_indr_block_ing_cmd_t cb,
 			  struct flow_block_offload *bo,
 			  enum flow_block_command command)
 {
@@ -480,15 +491,29 @@  void flow_indr_block_call(struct net_device *dev,
 	if (!indr_dev)
 		return;
 
-	indr_dev->block_ing_cmd_cb = command == FLOW_BLOCK_BIND
-				     ? cb : NULL;
-
 	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
 		indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
 				  bo);
 }
 EXPORT_SYMBOL_GPL(flow_indr_block_call);
 
+static DEFINE_MUTEX(flow_indr_block_ing_cb_lock);
+void flow_indr_add_block_ing_cb(struct flow_indr_block_ing_entry *entry)
+{
+	mutex_lock(&flow_indr_block_ing_cb_lock);
+	list_add_tail_rcu(&entry->list, &block_ing_cb_list);
+	mutex_unlock(&flow_indr_block_ing_cb_lock);
+}
+EXPORT_SYMBOL_GPL(flow_indr_add_block_ing_cb);
+
+void flow_indr_del_block_ing_cb(struct flow_indr_block_ing_entry *entry)
+{
+	mutex_lock(&flow_indr_block_ing_cb_lock);
+	list_del_rcu(&entry->list);
+	mutex_unlock(&flow_indr_block_ing_cb_lock);
+}
+EXPORT_SYMBOL_GPL(flow_indr_del_block_ing_cb);
+
 static int __init init_flow_indr_rhashtable(void)
 {
 	return rhashtable_init(&indr_setup_block_ht,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 0b0dde2..e0d8b45 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -621,7 +621,7 @@  static void tc_indr_block_call(struct tcf_block *block,
 	};
 	INIT_LIST_HEAD(&bo.cb_list);
 
-	flow_indr_block_call(dev, tc_indr_block_get_and_ing_cmd, &bo, command);
+	flow_indr_block_call(dev, &bo, command);
 	tcf_block_setup(block, &bo);
 }
 
@@ -3183,6 +3183,11 @@  static void __net_exit tcf_net_exit(struct net *net)
 	.size = sizeof(struct tcf_net),
 };
 
+static struct flow_indr_block_ing_entry block_ing_entry = {
+	.cb = tc_indr_block_get_and_ing_cmd,
+	.list = LIST_HEAD_INIT(block_ing_entry.list),
+};
+
 static int __init tc_filter_init(void)
 {
 	int err;
@@ -3195,6 +3200,8 @@  static int __init tc_filter_init(void)
 	if (err)
 		goto err_register_pernet_subsys;
 
+	flow_indr_add_block_ing_cb(&block_ing_entry);
+
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
 		      RTNL_FLAG_DOIT_UNLOCKED);
 	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL,