diff mbox series

[net-next] net: sched: flower: refactor reoffload for concurrent access

Message ID 20190422072135.4176-1-vladbu@mellanox.com
State Changes Requested
Delegated to: David Miller
Headers show
Series [net-next] net: sched: flower: refactor reoffload for concurrent access | expand

Commit Message

Vlad Buslov April 22, 2019, 7:21 a.m. UTC
Recent changes that introduced unlocked flower did not properly account for
case when reoffload is initiated concurrently with filter updates. To fix
the issue, extend flower with 'hw_filters' list that is used to store
filters that don't have 'skip_hw' flag set. Filter is added to the list
when it is inserted to hardware and only removed from it after being
unoffloaded from all drivers that parent block is attached to. This ensures
that concurrent reoffload can still access filter that is being deleted and
prevents race condition when driver callback can be removed when filter is
no longer accessible trough idr, but is still present in hardware.

Refactor fl_change() to respect new filter reference counter and to release
filter reference with __fl_put() in case of error, instead of directly
deallocating filter memory. This allows for concurrent access to filter
from fl_reoffload() and protects it with reference counting. Refactor
fl_reoffload() to iterate over hw_filters list instead of idr. Implement
fl_get_next_hw_filter() helper function that is used to iterate over
hw_filters list with reference counting and skips filters that are being
concurrently deleted.

Fixes: 92149190067d ("net: sched: flower: set unlocked flag for flower proto ops")
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 20 deletions(-)

Comments

Saeed Mahameed April 22, 2019, 8:34 p.m. UTC | #1
On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote:
> Recent changes that introduced unlocked flower did not properly
> account for
> case when reoffload is initiated concurrently with filter updates. To
> fix
> the issue, extend flower with 'hw_filters' list that is used to store
> filters that don't have 'skip_hw' flag set. Filter is added to the
> list
> when it is inserted to hardware and only removed from it after being
> unoffloaded from all drivers that parent block is attached to. This
> ensures
> that concurrent reoffload can still access filter that is being
> deleted and
> prevents race condition when driver callback can be removed when
> filter is
> no longer accessible trough idr, but is still present in hardware.
> 
> Refactor fl_change() to respect new filter reference counter and to
> release
> filter reference with __fl_put() in case of error, instead of
> directly
> deallocating filter memory. This allows for concurrent access to
> filter
> from fl_reoffload() and protects it with reference counting. Refactor
> fl_reoffload() to iterate over hw_filters list instead of idr.
> Implement
> fl_get_next_hw_filter() helper function that is used to iterate over
> hw_filters list with reference counting and skips filters that are
> being
> concurrently deleted.
> 
> Fixes: 92149190067d ("net: sched: flower: set unlocked flag for
> flower proto ops")
> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> ---
>  net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++---------
> --
>  1 file changed, 57 insertions(+), 20 deletions(-)
> 
> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> index 4b5585358699..524b86560af3 100644
> --- a/net/sched/cls_flower.c
> +++ b/net/sched/cls_flower.c
> @@ -90,6 +90,7 @@ struct cls_fl_head {
>  	struct rhashtable ht;
>  	spinlock_t masks_lock; /* Protect masks list */
>  	struct list_head masks;
> +	struct list_head hw_filters;
>  	struct rcu_work rwork;
>  	struct idr handle_idr;
>  };
> @@ -102,6 +103,7 @@ struct cls_fl_filter {
>  	struct tcf_result res;
>  	struct fl_flow_key key;
>  	struct list_head list;
> +	struct list_head hw_list;
>  	u32 handle;
>  	u32 flags;
>  	u32 in_hw_count;
> @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp)
>  
>  	spin_lock_init(&head->masks_lock);
>  	INIT_LIST_HEAD_RCU(&head->masks);
> +	INIT_LIST_HEAD(&head->hw_filters);
>  	rcu_assign_pointer(tp->root, head);
>  	idr_init(&head->handle_idr);
>  
> @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head
> *head, struct fl_flow_mask *mask)
>  	return true;
>  }
>  
> +static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
> +{
> +	/* Flower classifier only changes root pointer during init and
> destroy.
> +	 * Users must obtain reference to tcf_proto instance before
> calling its
> +	 * API, so tp->root pointer is protected from concurrent call
> to
> +	 * fl_destroy() by reference counting.
> +	 */
> +	return rcu_dereference_raw(tp->root);
> +}
> +
>  static void __fl_destroy_filter(struct cls_fl_filter *f)
>  {
>  	tcf_exts_destroy(&f->exts);
> @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct tcf_proto
> *tp, struct cls_fl_filter *f,
>  
>  	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower,
> false);
>  	spin_lock(&tp->lock);
> +	if (!list_empty(&f->hw_list))
> +		list_del_init(&f->hw_list);
>  	tcf_block_offload_dec(block, &f->flags);
>  	spin_unlock(&tp->lock);
>  
> @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct tcf_proto
> *tp,
>  				struct cls_fl_filter *f, bool
> rtnl_held,
>  				struct netlink_ext_ack *extack)
>  {
> +	struct cls_fl_head *head = fl_head_dereference(tp);
>  	struct tc_cls_flower_offload cls_flower = {};
>  	struct tcf_block *block = tp->chain->block;
>  	bool skip_sw = tc_skip_sw(f->flags);
> @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct tcf_proto
> *tp,
>  		goto errout;
>  	}
>  
> +	spin_lock(&tp->lock);
> +	list_add(&f->hw_list, &head->hw_filters);
> +	spin_unlock(&tp->lock);
>  errout:
>  	if (!rtnl_held)
>  		rtnl_unlock();
> @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct tcf_proto
> *tp, struct cls_fl_filter *f,
>  		rtnl_unlock();
>  }
>  
> -static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
> -{
> -	/* Flower classifier only changes root pointer during init and
> destroy.
> -	 * Users must obtain reference to tcf_proto instance before
> calling its
> -	 * API, so tp->root pointer is protected from concurrent call
> to
> -	 * fl_destroy() by reference counting.
> -	 */
> -	return rcu_dereference_raw(tp->root);
> -}
> -
>  static void __fl_put(struct cls_fl_filter *f)
>  {
>  	if (!refcount_dec_and_test(&f->refcnt))
>  		return;
>  
> -	WARN_ON(!f->deleted);
> -
>  	if (tcf_exts_get_net(&f->exts))
>  		tcf_queue_work(&f->rwork, fl_destroy_filter_work);
>  	else
> @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net, struct
> sk_buff *in_skb,
>  		err = -ENOBUFS;
>  		goto errout_tb;
>  	}
> +	INIT_LIST_HEAD(&fnew->hw_list);
>  	refcount_set(&fnew->refcnt, 1);
>  
>  	err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
> @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net, struct
> sk_buff *in_skb,
>  		goto errout_hw;
>  	}
>  
> -	refcount_inc(&fnew->refcnt);
>  	if (fold) {
>  		/* Fold filter was deleted concurrently. Retry lookup.
> */
>  		if (fold->deleted) {
> @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net, struct
> sk_buff *in_skb,
>  			in_ht = true;
>  		}
>  
> +		refcount_inc(&fnew->refcnt);
>  		rhashtable_remove_fast(&fold->mask->ht,
>  				       &fold->ht_node,
>  				       fold->mask->filter_ht_params);
> @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net, struct
> sk_buff *in_skb,
>  		if (err)
>  			goto errout_hw;
>  
> +		refcount_inc(&fnew->refcnt);
>  		fnew->handle = handle;
>  		list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
>  		spin_unlock(&tp->lock);
> @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net, struct
> sk_buff *in_skb,
>  	kfree(mask);
>  	return 0;
>  
> +errout_ht:
> +	spin_lock(&tp->lock);
>  errout_hw:
> +	fnew->deleted = true;
>  	spin_unlock(&tp->lock);
>  	if (!tc_skip_hw(fnew->flags))
>  		fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
> -errout_ht:
>  	if (in_ht)
>  		rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
>  				       fnew->mask->filter_ht_params);
>  errout_mask:
>  	fl_mask_put(head, fnew->mask);
>  errout:
> -	tcf_exts_get_net(&fnew->exts);
> -	tcf_queue_work(&fnew->rwork, fl_destroy_filter_work);
> +	__fl_put(fnew);
>  errout_tb:
>  	kfree(tb);
>  errout_mask_alloc:
> @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp,
> struct tcf_walker *arg,
>  	}
>  }
>  
> +static struct cls_fl_filter *
> +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
> bool add)
> +{
> +	struct cls_fl_head *head = fl_head_dereference(tp);
> +
> +	spin_lock(&tp->lock);
> +	if (!f) {
> +		if (list_empty(&head->hw_filters)) {
> +			spin_unlock(&tp->lock);
> +			return NULL;
> +		}

Shouldn't this be a pre-condition to the whole function ? i mean
regardless of whether 'f' is NULL or not ?

> +
> +		f = list_first_entry(&head->hw_filters, struct
> cls_fl_filter,
> +				     hw_list);
> +	} else {
> +		f = list_next_entry(f, hw_list);
> +	}
> +

Maybe if you use  list_for_each_entry_continue below, might simplify
the above logic. it is weird that you need to figure out next entry
then call list_for_each_from, list 'continue' variation is made for
such use cases.

> +	list_for_each_entry_from(f, &head->hw_filters, hw_list) {
> +		if (!(add && f->deleted) && refcount_inc_not_zero(&f-
> >refcnt)) {
> +			spin_unlock(&tp->lock);
> +			return f;
> +		}
> +	}
> +
> +	spin_unlock(&tp->lock);
> +	return NULL;
> +}
> +
>  static int fl_reoffload(struct tcf_proto *tp, bool add,
> tc_setup_cb_t *cb,
>  			void *cb_priv, struct netlink_ext_ack *extack)
>  {
>  	struct tc_cls_flower_offload cls_flower = {};
>  	struct tcf_block *block = tp->chain->block;
> -	unsigned long handle = 0;
> -	struct cls_fl_filter *f;
> +	struct cls_fl_filter *f = NULL;
>  	int err;
>  
> -	while ((f = fl_get_next_filter(tp, &handle))) {
> +	while ((f = fl_get_next_hw_filter(tp, f, add))) {
>  		if (tc_skip_hw(f->flags))
>  			goto next_flow;

this can never be true as it is already a pre-condition for
fl_hw_replace_filter which actually adds  to the hw_filters list, i
think it needs to be removed. if it is, then i think it should be part
of fl_get_next_hw_filter and not the caller responsibility.

>  
> @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto *tp,
> bool add, tc_setup_cb_t *cb,
>  					  add);
>  		spin_unlock(&tp->lock);
>  next_flow:
> -		handle++;
>  		__fl_put(f);
>  	}
>
Jakub Kicinski April 22, 2019, 9:02 p.m. UTC | #2
On Mon, 22 Apr 2019 10:21:34 +0300, Vlad Buslov wrote:
> Recent changes that introduced unlocked flower did not properly account for
> case when reoffload is initiated concurrently with filter updates. To fix
> the issue, extend flower with 'hw_filters' list that is used to store
> filters that don't have 'skip_hw' flag set. Filter is added to the list
> when it is inserted to hardware and only removed from it after being
> unoffloaded from all drivers that parent block is attached to. This ensures
> that concurrent reoffload can still access filter that is being deleted and
> prevents race condition when driver callback can be removed when filter is
> no longer accessible trough idr, but is still present in hardware.
> 
> Refactor fl_change() to respect new filter reference counter and to release
> filter reference with __fl_put() in case of error, instead of directly
> deallocating filter memory. This allows for concurrent access to filter
> from fl_reoffload() and protects it with reference counting. Refactor
> fl_reoffload() to iterate over hw_filters list instead of idr. Implement
> fl_get_next_hw_filter() helper function that is used to iterate over
> hw_filters list with reference counting and skips filters that are being
> concurrently deleted.
> 
> Fixes: 92149190067d ("net: sched: flower: set unlocked flag for flower proto ops")
> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Perhaps it'd be good to add an ASSERT_RTNL() (maybe with a comment?) 
to fl_reoffload()?

> @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
>  
>  	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
>  	spin_lock(&tp->lock);
> +	if (!list_empty(&f->hw_list))
> +		list_del_init(&f->hw_list);

Mm. I thought list_del_init() on an empty list should be fine?

>  	tcf_block_offload_dec(block, &f->flags);
>  	spin_unlock(&tp->lock);
>
Vlad Buslov April 23, 2019, 7:34 a.m. UTC | #3
On Tue 23 Apr 2019 at 00:02, Jakub Kicinski <jakub.kicinski@netronome.com> wrote:
> On Mon, 22 Apr 2019 10:21:34 +0300, Vlad Buslov wrote:
>> Recent changes that introduced unlocked flower did not properly account for
>> case when reoffload is initiated concurrently with filter updates. To fix
>> the issue, extend flower with 'hw_filters' list that is used to store
>> filters that don't have 'skip_hw' flag set. Filter is added to the list
>> when it is inserted to hardware and only removed from it after being
>> unoffloaded from all drivers that parent block is attached to. This ensures
>> that concurrent reoffload can still access filter that is being deleted and
>> prevents race condition when driver callback can be removed when filter is
>> no longer accessible trough idr, but is still present in hardware.
>> 
>> Refactor fl_change() to respect new filter reference counter and to release
>> filter reference with __fl_put() in case of error, instead of directly
>> deallocating filter memory. This allows for concurrent access to filter
>> from fl_reoffload() and protects it with reference counting. Refactor
>> fl_reoffload() to iterate over hw_filters list instead of idr. Implement
>> fl_get_next_hw_filter() helper function that is used to iterate over
>> hw_filters list with reference counting and skips filters that are being
>> concurrently deleted.
>> 
>> Fixes: 92149190067d ("net: sched: flower: set unlocked flag for flower proto ops")
>> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>> Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>
> Perhaps it'd be good to add an ASSERT_RTNL() (maybe with a comment?) 
> to fl_reoffload()?

Good idea.

>
>> @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
>>  
>>  	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
>>  	spin_lock(&tp->lock);
>> +	if (!list_empty(&f->hw_list))
>> +		list_del_init(&f->hw_list);
>
> Mm. I thought list_del_init() on an empty list should be fine?

Is it? Implementation of list_del_init() doesn't seem to check if list
is empty before re-initializing its pointers. Technically it seems like
it can work because the implementation will just set pointers of empty
list to point to itself (which is how empty list head is defined), but
should we assume this is intended behavior and not just implementation
detail? I don't see anything in comments for this function that suggests
that it is okay to call list_del_init() on empty list head.

>
>>  	tcf_block_offload_dec(block, &f->flags);
>>  	spin_unlock(&tp->lock);
>>
Vlad Buslov April 23, 2019, 7:43 a.m. UTC | #4
On Mon 22 Apr 2019 at 23:34, Saeed Mahameed <saeedm@mellanox.com> wrote:
> On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote:
>> Recent changes that introduced unlocked flower did not properly
>> account for
>> case when reoffload is initiated concurrently with filter updates. To
>> fix
>> the issue, extend flower with 'hw_filters' list that is used to store
>> filters that don't have 'skip_hw' flag set. Filter is added to the
>> list
>> when it is inserted to hardware and only removed from it after being
>> unoffloaded from all drivers that parent block is attached to. This
>> ensures
>> that concurrent reoffload can still access filter that is being
>> deleted and
>> prevents race condition when driver callback can be removed when
>> filter is
>> no longer accessible trough idr, but is still present in hardware.
>>
>> Refactor fl_change() to respect new filter reference counter and to
>> release
>> filter reference with __fl_put() in case of error, instead of
>> directly
>> deallocating filter memory. This allows for concurrent access to
>> filter
>> from fl_reoffload() and protects it with reference counting. Refactor
>> fl_reoffload() to iterate over hw_filters list instead of idr.
>> Implement
>> fl_get_next_hw_filter() helper function that is used to iterate over
>> hw_filters list with reference counting and skips filters that are
>> being
>> concurrently deleted.
>>
>> Fixes: 92149190067d ("net: sched: flower: set unlocked flag for
>> flower proto ops")
>> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>> Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>> ---
>>  net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++---------
>> --
>>  1 file changed, 57 insertions(+), 20 deletions(-)
>>
>> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
>> index 4b5585358699..524b86560af3 100644
>> --- a/net/sched/cls_flower.c
>> +++ b/net/sched/cls_flower.c
>> @@ -90,6 +90,7 @@ struct cls_fl_head {
>>  	struct rhashtable ht;
>>  	spinlock_t masks_lock; /* Protect masks list */
>>  	struct list_head masks;
>> +	struct list_head hw_filters;
>>  	struct rcu_work rwork;
>>  	struct idr handle_idr;
>>  };
>> @@ -102,6 +103,7 @@ struct cls_fl_filter {
>>  	struct tcf_result res;
>>  	struct fl_flow_key key;
>>  	struct list_head list;
>> +	struct list_head hw_list;
>>  	u32 handle;
>>  	u32 flags;
>>  	u32 in_hw_count;
>> @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp)
>>
>>  	spin_lock_init(&head->masks_lock);
>>  	INIT_LIST_HEAD_RCU(&head->masks);
>> +	INIT_LIST_HEAD(&head->hw_filters);
>>  	rcu_assign_pointer(tp->root, head);
>>  	idr_init(&head->handle_idr);
>>
>> @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head
>> *head, struct fl_flow_mask *mask)
>>  	return true;
>>  }
>>
>> +static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
>> +{
>> +	/* Flower classifier only changes root pointer during init and
>> destroy.
>> +	 * Users must obtain reference to tcf_proto instance before
>> calling its
>> +	 * API, so tp->root pointer is protected from concurrent call
>> to
>> +	 * fl_destroy() by reference counting.
>> +	 */
>> +	return rcu_dereference_raw(tp->root);
>> +}
>> +
>>  static void __fl_destroy_filter(struct cls_fl_filter *f)
>>  {
>>  	tcf_exts_destroy(&f->exts);
>> @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct tcf_proto
>> *tp, struct cls_fl_filter *f,
>>
>>  	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower,
>> false);
>>  	spin_lock(&tp->lock);
>> +	if (!list_empty(&f->hw_list))
>> +		list_del_init(&f->hw_list);
>>  	tcf_block_offload_dec(block, &f->flags);
>>  	spin_unlock(&tp->lock);
>>
>> @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct tcf_proto
>> *tp,
>>  				struct cls_fl_filter *f, bool
>> rtnl_held,
>>  				struct netlink_ext_ack *extack)
>>  {
>> +	struct cls_fl_head *head = fl_head_dereference(tp);
>>  	struct tc_cls_flower_offload cls_flower = {};
>>  	struct tcf_block *block = tp->chain->block;
>>  	bool skip_sw = tc_skip_sw(f->flags);
>> @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct tcf_proto
>> *tp,
>>  		goto errout;
>>  	}
>>
>> +	spin_lock(&tp->lock);
>> +	list_add(&f->hw_list, &head->hw_filters);
>> +	spin_unlock(&tp->lock);
>>  errout:
>>  	if (!rtnl_held)
>>  		rtnl_unlock();
>> @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct tcf_proto
>> *tp, struct cls_fl_filter *f,
>>  		rtnl_unlock();
>>  }
>>
>> -static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
>> -{
>> -	/* Flower classifier only changes root pointer during init and
>> destroy.
>> -	 * Users must obtain reference to tcf_proto instance before
>> calling its
>> -	 * API, so tp->root pointer is protected from concurrent call
>> to
>> -	 * fl_destroy() by reference counting.
>> -	 */
>> -	return rcu_dereference_raw(tp->root);
>> -}
>> -
>>  static void __fl_put(struct cls_fl_filter *f)
>>  {
>>  	if (!refcount_dec_and_test(&f->refcnt))
>>  		return;
>>
>> -	WARN_ON(!f->deleted);
>> -
>>  	if (tcf_exts_get_net(&f->exts))
>>  		tcf_queue_work(&f->rwork, fl_destroy_filter_work);
>>  	else
>> @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net, struct
>> sk_buff *in_skb,
>>  		err = -ENOBUFS;
>>  		goto errout_tb;
>>  	}
>> +	INIT_LIST_HEAD(&fnew->hw_list);
>>  	refcount_set(&fnew->refcnt, 1);
>>
>>  	err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
>> @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net, struct
>> sk_buff *in_skb,
>>  		goto errout_hw;
>>  	}
>>
>> -	refcount_inc(&fnew->refcnt);
>>  	if (fold) {
>>  		/* Fold filter was deleted concurrently. Retry lookup.
>> */
>>  		if (fold->deleted) {
>> @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net, struct
>> sk_buff *in_skb,
>>  			in_ht = true;
>>  		}
>>
>> +		refcount_inc(&fnew->refcnt);
>>  		rhashtable_remove_fast(&fold->mask->ht,
>>  				       &fold->ht_node,
>>  				       fold->mask->filter_ht_params);
>> @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net, struct
>> sk_buff *in_skb,
>>  		if (err)
>>  			goto errout_hw;
>>
>> +		refcount_inc(&fnew->refcnt);
>>  		fnew->handle = handle;
>>  		list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
>>  		spin_unlock(&tp->lock);
>> @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net, struct
>> sk_buff *in_skb,
>>  	kfree(mask);
>>  	return 0;
>>
>> +errout_ht:
>> +	spin_lock(&tp->lock);
>>  errout_hw:
>> +	fnew->deleted = true;
>>  	spin_unlock(&tp->lock);
>>  	if (!tc_skip_hw(fnew->flags))
>>  		fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
>> -errout_ht:
>>  	if (in_ht)
>>  		rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
>>  				       fnew->mask->filter_ht_params);
>>  errout_mask:
>>  	fl_mask_put(head, fnew->mask);
>>  errout:
>> -	tcf_exts_get_net(&fnew->exts);
>> -	tcf_queue_work(&fnew->rwork, fl_destroy_filter_work);
>> +	__fl_put(fnew);
>>  errout_tb:
>>  	kfree(tb);
>>  errout_mask_alloc:
>> @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp,
>> struct tcf_walker *arg,
>>  	}
>>  }
>>
>> +static struct cls_fl_filter *
>> +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
>> bool add)
>> +{
>> +	struct cls_fl_head *head = fl_head_dereference(tp);
>> +
>> +	spin_lock(&tp->lock);
>> +	if (!f) {
>> +		if (list_empty(&head->hw_filters)) {
>> +			spin_unlock(&tp->lock);
>> +			return NULL;
>> +		}
>
> Shouldn't this be a pre-condition to the whole function ? i mean
> regardless of whether 'f' is NULL or not ?

List can't be empty if we already have an element of the list (f), so
why check this on every iteration?

>
>> +
>> +		f = list_first_entry(&head->hw_filters, struct
>> cls_fl_filter,
>> +				     hw_list);
>> +	} else {
>> +		f = list_next_entry(f, hw_list);
>> +	}
>> +
>
> Maybe if you use  list_for_each_entry_continue below, might simplify
> the above logic. it is weird that you need to figure out next entry
> then call list_for_each_from, list 'continue' variation is made for
> such use cases.

list_for_each_entry_continue requires initialized cursor and will skip
first element if we obtain initial cursor with list_first_entry(). We
can have two loops - one that uses list_for_each_entry for initial
iteration, and another one that uses list_for_each_entry_continue for
case when f!=NULL, but I don't see how that would be any simpler.

>
>> +	list_for_each_entry_from(f, &head->hw_filters, hw_list) {
>> +		if (!(add && f->deleted) && refcount_inc_not_zero(&f-
>> >refcnt)) {
>> +			spin_unlock(&tp->lock);
>> +			return f;
>> +		}
>> +	}
>> +
>> +	spin_unlock(&tp->lock);
>> +	return NULL;
>> +}
>> +
>>  static int fl_reoffload(struct tcf_proto *tp, bool add,
>> tc_setup_cb_t *cb,
>>  			void *cb_priv, struct netlink_ext_ack *extack)
>>  {
>>  	struct tc_cls_flower_offload cls_flower = {};
>>  	struct tcf_block *block = tp->chain->block;
>> -	unsigned long handle = 0;
>> -	struct cls_fl_filter *f;
>> +	struct cls_fl_filter *f = NULL;
>>  	int err;
>>
>> -	while ((f = fl_get_next_filter(tp, &handle))) {
>> +	while ((f = fl_get_next_hw_filter(tp, f, add))) {
>>  		if (tc_skip_hw(f->flags))
>>  			goto next_flow;
>
> this can never be true as it is already a pre-condition for
> fl_hw_replace_filter which actually adds  to the hw_filters list, i
> think it needs to be removed. if it is, then i think it should be part
> of fl_get_next_hw_filter and not the caller responsibility.

Good catch.

>
>>
>> @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto *tp,
>> bool add, tc_setup_cb_t *cb,
>>  					  add);
>>  		spin_unlock(&tp->lock);
>>  next_flow:
>> -		handle++;
>>  		__fl_put(f);
>>  	}
>>
Jakub Kicinski April 23, 2019, 4:51 p.m. UTC | #5
On Tue, 23 Apr 2019 07:34:20 +0000, Vlad Buslov wrote:
> >> @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
> >>  
> >>  	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
> >>  	spin_lock(&tp->lock);
> >> +	if (!list_empty(&f->hw_list))
> >> +		list_del_init(&f->hw_list);  
> >
> > Mm. I thought list_del_init() on an empty list should be fine?  
> 
> Is it? Implementation of list_del_init() doesn't seem to check if list
> is empty before re-initializing its pointers. Technically it seems like
> it can work because the implementation will just set pointers of empty
> list to point to itself (which is how empty list head is defined), but
> should we assume this is intended behavior and not just implementation
> detail? I don't see anything in comments for this function that suggests
> that it is okay to call list_del_init() on empty list head.

Mm.. I'd do it, IDK if there was ever an official ruling by the
supreme court of Linus or any such ;)  __list_del_entry_valid() 
looks like it'd not complain.  Up to you, in general it didn't 
read very idiomatic, that's all.
Saeed Mahameed April 23, 2019, 4:52 p.m. UTC | #6
On Tue, 2019-04-23 at 07:43 +0000, Vlad Buslov wrote:
> On Mon 22 Apr 2019 at 23:34, Saeed Mahameed <saeedm@mellanox.com>
> wrote:
> > On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote:
> > > Recent changes that introduced unlocked flower did not properly
> > > account for
> > > case when reoffload is initiated concurrently with filter
> > > updates. To
> > > fix
> > > the issue, extend flower with 'hw_filters' list that is used to
> > > store
> > > filters that don't have 'skip_hw' flag set. Filter is added to
> > > the
> > > list
> > > when it is inserted to hardware and only removed from it after
> > > being
> > > unoffloaded from all drivers that parent block is attached to.
> > > This
> > > ensures
> > > that concurrent reoffload can still access filter that is being
> > > deleted and
> > > prevents race condition when driver callback can be removed when
> > > filter is
> > > no longer accessible trough idr, but is still present in
> > > hardware.
> > > 
> > > Refactor fl_change() to respect new filter reference counter and
> > > to
> > > release
> > > filter reference with __fl_put() in case of error, instead of
> > > directly
> > > deallocating filter memory. This allows for concurrent access to
> > > filter
> > > from fl_reoffload() and protects it with reference counting.
> > > Refactor
> > > fl_reoffload() to iterate over hw_filters list instead of idr.
> > > Implement
> > > fl_get_next_hw_filter() helper function that is used to iterate
> > > over
> > > hw_filters list with reference counting and skips filters that
> > > are
> > > being
> > > concurrently deleted.
> > > 
> > > Fixes: 92149190067d ("net: sched: flower: set unlocked flag for
> > > flower proto ops")
> > > Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> > > Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> > > ---
> > >  net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++-----
> > > ----
> > > --
> > >  1 file changed, 57 insertions(+), 20 deletions(-)
> > > 
> > > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> > > index 4b5585358699..524b86560af3 100644
> > > --- a/net/sched/cls_flower.c
> > > +++ b/net/sched/cls_flower.c
> > > @@ -90,6 +90,7 @@ struct cls_fl_head {
> > >  	struct rhashtable ht;
> > >  	spinlock_t masks_lock; /* Protect masks list */
> > >  	struct list_head masks;
> > > +	struct list_head hw_filters;
> > >  	struct rcu_work rwork;
> > >  	struct idr handle_idr;
> > >  };
> > > @@ -102,6 +103,7 @@ struct cls_fl_filter {
> > >  	struct tcf_result res;
> > >  	struct fl_flow_key key;
> > >  	struct list_head list;
> > > +	struct list_head hw_list;
> > >  	u32 handle;
> > >  	u32 flags;
> > >  	u32 in_hw_count;
> > > @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp)
> > > 
> > >  	spin_lock_init(&head->masks_lock);
> > >  	INIT_LIST_HEAD_RCU(&head->masks);
> > > +	INIT_LIST_HEAD(&head->hw_filters);
> > >  	rcu_assign_pointer(tp->root, head);
> > >  	idr_init(&head->handle_idr);
> > > 
> > > @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head
> > > *head, struct fl_flow_mask *mask)
> > >  	return true;
> > >  }
> > > 
> > > +static struct cls_fl_head *fl_head_dereference(struct tcf_proto
> > > *tp)
> > > +{
> > > +	/* Flower classifier only changes root pointer during init and
> > > destroy.
> > > +	 * Users must obtain reference to tcf_proto instance before
> > > calling its
> > > +	 * API, so tp->root pointer is protected from concurrent call
> > > to
> > > +	 * fl_destroy() by reference counting.
> > > +	 */
> > > +	return rcu_dereference_raw(tp->root);
> > > +}
> > > +
> > >  static void __fl_destroy_filter(struct cls_fl_filter *f)
> > >  {
> > >  	tcf_exts_destroy(&f->exts);
> > > @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct
> > > tcf_proto
> > > *tp, struct cls_fl_filter *f,
> > > 
> > >  	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower,
> > > false);
> > >  	spin_lock(&tp->lock);
> > > +	if (!list_empty(&f->hw_list))
> > > +		list_del_init(&f->hw_list);
> > >  	tcf_block_offload_dec(block, &f->flags);
> > >  	spin_unlock(&tp->lock);
> > > 
> > > @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct
> > > tcf_proto
> > > *tp,
> > >  				struct cls_fl_filter *f, bool
> > > rtnl_held,
> > >  				struct netlink_ext_ack *extack)
> > >  {
> > > +	struct cls_fl_head *head = fl_head_dereference(tp);
> > >  	struct tc_cls_flower_offload cls_flower = {};
> > >  	struct tcf_block *block = tp->chain->block;
> > >  	bool skip_sw = tc_skip_sw(f->flags);
> > > @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct
> > > tcf_proto
> > > *tp,
> > >  		goto errout;
> > >  	}
> > > 
> > > +	spin_lock(&tp->lock);
> > > +	list_add(&f->hw_list, &head->hw_filters);
> > > +	spin_unlock(&tp->lock);
> > >  errout:
> > >  	if (!rtnl_held)
> > >  		rtnl_unlock();
> > > @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct
> > > tcf_proto
> > > *tp, struct cls_fl_filter *f,
> > >  		rtnl_unlock();
> > >  }
> > > 
> > > -static struct cls_fl_head *fl_head_dereference(struct tcf_proto
> > > *tp)
> > > -{
> > > -	/* Flower classifier only changes root pointer during init and
> > > destroy.
> > > -	 * Users must obtain reference to tcf_proto instance before
> > > calling its
> > > -	 * API, so tp->root pointer is protected from concurrent call
> > > to
> > > -	 * fl_destroy() by reference counting.
> > > -	 */
> > > -	return rcu_dereference_raw(tp->root);
> > > -}
> > > -
> > >  static void __fl_put(struct cls_fl_filter *f)
> > >  {
> > >  	if (!refcount_dec_and_test(&f->refcnt))
> > >  		return;
> > > 
> > > -	WARN_ON(!f->deleted);
> > > -
> > >  	if (tcf_exts_get_net(&f->exts))
> > >  		tcf_queue_work(&f->rwork, fl_destroy_filter_work);
> > >  	else
> > > @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net,
> > > struct
> > > sk_buff *in_skb,
> > >  		err = -ENOBUFS;
> > >  		goto errout_tb;
> > >  	}
> > > +	INIT_LIST_HEAD(&fnew->hw_list);
> > >  	refcount_set(&fnew->refcnt, 1);
> > > 
> > >  	err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
> > > @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net,
> > > struct
> > > sk_buff *in_skb,
> > >  		goto errout_hw;
> > >  	}
> > > 
> > > -	refcount_inc(&fnew->refcnt);
> > >  	if (fold) {
> > >  		/* Fold filter was deleted concurrently. Retry lookup.
> > > */
> > >  		if (fold->deleted) {
> > > @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net,
> > > struct
> > > sk_buff *in_skb,
> > >  			in_ht = true;
> > >  		}
> > > 
> > > +		refcount_inc(&fnew->refcnt);
> > >  		rhashtable_remove_fast(&fold->mask->ht,
> > >  				       &fold->ht_node,
> > >  				       fold->mask->filter_ht_params);
> > > @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net,
> > > struct
> > > sk_buff *in_skb,
> > >  		if (err)
> > >  			goto errout_hw;
> > > 
> > > +		refcount_inc(&fnew->refcnt);
> > >  		fnew->handle = handle;
> > >  		list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
> > >  		spin_unlock(&tp->lock);
> > > @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net,
> > > struct
> > > sk_buff *in_skb,
> > >  	kfree(mask);
> > >  	return 0;
> > > 
> > > +errout_ht:
> > > +	spin_lock(&tp->lock);
> > >  errout_hw:
> > > +	fnew->deleted = true;
> > >  	spin_unlock(&tp->lock);
> > >  	if (!tc_skip_hw(fnew->flags))
> > >  		fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
> > > -errout_ht:
> > >  	if (in_ht)
> > >  		rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
> > >  				       fnew->mask->filter_ht_params);
> > >  errout_mask:
> > >  	fl_mask_put(head, fnew->mask);
> > >  errout:
> > > -	tcf_exts_get_net(&fnew->exts);
> > > -	tcf_queue_work(&fnew->rwork, fl_destroy_filter_work);
> > > +	__fl_put(fnew);
> > >  errout_tb:
> > >  	kfree(tb);
> > >  errout_mask_alloc:
> > > @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp,
> > > struct tcf_walker *arg,
> > >  	}
> > >  }
> > > 
> > > +static struct cls_fl_filter *
> > > +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter
> > > *f,
> > > bool add)
> > > +{
> > > +	struct cls_fl_head *head = fl_head_dereference(tp);
> > > +
> > > +	spin_lock(&tp->lock);
> > > +	if (!f) {
> > > +		if (list_empty(&head->hw_filters)) {
> > > +			spin_unlock(&tp->lock);
> > > +			return NULL;
> > > +		}
> > 
> > Shouldn't this be a pre-condition to the whole function ? i mean
> > regardless of whether 'f' is NULL or not ?
> 
> List can't be empty if we already have an element of the list (f), so
> why check this on every iteration?
> 

Because you release the lock on every iteration, you can't just assume
that the list is still intact.

> > > +
> > > +		f = list_first_entry(&head->hw_filters, struct
> > > cls_fl_filter,
> > > +				     hw_list);
> > > +	} else {
> > > +		f = list_next_entry(f, hw_list);
> > > +	}
> > > +
> > 
> > Maybe if you use  list_for_each_entry_continue below, might
> > simplify
> > the above logic. it is weird that you need to figure out next entry
> > then call list_for_each_from, list 'continue' variation is made for
> > such use cases.
> 
> list_for_each_entry_continue requires initialized cursor and will
> skip
> first element if we obtain initial cursor with list_first_entry(). We
> can have two loops - one that uses list_for_each_entry for initial
> iteration, and another one that uses list_for_each_entry_continue for
> case when f!=NULL, but I don't see how that would be any simpler.
> 

you can set the initial cursor to list->head and not first entry.

It would be simpler if you do:

if (list_empty())
    return NULL;
pos = f ? f : list->head;
list_for_each_entry_continue(pos, ...);

> > > +	list_for_each_entry_from(f, &head->hw_filters, hw_list) {
> > > +		if (!(add && f->deleted) && refcount_inc_not_zero(&f-
> > > > refcnt)) {
> > > +			spin_unlock(&tp->lock);
> > > +			return f;
> > > +		}
> > > +	}
> > > +
> > > +	spin_unlock(&tp->lock);
> > > +	return NULL;
> > > +}
> > > +
> > >  static int fl_reoffload(struct tcf_proto *tp, bool add,
> > > tc_setup_cb_t *cb,
> > >  			void *cb_priv, struct netlink_ext_ack *extack)
> > >  {
> > >  	struct tc_cls_flower_offload cls_flower = {};
> > >  	struct tcf_block *block = tp->chain->block;
> > > -	unsigned long handle = 0;
> > > -	struct cls_fl_filter *f;
> > > +	struct cls_fl_filter *f = NULL;
> > >  	int err;
> > > 
> > > -	while ((f = fl_get_next_filter(tp, &handle))) {
> > > +	while ((f = fl_get_next_hw_filter(tp, f, add))) {
> > >  		if (tc_skip_hw(f->flags))
> > >  			goto next_flow;
> > 
> > this can never be true as it is already a pre-condition for
> > fl_hw_replace_filter which actually adds  to the hw_filters list, i
> > think it needs to be removed. if it is, then i think it should be
> > part
> > of fl_get_next_hw_filter and not the caller responsibility.
> 
> Good catch.
> 
> > > @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto
> > > *tp,
> > > bool add, tc_setup_cb_t *cb,
> > >  					  add);
> > >  		spin_unlock(&tp->lock);
> > >  next_flow:
> > > -		handle++;
> > >  		__fl_put(f);
> > >  	}
> > >
Vlad Buslov April 24, 2019, 7:50 a.m. UTC | #7
On Tue 23 Apr 2019 at 19:52, Saeed Mahameed <saeedm@mellanox.com> wrote:
> On Tue, 2019-04-23 at 07:43 +0000, Vlad Buslov wrote:
>> On Mon 22 Apr 2019 at 23:34, Saeed Mahameed <saeedm@mellanox.com>
>> wrote:
>> > On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote:
>> > > Recent changes that introduced unlocked flower did not properly
>> > > account for
>> > > case when reoffload is initiated concurrently with filter
>> > > updates. To
>> > > fix
>> > > the issue, extend flower with 'hw_filters' list that is used to
>> > > store
>> > > filters that don't have 'skip_hw' flag set. Filter is added to
>> > > the
>> > > list
>> > > when it is inserted to hardware and only removed from it after
>> > > being
>> > > unoffloaded from all drivers that parent block is attached to.
>> > > This
>> > > ensures
>> > > that concurrent reoffload can still access filter that is being
>> > > deleted and
>> > > prevents race condition when driver callback can be removed when
>> > > filter is
>> > > no longer accessible trough idr, but is still present in
>> > > hardware.
>> > > 
>> > > Refactor fl_change() to respect new filter reference counter and
>> > > to
>> > > release
>> > > filter reference with __fl_put() in case of error, instead of
>> > > directly
>> > > deallocating filter memory. This allows for concurrent access to
>> > > filter
>> > > from fl_reoffload() and protects it with reference counting.
>> > > Refactor
>> > > fl_reoffload() to iterate over hw_filters list instead of idr.
>> > > Implement
>> > > fl_get_next_hw_filter() helper function that is used to iterate
>> > > over
>> > > hw_filters list with reference counting and skips filters that
>> > > are
>> > > being
>> > > concurrently deleted.
>> > > 
>> > > Fixes: 92149190067d ("net: sched: flower: set unlocked flag for
>> > > flower proto ops")
>> > > Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>> > > Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>> > > ---
>> > >  net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++-----
>> > > ----
>> > > --
>> > >  1 file changed, 57 insertions(+), 20 deletions(-)
>> > > 
>> > > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
>> > > index 4b5585358699..524b86560af3 100644
>> > > --- a/net/sched/cls_flower.c
>> > > +++ b/net/sched/cls_flower.c
>> > > @@ -90,6 +90,7 @@ struct cls_fl_head {
>> > >  	struct rhashtable ht;
>> > >  	spinlock_t masks_lock; /* Protect masks list */
>> > >  	struct list_head masks;
>> > > +	struct list_head hw_filters;
>> > >  	struct rcu_work rwork;
>> > >  	struct idr handle_idr;
>> > >  };
>> > > @@ -102,6 +103,7 @@ struct cls_fl_filter {
>> > >  	struct tcf_result res;
>> > >  	struct fl_flow_key key;
>> > >  	struct list_head list;
>> > > +	struct list_head hw_list;
>> > >  	u32 handle;
>> > >  	u32 flags;
>> > >  	u32 in_hw_count;
>> > > @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp)
>> > > 
>> > >  	spin_lock_init(&head->masks_lock);
>> > >  	INIT_LIST_HEAD_RCU(&head->masks);
>> > > +	INIT_LIST_HEAD(&head->hw_filters);
>> > >  	rcu_assign_pointer(tp->root, head);
>> > >  	idr_init(&head->handle_idr);
>> > > 
>> > > @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head
>> > > *head, struct fl_flow_mask *mask)
>> > >  	return true;
>> > >  }
>> > > 
>> > > +static struct cls_fl_head *fl_head_dereference(struct tcf_proto
>> > > *tp)
>> > > +{
>> > > +	/* Flower classifier only changes root pointer during init and
>> > > destroy.
>> > > +	 * Users must obtain reference to tcf_proto instance before
>> > > calling its
>> > > +	 * API, so tp->root pointer is protected from concurrent call
>> > > to
>> > > +	 * fl_destroy() by reference counting.
>> > > +	 */
>> > > +	return rcu_dereference_raw(tp->root);
>> > > +}
>> > > +
>> > >  static void __fl_destroy_filter(struct cls_fl_filter *f)
>> > >  {
>> > >  	tcf_exts_destroy(&f->exts);
>> > > @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct
>> > > tcf_proto
>> > > *tp, struct cls_fl_filter *f,
>> > > 
>> > >  	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower,
>> > > false);
>> > >  	spin_lock(&tp->lock);
>> > > +	if (!list_empty(&f->hw_list))
>> > > +		list_del_init(&f->hw_list);
>> > >  	tcf_block_offload_dec(block, &f->flags);
>> > >  	spin_unlock(&tp->lock);
>> > > 
>> > > @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct
>> > > tcf_proto
>> > > *tp,
>> > >  				struct cls_fl_filter *f, bool
>> > > rtnl_held,
>> > >  				struct netlink_ext_ack *extack)
>> > >  {
>> > > +	struct cls_fl_head *head = fl_head_dereference(tp);
>> > >  	struct tc_cls_flower_offload cls_flower = {};
>> > >  	struct tcf_block *block = tp->chain->block;
>> > >  	bool skip_sw = tc_skip_sw(f->flags);
>> > > @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct
>> > > tcf_proto
>> > > *tp,
>> > >  		goto errout;
>> > >  	}
>> > > 
>> > > +	spin_lock(&tp->lock);
>> > > +	list_add(&f->hw_list, &head->hw_filters);
>> > > +	spin_unlock(&tp->lock);
>> > >  errout:
>> > >  	if (!rtnl_held)
>> > >  		rtnl_unlock();
>> > > @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct
>> > > tcf_proto
>> > > *tp, struct cls_fl_filter *f,
>> > >  		rtnl_unlock();
>> > >  }
>> > > 
>> > > -static struct cls_fl_head *fl_head_dereference(struct tcf_proto
>> > > *tp)
>> > > -{
>> > > -	/* Flower classifier only changes root pointer during init and
>> > > destroy.
>> > > -	 * Users must obtain reference to tcf_proto instance before
>> > > calling its
>> > > -	 * API, so tp->root pointer is protected from concurrent call
>> > > to
>> > > -	 * fl_destroy() by reference counting.
>> > > -	 */
>> > > -	return rcu_dereference_raw(tp->root);
>> > > -}
>> > > -
>> > >  static void __fl_put(struct cls_fl_filter *f)
>> > >  {
>> > >  	if (!refcount_dec_and_test(&f->refcnt))
>> > >  		return;
>> > > 
>> > > -	WARN_ON(!f->deleted);
>> > > -
>> > >  	if (tcf_exts_get_net(&f->exts))
>> > >  		tcf_queue_work(&f->rwork, fl_destroy_filter_work);
>> > >  	else
>> > > @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net,
>> > > struct
>> > > sk_buff *in_skb,
>> > >  		err = -ENOBUFS;
>> > >  		goto errout_tb;
>> > >  	}
>> > > +	INIT_LIST_HEAD(&fnew->hw_list);
>> > >  	refcount_set(&fnew->refcnt, 1);
>> > > 
>> > >  	err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
>> > > @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net,
>> > > struct
>> > > sk_buff *in_skb,
>> > >  		goto errout_hw;
>> > >  	}
>> > > 
>> > > -	refcount_inc(&fnew->refcnt);
>> > >  	if (fold) {
>> > >  		/* Fold filter was deleted concurrently. Retry lookup.
>> > > */
>> > >  		if (fold->deleted) {
>> > > @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net,
>> > > struct
>> > > sk_buff *in_skb,
>> > >  			in_ht = true;
>> > >  		}
>> > > 
>> > > +		refcount_inc(&fnew->refcnt);
>> > >  		rhashtable_remove_fast(&fold->mask->ht,
>> > >  				       &fold->ht_node,
>> > >  				       fold->mask->filter_ht_params);
>> > > @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net,
>> > > struct
>> > > sk_buff *in_skb,
>> > >  		if (err)
>> > >  			goto errout_hw;
>> > > 
>> > > +		refcount_inc(&fnew->refcnt);
>> > >  		fnew->handle = handle;
>> > >  		list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
>> > >  		spin_unlock(&tp->lock);
>> > > @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net,
>> > > struct
>> > > sk_buff *in_skb,
>> > >  	kfree(mask);
>> > >  	return 0;
>> > > 
>> > > +errout_ht:
>> > > +	spin_lock(&tp->lock);
>> > >  errout_hw:
>> > > +	fnew->deleted = true;
>> > >  	spin_unlock(&tp->lock);
>> > >  	if (!tc_skip_hw(fnew->flags))
>> > >  		fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
>> > > -errout_ht:
>> > >  	if (in_ht)
>> > >  		rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
>> > >  				       fnew->mask->filter_ht_params);
>> > >  errout_mask:
>> > >  	fl_mask_put(head, fnew->mask);
>> > >  errout:
>> > > -	tcf_exts_get_net(&fnew->exts);
>> > > -	tcf_queue_work(&fnew->rwork, fl_destroy_filter_work);
>> > > +	__fl_put(fnew);
>> > >  errout_tb:
>> > >  	kfree(tb);
>> > >  errout_mask_alloc:
>> > > @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp,
>> > > struct tcf_walker *arg,
>> > >  	}
>> > >  }
>> > > 
>> > > +static struct cls_fl_filter *
>> > > +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter
>> > > *f,
>> > > bool add)
>> > > +{
>> > > +	struct cls_fl_head *head = fl_head_dereference(tp);
>> > > +
>> > > +	spin_lock(&tp->lock);
>> > > +	if (!f) {
>> > > +		if (list_empty(&head->hw_filters)) {
>> > > +			spin_unlock(&tp->lock);
>> > > +			return NULL;
>> > > +		}
>> > 
>> > Shouldn't this be a pre-condition to the whole function ? i mean
>> > regardless of whether 'f' is NULL or not ?
>> 
>> List can't be empty if we already have an element of the list (f), so
>> why check this on every iteration?
>> 
>
> Because you release the lock on every iteration, you can't just assume
> that the list is still intact.
>
>> > > +
>> > > +		f = list_first_entry(&head->hw_filters, struct
>> > > cls_fl_filter,
>> > > +				     hw_list);
>> > > +	} else {
>> > > +		f = list_next_entry(f, hw_list);
>> > > +	}
>> > > +
>> > 
>> > Maybe if you use  list_for_each_entry_continue below, might
>> > simplify
>> > the above logic. it is weird that you need to figure out next entry
>> > then call list_for_each_from, list 'continue' variation is made for
>> > such use cases.
>> 
>> list_for_each_entry_continue requires initialized cursor and will
>> skip
>> first element if we obtain initial cursor with list_first_entry(). We
>> can have two loops - one that uses list_for_each_entry for initial
>> iteration, and another one that uses list_for_each_entry_continue for
>> case when f!=NULL, but I don't see how that would be any simpler.
>> 
>
> you can set the initial cursor to list->head and not first entry.
>
> It would be simpler if you do:
>
> if (list_empty())
>     return NULL;
> pos = f ? f : list->head;
> list_for_each_entry_continue(pos, ...);

Okay.

>
>> > > +	list_for_each_entry_from(f, &head->hw_filters, hw_list) {
>> > > +		if (!(add && f->deleted) && refcount_inc_not_zero(&f-
>> > > > refcnt)) {
>> > > +			spin_unlock(&tp->lock);
>> > > +			return f;
>> > > +		}
>> > > +	}
>> > > +
>> > > +	spin_unlock(&tp->lock);
>> > > +	return NULL;
>> > > +}
>> > > +
>> > >  static int fl_reoffload(struct tcf_proto *tp, bool add,
>> > > tc_setup_cb_t *cb,
>> > >  			void *cb_priv, struct netlink_ext_ack *extack)
>> > >  {
>> > >  	struct tc_cls_flower_offload cls_flower = {};
>> > >  	struct tcf_block *block = tp->chain->block;
>> > > -	unsigned long handle = 0;
>> > > -	struct cls_fl_filter *f;
>> > > +	struct cls_fl_filter *f = NULL;
>> > >  	int err;
>> > > 
>> > > -	while ((f = fl_get_next_filter(tp, &handle))) {
>> > > +	while ((f = fl_get_next_hw_filter(tp, f, add))) {
>> > >  		if (tc_skip_hw(f->flags))
>> > >  			goto next_flow;
>> > 
>> > this can never be true as it is already a pre-condition for
>> > fl_hw_replace_filter which actually adds  to the hw_filters list, i
>> > think it needs to be removed. if it is, then i think it should be
>> > part
>> > of fl_get_next_hw_filter and not the caller responsibility.
>> 
>> Good catch.
>> 
>> > > @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto
>> > > *tp,
>> > > bool add, tc_setup_cb_t *cb,
>> > >  					  add);
>> > >  		spin_unlock(&tp->lock);
>> > >  next_flow:
>> > > -		handle++;
>> > >  		__fl_put(f);
>> > >  	}
>> > >
Vlad Buslov April 24, 2019, 7:50 a.m. UTC | #8
On Tue 23 Apr 2019 at 19:51, Jakub Kicinski <jakub.kicinski@netronome.com> wrote:
> On Tue, 23 Apr 2019 07:34:20 +0000, Vlad Buslov wrote:
>> >> @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
>> >>  
>> >>  	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
>> >>  	spin_lock(&tp->lock);
>> >> +	if (!list_empty(&f->hw_list))
>> >> +		list_del_init(&f->hw_list);  
>> >
>> > Mm. I thought list_del_init() on an empty list should be fine?  
>> 
>> Is it? Implementation of list_del_init() doesn't seem to check if list
>> is empty before re-initializing its pointers. Technically it seems like
>> it can work because the implementation will just set pointers of empty
>> list to point to itself (which is how empty list head is defined), but
>> should we assume this is intended behavior and not just implementation
>> detail? I don't see anything in comments for this function that suggests
>> that it is okay to call list_del_init() on empty list head.
>
> Mm.. I'd do it, IDK if there was ever an official ruling by the
> supreme court of Linus or any such ;)  __list_del_entry_valid() 
> looks like it'd not complain.  Up to you, in general it didn't 
> read very idiomatic, that's all.

Okay.
diff mbox series

Patch

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 4b5585358699..524b86560af3 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -90,6 +90,7 @@  struct cls_fl_head {
 	struct rhashtable ht;
 	spinlock_t masks_lock; /* Protect masks list */
 	struct list_head masks;
+	struct list_head hw_filters;
 	struct rcu_work rwork;
 	struct idr handle_idr;
 };
@@ -102,6 +103,7 @@  struct cls_fl_filter {
 	struct tcf_result res;
 	struct fl_flow_key key;
 	struct list_head list;
+	struct list_head hw_list;
 	u32 handle;
 	u32 flags;
 	u32 in_hw_count;
@@ -315,6 +317,7 @@  static int fl_init(struct tcf_proto *tp)
 
 	spin_lock_init(&head->masks_lock);
 	INIT_LIST_HEAD_RCU(&head->masks);
+	INIT_LIST_HEAD(&head->hw_filters);
 	rcu_assign_pointer(tp->root, head);
 	idr_init(&head->handle_idr);
 
@@ -352,6 +355,16 @@  static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask)
 	return true;
 }
 
+static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
+{
+	/* Flower classifier only changes root pointer during init and destroy.
+	 * Users must obtain reference to tcf_proto instance before calling its
+	 * API, so tp->root pointer is protected from concurrent call to
+	 * fl_destroy() by reference counting.
+	 */
+	return rcu_dereference_raw(tp->root);
+}
+
 static void __fl_destroy_filter(struct cls_fl_filter *f)
 {
 	tcf_exts_destroy(&f->exts);
@@ -382,6 +395,8 @@  static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
 
 	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
 	spin_lock(&tp->lock);
+	if (!list_empty(&f->hw_list))
+		list_del_init(&f->hw_list);
 	tcf_block_offload_dec(block, &f->flags);
 	spin_unlock(&tp->lock);
 
@@ -393,6 +408,7 @@  static int fl_hw_replace_filter(struct tcf_proto *tp,
 				struct cls_fl_filter *f, bool rtnl_held,
 				struct netlink_ext_ack *extack)
 {
+	struct cls_fl_head *head = fl_head_dereference(tp);
 	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
 	bool skip_sw = tc_skip_sw(f->flags);
@@ -444,6 +460,9 @@  static int fl_hw_replace_filter(struct tcf_proto *tp,
 		goto errout;
 	}
 
+	spin_lock(&tp->lock);
+	list_add(&f->hw_list, &head->hw_filters);
+	spin_unlock(&tp->lock);
 errout:
 	if (!rtnl_held)
 		rtnl_unlock();
@@ -475,23 +494,11 @@  static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
 		rtnl_unlock();
 }
 
-static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
-{
-	/* Flower classifier only changes root pointer during init and destroy.
-	 * Users must obtain reference to tcf_proto instance before calling its
-	 * API, so tp->root pointer is protected from concurrent call to
-	 * fl_destroy() by reference counting.
-	 */
-	return rcu_dereference_raw(tp->root);
-}
-
 static void __fl_put(struct cls_fl_filter *f)
 {
 	if (!refcount_dec_and_test(&f->refcnt))
 		return;
 
-	WARN_ON(!f->deleted);
-
 	if (tcf_exts_get_net(&f->exts))
 		tcf_queue_work(&f->rwork, fl_destroy_filter_work);
 	else
@@ -1522,6 +1529,7 @@  static int fl_change(struct net *net, struct sk_buff *in_skb,
 		err = -ENOBUFS;
 		goto errout_tb;
 	}
+	INIT_LIST_HEAD(&fnew->hw_list);
 	refcount_set(&fnew->refcnt, 1);
 
 	err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
@@ -1569,7 +1577,6 @@  static int fl_change(struct net *net, struct sk_buff *in_skb,
 		goto errout_hw;
 	}
 
-	refcount_inc(&fnew->refcnt);
 	if (fold) {
 		/* Fold filter was deleted concurrently. Retry lookup. */
 		if (fold->deleted) {
@@ -1591,6 +1598,7 @@  static int fl_change(struct net *net, struct sk_buff *in_skb,
 			in_ht = true;
 		}
 
+		refcount_inc(&fnew->refcnt);
 		rhashtable_remove_fast(&fold->mask->ht,
 				       &fold->ht_node,
 				       fold->mask->filter_ht_params);
@@ -1631,6 +1639,7 @@  static int fl_change(struct net *net, struct sk_buff *in_skb,
 		if (err)
 			goto errout_hw;
 
+		refcount_inc(&fnew->refcnt);
 		fnew->handle = handle;
 		list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
 		spin_unlock(&tp->lock);
@@ -1642,19 +1651,20 @@  static int fl_change(struct net *net, struct sk_buff *in_skb,
 	kfree(mask);
 	return 0;
 
+errout_ht:
+	spin_lock(&tp->lock);
 errout_hw:
+	fnew->deleted = true;
 	spin_unlock(&tp->lock);
 	if (!tc_skip_hw(fnew->flags))
 		fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
-errout_ht:
 	if (in_ht)
 		rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
 				       fnew->mask->filter_ht_params);
 errout_mask:
 	fl_mask_put(head, fnew->mask);
 errout:
-	tcf_exts_get_net(&fnew->exts);
-	tcf_queue_work(&fnew->rwork, fl_destroy_filter_work);
+	__fl_put(fnew);
 errout_tb:
 	kfree(tb);
 errout_mask_alloc:
@@ -1699,16 +1709,44 @@  static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
 	}
 }
 
+static struct cls_fl_filter *
+fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add)
+{
+	struct cls_fl_head *head = fl_head_dereference(tp);
+
+	spin_lock(&tp->lock);
+	if (!f) {
+		if (list_empty(&head->hw_filters)) {
+			spin_unlock(&tp->lock);
+			return NULL;
+		}
+
+		f = list_first_entry(&head->hw_filters, struct cls_fl_filter,
+				     hw_list);
+	} else {
+		f = list_next_entry(f, hw_list);
+	}
+
+	list_for_each_entry_from(f, &head->hw_filters, hw_list) {
+		if (!(add && f->deleted) && refcount_inc_not_zero(&f->refcnt)) {
+			spin_unlock(&tp->lock);
+			return f;
+		}
+	}
+
+	spin_unlock(&tp->lock);
+	return NULL;
+}
+
 static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
 			void *cb_priv, struct netlink_ext_ack *extack)
 {
 	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
-	unsigned long handle = 0;
-	struct cls_fl_filter *f;
+	struct cls_fl_filter *f = NULL;
 	int err;
 
-	while ((f = fl_get_next_filter(tp, &handle))) {
+	while ((f = fl_get_next_hw_filter(tp, f, add))) {
 		if (tc_skip_hw(f->flags))
 			goto next_flow;
 
@@ -1757,7 +1795,6 @@  static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
 					  add);
 		spin_unlock(&tp->lock);
 next_flow:
-		handle++;
 		__fl_put(f);
 	}