diff mbox series

[net-next,02/19] netfilter: nft_set_rbtree: prefer sync gc to async worker

Message ID 20231025212555.132775-3-pablo@netfilter.org
State Awaiting Upstream
Headers show
Series [net-next,01/19] netfilter: nft_set_rbtree: rename gc deactivate+erase function | expand

Commit Message

Pablo Neira Ayuso Oct. 25, 2023, 9:25 p.m. UTC
From: Florian Westphal <fw@strlen.de>

There is no need for asynchronous garbage collection, rbtree inserts
can only happen from the netlink control plane.

We already perform on-demand gc on insertion, in the area of the
tree where the insertion takes place, but we don't do a full tree
walk there for performance reasons.

Do a full gc walk at the end of the transaction instead and
remove the async worker.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_rbtree.c | 124 +++++++++++++++++----------------
 1 file changed, 65 insertions(+), 59 deletions(-)

Comments

Simon Horman Nov. 3, 2023, 5:34 p.m. UTC | #1
On Wed, Oct 25, 2023 at 11:25:38PM +0200, Pablo Neira Ayuso wrote:
> From: Florian Westphal <fw@strlen.de>
> 
> There is no need for asynchronous garbage collection, rbtree inserts
> can only happen from the netlink control plane.
> 
> We already perform on-demand gc on insertion, in the area of the
> tree where the insertion takes place, but we don't do a full tree
> walk there for performance reasons.
> 
> Do a full gc walk at the end of the transaction instead and
> remove the async worker.
> 
> Signed-off-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

...

> @@ -515,11 +523,7 @@ static void nft_rbtree_remove(const struct net *net,
>  	struct nft_rbtree *priv = nft_set_priv(set);
>  	struct nft_rbtree_elem *rbe = elem->priv;
>  
> -	write_lock_bh(&priv->lock);
> -	write_seqcount_begin(&priv->count);
> -	rb_erase(&rbe->node, &priv->root);
> -	write_seqcount_end(&priv->count);
> -	write_unlock_bh(&priv->lock);
> +	nft_rbtree_erase(priv, rbe);
>  }
>  
>  static void nft_rbtree_activate(const struct net *net,
> @@ -613,45 +617,40 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
>  	read_unlock_bh(&priv->lock);
>  }
>  
> -static void nft_rbtree_gc(struct work_struct *work)
> +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
> +				 struct nft_rbtree *priv,
> +				 struct nft_rbtree_elem *rbe)
>  {
> +	struct nft_set_elem elem = {
> +		.priv	= rbe,
> +	};
> +
> +	nft_setelem_data_deactivate(net, set, &elem);
> +	nft_rbtree_erase(priv, rbe);
> +}
> +
> +static void nft_rbtree_gc(struct nft_set *set)
> +{
> +	struct nft_rbtree *priv = nft_set_priv(set);
>  	struct nft_rbtree_elem *rbe, *rbe_end = NULL;
>  	struct nftables_pernet *nft_net;

Hi Florian and Pablo,

I understand that this patch has been accepted upstream,
and that by implication this feedback is rather slow,
but I noticed that with this patch nft_net is now
set but otherwise unused in this function.

As flagged by clang-16 and gcc-13 W=1 builds.

> -	struct nft_rbtree *priv;
> +	struct rb_node *node, *next;
>  	struct nft_trans_gc *gc;
> -	struct rb_node *node;
> -	struct nft_set *set;
> -	unsigned int gc_seq;
>  	struct net *net;
>  
> -	priv = container_of(work, struct nft_rbtree, gc_work.work);
>  	set  = nft_set_container_of(priv);
>  	net  = read_pnet(&set->net);
>  	nft_net = nft_pernet(net);
> -	gc_seq  = READ_ONCE(nft_net->gc_seq);
>  
> -	if (nft_set_gc_is_pending(set))
> -		goto done;
> -
> -	gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
> +	gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
>  	if (!gc)
> -		goto done;
> -
> -	read_lock_bh(&priv->lock);
> -	for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
> +		return;
>  
> -		/* Ruleset has been updated, try later. */
> -		if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
> -			nft_trans_gc_destroy(gc);
> -			gc = NULL;
> -			goto try_later;
> -		}
> +	for (node = rb_first(&priv->root); node ; node = next) {
> +		next = rb_next(node);
>  
>  		rbe = rb_entry(node, struct nft_rbtree_elem, node);
>  
> -		if (nft_set_elem_is_dead(&rbe->ext))
> -			goto dead_elem;
> -
>  		/* elements are reversed in the rbtree for historical reasons,
>  		 * from highest to lowest value, that is why end element is
>  		 * always visited before the start element.

...
Florian Westphal Nov. 3, 2023, 5:55 p.m. UTC | #2
Simon Horman <horms@kernel.org> wrote:
> Hi Florian and Pablo,
> 
> I understand that this patch has been accepted upstream,
> and that by implication this feedback is rather slow,
> but I noticed that with this patch nft_net is now
> set but otherwise unused in this function.

There is a patch queued for this, we'll pass this to -net next week.
diff mbox series

Patch

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index d59be2bc6e6c..7d1004f9e7d2 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -19,7 +19,7 @@  struct nft_rbtree {
 	struct rb_root		root;
 	rwlock_t		lock;
 	seqcount_rwlock_t	count;
-	struct delayed_work	gc_work;
+	unsigned long		last_gc;
 };
 
 struct nft_rbtree_elem {
@@ -48,8 +48,7 @@  static int nft_rbtree_cmp(const struct nft_set *set,
 
 static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
 {
-	return nft_set_elem_expired(&rbe->ext) ||
-	       nft_set_elem_is_dead(&rbe->ext);
+	return nft_set_elem_expired(&rbe->ext);
 }
 
 static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
@@ -508,6 +507,15 @@  static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 	return err;
 }
 
+static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe)
+{
+	write_lock_bh(&priv->lock);
+	write_seqcount_begin(&priv->count);
+	rb_erase(&rbe->node, &priv->root);
+	write_seqcount_end(&priv->count);
+	write_unlock_bh(&priv->lock);
+}
+
 static void nft_rbtree_remove(const struct net *net,
 			      const struct nft_set *set,
 			      const struct nft_set_elem *elem)
@@ -515,11 +523,7 @@  static void nft_rbtree_remove(const struct net *net,
 	struct nft_rbtree *priv = nft_set_priv(set);
 	struct nft_rbtree_elem *rbe = elem->priv;
 
-	write_lock_bh(&priv->lock);
-	write_seqcount_begin(&priv->count);
-	rb_erase(&rbe->node, &priv->root);
-	write_seqcount_end(&priv->count);
-	write_unlock_bh(&priv->lock);
+	nft_rbtree_erase(priv, rbe);
 }
 
 static void nft_rbtree_activate(const struct net *net,
@@ -613,45 +617,40 @@  static void nft_rbtree_walk(const struct nft_ctx *ctx,
 	read_unlock_bh(&priv->lock);
 }
 
-static void nft_rbtree_gc(struct work_struct *work)
+static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
+				 struct nft_rbtree *priv,
+				 struct nft_rbtree_elem *rbe)
 {
+	struct nft_set_elem elem = {
+		.priv	= rbe,
+	};
+
+	nft_setelem_data_deactivate(net, set, &elem);
+	nft_rbtree_erase(priv, rbe);
+}
+
+static void nft_rbtree_gc(struct nft_set *set)
+{
+	struct nft_rbtree *priv = nft_set_priv(set);
 	struct nft_rbtree_elem *rbe, *rbe_end = NULL;
 	struct nftables_pernet *nft_net;
-	struct nft_rbtree *priv;
+	struct rb_node *node, *next;
 	struct nft_trans_gc *gc;
-	struct rb_node *node;
-	struct nft_set *set;
-	unsigned int gc_seq;
 	struct net *net;
 
-	priv = container_of(work, struct nft_rbtree, gc_work.work);
 	set  = nft_set_container_of(priv);
 	net  = read_pnet(&set->net);
 	nft_net = nft_pernet(net);
-	gc_seq  = READ_ONCE(nft_net->gc_seq);
 
-	if (nft_set_gc_is_pending(set))
-		goto done;
-
-	gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
+	gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
 	if (!gc)
-		goto done;
-
-	read_lock_bh(&priv->lock);
-	for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+		return;
 
-		/* Ruleset has been updated, try later. */
-		if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
-			nft_trans_gc_destroy(gc);
-			gc = NULL;
-			goto try_later;
-		}
+	for (node = rb_first(&priv->root); node ; node = next) {
+		next = rb_next(node);
 
 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
 
-		if (nft_set_elem_is_dead(&rbe->ext))
-			goto dead_elem;
-
 		/* elements are reversed in the rbtree for historical reasons,
 		 * from highest to lowest value, that is why end element is
 		 * always visited before the start element.
@@ -663,37 +662,34 @@  static void nft_rbtree_gc(struct work_struct *work)
 		if (!nft_set_elem_expired(&rbe->ext))
 			continue;
 
-		nft_set_elem_dead(&rbe->ext);
-
-		if (!rbe_end)
-			continue;
-
-		nft_set_elem_dead(&rbe_end->ext);
-
-		gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+		gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
 		if (!gc)
 			goto try_later;
 
-		nft_trans_gc_elem_add(gc, rbe_end);
-		rbe_end = NULL;
-dead_elem:
-		gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+		/* end element needs to be removed first, it has
+		 * no timeout extension.
+		 */
+		if (rbe_end) {
+			nft_rbtree_gc_remove(net, set, priv, rbe_end);
+			nft_trans_gc_elem_add(gc, rbe_end);
+			rbe_end = NULL;
+		}
+
+		gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
 		if (!gc)
 			goto try_later;
 
+		nft_rbtree_gc_remove(net, set, priv, rbe);
 		nft_trans_gc_elem_add(gc, rbe);
 	}
 
-	gc = nft_trans_gc_catchall_async(gc, gc_seq);
-
 try_later:
-	read_unlock_bh(&priv->lock);
 
-	if (gc)
-		nft_trans_gc_queue_async_done(gc);
-done:
-	queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
-			   nft_set_gc_interval(set));
+	if (gc) {
+		gc = nft_trans_gc_catchall_sync(gc);
+		nft_trans_gc_queue_sync_done(gc);
+		priv->last_gc = jiffies;
+	}
 }
 
 static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
@@ -712,11 +708,6 @@  static int nft_rbtree_init(const struct nft_set *set,
 	seqcount_rwlock_init(&priv->count, &priv->lock);
 	priv->root = RB_ROOT;
 
-	INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
-	if (set->flags & NFT_SET_TIMEOUT)
-		queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
-				   nft_set_gc_interval(set));
-
 	return 0;
 }
 
@@ -727,8 +718,6 @@  static void nft_rbtree_destroy(const struct nft_ctx *ctx,
 	struct nft_rbtree_elem *rbe;
 	struct rb_node *node;
 
-	cancel_delayed_work_sync(&priv->gc_work);
-	rcu_barrier();
 	while ((node = priv->root.rb_node) != NULL) {
 		rb_erase(node, &priv->root);
 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -754,6 +743,21 @@  static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 	return true;
 }
 
+static void nft_rbtree_commit(struct nft_set *set)
+{
+	struct nft_rbtree *priv = nft_set_priv(set);
+
+	if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+		nft_rbtree_gc(set);
+}
+
+static void nft_rbtree_gc_init(const struct nft_set *set)
+{
+	struct nft_rbtree *priv = nft_set_priv(set);
+
+	priv->last_gc = jiffies;
+}
+
 const struct nft_set_type nft_set_rbtree_type = {
 	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
 	.ops		= {
@@ -767,6 +771,8 @@  const struct nft_set_type nft_set_rbtree_type = {
 		.deactivate	= nft_rbtree_deactivate,
 		.flush		= nft_rbtree_flush,
 		.activate	= nft_rbtree_activate,
+		.commit		= nft_rbtree_commit,
+		.gc_init	= nft_rbtree_gc_init,
 		.lookup		= nft_rbtree_lookup,
 		.walk		= nft_rbtree_walk,
 		.get		= nft_rbtree_get,