diff mbox series

[nf-next] netfilter: flowtable: separate replace, destroy and stats to different workqueues

Message ID 20210303125953.11911-1-ozsh@nvidia.com
State Accepted
Delegated to: Pablo Neira
Headers show
Series [nf-next] netfilter: flowtable: separate replace, destroy and stats to different workqueues | expand

Commit Message

Oz Shlomo March 3, 2021, 12:59 p.m. UTC
Currently the flow table offload replace, destroy and stats work items are
executed on a single workqueue. As such, DESTROY and STATS commands may
be backloged after a burst of REPLACE work items. This scenario can bloat
up memory and may cause active connections to age.

Instatiate add, del and stats workqueues to avoid backlogs of non-dependent
actions. Provide sysfs control over the workqueue attributes, allowing
userspace applications to control the workqueue cpumask.

Signed-off-by: Oz Shlomo <ozsh@nvidia.com>
Reviewed-by: Paul Blakey <paulb@nvidia.com>
---
 net/netfilter/nf_flow_table_offload.c | 44 ++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 8 deletions(-)

Comments

Pablo Neira Ayuso March 3, 2021, 4:11 p.m. UTC | #1
Hi,

On Wed, Mar 03, 2021 at 02:59:53PM +0200, Oz Shlomo wrote:
> Currently the flow table offload replace, destroy and stats work items are
> executed on a single workqueue. As such, DESTROY and STATS commands may
> be backloged after a burst of REPLACE work items. This scenario can bloat
> up memory and may cause active connections to age.
> 
> Instatiate add, del and stats workqueues to avoid backlogs of non-dependent
> actions. Provide sysfs control over the workqueue attributes, allowing
> userspace applications to control the workqueue cpumask.

Probably it would be good to place REPLACE and DESTROY in one single
queue so workqueues don't race? In case connections are quickly
created and destroyed, we might get an out of order execution, instead
of:

  REPLACE -> DESTROY -> REPLACE

events could be reordered to:

  REPLACE -> REPLACE -> DESTROY

So would it work for you if REPLACE and DESTROY go into one single
workqueue and stats go into another?

Or probably make the cookie unique is sufficient? The cookie refers to
the memory address but memory can be recycled very quickly. If the
cookie helps to catch the reorder scenario, then the conntrack id
could be used instead of the memory address as cookie.

Regarding exposing sysfs toogles, what kind of tuning are you
expecting from users?  I'd prefer that the workqueue subsystem selects
for me what is best (autotuning). I'm not a fan of exposing toggles to
userspace that I don't know what users would do with it.

Let me know, thanks.

> Signed-off-by: Oz Shlomo <ozsh@nvidia.com>
> Reviewed-by: Paul Blakey <paulb@nvidia.com>
> ---
>  net/netfilter/nf_flow_table_offload.c | 44 ++++++++++++++++++++++++++++-------
>  1 file changed, 36 insertions(+), 8 deletions(-)
> 
> diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
> index 2a6993fa40d7..1b979c8b3ba0 100644
> --- a/net/netfilter/nf_flow_table_offload.c
> +++ b/net/netfilter/nf_flow_table_offload.c
> @@ -13,7 +13,9 @@
>  #include <net/netfilter/nf_conntrack_core.h>
>  #include <net/netfilter/nf_conntrack_tuple.h>
>  
> -static struct workqueue_struct *nf_flow_offload_wq;
> +static struct workqueue_struct *nf_flow_offload_add_wq;
> +static struct workqueue_struct *nf_flow_offload_del_wq;
> +static struct workqueue_struct *nf_flow_offload_stats_wq;
>  
>  struct flow_offload_work {
>  	struct list_head	list;
> @@ -826,7 +828,12 @@ static void flow_offload_work_handler(struct work_struct *work)
>  
>  static void flow_offload_queue_work(struct flow_offload_work *offload)
>  {
> -	queue_work(nf_flow_offload_wq, &offload->work);
> +	if (offload->cmd == FLOW_CLS_REPLACE)
> +		queue_work(nf_flow_offload_add_wq, &offload->work);
> +	else if (offload->cmd == FLOW_CLS_DESTROY)
> +		queue_work(nf_flow_offload_del_wq, &offload->work);
> +	else
> +		queue_work(nf_flow_offload_stats_wq, &offload->work);
>  }
>  
>  static struct flow_offload_work *
> @@ -898,8 +905,11 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable,
>  
>  void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
>  {
> -	if (nf_flowtable_hw_offload(flowtable))
> -		flush_workqueue(nf_flow_offload_wq);
> +	if (nf_flowtable_hw_offload(flowtable)) {
> +		flush_workqueue(nf_flow_offload_add_wq);
> +		flush_workqueue(nf_flow_offload_del_wq);
> +		flush_workqueue(nf_flow_offload_stats_wq);
> +	}
>  }
>  
>  static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
> @@ -1011,15 +1021,33 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
>  
>  int nf_flow_table_offload_init(void)
>  {
> -	nf_flow_offload_wq  = alloc_workqueue("nf_flow_table_offload",
> -					      WQ_UNBOUND, 0);
> -	if (!nf_flow_offload_wq)
> +	nf_flow_offload_add_wq  = alloc_workqueue("nf_ft_offload_add",
> +						  WQ_UNBOUND | WQ_SYSFS, 0);
> +	if (!nf_flow_offload_add_wq)
>  		return -ENOMEM;
>  
> +	nf_flow_offload_del_wq  = alloc_workqueue("nf_ft_offload_del",
> +						  WQ_UNBOUND | WQ_SYSFS, 0);
> +	if (!nf_flow_offload_del_wq)
> +		goto err_del_wq;
> +
> +	nf_flow_offload_stats_wq  = alloc_workqueue("nf_ft_offload_stats",
> +						    WQ_UNBOUND | WQ_SYSFS, 0);
> +	if (!nf_flow_offload_stats_wq)
> +		goto err_stats_wq;
> +
>  	return 0;
> +
> +err_stats_wq:
> +	destroy_workqueue(nf_flow_offload_del_wq);
> +err_del_wq:
> +	destroy_workqueue(nf_flow_offload_add_wq);
> +	return -ENOMEM;
>  }
>  
>  void nf_flow_table_offload_exit(void)
>  {
> -	destroy_workqueue(nf_flow_offload_wq);
> +	destroy_workqueue(nf_flow_offload_add_wq);
> +	destroy_workqueue(nf_flow_offload_del_wq);
> +	destroy_workqueue(nf_flow_offload_stats_wq);
>  }
> -- 
> 1.8.3.1
>
Pablo Neira Ayuso March 17, 2021, 11:36 p.m. UTC | #2
On Wed, Mar 03, 2021 at 02:59:53PM +0200, Oz Shlomo wrote:
> Currently the flow table offload replace, destroy and stats work items are
> executed on a single workqueue. As such, DESTROY and STATS commands may
> be backloged after a burst of REPLACE work items. This scenario can bloat
> up memory and may cause active connections to age.
> 
> Instatiate add, del and stats workqueues to avoid backlogs of non-dependent
> actions. Provide sysfs control over the workqueue attributes, allowing
> userspace applications to control the workqueue cpumask.

I'm going to apply this to nf-next, it should be possible to revisit
this problem incrementally.

Applied, thanks for your patience.
Marcelo Ricardo Leitner March 22, 2021, 6:09 p.m. UTC | #3
On Wed, Mar 03, 2021 at 05:11:47PM +0100, Pablo Neira Ayuso wrote:
> Hi,
> 
> On Wed, Mar 03, 2021 at 02:59:53PM +0200, Oz Shlomo wrote:
> > Currently the flow table offload replace, destroy and stats work items are
> > executed on a single workqueue. As such, DESTROY and STATS commands may
> > be backloged after a burst of REPLACE work items. This scenario can bloat
> > up memory and may cause active connections to age.
> > 
> > Instatiate add, del and stats workqueues to avoid backlogs of non-dependent
> > actions. Provide sysfs control over the workqueue attributes, allowing
> > userspace applications to control the workqueue cpumask.
> 
> Probably it would be good to place REPLACE and DESTROY in one single
> queue so workqueues don't race? In case connections are quickly
> created and destroyed, we might get an out of order execution, instead
> of:
> 
>   REPLACE -> DESTROY -> REPLACE
> 
> events could be reordered to:
> 
>   REPLACE -> REPLACE -> DESTROY
> 
> So would it work for you if REPLACE and DESTROY go into one single
> workqueue and stats go into another?
> 
> Or probably make the cookie unique is sufficient? The cookie refers to
> the memory address but memory can be recycled very quickly. If the
> cookie helps to catch the reorder scenario, then the conntrack id
> could be used instead of the memory address as cookie.

Something like this, if I got the idea right, would be even better. If
the entry actually expired before it had a chance of being offloaded,
there is no point in offloading it to then just remove it.

  Marcelo
Pablo Neira Ayuso March 24, 2021, 1:38 a.m. UTC | #4
Hi Marcelo,

On Mon, Mar 22, 2021 at 03:09:51PM -0300, Marcelo Ricardo Leitner wrote:
> On Wed, Mar 03, 2021 at 05:11:47PM +0100, Pablo Neira Ayuso wrote:
[...]
> > Or probably make the cookie unique is sufficient? The cookie refers to
> > the memory address but memory can be recycled very quickly. If the
> > cookie helps to catch the reorder scenario, then the conntrack id
> > could be used instead of the memory address as cookie.
> 
> Something like this, if I got the idea right, would be even better. If
> the entry actually expired before it had a chance of being offloaded,
> there is no point in offloading it to then just remove it.

It would be interesting to explore this idea you describe. Maybe a
flag can be set on stale objects, or simply remove the stale object
from the offload queue. So I guess it should be possible to recover
control on the list of pending requests as a batch that is passed
through one single queue_work call.
Oz Shlomo March 24, 2021, 11:24 a.m. UTC | #5
Hi,

On 3/24/2021 3:38 AM, Pablo Neira Ayuso wrote:
> Hi Marcelo,
> 
> On Mon, Mar 22, 2021 at 03:09:51PM -0300, Marcelo Ricardo Leitner wrote:
>> On Wed, Mar 03, 2021 at 05:11:47PM +0100, Pablo Neira Ayuso wrote:
> [...]
>>> Or probably make the cookie unique is sufficient? The cookie refers to
>>> the memory address but memory can be recycled very quickly. If the
>>> cookie helps to catch the reorder scenario, then the conntrack id
>>> could be used instead of the memory address as cookie.
>>
>> Something like this, if I got the idea right, would be even better. If
>> the entry actually expired before it had a chance of being offloaded,
>> there is no point in offloading it to then just remove it.
> 
> It would be interesting to explore this idea you describe. Maybe a
> flag can be set on stale objects, or simply remove the stale object
> from the offload queue. So I guess it should be possible to recover
> control on the list of pending requests as a batch that is passed
> through one single queue_work call.
> 

Removing stale objects is a good optimization for cases when the rate of established connections is 
greater than the hardware offload insertion rate.
However, with a single workqueue design, a burst of del commands may postpone connection offload tasks.
Postponed offloads may cause additional packets to go through software, thus creating a chain effect 
which may diminish the system's connection rate.

Marcelo, AFAIU add/del are synchronized by design since the del is triggered by the gc thread.
A del workqueue item will be instantiated only after a connection is in hardware.
Marcelo Ricardo Leitner March 24, 2021, 9:20 p.m. UTC | #6
On Wed, Mar 24, 2021 at 01:24:53PM +0200, Oz Shlomo wrote:
> Hi,

Hi,

> 
> On 3/24/2021 3:38 AM, Pablo Neira Ayuso wrote:
> > Hi Marcelo,
> > 
> > On Mon, Mar 22, 2021 at 03:09:51PM -0300, Marcelo Ricardo Leitner wrote:
> > > On Wed, Mar 03, 2021 at 05:11:47PM +0100, Pablo Neira Ayuso wrote:
> > [...]
> > > > Or probably make the cookie unique is sufficient? The cookie refers to
> > > > the memory address but memory can be recycled very quickly. If the
> > > > cookie helps to catch the reorder scenario, then the conntrack id
> > > > could be used instead of the memory address as cookie.
> > > 
> > > Something like this, if I got the idea right, would be even better. If
> > > the entry actually expired before it had a chance of being offloaded,
> > > there is no point in offloading it to then just remove it.
> > 
> > It would be interesting to explore this idea you describe. Maybe a
> > flag can be set on stale objects, or simply remove the stale object
> > from the offload queue. So I guess it should be possible to recover
> > control on the list of pending requests as a batch that is passed
> > through one single queue_work call.
> > 
> 
> Removing stale objects is a good optimization for cases when the rate of
> established connections is greater than the hardware offload insertion rate.
> However, with a single workqueue design, a burst of del commands may postpone connection offload tasks.
> Postponed offloads may cause additional packets to go through software, thus
> creating a chain effect which may diminish the system's connection rate.

Right. I didn't intend to object to multiqueues. I'm sorry if it
sounded that way.

> 
> Marcelo, AFAIU add/del are synchronized by design since the del is triggered by the gc thread.
> A del workqueue item will be instantiated only after a connection is in hardware.

They were synchronized, but after this patch, not anymore AFAICT:

tcf_ct_flow_table_add()
  flow_offload_add()
              if (nf_flowtable_hw_offload(flow_table)) {
                  __set_bit(NF_FLOW_HW, &flow->flags);    [A]
                  nf_flow_offload_add(flow_table, flow);
                           ^--- schedules on _add workqueue

then the gc thread:
nf_flow_offload_gc_step()
          if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct))
                  set_bit(NF_FLOW_TEARDOWN, &flow->flags);

          if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
	                   ^-- can also set by tcf_ct_flow_table_lookup()
			       on fin's, by calling flow_offload_teardown()
                  if (test_bit(NF_FLOW_HW, &flow->flags)) {
                                    ^--- this is set in [A], even if the _add is still queued
                          if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
                                  nf_flow_offload_del(flow_table, flow);

nf_flow_offload_del()
          offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY);
          if (!offload)
                  return;

          set_bit(NF_FLOW_HW_DYING, &flow->flags);
          flow_offload_queue_work(offload);

NF_FLOW_HW_DYING only avoids a double _del here.

Maybe I'm just missing it but I'm not seeing how removals would only
happen after the entry is actually offloaded. As in, if the add queue
is very long, and the datapath see a FIN, seems the next gc iteration
could try to remove it before it's actually offloaded. I think this is
what Pablo meant on his original reply here too, then his idea on
having add/del to work with the same queue.
Oz Shlomo March 25, 2021, 8:46 a.m. UTC | #7
Hi Marcelo,

On 3/24/2021 11:20 PM, Marcelo Ricardo Leitner wrote:
> On Wed, Mar 24, 2021 at 01:24:53PM +0200, Oz Shlomo wrote:
>> Hi,
> 
> Hi,
> 
>>
>> On 3/24/2021 3:38 AM, Pablo Neira Ayuso wrote:
>>> Hi Marcelo,
>>>
>>> On Mon, Mar 22, 2021 at 03:09:51PM -0300, Marcelo Ricardo Leitner wrote:
>>>> On Wed, Mar 03, 2021 at 05:11:47PM +0100, Pablo Neira Ayuso wrote:
>>> [...]
>>>>> Or probably make the cookie unique is sufficient? The cookie refers to
>>>>> the memory address but memory can be recycled very quickly. If the
>>>>> cookie helps to catch the reorder scenario, then the conntrack id
>>>>> could be used instead of the memory address as cookie.
>>>>
>>>> Something like this, if I got the idea right, would be even better. If
>>>> the entry actually expired before it had a chance of being offloaded,
>>>> there is no point in offloading it to then just remove it.
>>>
>>> It would be interesting to explore this idea you describe. Maybe a
>>> flag can be set on stale objects, or simply remove the stale object
>>> from the offload queue. So I guess it should be possible to recover
>>> control on the list of pending requests as a batch that is passed
>>> through one single queue_work call.
>>>
>>
>> Removing stale objects is a good optimization for cases when the rate of
>> established connections is greater than the hardware offload insertion rate.
>> However, with a single workqueue design, a burst of del commands may postpone connection offload tasks.
>> Postponed offloads may cause additional packets to go through software, thus
>> creating a chain effect which may diminish the system's connection rate.
> 
> Right. I didn't intend to object to multiqueues. I'm sorry if it
> sounded that way.
> 
>>
>> Marcelo, AFAIU add/del are synchronized by design since the del is triggered by the gc thread.
>> A del workqueue item will be instantiated only after a connection is in hardware.
> 
> They were synchronized, but after this patch, not anymore AFAICT:
> 
> tcf_ct_flow_table_add()
>    flow_offload_add()
>                if (nf_flowtable_hw_offload(flow_table)) {
>                    __set_bit(NF_FLOW_HW, &flow->flags);    [A]
>                    nf_flow_offload_add(flow_table, flow);
>                             ^--- schedules on _add workqueue
> 
> then the gc thread:
> nf_flow_offload_gc_step()
>            if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct))
>                    set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> 
>            if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
> 	                   ^-- can also set by tcf_ct_flow_table_lookup()
> 			       on fin's, by calling flow_offload_teardown()
>                    if (test_bit(NF_FLOW_HW, &flow->flags)) {
>                                      ^--- this is set in [A], even if the _add is still queued
>                            if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
>                                    nf_flow_offload_del(flow_table, flow);
> 
> nf_flow_offload_del()
>            offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY);
>            if (!offload)
>                    return;
> 
>            set_bit(NF_FLOW_HW_DYING, &flow->flags);
>            flow_offload_queue_work(offload);
> 
> NF_FLOW_HW_DYING only avoids a double _del here.
> 
> Maybe I'm just missing it but I'm not seeing how removals would only
> happen after the entry is actually offloaded. As in, if the add queue
> is very long, and the datapath see a FIN, seems the next gc iteration
> could try to remove it before it's actually offloaded. I think this is
> what Pablo meant on his original reply here too, then his idea on
> having add/del to work with the same queue.
> 

The work item will not be allocated if the hw offload is pending.

nf_flow_offload_work_alloc()
	if (test_and_set_bit(NF_FLOW_HW_PENDING, &flow->flags))
		return NULL;
Marcelo Ricardo Leitner March 26, 2021, 1:51 p.m. UTC | #8
On Thu, Mar 25, 2021 at 10:46:12AM +0200, Oz Shlomo wrote:
> Hi Marcelo,
> 
> On 3/24/2021 11:20 PM, Marcelo Ricardo Leitner wrote:
> > Maybe I'm just missing it but I'm not seeing how removals would only
> > happen after the entry is actually offloaded. As in, if the add queue
> > is very long, and the datapath see a FIN, seems the next gc iteration
> > could try to remove it before it's actually offloaded. I think this is
> > what Pablo meant on his original reply here too, then his idea on
> > having add/del to work with the same queue.
> > 
> 
> The work item will not be allocated if the hw offload is pending.
> 
> nf_flow_offload_work_alloc()
> 	if (test_and_set_bit(NF_FLOW_HW_PENDING, &flow->flags))
> 		return NULL;

Ahá! Right, and with that there can only be 1 flow_offload_work for a
flow at a time, so it can't fetch stats for a flow that is still to be
offloaded too. Got it.

Thanks,
Marcelo
diff mbox series

Patch

diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 2a6993fa40d7..1b979c8b3ba0 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -13,7 +13,9 @@ 
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
 
-static struct workqueue_struct *nf_flow_offload_wq;
+static struct workqueue_struct *nf_flow_offload_add_wq;
+static struct workqueue_struct *nf_flow_offload_del_wq;
+static struct workqueue_struct *nf_flow_offload_stats_wq;
 
 struct flow_offload_work {
 	struct list_head	list;
@@ -826,7 +828,12 @@  static void flow_offload_work_handler(struct work_struct *work)
 
 static void flow_offload_queue_work(struct flow_offload_work *offload)
 {
-	queue_work(nf_flow_offload_wq, &offload->work);
+	if (offload->cmd == FLOW_CLS_REPLACE)
+		queue_work(nf_flow_offload_add_wq, &offload->work);
+	else if (offload->cmd == FLOW_CLS_DESTROY)
+		queue_work(nf_flow_offload_del_wq, &offload->work);
+	else
+		queue_work(nf_flow_offload_stats_wq, &offload->work);
 }
 
 static struct flow_offload_work *
@@ -898,8 +905,11 @@  void nf_flow_offload_stats(struct nf_flowtable *flowtable,
 
 void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
 {
-	if (nf_flowtable_hw_offload(flowtable))
-		flush_workqueue(nf_flow_offload_wq);
+	if (nf_flowtable_hw_offload(flowtable)) {
+		flush_workqueue(nf_flow_offload_add_wq);
+		flush_workqueue(nf_flow_offload_del_wq);
+		flush_workqueue(nf_flow_offload_stats_wq);
+	}
 }
 
 static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
@@ -1011,15 +1021,33 @@  int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
 
 int nf_flow_table_offload_init(void)
 {
-	nf_flow_offload_wq  = alloc_workqueue("nf_flow_table_offload",
-					      WQ_UNBOUND, 0);
-	if (!nf_flow_offload_wq)
+	nf_flow_offload_add_wq  = alloc_workqueue("nf_ft_offload_add",
+						  WQ_UNBOUND | WQ_SYSFS, 0);
+	if (!nf_flow_offload_add_wq)
 		return -ENOMEM;
 
+	nf_flow_offload_del_wq  = alloc_workqueue("nf_ft_offload_del",
+						  WQ_UNBOUND | WQ_SYSFS, 0);
+	if (!nf_flow_offload_del_wq)
+		goto err_del_wq;
+
+	nf_flow_offload_stats_wq  = alloc_workqueue("nf_ft_offload_stats",
+						    WQ_UNBOUND | WQ_SYSFS, 0);
+	if (!nf_flow_offload_stats_wq)
+		goto err_stats_wq;
+
 	return 0;
+
+err_stats_wq:
+	destroy_workqueue(nf_flow_offload_del_wq);
+err_del_wq:
+	destroy_workqueue(nf_flow_offload_add_wq);
+	return -ENOMEM;
 }
 
 void nf_flow_table_offload_exit(void)
 {
-	destroy_workqueue(nf_flow_offload_wq);
+	destroy_workqueue(nf_flow_offload_add_wq);
+	destroy_workqueue(nf_flow_offload_del_wq);
+	destroy_workqueue(nf_flow_offload_stats_wq);
 }