Message ID | 20220516191032.340243-3-vladbu@nvidia.com |
---|---|
State | Changes Requested |
Delegated to: | Pablo Neira |
Headers | show |
Series | Conntrack offload debuggability improvements | expand |
On Tue 17 May 2022 at 13:28, Pablo Neira Ayuso <pablo@netfilter.org> wrote: > On Mon, May 16, 2022 at 10:10:31PM +0300, Vlad Buslov wrote: >> To improve hardware offload debuggability and scalability introduce >> 'nf_flowtable_count_hw' and 'nf_flowtable_max_hw' sysctl entries in new >> dedicated 'net/netfilter/ft' namespace. Add new pernet struct nf_ft_net in >> order to store the counter and sysctl header of new sysctl table. >> >> Count the offloaded flows in workqueue add task handler. Verify that >> offloaded flow total is lower than allowed maximum before calling the >> driver callbacks. To prevent spamming the 'add' workqueue with tasks when >> flows can't be offloaded anymore also check that count is below limit >> before queuing offload work. This doesn't prevent all redundant workqueue >> task since counter can be taken by concurrent work handler after the check >> had been performed but before the offload job is executed but it still >> greatly reduces such occurrences. Note that flows that were not offloaded >> due to counter being larger than the cap can still be offloaded via refresh >> function. >> >> Ensure that flows are accounted correctly by verifying IPS_HW_OFFLOAD_BIT >> value before counting them. This ensures that add/refresh code path >> increments the counter exactly once per flow when setting the bit and >> decrements it only for accounted flows when deleting the flow with the bit >> set. >> >> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> >> Signed-off-by: Oz Shlomo <ozsh@nvidia.com> >> --- >> >> Notes: >> Changes V1 -> V2: >> >> - Combine patches that expose offloaded flow count and limit total >> offloaded flows. >> >> - Rework the counting logic to count in workqueue context. >> >> - Create dedicated namespace for flow table sysctls. >> >> .../networking/nf_flowtable-sysctl.rst | 17 ++++ >> include/net/netfilter/nf_flow_table.h | 25 ++++++ >> net/netfilter/Makefile | 3 +- >> net/netfilter/nf_flow_table_core.c | 55 +++++++++++- >> net/netfilter/nf_flow_table_offload.c | 36 ++++++-- >> net/netfilter/nf_flow_table_sysctl.c | 83 +++++++++++++++++++ >> 6 files changed, 210 insertions(+), 9 deletions(-) >> create mode 100644 Documentation/networking/nf_flowtable-sysctl.rst >> create mode 100644 net/netfilter/nf_flow_table_sysctl.c >> >> diff --git a/Documentation/networking/nf_flowtable-sysctl.rst b/Documentation/networking/nf_flowtable-sysctl.rst >> new file mode 100644 >> index 000000000000..932b4abeca6c >> --- /dev/null >> +++ b/Documentation/networking/nf_flowtable-sysctl.rst >> @@ -0,0 +1,17 @@ >> +.. SPDX-License-Identifier: GPL-2.0 >> + >> +=================================== >> +Netfilter Flowtable Sysfs variables >> +=================================== > > Better document the sysctl entries in the existing nf_conntrack.txt > file rather than this new file? Sure, no problem. > >> + >> +/proc/sys/net/netfilter/ft/nf_flowtable_* Variables: >> +================================================= >> + >> +nf_flowtable_count_hw - INTEGER (read-only) >> + Number of flowtable entries that are currently offloaded to hardware. >> + >> +nf_flowtable_max_hw - INTEGER >> + - 0 - disabled (default) >> + - not 0 - enabled >> + >> + Cap on maximum total amount of flowtable entries offloaded to hardware. >> diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h >> index 64daafd1fc41..e09c29fa034e 100644 >> --- a/include/net/netfilter/nf_flow_table.h >> +++ b/include/net/netfilter/nf_flow_table.h >> @@ -335,4 +335,29 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) >> return 0; >> } >> >> +struct nf_ft_net { >> + atomic_t count_hw; >> +#ifdef CONFIG_SYSCTL >> + struct ctl_table_header *sysctl_header; >> +#endif >> +}; >> + >> +extern unsigned int nf_ft_hw_max; >> +extern unsigned int nf_ft_net_id; >> + >> +#include <net/netns/generic.h> >> + >> +static inline struct nf_ft_net *nf_ft_pernet(const struct net *net) >> +{ >> + return net_generic(net, nf_ft_net_id); >> +} >> + >> +static inline struct nf_ft_net *nf_ft_pernet_get(struct nf_flowtable *flow_table) >> +{ >> + return nf_ft_pernet(read_pnet(&flow_table->net)); >> +} >> + >> +int nf_flow_table_init_sysctl(struct net *net); >> +void nf_flow_table_fini_sysctl(struct net *net); >> + >> #endif /* _NF_FLOW_TABLE_H */ >> diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile >> index 238b6a620e88..67e281842b61 100644 >> --- a/net/netfilter/Makefile >> +++ b/net/netfilter/Makefile >> @@ -127,7 +127,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o >> # flow table infrastructure >> obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o >> nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ >> - nf_flow_table_offload.o >> + nf_flow_table_offload.o \ >> + nf_flow_table_sysctl.o >> >> obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o >> >> diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c >> index 3db256da919b..e2598f98017c 100644 >> --- a/net/netfilter/nf_flow_table_core.c >> +++ b/net/netfilter/nf_flow_table_core.c >> @@ -277,6 +277,13 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { >> .automatic_shrinking = true, >> }; >> >> +static bool flow_max_hw_entries(struct nf_flowtable *flow_table) >> +{ >> + struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table); >> + >> + return nf_ft_hw_max && atomic_read(&fnet->count_hw) >= nf_ft_hw_max; >> +} >> + >> unsigned long flow_offload_get_timeout(struct flow_offload *flow) >> { >> unsigned long timeout = NF_FLOW_TIMEOUT; >> @@ -320,7 +327,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) >> >> nf_ct_offload_timeout(flow->ct); >> >> - if (nf_flowtable_hw_offload(flow_table)) { >> + if (nf_flowtable_hw_offload(flow_table) && >> + !flow_max_hw_entries(flow_table)) { >> __set_bit(NF_FLOW_HW, &flow->flags); >> nf_flow_offload_add(flow_table, flow); >> } >> @@ -338,9 +346,11 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, >> if (READ_ONCE(flow->timeout) != timeout) >> WRITE_ONCE(flow->timeout, timeout); >> >> - if (likely(!nf_flowtable_hw_offload(flow_table))) >> + if (likely(!nf_flowtable_hw_offload(flow_table) || >> + flow_max_hw_entries(flow_table))) >> return; >> >> + set_bit(NF_FLOW_HW, &flow->flags); >> nf_flow_offload_add(flow_table, flow); >> } >> EXPORT_SYMBOL_GPL(flow_offload_refresh); >> @@ -652,14 +662,53 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) >> } >> EXPORT_SYMBOL_GPL(nf_flow_table_free); >> >> +static int nf_flow_table_pernet_init(struct net *net) >> +{ >> + return nf_flow_table_init_sysctl(net); >> +} >> + >> +static void nf_flow_table_pernet_exit(struct list_head *net_exit_list) >> +{ >> + struct net *net; >> + >> + list_for_each_entry(net, net_exit_list, exit_list) >> + nf_flow_table_fini_sysctl(net); >> +} >> + >> +unsigned int nf_ft_net_id __read_mostly; >> + >> +static struct pernet_operations nf_flow_table_net_ops = { >> + .init = nf_flow_table_pernet_init, >> + .exit_batch = nf_flow_table_pernet_exit, >> + .id = &nf_ft_net_id, >> + .size = sizeof(struct nf_ft_net), >> +}; >> + >> static int __init nf_flow_table_module_init(void) >> { >> - return nf_flow_table_offload_init(); >> + int ret; >> + >> + nf_ft_hw_max = 0; >> + >> + ret = register_pernet_subsys(&nf_flow_table_net_ops); >> + if (ret < 0) >> + return ret; >> + >> + ret = nf_flow_table_offload_init(); >> + if (ret) >> + goto out_offload; >> + >> + return 0; >> + >> +out_offload: >> + unregister_pernet_subsys(&nf_flow_table_net_ops); >> + return ret; >> } >> >> static void __exit nf_flow_table_module_exit(void) >> { >> nf_flow_table_offload_exit(); >> + unregister_pernet_subsys(&nf_flow_table_net_ops); >> } >> >> module_init(nf_flow_table_module_init); >> diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c >> index 11b6e1942092..a6e763901eb9 100644 >> --- a/net/netfilter/nf_flow_table_offload.c >> +++ b/net/netfilter/nf_flow_table_offload.c >> @@ -903,30 +903,56 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, >> return 0; >> } >> >> +static bool flow_get_max_hw_entries(struct nf_flowtable *flow_table) >> +{ >> + struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table); >> + int count_hw = atomic_inc_return(&fnet->count_hw); >> + >> + if (nf_ft_hw_max && count_hw > nf_ft_hw_max) { >> + atomic_dec(&fnet->count_hw); >> + return false; >> + } >> + return true; >> +} >> + >> static void flow_offload_work_add(struct flow_offload_work *offload) >> { >> + struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable); >> struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; >> int err; >> >> + if (!flow_get_max_hw_entries(offload->flowtable)) >> + return; >> + >> err = nf_flow_offload_alloc(offload, flow_rule); >> if (err < 0) >> - return; >> + goto out_alloc; >> >> err = flow_offload_rule_add(offload, flow_rule); >> if (err < 0) >> - goto out; >> + goto out_add; >> >> - set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); >> + if (test_and_set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status)) >> + atomic_dec(&fnet->count_hw); >> + nf_flow_offload_destroy(flow_rule); >> + return; >> >> -out: >> +out_add: >> nf_flow_offload_destroy(flow_rule); >> +out_alloc: >> + atomic_dec(&fnet->count_hw); >> } >> >> static void flow_offload_work_del(struct flow_offload_work *offload) >> { >> - clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); >> + bool offloaded = test_and_clear_bit(IPS_HW_OFFLOAD_BIT, >> + &offload->flow->ct->status); >> + struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable); >> + >> flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); >> flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); >> + if (offloaded) >> + atomic_dec(&fnet->count_hw); >> set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); >> } >> >> diff --git a/net/netfilter/nf_flow_table_sysctl.c b/net/netfilter/nf_flow_table_sysctl.c >> new file mode 100644 >> index 000000000000..ce8c0a2c4bdb >> --- /dev/null >> +++ b/net/netfilter/nf_flow_table_sysctl.c >> @@ -0,0 +1,83 @@ >> +// SPDX-License-Identifier: GPL-2.0-only >> +#include <linux/kernel.h> >> +#include <net/netfilter/nf_flow_table.h> >> + >> +unsigned int nf_ft_hw_max __read_mostly; > > You can move this to nf_flow_table_core_offload.c for the Makefile > idea. > >> +#ifdef CONFIG_SYSCTL > > If you follow the Kconfig+Makefile approach, then this #ifdef can > likely go away as the entire file would be on/off thing. In following patch I also implement procfs stats in this file, which depends on another config variable (CONFIG_NF_FLOW_TABLE_PROCFS). Should I put procfs code in another new file that will also be conditionally compiled? > >> +enum nf_ct_sysctl_index { >> + NF_SYSCTL_FLOW_TABLE_MAX_HW, >> + NF_SYSCTL_FLOW_TABLE_COUNT_HW, >> + >> + __NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL, >> +}; >> + >> +#define NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL \ >> + (__NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL + 1) >> + >> +static struct ctl_table nf_flow_table_sysctl_table[] = { >> + [NF_SYSCTL_FLOW_TABLE_MAX_HW] = { >> + .procname = "nf_flowtable_max_hw", >> + .data = &nf_ft_hw_max, >> + .maxlen = sizeof(int), >> + .mode = 0644, >> + .proc_handler = proc_dointvec, >> + }, >> + [NF_SYSCTL_FLOW_TABLE_COUNT_HW] = { >> + .procname = "nf_flowtable_count_hw", >> + .maxlen = sizeof(int), >> + .mode = 0444, >> + .proc_handler = proc_dointvec, >> + }, >> + {} >> +}; >> + >> +int nf_flow_table_init_sysctl(struct net *net) >> +{ >> + struct nf_ft_net *fnet = nf_ft_pernet(net); >> + struct ctl_table *table; >> + >> + BUILD_BUG_ON(ARRAY_SIZE(nf_flow_table_sysctl_table) != NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL); >> + >> + table = kmemdup(nf_flow_table_sysctl_table, sizeof(nf_flow_table_sysctl_table), >> + GFP_KERNEL); >> + if (!table) >> + return -ENOMEM; >> + >> + table[NF_SYSCTL_FLOW_TABLE_COUNT_HW].data = &fnet->count_hw; >> + >> + /* Don't allow non-init_net ns to alter global sysctls */ >> + if (!net_eq(&init_net, net)) >> + table[NF_SYSCTL_FLOW_TABLE_MAX_HW].mode = 0444; >> + >> + fnet->sysctl_header = register_net_sysctl(net, "net/netfilter/ft", table); >> + if (!fnet->sysctl_header) >> + goto out_unregister_netfilter; >> + >> + return 0; >> + >> +out_unregister_netfilter: >> + kfree(table); >> + return -ENOMEM; >> +} >> + >> +void nf_flow_table_fini_sysctl(struct net *net) >> +{ >> + struct nf_ft_net *fnet = nf_ft_pernet(net); >> + struct ctl_table *table; >> + >> + table = fnet->sysctl_header->ctl_table_arg; >> + unregister_net_sysctl_table(fnet->sysctl_header); >> + kfree(table); >> +} >> + >> +#else >> +int nf_flow_table_init_sysctl(struct net *net) >> +{ >> + return 0; >> +} >> + >> +void nf_flow_table_fini_sysctl(struct net *net) >> +{ >> +} >> +#endif /* CONFIG_SYSCTL */ >> -- >> 2.31.1 >>
On Mon, May 16, 2022 at 10:10:31PM +0300, Vlad Buslov wrote: > To improve hardware offload debuggability and scalability introduce > 'nf_flowtable_count_hw' and 'nf_flowtable_max_hw' sysctl entries in new > dedicated 'net/netfilter/ft' namespace. Add new pernet struct nf_ft_net in > order to store the counter and sysctl header of new sysctl table. > > Count the offloaded flows in workqueue add task handler. Verify that > offloaded flow total is lower than allowed maximum before calling the > driver callbacks. To prevent spamming the 'add' workqueue with tasks when > flows can't be offloaded anymore also check that count is below limit > before queuing offload work. This doesn't prevent all redundant workqueue > task since counter can be taken by concurrent work handler after the check > had been performed but before the offload job is executed but it still > greatly reduces such occurrences. Note that flows that were not offloaded > due to counter being larger than the cap can still be offloaded via refresh > function. > > Ensure that flows are accounted correctly by verifying IPS_HW_OFFLOAD_BIT > value before counting them. This ensures that add/refresh code path > increments the counter exactly once per flow when setting the bit and > decrements it only for accounted flows when deleting the flow with the bit > set. > > Signed-off-by: Vlad Buslov <vladbu@nvidia.com> > Signed-off-by: Oz Shlomo <ozsh@nvidia.com> > --- > > Notes: > Changes V1 -> V2: > > - Combine patches that expose offloaded flow count and limit total > offloaded flows. > > - Rework the counting logic to count in workqueue context. > > - Create dedicated namespace for flow table sysctls. > > .../networking/nf_flowtable-sysctl.rst | 17 ++++ > include/net/netfilter/nf_flow_table.h | 25 ++++++ > net/netfilter/Makefile | 3 +- > net/netfilter/nf_flow_table_core.c | 55 +++++++++++- > net/netfilter/nf_flow_table_offload.c | 36 ++++++-- > net/netfilter/nf_flow_table_sysctl.c | 83 +++++++++++++++++++ > 6 files changed, 210 insertions(+), 9 deletions(-) > create mode 100644 Documentation/networking/nf_flowtable-sysctl.rst > create mode 100644 net/netfilter/nf_flow_table_sysctl.c > > diff --git a/Documentation/networking/nf_flowtable-sysctl.rst b/Documentation/networking/nf_flowtable-sysctl.rst > new file mode 100644 > index 000000000000..932b4abeca6c > --- /dev/null > +++ b/Documentation/networking/nf_flowtable-sysctl.rst > @@ -0,0 +1,17 @@ > +.. SPDX-License-Identifier: GPL-2.0 > + > +=================================== > +Netfilter Flowtable Sysfs variables > +=================================== Better document the sysctl entries in the existing nf_conntrack.txt file rather than this new file? > + > +/proc/sys/net/netfilter/ft/nf_flowtable_* Variables: > +================================================= > + > +nf_flowtable_count_hw - INTEGER (read-only) > + Number of flowtable entries that are currently offloaded to hardware. > + > +nf_flowtable_max_hw - INTEGER > + - 0 - disabled (default) > + - not 0 - enabled > + > + Cap on maximum total amount of flowtable entries offloaded to hardware. > diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h > index 64daafd1fc41..e09c29fa034e 100644 > --- a/include/net/netfilter/nf_flow_table.h > +++ b/include/net/netfilter/nf_flow_table.h > @@ -335,4 +335,29 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) > return 0; > } > > +struct nf_ft_net { > + atomic_t count_hw; > +#ifdef CONFIG_SYSCTL > + struct ctl_table_header *sysctl_header; > +#endif > +}; > + > +extern unsigned int nf_ft_hw_max; > +extern unsigned int nf_ft_net_id; > + > +#include <net/netns/generic.h> > + > +static inline struct nf_ft_net *nf_ft_pernet(const struct net *net) > +{ > + return net_generic(net, nf_ft_net_id); > +} > + > +static inline struct nf_ft_net *nf_ft_pernet_get(struct nf_flowtable *flow_table) > +{ > + return nf_ft_pernet(read_pnet(&flow_table->net)); > +} > + > +int nf_flow_table_init_sysctl(struct net *net); > +void nf_flow_table_fini_sysctl(struct net *net); > + > #endif /* _NF_FLOW_TABLE_H */ > diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile > index 238b6a620e88..67e281842b61 100644 > --- a/net/netfilter/Makefile > +++ b/net/netfilter/Makefile > @@ -127,7 +127,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o > # flow table infrastructure > obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o > nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ > - nf_flow_table_offload.o > + nf_flow_table_offload.o \ > + nf_flow_table_sysctl.o > > obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o > > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c > index 3db256da919b..e2598f98017c 100644 > --- a/net/netfilter/nf_flow_table_core.c > +++ b/net/netfilter/nf_flow_table_core.c > @@ -277,6 +277,13 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { > .automatic_shrinking = true, > }; > > +static bool flow_max_hw_entries(struct nf_flowtable *flow_table) > +{ > + struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table); > + > + return nf_ft_hw_max && atomic_read(&fnet->count_hw) >= nf_ft_hw_max; > +} > + > unsigned long flow_offload_get_timeout(struct flow_offload *flow) > { > unsigned long timeout = NF_FLOW_TIMEOUT; > @@ -320,7 +327,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) > > nf_ct_offload_timeout(flow->ct); > > - if (nf_flowtable_hw_offload(flow_table)) { > + if (nf_flowtable_hw_offload(flow_table) && > + !flow_max_hw_entries(flow_table)) { > __set_bit(NF_FLOW_HW, &flow->flags); > nf_flow_offload_add(flow_table, flow); > } > @@ -338,9 +346,11 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, > if (READ_ONCE(flow->timeout) != timeout) > WRITE_ONCE(flow->timeout, timeout); > > - if (likely(!nf_flowtable_hw_offload(flow_table))) > + if (likely(!nf_flowtable_hw_offload(flow_table) || > + flow_max_hw_entries(flow_table))) > return; > > + set_bit(NF_FLOW_HW, &flow->flags); > nf_flow_offload_add(flow_table, flow); > } > EXPORT_SYMBOL_GPL(flow_offload_refresh); > @@ -652,14 +662,53 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) > } > EXPORT_SYMBOL_GPL(nf_flow_table_free); > > +static int nf_flow_table_pernet_init(struct net *net) > +{ > + return nf_flow_table_init_sysctl(net); > +} > + > +static void nf_flow_table_pernet_exit(struct list_head *net_exit_list) > +{ > + struct net *net; > + > + list_for_each_entry(net, net_exit_list, exit_list) > + nf_flow_table_fini_sysctl(net); > +} > + > +unsigned int nf_ft_net_id __read_mostly; > + > +static struct pernet_operations nf_flow_table_net_ops = { > + .init = nf_flow_table_pernet_init, > + .exit_batch = nf_flow_table_pernet_exit, > + .id = &nf_ft_net_id, > + .size = sizeof(struct nf_ft_net), > +}; > + > static int __init nf_flow_table_module_init(void) > { > - return nf_flow_table_offload_init(); > + int ret; > + > + nf_ft_hw_max = 0; > + > + ret = register_pernet_subsys(&nf_flow_table_net_ops); > + if (ret < 0) > + return ret; > + > + ret = nf_flow_table_offload_init(); > + if (ret) > + goto out_offload; > + > + return 0; > + > +out_offload: > + unregister_pernet_subsys(&nf_flow_table_net_ops); > + return ret; > } > > static void __exit nf_flow_table_module_exit(void) > { > nf_flow_table_offload_exit(); > + unregister_pernet_subsys(&nf_flow_table_net_ops); > } > > module_init(nf_flow_table_module_init); > diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c > index 11b6e1942092..a6e763901eb9 100644 > --- a/net/netfilter/nf_flow_table_offload.c > +++ b/net/netfilter/nf_flow_table_offload.c > @@ -903,30 +903,56 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, > return 0; > } > > +static bool flow_get_max_hw_entries(struct nf_flowtable *flow_table) > +{ > + struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table); > + int count_hw = atomic_inc_return(&fnet->count_hw); > + > + if (nf_ft_hw_max && count_hw > nf_ft_hw_max) { > + atomic_dec(&fnet->count_hw); > + return false; > + } > + return true; > +} > + > static void flow_offload_work_add(struct flow_offload_work *offload) > { > + struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable); > struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; > int err; > > + if (!flow_get_max_hw_entries(offload->flowtable)) > + return; > + > err = nf_flow_offload_alloc(offload, flow_rule); > if (err < 0) > - return; > + goto out_alloc; > > err = flow_offload_rule_add(offload, flow_rule); > if (err < 0) > - goto out; > + goto out_add; > > - set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); > + if (test_and_set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status)) > + atomic_dec(&fnet->count_hw); > + nf_flow_offload_destroy(flow_rule); > + return; > > -out: > +out_add: > nf_flow_offload_destroy(flow_rule); > +out_alloc: > + atomic_dec(&fnet->count_hw); > } > > static void flow_offload_work_del(struct flow_offload_work *offload) > { > - clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); > + bool offloaded = test_and_clear_bit(IPS_HW_OFFLOAD_BIT, > + &offload->flow->ct->status); > + struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable); > + > flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); > flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); > + if (offloaded) > + atomic_dec(&fnet->count_hw); > set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); > } > > diff --git a/net/netfilter/nf_flow_table_sysctl.c b/net/netfilter/nf_flow_table_sysctl.c > new file mode 100644 > index 000000000000..ce8c0a2c4bdb > --- /dev/null > +++ b/net/netfilter/nf_flow_table_sysctl.c > @@ -0,0 +1,83 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +#include <linux/kernel.h> > +#include <net/netfilter/nf_flow_table.h> > + > +unsigned int nf_ft_hw_max __read_mostly; You can move this to nf_flow_table_core_offload.c for the Makefile idea. > +#ifdef CONFIG_SYSCTL If you follow the Kconfig+Makefile approach, then this #ifdef can likely go away as the entire file would be on/off thing. > +enum nf_ct_sysctl_index { > + NF_SYSCTL_FLOW_TABLE_MAX_HW, > + NF_SYSCTL_FLOW_TABLE_COUNT_HW, > + > + __NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL, > +}; > + > +#define NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL \ > + (__NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL + 1) > + > +static struct ctl_table nf_flow_table_sysctl_table[] = { > + [NF_SYSCTL_FLOW_TABLE_MAX_HW] = { > + .procname = "nf_flowtable_max_hw", > + .data = &nf_ft_hw_max, > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = proc_dointvec, > + }, > + [NF_SYSCTL_FLOW_TABLE_COUNT_HW] = { > + .procname = "nf_flowtable_count_hw", > + .maxlen = sizeof(int), > + .mode = 0444, > + .proc_handler = proc_dointvec, > + }, > + {} > +}; > + > +int nf_flow_table_init_sysctl(struct net *net) > +{ > + struct nf_ft_net *fnet = nf_ft_pernet(net); > + struct ctl_table *table; > + > + BUILD_BUG_ON(ARRAY_SIZE(nf_flow_table_sysctl_table) != NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL); > + > + table = kmemdup(nf_flow_table_sysctl_table, sizeof(nf_flow_table_sysctl_table), > + GFP_KERNEL); > + if (!table) > + return -ENOMEM; > + > + table[NF_SYSCTL_FLOW_TABLE_COUNT_HW].data = &fnet->count_hw; > + > + /* Don't allow non-init_net ns to alter global sysctls */ > + if (!net_eq(&init_net, net)) > + table[NF_SYSCTL_FLOW_TABLE_MAX_HW].mode = 0444; > + > + fnet->sysctl_header = register_net_sysctl(net, "net/netfilter/ft", table); > + if (!fnet->sysctl_header) > + goto out_unregister_netfilter; > + > + return 0; > + > +out_unregister_netfilter: > + kfree(table); > + return -ENOMEM; > +} > + > +void nf_flow_table_fini_sysctl(struct net *net) > +{ > + struct nf_ft_net *fnet = nf_ft_pernet(net); > + struct ctl_table *table; > + > + table = fnet->sysctl_header->ctl_table_arg; > + unregister_net_sysctl_table(fnet->sysctl_header); > + kfree(table); > +} > + > +#else > +int nf_flow_table_init_sysctl(struct net *net) > +{ > + return 0; > +} > + > +void nf_flow_table_fini_sysctl(struct net *net) > +{ > +} > +#endif /* CONFIG_SYSCTL */ > -- > 2.31.1 >
diff --git a/Documentation/networking/nf_flowtable-sysctl.rst b/Documentation/networking/nf_flowtable-sysctl.rst new file mode 100644 index 000000000000..932b4abeca6c --- /dev/null +++ b/Documentation/networking/nf_flowtable-sysctl.rst @@ -0,0 +1,17 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================== +Netfilter Flowtable Sysfs variables +=================================== + +/proc/sys/net/netfilter/ft/nf_flowtable_* Variables: +================================================= + +nf_flowtable_count_hw - INTEGER (read-only) + Number of flowtable entries that are currently offloaded to hardware. + +nf_flowtable_max_hw - INTEGER + - 0 - disabled (default) + - not 0 - enabled + + Cap on maximum total amount of flowtable entries offloaded to hardware. diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 64daafd1fc41..e09c29fa034e 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -335,4 +335,29 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) return 0; } +struct nf_ft_net { + atomic_t count_hw; +#ifdef CONFIG_SYSCTL + struct ctl_table_header *sysctl_header; +#endif +}; + +extern unsigned int nf_ft_hw_max; +extern unsigned int nf_ft_net_id; + +#include <net/netns/generic.h> + +static inline struct nf_ft_net *nf_ft_pernet(const struct net *net) +{ + return net_generic(net, nf_ft_net_id); +} + +static inline struct nf_ft_net *nf_ft_pernet_get(struct nf_flowtable *flow_table) +{ + return nf_ft_pernet(read_pnet(&flow_table->net)); +} + +int nf_flow_table_init_sysctl(struct net *net); +void nf_flow_table_fini_sysctl(struct net *net); + #endif /* _NF_FLOW_TABLE_H */ diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 238b6a620e88..67e281842b61 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -127,7 +127,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ - nf_flow_table_offload.o + nf_flow_table_offload.o \ + nf_flow_table_sysctl.o obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 3db256da919b..e2598f98017c 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -277,6 +277,13 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { .automatic_shrinking = true, }; +static bool flow_max_hw_entries(struct nf_flowtable *flow_table) +{ + struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table); + + return nf_ft_hw_max && atomic_read(&fnet->count_hw) >= nf_ft_hw_max; +} + unsigned long flow_offload_get_timeout(struct flow_offload *flow) { unsigned long timeout = NF_FLOW_TIMEOUT; @@ -320,7 +327,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) nf_ct_offload_timeout(flow->ct); - if (nf_flowtable_hw_offload(flow_table)) { + if (nf_flowtable_hw_offload(flow_table) && + !flow_max_hw_entries(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); } @@ -338,9 +346,11 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, if (READ_ONCE(flow->timeout) != timeout) WRITE_ONCE(flow->timeout, timeout); - if (likely(!nf_flowtable_hw_offload(flow_table))) + if (likely(!nf_flowtable_hw_offload(flow_table) || + flow_max_hw_entries(flow_table))) return; + set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); } EXPORT_SYMBOL_GPL(flow_offload_refresh); @@ -652,14 +662,53 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) } EXPORT_SYMBOL_GPL(nf_flow_table_free); +static int nf_flow_table_pernet_init(struct net *net) +{ + return nf_flow_table_init_sysctl(net); +} + +static void nf_flow_table_pernet_exit(struct list_head *net_exit_list) +{ + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) + nf_flow_table_fini_sysctl(net); +} + +unsigned int nf_ft_net_id __read_mostly; + +static struct pernet_operations nf_flow_table_net_ops = { + .init = nf_flow_table_pernet_init, + .exit_batch = nf_flow_table_pernet_exit, + .id = &nf_ft_net_id, + .size = sizeof(struct nf_ft_net), +}; + static int __init nf_flow_table_module_init(void) { - return nf_flow_table_offload_init(); + int ret; + + nf_ft_hw_max = 0; + + ret = register_pernet_subsys(&nf_flow_table_net_ops); + if (ret < 0) + return ret; + + ret = nf_flow_table_offload_init(); + if (ret) + goto out_offload; + + return 0; + +out_offload: + unregister_pernet_subsys(&nf_flow_table_net_ops); + return ret; } static void __exit nf_flow_table_module_exit(void) { nf_flow_table_offload_exit(); + unregister_pernet_subsys(&nf_flow_table_net_ops); } module_init(nf_flow_table_module_init); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 11b6e1942092..a6e763901eb9 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -903,30 +903,56 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, return 0; } +static bool flow_get_max_hw_entries(struct nf_flowtable *flow_table) +{ + struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table); + int count_hw = atomic_inc_return(&fnet->count_hw); + + if (nf_ft_hw_max && count_hw > nf_ft_hw_max) { + atomic_dec(&fnet->count_hw); + return false; + } + return true; +} + static void flow_offload_work_add(struct flow_offload_work *offload) { + struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable); struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; int err; + if (!flow_get_max_hw_entries(offload->flowtable)) + return; + err = nf_flow_offload_alloc(offload, flow_rule); if (err < 0) - return; + goto out_alloc; err = flow_offload_rule_add(offload, flow_rule); if (err < 0) - goto out; + goto out_add; - set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); + if (test_and_set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status)) + atomic_dec(&fnet->count_hw); + nf_flow_offload_destroy(flow_rule); + return; -out: +out_add: nf_flow_offload_destroy(flow_rule); +out_alloc: + atomic_dec(&fnet->count_hw); } static void flow_offload_work_del(struct flow_offload_work *offload) { - clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); + bool offloaded = test_and_clear_bit(IPS_HW_OFFLOAD_BIT, + &offload->flow->ct->status); + struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable); + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); + if (offloaded) + atomic_dec(&fnet->count_hw); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); } diff --git a/net/netfilter/nf_flow_table_sysctl.c b/net/netfilter/nf_flow_table_sysctl.c new file mode 100644 index 000000000000..ce8c0a2c4bdb --- /dev/null +++ b/net/netfilter/nf_flow_table_sysctl.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <net/netfilter/nf_flow_table.h> + +unsigned int nf_ft_hw_max __read_mostly; + +#ifdef CONFIG_SYSCTL +enum nf_ct_sysctl_index { + NF_SYSCTL_FLOW_TABLE_MAX_HW, + NF_SYSCTL_FLOW_TABLE_COUNT_HW, + + __NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL, +}; + +#define NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL \ + (__NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL + 1) + +static struct ctl_table nf_flow_table_sysctl_table[] = { + [NF_SYSCTL_FLOW_TABLE_MAX_HW] = { + .procname = "nf_flowtable_max_hw", + .data = &nf_ft_hw_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NF_SYSCTL_FLOW_TABLE_COUNT_HW] = { + .procname = "nf_flowtable_count_hw", + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + {} +}; + +int nf_flow_table_init_sysctl(struct net *net) +{ + struct nf_ft_net *fnet = nf_ft_pernet(net); + struct ctl_table *table; + + BUILD_BUG_ON(ARRAY_SIZE(nf_flow_table_sysctl_table) != NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL); + + table = kmemdup(nf_flow_table_sysctl_table, sizeof(nf_flow_table_sysctl_table), + GFP_KERNEL); + if (!table) + return -ENOMEM; + + table[NF_SYSCTL_FLOW_TABLE_COUNT_HW].data = &fnet->count_hw; + + /* Don't allow non-init_net ns to alter global sysctls */ + if (!net_eq(&init_net, net)) + table[NF_SYSCTL_FLOW_TABLE_MAX_HW].mode = 0444; + + fnet->sysctl_header = register_net_sysctl(net, "net/netfilter/ft", table); + if (!fnet->sysctl_header) + goto out_unregister_netfilter; + + return 0; + +out_unregister_netfilter: + kfree(table); + return -ENOMEM; +} + +void nf_flow_table_fini_sysctl(struct net *net) +{ + struct nf_ft_net *fnet = nf_ft_pernet(net); + struct ctl_table *table; + + table = fnet->sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(fnet->sysctl_header); + kfree(table); +} + +#else +int nf_flow_table_init_sysctl(struct net *net) +{ + return 0; +} + +void nf_flow_table_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */