diff mbox series

[net-next,v2,2/3] ipv4: add sysctl for nexthop api compatibility mode

Message ID 1587862128-24319-3-git-send-email-roopa@cumulusnetworks.com
State Changes Requested
Delegated to: David Miller
Headers show
Series New sysctl to turn off nexthop API compat mode | expand

Commit Message

Roopa Prabhu April 26, 2020, 12:48 a.m. UTC
From: Roopa Prabhu <roopa@cumulusnetworks.com>

Current route nexthop API maintains user space compatibility
with old route API by default. Dumps and netlink notifications
support both new and old API format. In systems which have
moved to the new API, this compatibility mode cancels some
of the performance benefits provided by the new nexthop API.

This patch adds new sysctl nexthop_compat_mode which is on
by default but provides the ability to turn off compatibility
mode allowing systems to run entirely with the new routing
API. Old route API behaviour and support is not modified by this
sysctl.

Uses a single sysctl to cover both ipv4 and ipv6 following
other sysctls. Covers dumps and delete notifications as
suggested by David Ahern.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 include/net/netns/ipv4.h   | 2 ++
 net/ipv4/af_inet.c         | 1 +
 net/ipv4/fib_semantics.c   | 3 +++
 net/ipv4/nexthop.c         | 5 +++--
 net/ipv4/sysctl_net_ipv4.c | 7 +++++++
 net/ipv6/route.c           | 3 ++-
 6 files changed, 18 insertions(+), 3 deletions(-)

Comments

Randy Dunlap April 26, 2020, 2:08 a.m. UTC | #1
On 4/25/20 5:48 PM, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
> 
> Current route nexthop API maintains user space compatibility
> with old route API by default. Dumps and netlink notifications
> support both new and old API format. In systems which have
> moved to the new API, this compatibility mode cancels some
> of the performance benefits provided by the new nexthop API.
> 
> This patch adds new sysctl nexthop_compat_mode which is on
> by default but provides the ability to turn off compatibility
> mode allowing systems to run entirely with the new routing
> API. Old route API behaviour and support is not modified by this
> sysctl.
> 
> Uses a single sysctl to cover both ipv4 and ipv6 following
> other sysctls. Covers dumps and delete notifications as
> suggested by David Ahern.
> 
> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
> ---
>  include/net/netns/ipv4.h   | 2 ++
>  net/ipv4/af_inet.c         | 1 +
>  net/ipv4/fib_semantics.c   | 3 +++
>  net/ipv4/nexthop.c         | 5 +++--
>  net/ipv4/sysctl_net_ipv4.c | 7 +++++++
>  net/ipv6/route.c           | 3 ++-
>  6 files changed, 18 insertions(+), 3 deletions(-)

Hi,

Are net sysctls supposed to be documented, e.g. in
Documentation/admin-guide/sysctl/net.rst?

thanks.
Roopa Prabhu April 26, 2020, 4:30 a.m. UTC | #2
On Sat, Apr 25, 2020 at 7:08 PM Randy Dunlap <rdunlap@infradead.org> wrote:
>
> On 4/25/20 5:48 PM, Roopa Prabhu wrote:
> > From: Roopa Prabhu <roopa@cumulusnetworks.com>
> >
> > Current route nexthop API maintains user space compatibility
> > with old route API by default. Dumps and netlink notifications
> > support both new and old API format. In systems which have
> > moved to the new API, this compatibility mode cancels some
> > of the performance benefits provided by the new nexthop API.
> >
> > This patch adds new sysctl nexthop_compat_mode which is on
> > by default but provides the ability to turn off compatibility
> > mode allowing systems to run entirely with the new routing
> > API. Old route API behaviour and support is not modified by this
> > sysctl.
> >
> > Uses a single sysctl to cover both ipv4 and ipv6 following
> > other sysctls. Covers dumps and delete notifications as
> > suggested by David Ahern.
> >
> > Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
> > ---
> >  include/net/netns/ipv4.h   | 2 ++
> >  net/ipv4/af_inet.c         | 1 +
> >  net/ipv4/fib_semantics.c   | 3 +++
> >  net/ipv4/nexthop.c         | 5 +++--
> >  net/ipv4/sysctl_net_ipv4.c | 7 +++++++
> >  net/ipv6/route.c           | 3 ++-
> >  6 files changed, 18 insertions(+), 3 deletions(-)
>
> Hi,
>
> Are net sysctls supposed to be documented, e.g. in
> Documentation/admin-guide/sysctl/net.rst?
>

thanks, good reminder. It was on my TODO and missed it. This one can
go in Documentation/networking/ip-sysctl.txt

will include it in v3.
David Ahern April 26, 2020, 5:26 p.m. UTC | #3
On 4/25/20 6:48 PM, Roopa Prabhu wrote:
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 81b267e..a613ecf 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -711,6 +711,13 @@ static struct ctl_table ipv4_net_table[] = {
>  		.proc_handler   = proc_tcp_early_demux
>  	},
>  	{
> +		.procname       = "nexthop_compat_mode",
> +		.data           = &init_net.ipv4.sysctl_nexthop_compat_mode,
> +		.maxlen         = sizeof(int),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec
> +	},

I think you can make this a bool.

	.proc_handler   = proc_dointvec_minmax,
        .extra1         = SYSCTL_ZERO,
        .extra2         = SYSCTL_ONE,

rest looks good to me.
diff mbox series

Patch

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 154b8f0..5acdb4d 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -111,6 +111,8 @@  struct netns_ipv4 {
 	int sysctl_tcp_early_demux;
 	int sysctl_udp_early_demux;
 
+	int sysctl_nexthop_compat_mode;
+
 	int sysctl_fwmark_reflect;
 	int sysctl_tcp_fwmark_accept;
 #ifdef CONFIG_NET_L3_MASTER_DEV
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c618e24..6177c4b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1835,6 +1835,7 @@  static __net_init int inet_init_net(struct net *net)
 	net->ipv4.sysctl_ip_early_demux = 1;
 	net->ipv4.sysctl_udp_early_demux = 1;
 	net->ipv4.sysctl_tcp_early_demux = 1;
+	net->ipv4.sysctl_nexthop_compat_mode = 1;
 #ifdef CONFIG_SYSCTL
 	net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
 #endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6ed8c93..7546b88 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1780,6 +1780,8 @@  int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			goto nla_put_failure;
 		if (nexthop_is_blackhole(fi->nh))
 			rtm->rtm_type = RTN_BLACKHOLE;
+		if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode)
+			goto offload;
 	}
 
 	if (nhs == 1) {
@@ -1805,6 +1807,7 @@  int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			goto nla_put_failure;
 	}
 
+offload:
 	if (fri->offload)
 		rtm->rtm_flags |= RTM_F_OFFLOAD;
 	if (fri->trap)
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 9999687..3957364 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -784,7 +784,8 @@  static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 	list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
 		/* __ip6_del_rt does a release, so do a hold here */
 		fib6_info_hold(f6i);
-		ipv6_stub->ip6_del_rt(net, f6i, false);
+		ipv6_stub->ip6_del_rt(net, f6i,
+				      !net->ipv4.sysctl_nexthop_compat_mode);
 	}
 }
 
@@ -1041,7 +1042,7 @@  static int insert_nexthop(struct net *net, struct nexthop *new_nh,
 	if (!rc) {
 		nh_base_seq_inc(net);
 		nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
-		if (replace_notify)
+		if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode)
 			nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
 	}
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 81b267e..a613ecf 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -711,6 +711,13 @@  static struct ctl_table ipv4_net_table[] = {
 		.proc_handler   = proc_tcp_early_demux
 	},
 	{
+		.procname       = "nexthop_compat_mode",
+		.data           = &init_net.ipv4.sysctl_nexthop_compat_mode,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec
+	},
+	{
 		.procname	= "ip_default_ttl",
 		.data		= &init_net.ipv4.sysctl_ip_default_ttl,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 486c36a..803212a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5557,7 +5557,8 @@  static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 		if (nexthop_is_blackhole(rt->nh))
 			rtm->rtm_type = RTN_BLACKHOLE;
 
-		if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+		if (net->ipv4.sysctl_nexthop_compat_mode &&
+		    rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
 			goto nla_put_failure;
 
 		rtm->rtm_flags |= nh_flags;