Message ID | 1489116660-4244-1-git-send-email-subashab@codeaurora.org |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On Thu, Mar 9, 2017 at 7:31 PM, Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> wrote: > Certain system process significant unconnected UDP workload. > It would be preferrable to disable UDP early demux for those systems > and enable it for TCP only. > Presumably you want this for performance reasons. Can you provide some before and after numbers? > v1->v2: Change function pointer instead of adding conditional as > suggested by Stephen. > > Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> > Suggested-by: Eric Dumazet <edumazet@google.com> > Cc: Stephen Hemminger <stephen@networkplumber.org> > --- > include/net/netns/ipv4.h | 2 ++ > include/net/tcp.h | 2 ++ > include/net/udp.h | 2 ++ > net/ipv4/af_inet.c | 22 ++++++++++++++++++++-- > net/ipv4/sysctl_net_ipv4.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++ > net/ipv6/tcp_ipv6.c | 10 +++++++++- > 6 files changed, 82 insertions(+), 3 deletions(-) > > diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h > index 0378e88..1e74da23 100644 > --- a/include/net/netns/ipv4.h > +++ b/include/net/netns/ipv4.h > @@ -86,6 +86,8 @@ struct netns_ipv4 { > /* Shall we try to damage output packets if routing dev changes? */ > int sysctl_ip_dynaddr; > int sysctl_ip_early_demux; > + int sysctl_tcp_early_demux; > + int sysctl_udp_early_demux; > > int sysctl_fwmark_reflect; > int sysctl_tcp_fwmark_accept; > diff --git a/include/net/tcp.h b/include/net/tcp.h > index 6061963..3b6446d 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -1953,4 +1953,6 @@ static inline void tcp_listendrop(const struct sock *sk) > __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); > } > > +void tcp_v4_early_demux_configure(int enable); > +void tcp_v6_early_demux_configure(int enable); > #endif /* _TCP_H */ > diff --git a/include/net/udp.h b/include/net/udp.h > index 1661791..7de31d5 100644 > --- a/include/net/udp.h > +++ b/include/net/udp.h > @@ -373,4 +373,6 @@ struct udp_iter_state { > #if IS_ENABLED(CONFIG_IPV6) > void udpv6_encap_enable(void); > #endif > + > +void udp_v4_early_demux_configure(int enable); > #endif /* _UDP_H */ > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > index f750698..3e11d74 100644 > --- a/net/ipv4/af_inet.c > +++ b/net/ipv4/af_inet.c > @@ -1579,7 +1579,7 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) > }; > #endif > > -static const struct net_protocol tcp_protocol = { > +static struct net_protocol tcp_protocol = { > .early_demux = tcp_v4_early_demux, > .handler = tcp_v4_rcv, > .err_handler = tcp_v4_err, > @@ -1588,7 +1588,7 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) > .icmp_strict_tag_validation = 1, > }; > > -static const struct net_protocol udp_protocol = { > +static struct net_protocol udp_protocol = { > .early_demux = udp_v4_early_demux, > .handler = udp_rcv, > .err_handler = udp_err, > @@ -1596,6 +1596,22 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) > .netns_ok = 1, > }; > > +void tcp_v4_early_demux_configure(int enable) > +{ > + if (enable) > + tcp_protocol.early_demux = tcp_v4_early_demux; > + else > + tcp_protocol.early_demux = NULL; > +} > + > +void udp_v4_early_demux_configure(int enable) > +{ > + if (enable) > + udp_protocol.early_demux = udp_v4_early_demux; > + else > + udp_protocol.early_demux = NULL; > +} > + > static const struct net_protocol icmp_protocol = { > .handler = icmp_rcv, > .err_handler = icmp_err, > @@ -1700,6 +1716,8 @@ static __net_init int inet_init_net(struct net *net) > net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; > net->ipv4.sysctl_ip_dynaddr = 0; > net->ipv4.sysctl_ip_early_demux = 1; > + net->ipv4.sysctl_udp_early_demux = 1; > + net->ipv4.sysctl_tcp_early_demux = 1; > > return 0; > } > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c > index b2fa498..c61383b 100644 > --- a/net/ipv4/sysctl_net_ipv4.c > +++ b/net/ipv4/sysctl_net_ipv4.c > @@ -253,6 +253,39 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, > return ret; > } > > +static int proc_tcp_early_demux(struct ctl_table *table, int write, > + void __user *buffer, size_t *lenp, loff_t *ppos) > +{ > + int ret = 0; > + > + ret = proc_dointvec(table, write, buffer, lenp, ppos); > + > + if (write && !ret) { > + int enabled = init_net.ipv4.sysctl_tcp_early_demux; > + > + tcp_v4_early_demux_configure(enabled); > + tcp_v6_early_demux_configure(enabled); > + } > + > + return ret; > +} > + > +static int proc_udp_early_demux(struct ctl_table *table, int write, > + void __user *buffer, size_t *lenp, loff_t *ppos) > +{ > + int ret = 0; > + > + ret = proc_dointvec(table, write, buffer, lenp, ppos); > + > + if (write && !ret) { > + int enabled = init_net.ipv4.sysctl_udp_early_demux; > + > + udp_v4_early_demux_configure(enabled); > + } > + > + return ret; > +} > + > static struct ctl_table ipv4_table[] = { > { > .procname = "tcp_timestamps", > @@ -737,6 +770,20 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, > .proc_handler = proc_dointvec > }, > { > + .procname = "udp_early_demux", > + .data = &init_net.ipv4.sysctl_udp_early_demux, > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = proc_udp_early_demux > + }, > + { > + .procname = "tcp_early_demux", > + .data = &init_net.ipv4.sysctl_tcp_early_demux, > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = proc_tcp_early_demux > + }, > + { > .procname = "ip_default_ttl", > .data = &init_net.ipv4.sysctl_ip_default_ttl, > .maxlen = sizeof(int), > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c > index 4c60c6f..0dd761c 100644 > --- a/net/ipv6/tcp_ipv6.c > +++ b/net/ipv6/tcp_ipv6.c > @@ -1926,13 +1926,21 @@ struct proto tcpv6_prot = { > .diag_destroy = tcp_abort, > }; > > -static const struct inet6_protocol tcpv6_protocol = { > +static struct inet6_protocol tcpv6_protocol = { > .early_demux = tcp_v6_early_demux, > .handler = tcp_v6_rcv, > .err_handler = tcp_v6_err, > .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, > }; > > +void tcp_v6_early_demux_configure(int enable) > +{ > + if (enable) > + tcpv6_protocol.early_demux = tcp_v6_early_demux; > + else > + tcpv6_protocol.early_demux = NULL; > +} > + > static struct inet_protosw tcpv6_protosw = { > .type = SOCK_STREAM, > .protocol = IPPROTO_TCP, > -- > 1.9.1 >
On Thu, 2017-03-09 at 20:31 -0700, Subash Abhinov Kasiviswanathan wrote: > Certain system process significant unconnected UDP workload. > It would be preferrable to disable UDP early demux for those systems > and enable it for TCP only. > +void tcp_v4_early_demux_configure(int enable) > +{ > + if (enable) > + tcp_protocol.early_demux = tcp_v4_early_demux; > + else > + tcp_protocol.early_demux = NULL; > +} > + > +void udp_v4_early_demux_configure(int enable) > +{ > + if (enable) > + udp_protocol.early_demux = udp_v4_early_demux; > + else > + udp_protocol.early_demux = NULL; > +} > + Well, then you need to make sure ipprot->early_demux is read once in the callers, like ip_rcv_finish(), otherwise compiler could read it twice and we could deref a NULL pointer. ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->early_demux) { ipprot->early_demux(skb); // crash
On 2017-03-09 20:42, Tom Herbert wrote: > On Thu, Mar 9, 2017 at 7:31 PM, Subash Abhinov Kasiviswanathan > <subashab@codeaurora.org> wrote: >> Certain system process significant unconnected UDP workload. >> It would be preferrable to disable UDP early demux for those systems >> and enable it for TCP only. >> > Presumably you want this for performance reasons. Can you provide some > before and after numbers? Hi Tom We are working on UDPv6 performance issues seen on an Android ARM64 system. Adding an early demux handler (link below) for it helped to increase performance (800Mbps -> 870Mbps). This helps because Android statistics rules do multiple socket lookup when no socket is associated with the skb. https://www.mail-archive.com/netdev@vger.kernel.org/msg157003.html Eric mentioned that server loads usually see more unconnected load and he preferred to turn off early demux for UDP, hence this patch. I don't have numbers for unconnected loads as of now though. -- Qualcomm Innovation Center, Inc. The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project
> Well, then you need to make sure ipprot->early_demux is read once in > the > callers, like ip_rcv_finish(), otherwise compiler could read it twice > and we could deref a NULL pointer. > > ipprot = rcu_dereference(inet_protos[protocol]); > if (ipprot && ipprot->early_demux) { > ipprot->early_demux(skb); // crash Thanks Eric. I'll update this. -- Qualcomm Innovation Center, Inc. The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project
Hi Subash, [auto build test ERROR on v4.9-rc8] [cannot apply to net-next/master net/master next-20170309] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Subash-Abhinov-Kasiviswanathan/net-Add-sysctl-to-toggle-early-demux-for-tcp-and-udp/20170310-184912 config: blackfin-BF561-EZKIT-SMP_defconfig (attached as .config) compiler: bfin-uclinux-gcc (GCC) 6.2.0 reproduce: wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree make.cross ARCH=blackfin All errors (new ones prefixed by >>): net/built-in.o: In function `proc_tcp_early_demux': >> net/ipv4/sysctl_net_ipv4.c:267: undefined reference to `tcp_v6_early_demux_configure' vim +267 net/ipv4/sysctl_net_ipv4.c 261 ret = proc_dointvec(table, write, buffer, lenp, ppos); 262 263 if (write && !ret) { 264 int enabled = init_net.ipv4.sysctl_tcp_early_demux; 265 266 tcp_v4_early_demux_configure(enabled); > 267 tcp_v6_early_demux_configure(enabled); 268 } 269 270 return ret; --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi Subash,
[auto build test ERROR on v4.9-rc8]
[cannot apply to net-next/master net/master next-20170309]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Subash-Abhinov-Kasiviswanathan/net-Add-sysctl-to-toggle-early-demux-for-tcp-and-udp/20170310-184912
config: xtensa-common_defconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 4.9.0
reproduce:
wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=xtensa
All errors (new ones prefixed by >>):
net/built-in.o: In function `genl_unregister_family':
>> (.text+0x3a17c): undefined reference to `tcp_v6_early_demux_configure'
net/built-in.o: In function `proc_tcp_early_demux':
sysctl_net_ipv4.c:(.text+0x7046e): undefined reference to `tcp_v6_early_demux_configure'
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
On Thu, Mar 9, 2017 at 9:26 PM, Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> wrote: > On 2017-03-09 20:42, Tom Herbert wrote: >> >> On Thu, Mar 9, 2017 at 7:31 PM, Subash Abhinov Kasiviswanathan >> <subashab@codeaurora.org> wrote: >>> >>> Certain system process significant unconnected UDP workload. >>> It would be preferrable to disable UDP early demux for those systems >>> and enable it for TCP only. >>> >> Presumably you want this for performance reasons. Can you provide some >> before and after numbers? > > > Hi Tom > > We are working on UDPv6 performance issues seen on an Android ARM64 system. > Adding an early demux handler (link below) for it helped to increase > performance > (800Mbps -> 870Mbps). This helps because Android statistics rules do > multiple > socket lookup when no socket is associated with the skb. > > https://www.mail-archive.com/netdev@vger.kernel.org/msg157003.html > > Eric mentioned that server loads usually see more unconnected load and he > preferred to turn off early demux for UDP, hence this patch. I don't have > numbers > for unconnected loads as of now though. > Subash, Okay, now I'm confused. You're saying that when early demux was added for IPv6 performance improved, but this patch is allowing early demux to be disabled on the basis that it hurts performance for unconnected UDP workloads. While it's true that early demux in the case results in another UDP lookup, Eric's changes to make it lockless have made that lookup very cheap. So we really need numbers to justify this patch. Even if the numbers were to show a benefit, we still have the problem that this creates a bimodal performance characteristic, e.g. what if the work load were 1/2 connected and 1/2 unconnected in real life, or what it the user incorrectly guesses the actual workload. Maybe a deeper solution to investigate is making early demux work with unconnected sockets. Tom > -- > Qualcomm Innovation Center, Inc. > The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a > Linux Foundation Collaborative Project
On Fri, 2017-03-10 at 08:33 -0800, Tom Herbert wrote: > Okay, now I'm confused. You're saying that when early demux was added > for IPv6 performance improved, but this patch is allowing early demux > to be disabled on the basis that it hurts performance for unconnected > UDP workloads. While it's true that early demux in the case results in > another UDP lookup, Eric's changes to make it lockless have made that > lookup very cheap. So we really need numbers to justify this patch. > Fact that the lookup is lockless does not avoid a cache line miss. Early demux computes a hash based on the 4-tuple, and lookups a hash table with does not fit in cpu caches. A cache line miss per packet is expensive, when handling millions of UDP packets per second, (with millions of 4-tuples) > Even if the numbers were to show a benefit, we still have the problem > that this creates a bimodal performance characteristic, e.g. what if > the work load were 1/2 connected and 1/2 unconnected in real life, or > what it the user incorrectly guesses the actual workload. Maybe a > deeper solution to investigate is making early demux work with > unconnected sockets. Sure, but forcing all UDP applications to perform IP early demux is not better.
On Fri, Mar 10, 2017 at 4:22 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > On Fri, 2017-03-10 at 08:33 -0800, Tom Herbert wrote: > >> Okay, now I'm confused. You're saying that when early demux was added >> for IPv6 performance improved, but this patch is allowing early demux >> to be disabled on the basis that it hurts performance for unconnected >> UDP workloads. While it's true that early demux in the case results in >> another UDP lookup, Eric's changes to make it lockless have made that >> lookup very cheap. So we really need numbers to justify this patch. >> > > Fact that the lookup is lockless does not avoid a cache line miss. > > Early demux computes a hash based on the 4-tuple, and lookups a hash > table with does not fit in cpu caches. > > A cache line miss per packet is expensive, when handling millions of UDP > packets per second, (with millions of 4-tuples) > >> Even if the numbers were to show a benefit, we still have the problem >> that this creates a bimodal performance characteristic, e.g. what if >> the work load were 1/2 connected and 1/2 unconnected in real life, or >> what it the user incorrectly guesses the actual workload. Maybe a >> deeper solution to investigate is making early demux work with >> unconnected sockets. > > Sure, but forcing all UDP applications to perform IP early demux is not > better. > All these hypotheses are quite testable, and it should be obvious that if a patch is supposed to improve performance there should be some effort to quantify the impact.
> All these hypotheses are quite testable, and it should be obvious that > if a patch is supposed to improve performance there should be some > effort to quantify the impact. Hi Tom I ran a single stream unconnected UDPv4 test on an ARM64 system and I see a slight increase in performance (782 -> 788Mbps) with udp demux disabled. A rudimentary app which does a sendto/recvfrom along with some statistics collection was used for this. -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project
On Sat, Mar 18, 2017 at 10:32 AM, Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> wrote: >> All these hypotheses are quite testable, and it should be obvious that >> if a patch is supposed to improve performance there should be some >> effort to quantify the impact. > > > Hi Tom > > I ran a single stream unconnected UDPv4 test on an ARM64 system and I see > a slight increase in performance (782 -> 788Mbps) with udp demux disabled. > A rudimentary app which does a sendto/recvfrom along with some > statistics collection was used for this. > Less than 1% performance improvement in a benchmark doesn't justify the complexity of the patch. Eric's hypothesis was that an unconnected UDP socket may show issues because of cache misses in look-ups due to so many different sources. This should be fairly easy to benchmark by randomly setting source address in your test (IP any and routing my need to be set appropriately). > -- > The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, > > a Linux Foundation Collaborative Project
> Less than 1% performance improvement in a benchmark doesn't justify > the complexity of the patch. Eric's hypothesis was that an unconnected > UDP socket may show issues because of cache misses in look-ups due to > so many different sources. This should be fairly easy to benchmark by > randomly setting source address in your test (IP any and routing my > need to be set appropriately). > With different source addresses, a larger increase is seen here (633->654Mbps). -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project
On Sat, 2017-03-18 at 20:07 -0600, Subash Abhinov Kasiviswanathan wrote: > > Less than 1% performance improvement in a benchmark doesn't justify > > the complexity of the patch. Eric's hypothesis was that an unconnected > > UDP socket may show issues because of cache misses in look-ups due to > > so many different sources. This should be fairly easy to benchmark by > > randomly setting source address in your test (IP any and routing my > > need to be set appropriately). > > > > With different source addresses, a larger increase is seen here > (633->654Mbps). Yes, also note the extra cache line might be not noticed if the cpu can keep in its caches the whole UDP hash table (1 MB on 64bit kernels for the portion touched in IP early demux) So the exact slowdown depends on CPU generation / cache sizes.
On Sat, Mar 18, 2017 at 7:07 PM, Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> wrote: >> Less than 1% performance improvement in a benchmark doesn't justify >> the complexity of the patch. Eric's hypothesis was that an unconnected >> UDP socket may show issues because of cache misses in look-ups due to >> so many different sources. This should be fairly easy to benchmark by >> randomly setting source address in your test (IP any and routing my >> need to be set appropriately). >> > > With different source addresses, a larger increase is seen here > (633->654Mbps). > Thanks for running the tests. It's obviously not a huge win at least relative to performance improvement we got from early demux, but I suppose with very specific and engineered loads this might have value. Please include this is next patch sets. Generally, I think a good goal moving forward would be a to apply the 0 or 1 times rule for connection lookup. That is for any transport tuple in a receive packet we want to do at most one connection lookup. So early demux would need to apply to unconnected sockets and then we wouldn't have to do the second lookup in UDP (or TCP for a SYN) receive (note we also do an extra lookup for GRO with UDP encapsulation). A reason we haven't this before might be that the lookup may actually find the wrong socket (for example we go into a different network namespace). Maybe the stack should consider any lookup result outside of the protocol stack to be provisional (and it would be super nice if we could somehow cache a dst with an unconnected socket also ;-) ) Tom > > -- > The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, > a Linux Foundation Collaborative Project
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 0378e88..1e74da23 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -86,6 +86,8 @@ struct netns_ipv4 { /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; int sysctl_ip_early_demux; + int sysctl_tcp_early_demux; + int sysctl_udp_early_demux; int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6061963..3b6446d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1953,4 +1953,6 @@ static inline void tcp_listendrop(const struct sock *sk) __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } +void tcp_v4_early_demux_configure(int enable); +void tcp_v6_early_demux_configure(int enable); #endif /* _TCP_H */ diff --git a/include/net/udp.h b/include/net/udp.h index 1661791..7de31d5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -373,4 +373,6 @@ struct udp_iter_state { #if IS_ENABLED(CONFIG_IPV6) void udpv6_encap_enable(void); #endif + +void udp_v4_early_demux_configure(int enable); #endif /* _UDP_H */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f750698..3e11d74 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1579,7 +1579,7 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) }; #endif -static const struct net_protocol tcp_protocol = { +static struct net_protocol tcp_protocol = { .early_demux = tcp_v4_early_demux, .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, @@ -1588,7 +1588,7 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) .icmp_strict_tag_validation = 1, }; -static const struct net_protocol udp_protocol = { +static struct net_protocol udp_protocol = { .early_demux = udp_v4_early_demux, .handler = udp_rcv, .err_handler = udp_err, @@ -1596,6 +1596,22 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) .netns_ok = 1, }; +void tcp_v4_early_demux_configure(int enable) +{ + if (enable) + tcp_protocol.early_demux = tcp_v4_early_demux; + else + tcp_protocol.early_demux = NULL; +} + +void udp_v4_early_demux_configure(int enable) +{ + if (enable) + udp_protocol.early_demux = udp_v4_early_demux; + else + udp_protocol.early_demux = NULL; +} + static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .err_handler = icmp_err, @@ -1700,6 +1716,8 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; + net->ipv4.sysctl_udp_early_demux = 1; + net->ipv4.sysctl_tcp_early_demux = 1; return 0; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b2fa498..c61383b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -253,6 +253,39 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return ret; } +static int proc_tcp_early_demux(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (write && !ret) { + int enabled = init_net.ipv4.sysctl_tcp_early_demux; + + tcp_v4_early_demux_configure(enabled); + tcp_v6_early_demux_configure(enabled); + } + + return ret; +} + +static int proc_udp_early_demux(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (write && !ret) { + int enabled = init_net.ipv4.sysctl_udp_early_demux; + + udp_v4_early_demux_configure(enabled); + } + + return ret; +} + static struct ctl_table ipv4_table[] = { { .procname = "tcp_timestamps", @@ -737,6 +770,20 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, .proc_handler = proc_dointvec }, { + .procname = "udp_early_demux", + .data = &init_net.ipv4.sysctl_udp_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_udp_early_demux + }, + { + .procname = "tcp_early_demux", + .data = &init_net.ipv4.sysctl_tcp_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tcp_early_demux + }, + { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, .maxlen = sizeof(int), diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4c60c6f..0dd761c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1926,13 +1926,21 @@ struct proto tcpv6_prot = { .diag_destroy = tcp_abort, }; -static const struct inet6_protocol tcpv6_protocol = { +static struct inet6_protocol tcpv6_protocol = { .early_demux = tcp_v6_early_demux, .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; +void tcp_v6_early_demux_configure(int enable) +{ + if (enable) + tcpv6_protocol.early_demux = tcp_v6_early_demux; + else + tcpv6_protocol.early_demux = NULL; +} + static struct inet_protosw tcpv6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_TCP,
Certain system process significant unconnected UDP workload. It would be preferrable to disable UDP early demux for those systems and enable it for TCP only. v1->v2: Change function pointer instead of adding conditional as suggested by Stephen. Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> Suggested-by: Eric Dumazet <edumazet@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 2 ++ include/net/udp.h | 2 ++ net/ipv4/af_inet.c | 22 ++++++++++++++++++++-- net/ipv4/sysctl_net_ipv4.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 10 +++++++++- 6 files changed, 82 insertions(+), 3 deletions(-)