diff mbox

[net-next,v2] net: Add sysctl to toggle early demux for tcp and udp

Message ID 1489116660-4244-1-git-send-email-subashab@codeaurora.org
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Subash Abhinov Kasiviswanathan March 10, 2017, 3:31 a.m. UTC
Certain system process significant unconnected UDP workload.
It would be preferrable to disable UDP early demux for those systems
and enable it for TCP only.

v1->v2: Change function pointer instead of adding conditional as
suggested by Stephen.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Suggested-by: Eric Dumazet <edumazet@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
---
 include/net/netns/ipv4.h   |  2 ++
 include/net/tcp.h          |  2 ++
 include/net/udp.h          |  2 ++
 net/ipv4/af_inet.c         | 22 ++++++++++++++++++++--
 net/ipv4/sysctl_net_ipv4.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/tcp_ipv6.c        | 10 +++++++++-
 6 files changed, 82 insertions(+), 3 deletions(-)

Comments

Tom Herbert March 10, 2017, 3:42 a.m. UTC | #1
On Thu, Mar 9, 2017 at 7:31 PM, Subash Abhinov Kasiviswanathan
<subashab@codeaurora.org> wrote:
> Certain system process significant unconnected UDP workload.
> It would be preferrable to disable UDP early demux for those systems
> and enable it for TCP only.
>
Presumably you want this for performance reasons. Can you provide some
before and after numbers?

> v1->v2: Change function pointer instead of adding conditional as
> suggested by Stephen.
>
> Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
> Suggested-by: Eric Dumazet <edumazet@google.com>
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> ---
>  include/net/netns/ipv4.h   |  2 ++
>  include/net/tcp.h          |  2 ++
>  include/net/udp.h          |  2 ++
>  net/ipv4/af_inet.c         | 22 ++++++++++++++++++++--
>  net/ipv4/sysctl_net_ipv4.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++
>  net/ipv6/tcp_ipv6.c        | 10 +++++++++-
>  6 files changed, 82 insertions(+), 3 deletions(-)
>
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 0378e88..1e74da23 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -86,6 +86,8 @@ struct netns_ipv4 {
>         /* Shall we try to damage output packets if routing dev changes? */
>         int sysctl_ip_dynaddr;
>         int sysctl_ip_early_demux;
> +       int sysctl_tcp_early_demux;
> +       int sysctl_udp_early_demux;
>
>         int sysctl_fwmark_reflect;
>         int sysctl_tcp_fwmark_accept;
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 6061963..3b6446d 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1953,4 +1953,6 @@ static inline void tcp_listendrop(const struct sock *sk)
>         __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
>  }
>
> +void tcp_v4_early_demux_configure(int enable);
> +void tcp_v6_early_demux_configure(int enable);
>  #endif /* _TCP_H */
> diff --git a/include/net/udp.h b/include/net/udp.h
> index 1661791..7de31d5 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -373,4 +373,6 @@ struct udp_iter_state {
>  #if IS_ENABLED(CONFIG_IPV6)
>  void udpv6_encap_enable(void);
>  #endif
> +
> +void udp_v4_early_demux_configure(int enable);
>  #endif /* _UDP_H */
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index f750698..3e11d74 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1579,7 +1579,7 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
>  };
>  #endif
>
> -static const struct net_protocol tcp_protocol = {
> +static struct net_protocol tcp_protocol = {
>         .early_demux    =       tcp_v4_early_demux,
>         .handler        =       tcp_v4_rcv,
>         .err_handler    =       tcp_v4_err,
> @@ -1588,7 +1588,7 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
>         .icmp_strict_tag_validation = 1,
>  };
>
> -static const struct net_protocol udp_protocol = {
> +static struct net_protocol udp_protocol = {
>         .early_demux =  udp_v4_early_demux,
>         .handler =      udp_rcv,
>         .err_handler =  udp_err,
> @@ -1596,6 +1596,22 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
>         .netns_ok =     1,
>  };
>
> +void tcp_v4_early_demux_configure(int enable)
> +{
> +       if (enable)
> +               tcp_protocol.early_demux = tcp_v4_early_demux;
> +       else
> +               tcp_protocol.early_demux = NULL;
> +}
> +
> +void udp_v4_early_demux_configure(int enable)
> +{
> +       if (enable)
> +               udp_protocol.early_demux = udp_v4_early_demux;
> +       else
> +               udp_protocol.early_demux = NULL;
> +}
> +
>  static const struct net_protocol icmp_protocol = {
>         .handler =      icmp_rcv,
>         .err_handler =  icmp_err,
> @@ -1700,6 +1716,8 @@ static __net_init int inet_init_net(struct net *net)
>         net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
>         net->ipv4.sysctl_ip_dynaddr = 0;
>         net->ipv4.sysctl_ip_early_demux = 1;
> +       net->ipv4.sysctl_udp_early_demux = 1;
> +       net->ipv4.sysctl_tcp_early_demux = 1;
>
>         return 0;
>  }
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index b2fa498..c61383b 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -253,6 +253,39 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
>         return ret;
>  }
>
> +static int proc_tcp_early_demux(struct ctl_table *table, int write,
> +                               void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +       int ret = 0;
> +
> +       ret = proc_dointvec(table, write, buffer, lenp, ppos);
> +
> +       if (write && !ret) {
> +               int enabled = init_net.ipv4.sysctl_tcp_early_demux;
> +
> +               tcp_v4_early_demux_configure(enabled);
> +               tcp_v6_early_demux_configure(enabled);
> +       }
> +
> +       return ret;
> +}
> +
> +static int proc_udp_early_demux(struct ctl_table *table, int write,
> +                               void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +       int ret = 0;
> +
> +       ret = proc_dointvec(table, write, buffer, lenp, ppos);
> +
> +       if (write && !ret) {
> +               int enabled = init_net.ipv4.sysctl_udp_early_demux;
> +
> +               udp_v4_early_demux_configure(enabled);
> +       }
> +
> +       return ret;
> +}
> +
>  static struct ctl_table ipv4_table[] = {
>         {
>                 .procname       = "tcp_timestamps",
> @@ -737,6 +770,20 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
>                 .proc_handler   = proc_dointvec
>         },
>         {
> +               .procname       = "udp_early_demux",
> +               .data           = &init_net.ipv4.sysctl_udp_early_demux,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = proc_udp_early_demux
> +       },
> +       {
> +               .procname       = "tcp_early_demux",
> +               .data           = &init_net.ipv4.sysctl_tcp_early_demux,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = proc_tcp_early_demux
> +       },
> +       {
>                 .procname       = "ip_default_ttl",
>                 .data           = &init_net.ipv4.sysctl_ip_default_ttl,
>                 .maxlen         = sizeof(int),
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 4c60c6f..0dd761c 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1926,13 +1926,21 @@ struct proto tcpv6_prot = {
>         .diag_destroy           = tcp_abort,
>  };
>
> -static const struct inet6_protocol tcpv6_protocol = {
> +static struct inet6_protocol tcpv6_protocol = {
>         .early_demux    =       tcp_v6_early_demux,
>         .handler        =       tcp_v6_rcv,
>         .err_handler    =       tcp_v6_err,
>         .flags          =       INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
>  };
>
> +void tcp_v6_early_demux_configure(int enable)
> +{
> +       if (enable)
> +               tcpv6_protocol.early_demux = tcp_v6_early_demux;
> +       else
> +               tcpv6_protocol.early_demux = NULL;
> +}
> +
>  static struct inet_protosw tcpv6_protosw = {
>         .type           =       SOCK_STREAM,
>         .protocol       =       IPPROTO_TCP,
> --
> 1.9.1
>
Eric Dumazet March 10, 2017, 4:25 a.m. UTC | #2
On Thu, 2017-03-09 at 20:31 -0700, Subash Abhinov Kasiviswanathan wrote:
> Certain system process significant unconnected UDP workload.
> It would be preferrable to disable UDP early demux for those systems
> and enable it for TCP only.

> +void tcp_v4_early_demux_configure(int enable)
> +{
> +	if (enable)
> +		tcp_protocol.early_demux = tcp_v4_early_demux;
> +	else
> +		tcp_protocol.early_demux = NULL;
> +}
> +
> +void udp_v4_early_demux_configure(int enable)
> +{
> +	if (enable)
> +		udp_protocol.early_demux = udp_v4_early_demux;
> +	else
> +		udp_protocol.early_demux = NULL;
> +}
> +


Well, then you need to make sure ipprot->early_demux is read once in the
callers, like ip_rcv_finish(), otherwise compiler could read it twice
and we could deref a NULL pointer.

ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->early_demux) {
    ipprot->early_demux(skb);   // crash
Subash Abhinov Kasiviswanathan March 10, 2017, 5:26 a.m. UTC | #3
On 2017-03-09 20:42, Tom Herbert wrote:
> On Thu, Mar 9, 2017 at 7:31 PM, Subash Abhinov Kasiviswanathan
> <subashab@codeaurora.org> wrote:
>> Certain system process significant unconnected UDP workload.
>> It would be preferrable to disable UDP early demux for those systems
>> and enable it for TCP only.
>> 
> Presumably you want this for performance reasons. Can you provide some
> before and after numbers?

Hi Tom

We are working on UDPv6 performance issues seen on an Android ARM64 
system.
Adding an early demux handler (link below) for it helped to increase 
performance
(800Mbps -> 870Mbps). This helps because Android statistics rules do 
multiple
socket lookup when no socket is associated with the skb.

https://www.mail-archive.com/netdev@vger.kernel.org/msg157003.html

Eric mentioned that server loads usually see more unconnected load and 
he
preferred to turn off early demux for UDP, hence this patch. I don't 
have numbers
for unconnected loads as of now though.

--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a 
Linux Foundation Collaborative Project
Subash Abhinov Kasiviswanathan March 10, 2017, 7:34 a.m. UTC | #4
> Well, then you need to make sure ipprot->early_demux is read once in 
> the
> callers, like ip_rcv_finish(), otherwise compiler could read it twice
> and we could deref a NULL pointer.
> 
> ipprot = rcu_dereference(inet_protos[protocol]);
> if (ipprot && ipprot->early_demux) {
>     ipprot->early_demux(skb);   // crash

Thanks Eric. I'll update this.

--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a 
Linux Foundation Collaborative Project
kernel test robot March 10, 2017, 12:42 p.m. UTC | #5
Hi Subash,

[auto build test ERROR on v4.9-rc8]
[cannot apply to net-next/master net/master next-20170309]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Subash-Abhinov-Kasiviswanathan/net-Add-sysctl-to-toggle-early-demux-for-tcp-and-udp/20170310-184912
config: blackfin-BF561-EZKIT-SMP_defconfig (attached as .config)
compiler: bfin-uclinux-gcc (GCC) 6.2.0
reproduce:
        wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=blackfin 

All errors (new ones prefixed by >>):

   net/built-in.o: In function `proc_tcp_early_demux':
>> net/ipv4/sysctl_net_ipv4.c:267: undefined reference to `tcp_v6_early_demux_configure'

vim +267 net/ipv4/sysctl_net_ipv4.c

   261		ret = proc_dointvec(table, write, buffer, lenp, ppos);
   262	
   263		if (write && !ret) {
   264			int enabled = init_net.ipv4.sysctl_tcp_early_demux;
   265	
   266			tcp_v4_early_demux_configure(enabled);
 > 267			tcp_v6_early_demux_configure(enabled);
   268		}
   269	
   270		return ret;

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot March 10, 2017, 12:44 p.m. UTC | #6
Hi Subash,

[auto build test ERROR on v4.9-rc8]
[cannot apply to net-next/master net/master next-20170309]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Subash-Abhinov-Kasiviswanathan/net-Add-sysctl-to-toggle-early-demux-for-tcp-and-udp/20170310-184912
config: xtensa-common_defconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 4.9.0
reproduce:
        wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   net/built-in.o: In function `genl_unregister_family':
>> (.text+0x3a17c): undefined reference to `tcp_v6_early_demux_configure'
   net/built-in.o: In function `proc_tcp_early_demux':
   sysctl_net_ipv4.c:(.text+0x7046e): undefined reference to `tcp_v6_early_demux_configure'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Tom Herbert March 10, 2017, 4:33 p.m. UTC | #7
On Thu, Mar 9, 2017 at 9:26 PM, Subash Abhinov Kasiviswanathan
<subashab@codeaurora.org> wrote:
> On 2017-03-09 20:42, Tom Herbert wrote:
>>
>> On Thu, Mar 9, 2017 at 7:31 PM, Subash Abhinov Kasiviswanathan
>> <subashab@codeaurora.org> wrote:
>>>
>>> Certain system process significant unconnected UDP workload.
>>> It would be preferrable to disable UDP early demux for those systems
>>> and enable it for TCP only.
>>>
>> Presumably you want this for performance reasons. Can you provide some
>> before and after numbers?
>
>
> Hi Tom
>
> We are working on UDPv6 performance issues seen on an Android ARM64 system.
> Adding an early demux handler (link below) for it helped to increase
> performance
> (800Mbps -> 870Mbps). This helps because Android statistics rules do
> multiple
> socket lookup when no socket is associated with the skb.
>
> https://www.mail-archive.com/netdev@vger.kernel.org/msg157003.html
>
> Eric mentioned that server loads usually see more unconnected load and he
> preferred to turn off early demux for UDP, hence this patch. I don't have
> numbers
> for unconnected loads as of now though.
>
Subash,

Okay, now I'm confused. You're saying that when early demux was added
for IPv6 performance improved, but this patch is allowing early demux
to be disabled on the basis that it hurts performance for unconnected
UDP workloads. While it's true that early demux in the case results in
another UDP lookup, Eric's changes to make it lockless have made that
lookup very cheap. So we really need numbers to justify this patch.

Even if the numbers were to show a benefit, we still have the problem
that this creates a bimodal performance characteristic, e.g. what if
the work load were 1/2 connected and 1/2 unconnected in real life, or
what it the user incorrectly guesses the actual workload. Maybe a
deeper solution to investigate is making early demux work with
unconnected sockets.

Tom

> --
> Qualcomm Innovation Center, Inc.
> The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a
> Linux Foundation Collaborative Project
Eric Dumazet March 11, 2017, 12:22 a.m. UTC | #8
On Fri, 2017-03-10 at 08:33 -0800, Tom Herbert wrote:

> Okay, now I'm confused. You're saying that when early demux was added
> for IPv6 performance improved, but this patch is allowing early demux
> to be disabled on the basis that it hurts performance for unconnected
> UDP workloads. While it's true that early demux in the case results in
> another UDP lookup, Eric's changes to make it lockless have made that
> lookup very cheap. So we really need numbers to justify this patch.
> 

Fact that the lookup is lockless does not avoid a cache line miss.

Early demux computes a hash based on the 4-tuple, and lookups a hash
table with does not fit in cpu caches.

A cache line miss per packet is expensive, when handling millions of UDP
packets per second, (with millions of 4-tuples)

> Even if the numbers were to show a benefit, we still have the problem
> that this creates a bimodal performance characteristic, e.g. what if
> the work load were 1/2 connected and 1/2 unconnected in real life, or
> what it the user incorrectly guesses the actual workload. Maybe a
> deeper solution to investigate is making early demux work with
> unconnected sockets.

Sure, but forcing all UDP applications to perform IP early demux is not
better.
Tom Herbert March 11, 2017, 12:49 a.m. UTC | #9
On Fri, Mar 10, 2017 at 4:22 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2017-03-10 at 08:33 -0800, Tom Herbert wrote:
>
>> Okay, now I'm confused. You're saying that when early demux was added
>> for IPv6 performance improved, but this patch is allowing early demux
>> to be disabled on the basis that it hurts performance for unconnected
>> UDP workloads. While it's true that early demux in the case results in
>> another UDP lookup, Eric's changes to make it lockless have made that
>> lookup very cheap. So we really need numbers to justify this patch.
>>
>
> Fact that the lookup is lockless does not avoid a cache line miss.
>
> Early demux computes a hash based on the 4-tuple, and lookups a hash
> table with does not fit in cpu caches.
>
> A cache line miss per packet is expensive, when handling millions of UDP
> packets per second, (with millions of 4-tuples)
>
>> Even if the numbers were to show a benefit, we still have the problem
>> that this creates a bimodal performance characteristic, e.g. what if
>> the work load were 1/2 connected and 1/2 unconnected in real life, or
>> what it the user incorrectly guesses the actual workload. Maybe a
>> deeper solution to investigate is making early demux work with
>> unconnected sockets.
>
> Sure, but forcing all UDP applications to perform IP early demux is not
> better.
>
All these hypotheses are quite testable, and it should be obvious that
if a patch is supposed to improve performance there should be some
effort to quantify the impact.
Subash Abhinov Kasiviswanathan March 18, 2017, 5:32 p.m. UTC | #10
> All these hypotheses are quite testable, and it should be obvious that
> if a patch is supposed to improve performance there should be some
> effort to quantify the impact.

Hi Tom

I ran a single stream unconnected UDPv4 test on an ARM64 system and I 
see
a slight increase in performance (782 -> 788Mbps) with udp demux 
disabled.
A rudimentary app which does a sendto/recvfrom along with some
statistics collection was used for this.

--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora 
Forum,
a Linux Foundation Collaborative Project
Tom Herbert March 18, 2017, 5:44 p.m. UTC | #11
On Sat, Mar 18, 2017 at 10:32 AM, Subash Abhinov Kasiviswanathan
<subashab@codeaurora.org> wrote:
>> All these hypotheses are quite testable, and it should be obvious that
>> if a patch is supposed to improve performance there should be some
>> effort to quantify the impact.
>
>
> Hi Tom
>
> I ran a single stream unconnected UDPv4 test on an ARM64 system and I see
> a slight increase in performance (782 -> 788Mbps) with udp demux disabled.
> A rudimentary app which does a sendto/recvfrom along with some
> statistics collection was used for this.
>
Less than 1% performance improvement in a benchmark doesn't justify
the complexity of the patch. Eric's hypothesis was that an unconnected
UDP socket may show issues because of cache misses in look-ups due to
so many different sources. This should be fairly easy to benchmark by
randomly setting source address in your test (IP any and routing my
need to be set appropriately).

> --
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
>
> a Linux Foundation Collaborative Project
Subash Abhinov Kasiviswanathan March 19, 2017, 2:07 a.m. UTC | #12
> Less than 1% performance improvement in a benchmark doesn't justify
> the complexity of the patch. Eric's hypothesis was that an unconnected
> UDP socket may show issues because of cache misses in look-ups due to
> so many different sources. This should be fairly easy to benchmark by
> randomly setting source address in your test (IP any and routing my
> need to be set appropriately).
> 

With different source addresses, a larger increase is seen here
(633->654Mbps).

--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora 
Forum,
a Linux Foundation Collaborative Project
Eric Dumazet March 19, 2017, 7:18 p.m. UTC | #13
On Sat, 2017-03-18 at 20:07 -0600, Subash Abhinov Kasiviswanathan wrote:
> > Less than 1% performance improvement in a benchmark doesn't justify
> > the complexity of the patch. Eric's hypothesis was that an unconnected
> > UDP socket may show issues because of cache misses in look-ups due to
> > so many different sources. This should be fairly easy to benchmark by
> > randomly setting source address in your test (IP any and routing my
> > need to be set appropriately).
> > 
> 
> With different source addresses, a larger increase is seen here
> (633->654Mbps).

Yes, also note the extra cache line might be not noticed if the cpu can
keep in its caches the whole UDP hash table (1 MB on 64bit kernels for
the portion touched in IP early demux)

So the exact slowdown depends on CPU generation / cache sizes.
Tom Herbert March 21, 2017, 10:49 p.m. UTC | #14
On Sat, Mar 18, 2017 at 7:07 PM, Subash Abhinov Kasiviswanathan
<subashab@codeaurora.org> wrote:
>> Less than 1% performance improvement in a benchmark doesn't justify
>> the complexity of the patch. Eric's hypothesis was that an unconnected
>> UDP socket may show issues because of cache misses in look-ups due to
>> so many different sources. This should be fairly easy to benchmark by
>> randomly setting source address in your test (IP any and routing my
>> need to be set appropriately).
>>
>
> With different source addresses, a larger increase is seen here
> (633->654Mbps).
>
Thanks for running the tests. It's obviously not a huge win at least
relative to performance improvement we got from early demux, but I
suppose with very specific and engineered loads this might have value.
Please include this is next patch sets.

Generally, I think a good goal moving forward would be a to apply the
0 or 1 times rule for connection lookup. That is for any transport
tuple in a receive packet we want to do at most one connection lookup.
So early demux would need to apply to unconnected sockets and then we
wouldn't have to do the second lookup in UDP (or TCP for a SYN)
receive (note we also do an extra lookup for GRO with UDP
encapsulation). A reason we haven't this before might be that the
lookup may actually find the wrong socket (for example we go into a
different network namespace). Maybe the stack should consider any
lookup result outside of the protocol stack to be provisional (and it
would be super nice if we could somehow cache a dst with an
unconnected socket also ;-) )

Tom

>
> --
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
diff mbox

Patch

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 0378e88..1e74da23 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -86,6 +86,8 @@  struct netns_ipv4 {
 	/* Shall we try to damage output packets if routing dev changes? */
 	int sysctl_ip_dynaddr;
 	int sysctl_ip_early_demux;
+	int sysctl_tcp_early_demux;
+	int sysctl_udp_early_demux;
 
 	int sysctl_fwmark_reflect;
 	int sysctl_tcp_fwmark_accept;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6061963..3b6446d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1953,4 +1953,6 @@  static inline void tcp_listendrop(const struct sock *sk)
 	__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
 }
 
+void tcp_v4_early_demux_configure(int enable);
+void tcp_v6_early_demux_configure(int enable);
 #endif	/* _TCP_H */
diff --git a/include/net/udp.h b/include/net/udp.h
index 1661791..7de31d5 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -373,4 +373,6 @@  struct udp_iter_state {
 #if IS_ENABLED(CONFIG_IPV6)
 void udpv6_encap_enable(void);
 #endif
+
+void udp_v4_early_demux_configure(int enable);
 #endif	/* _UDP_H */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f750698..3e11d74 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1579,7 +1579,7 @@  u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
 };
 #endif
 
-static const struct net_protocol tcp_protocol = {
+static struct net_protocol tcp_protocol = {
 	.early_demux	=	tcp_v4_early_demux,
 	.handler	=	tcp_v4_rcv,
 	.err_handler	=	tcp_v4_err,
@@ -1588,7 +1588,7 @@  u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
 	.icmp_strict_tag_validation = 1,
 };
 
-static const struct net_protocol udp_protocol = {
+static struct net_protocol udp_protocol = {
 	.early_demux =	udp_v4_early_demux,
 	.handler =	udp_rcv,
 	.err_handler =	udp_err,
@@ -1596,6 +1596,22 @@  u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
 	.netns_ok =	1,
 };
 
+void tcp_v4_early_demux_configure(int enable)
+{
+	if (enable)
+		tcp_protocol.early_demux = tcp_v4_early_demux;
+	else
+		tcp_protocol.early_demux = NULL;
+}
+
+void udp_v4_early_demux_configure(int enable)
+{
+	if (enable)
+		udp_protocol.early_demux = udp_v4_early_demux;
+	else
+		udp_protocol.early_demux = NULL;
+}
+
 static const struct net_protocol icmp_protocol = {
 	.handler =	icmp_rcv,
 	.err_handler =	icmp_err,
@@ -1700,6 +1716,8 @@  static __net_init int inet_init_net(struct net *net)
 	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
 	net->ipv4.sysctl_ip_dynaddr = 0;
 	net->ipv4.sysctl_ip_early_demux = 1;
+	net->ipv4.sysctl_udp_early_demux = 1;
+	net->ipv4.sysctl_tcp_early_demux = 1;
 
 	return 0;
 }
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b2fa498..c61383b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -253,6 +253,39 @@  static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
 	return ret;
 }
 
+static int proc_tcp_early_demux(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = 0;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if (write && !ret) {
+		int enabled = init_net.ipv4.sysctl_tcp_early_demux;
+
+		tcp_v4_early_demux_configure(enabled);
+		tcp_v6_early_demux_configure(enabled);
+	}
+
+	return ret;
+}
+
+static int proc_udp_early_demux(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = 0;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if (write && !ret) {
+		int enabled = init_net.ipv4.sysctl_udp_early_demux;
+
+		udp_v4_early_demux_configure(enabled);
+	}
+
+	return ret;
+}
+
 static struct ctl_table ipv4_table[] = {
 	{
 		.procname	= "tcp_timestamps",
@@ -737,6 +770,20 @@  static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
 		.proc_handler	= proc_dointvec
 	},
 	{
+		.procname       = "udp_early_demux",
+		.data           = &init_net.ipv4.sysctl_udp_early_demux,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_udp_early_demux
+	},
+	{
+		.procname       = "tcp_early_demux",
+		.data           = &init_net.ipv4.sysctl_tcp_early_demux,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_tcp_early_demux
+	},
+	{
 		.procname	= "ip_default_ttl",
 		.data		= &init_net.ipv4.sysctl_ip_default_ttl,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4c60c6f..0dd761c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1926,13 +1926,21 @@  struct proto tcpv6_prot = {
 	.diag_destroy		= tcp_abort,
 };
 
-static const struct inet6_protocol tcpv6_protocol = {
+static struct inet6_protocol tcpv6_protocol = {
 	.early_demux	=	tcp_v6_early_demux,
 	.handler	=	tcp_v6_rcv,
 	.err_handler	=	tcp_v6_err,
 	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
 };
 
+void tcp_v6_early_demux_configure(int enable)
+{
+	if (enable)
+		tcpv6_protocol.early_demux = tcp_v6_early_demux;
+	else
+		tcpv6_protocol.early_demux = NULL;
+}
+
 static struct inet_protosw tcpv6_protosw = {
 	.type		=	SOCK_STREAM,
 	.protocol	=	IPPROTO_TCP,