Message ID | 1444318627-27883-2-git-send-email-edumazet@google.com |
---|---|
State | Superseded, archived |
Delegated to: | David Miller |
Headers | show |
On Thu, Oct 8, 2015 at 8:37 AM, Eric Dumazet <edumazet@google.com> wrote: > SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command > to fetch incoming cpu handling a particular TCP flow after accept() > > This commits adds setsockopt() support and extends SO_REUSEPORT selection > logic : If a TCP listener or UDP socket has this option set, a packet is > delivered to this socket only if CPU handling the packet matches the specified one. > > This allows to build very efficient TCP servers, using one thread per cpu, > as the associated TCP listener should only accept flows handled in softirq > by the same cpu. This provides optimal NUMA/SMP behavior and keep cpu caches hot. > Please look again at my SO_INCOMING_CPU_MASK patches to see it these will work. I believe SO_INCOMING_CPU setsockopt is probably a subset of that functionality. A single CPU assigned to socket forces an application design to have one thread per CPU-- this may be overkill. It's probably be sufficient in many cases to have just one listener thread per NUMA node. Thanks, Tom > Note that __inet_lookup_listener() still has to iterate over the list of > all listeners. Following patch puts sk_refcnt in a different cache line > to let this iteration hit only shared and read mostly cache lines. > > Signed-off-by: Eric Dumazet <edumazet@google.com> > --- > include/net/sock.h | 11 +++++------ > net/core/sock.c | 5 +++++ > net/ipv4/inet_hashtables.c | 5 +++++ > net/ipv4/udp.c | 12 +++++++++++- > net/ipv6/inet6_hashtables.c | 5 +++++ > net/ipv6/udp.c | 11 +++++++++++ > 6 files changed, 42 insertions(+), 7 deletions(-) > > diff --git a/include/net/sock.h b/include/net/sock.h > index dfe2eb8e1132..00f60bea983b 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -150,6 +150,7 @@ typedef __u64 __bitwise __addrpair; > * @skc_node: main hash linkage for various protocol lookup tables > * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol > * @skc_tx_queue_mapping: tx queue number for this connection > + * @skc_incoming_cpu: record/match cpu processing incoming packets > * @skc_refcnt: reference count > * > * This is the minimal network layer representation of sockets, the header > @@ -212,6 +213,8 @@ struct sock_common { > struct hlist_nulls_node skc_nulls_node; > }; > int skc_tx_queue_mapping; > + int skc_incoming_cpu; > + > atomic_t skc_refcnt; > /* private: */ > int skc_dontcopy_end[0]; > @@ -274,7 +277,7 @@ struct cg_proto; > * @sk_rcvtimeo: %SO_RCVTIMEO setting > * @sk_sndtimeo: %SO_SNDTIMEO setting > * @sk_rxhash: flow hash received from netif layer > - * @sk_incoming_cpu: record cpu processing incoming packets > + * @sk_incoming_cpu: record/match cpu processing incoming packets > * @sk_txhash: computed flow hash for use on transmit > * @sk_filter: socket filtering instructions > * @sk_timer: sock cleanup timer > @@ -331,6 +334,7 @@ struct sock { > #define sk_v6_daddr __sk_common.skc_v6_daddr > #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr > #define sk_cookie __sk_common.skc_cookie > +#define sk_incoming_cpu __sk_common.skc_incoming_cpu > > socket_lock_t sk_lock; > struct sk_buff_head sk_receive_queue; > @@ -353,11 +357,6 @@ struct sock { > #ifdef CONFIG_RPS > __u32 sk_rxhash; > #endif > - u16 sk_incoming_cpu; > - /* 16bit hole > - * Warned : sk_incoming_cpu can be set from softirq, > - * Do not use this hole without fully understanding possible issues. > - */ > > __u32 sk_txhash; > #ifdef CONFIG_NET_RX_BUSY_POLL > diff --git a/net/core/sock.c b/net/core/sock.c > index 7dd1263e4c24..1071f9380250 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -988,6 +988,10 @@ set_rcvbuf: > sk->sk_max_pacing_rate); > break; > > + case SO_INCOMING_CPU: > + sk->sk_incoming_cpu = val; > + break; > + > default: > ret = -ENOPROTOOPT; > break; > @@ -2353,6 +2357,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) > > sk->sk_max_pacing_rate = ~0U; > sk->sk_pacing_rate = ~0U; > + sk->sk_incoming_cpu = -1; > /* > * Before updating sk_refcnt, we must commit prior changes to memory > * (Documentation/RCU/rculist_nulls.txt for details) > diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c > index bed8886a4b6c..eabcfbc13afb 100644 > --- a/net/ipv4/inet_hashtables.c > +++ b/net/ipv4/inet_hashtables.c > @@ -185,6 +185,11 @@ static inline int compute_score(struct sock *sk, struct net *net, > return -1; > score += 4; > } > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > } > return score; > } > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index e1fc129099ea..de675b796f78 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -375,7 +375,11 @@ static inline int compute_score(struct sock *sk, struct net *net, > return -1; > score += 4; > } > - > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > return score; > } > > @@ -419,6 +423,12 @@ static inline int compute_score2(struct sock *sk, struct net *net, > score += 4; > } > > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > + > return score; > } > > diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c > index 6ac8dad0138a..af3d7f826bff 100644 > --- a/net/ipv6/inet6_hashtables.c > +++ b/net/ipv6/inet6_hashtables.c > @@ -114,6 +114,11 @@ static inline int compute_score(struct sock *sk, struct net *net, > return -1; > score++; > } > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > } > return score; > } > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c > index 0aba654f5b91..222fdc780405 100644 > --- a/net/ipv6/udp.c > +++ b/net/ipv6/udp.c > @@ -182,6 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net, > score++; > } > > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > + > return score; > } > > @@ -223,6 +229,11 @@ static inline int compute_score2(struct sock *sk, struct net *net, > score++; > } > > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > return score; > } > > -- > 2.6.0.rc2.230.g3dd15c0 > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Eric,
[auto build test WARNING on net-next/master -- if it's inappropriate base, please ignore]
reproduce: make htmldocs
All warnings (new ones prefixed by >>):
include/linux/skbuff.h:833: warning: No description found for parameter 'sk'
>> include/net/sock.h:442: warning: Excess struct/union/enum/typedef member 'sk_incoming_cpu' description in 'sock'
net/core/gen_stats.c:155: warning: No description found for parameter 'cpu'
net/core/gen_estimator.c:212: warning: No description found for parameter 'cpu_bstats'
net/core/gen_estimator.c:303: warning: No description found for parameter 'cpu_bstats'
net/core/dev.c:6102: warning: No description found for parameter 'len'
include/linux/netdevice.h:1291: warning: Enum value 'IFF_XMIT_DST_RELEASE_PERM' not described in enum 'netdev_priv_flags'
include/linux/netdevice.h:1291: warning: Enum value 'IFF_IPVLAN_MASTER' not described in enum 'netdev_priv_flags'
include/linux/netdevice.h:1291: warning: Enum value 'IFF_IPVLAN_SLAVE' not described in enum 'netdev_priv_flags'
include/linux/netdevice.h:1791: warning: No description found for parameter 'ptype_all'
include/linux/netdevice.h:1791: warning: No description found for parameter 'ptype_specific'
vim +442 include/net/sock.h
^1da177e4 Linus Torvalds 2005-04-16 426 int sk_write_pending;
d5f642384 Alexey Dobriyan 2008-11-04 427 #ifdef CONFIG_SECURITY
^1da177e4 Linus Torvalds 2005-04-16 428 void *sk_security;
d5f642384 Alexey Dobriyan 2008-11-04 429 #endif
4a19ec580 Laszlo Attila Toth 2008-01-30 430 __u32 sk_mark;
e181a5430 Mathias Krause 2015-07-19 431 #ifdef CONFIG_CGROUP_NET_CLASSID
f84517253 Herbert Xu 2010-05-24 432 u32 sk_classid;
e181a5430 Mathias Krause 2015-07-19 433 #endif
e1aab161e Glauber Costa 2011-12-11 434 struct cg_proto *sk_cgrp;
^1da177e4 Linus Torvalds 2005-04-16 435 void (*sk_state_change)(struct sock *sk);
676d23690 David S. Miller 2014-04-11 436 void (*sk_data_ready)(struct sock *sk);
^1da177e4 Linus Torvalds 2005-04-16 437 void (*sk_write_space)(struct sock *sk);
^1da177e4 Linus Torvalds 2005-04-16 438 void (*sk_error_report)(struct sock *sk);
^1da177e4 Linus Torvalds 2005-04-16 439 int (*sk_backlog_rcv)(struct sock *sk,
^1da177e4 Linus Torvalds 2005-04-16 440 struct sk_buff *skb);
^1da177e4 Linus Torvalds 2005-04-16 441 void (*sk_destruct)(struct sock *sk);
^1da177e4 Linus Torvalds 2005-04-16 @442 };
^1da177e4 Linus Torvalds 2005-04-16 443
559835ea7 Pravin B Shelar 2013-09-24 444 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
559835ea7 Pravin B Shelar 2013-09-24 445
559835ea7 Pravin B Shelar 2013-09-24 446 #define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk)))
559835ea7 Pravin B Shelar 2013-09-24 447 #define rcu_assign_sk_user_data(sk, ptr) rcu_assign_pointer(__sk_user_data((sk)), ptr)
559835ea7 Pravin B Shelar 2013-09-24 448
4a17fd522 Pavel Emelyanov 2012-04-19 449 /*
4a17fd522 Pavel Emelyanov 2012-04-19 450 * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
:::::: The code at line 442 was first introduced by commit
:::::: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Linux-2.6.12-rc2
:::::: TO: Linus Torvalds <torvalds@ppc970.osdl.org>
:::::: CC: Linus Torvalds <torvalds@ppc970.osdl.org>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
On Thu, 2015-10-08 at 09:03 -0700, Tom Herbert wrote: > On Thu, Oct 8, 2015 at 8:37 AM, Eric Dumazet <edumazet@google.com> wrote: > > SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command > > to fetch incoming cpu handling a particular TCP flow after accept() > > > > This commits adds setsockopt() support and extends SO_REUSEPORT selection > > logic : If a TCP listener or UDP socket has this option set, a packet is > > delivered to this socket only if CPU handling the packet matches the specified one. > > > > This allows to build very efficient TCP servers, using one thread per cpu, > > as the associated TCP listener should only accept flows handled in softirq > > by the same cpu. This provides optimal NUMA/SMP behavior and keep cpu caches hot. > > > Please look again at my SO_INCOMING_CPU_MASK patches to see it these > will work. I believe SO_INCOMING_CPU setsockopt is probably a subset > of that functionality. A single CPU assigned to socket forces an > application design to have one thread per CPU-- this may be overkill. > It's probably be sufficient in many cases to have just one listener > thread per NUMA node. I think you misunderstood my patch. For optimal behavior against DDOS, you need one TCP _listener_ per RX queue on the NIC. Not one listener per cpu ! You can then have multiple threads servicing requests coming for a particular listener. Or one thread (cpu) servicing requests coming from multiple TCP listeners. It's as you said an application choice/design. Say you have 32 cpu (hyper threads) and 8 RX queues, you can create groups of 4 threads per RX queue. Or one thread servicing all accept() If you use 2 listeners (one per NUMA node), performance is not so good, and SO_INCOMING_CPU_MASK adds an extra indirection, scaling poorly with say 64 listeners. Ying Cai has a patch to add an array selection (instead of having to parse the list, yet to be rebased) I am afraid your choice of SO_INCOMING_CPU_MASK might have been driven by the necessity of keeping the list short, but this is about to be a non issue. Thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Oct 8, 2015 at 9:29 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > On Thu, 2015-10-08 at 09:03 -0700, Tom Herbert wrote: >> On Thu, Oct 8, 2015 at 8:37 AM, Eric Dumazet <edumazet@google.com> wrote: >> > SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command >> > to fetch incoming cpu handling a particular TCP flow after accept() >> > >> > This commits adds setsockopt() support and extends SO_REUSEPORT selection >> > logic : If a TCP listener or UDP socket has this option set, a packet is >> > delivered to this socket only if CPU handling the packet matches the specified one. >> > >> > This allows to build very efficient TCP servers, using one thread per cpu, >> > as the associated TCP listener should only accept flows handled in softirq >> > by the same cpu. This provides optimal NUMA/SMP behavior and keep cpu caches hot. >> > >> Please look again at my SO_INCOMING_CPU_MASK patches to see it these >> will work. I believe SO_INCOMING_CPU setsockopt is probably a subset >> of that functionality. A single CPU assigned to socket forces an >> application design to have one thread per CPU-- this may be overkill. >> It's probably be sufficient in many cases to have just one listener >> thread per NUMA node. > > > I think you misunderstood my patch. > > For optimal behavior against DDOS, you need one TCP _listener_ per RX > queue on the NIC. > I see. We are not using SO_INCOMING_CPU_MASK as a defense against DDOS. It's used ensure affinity in application connection processing between CPUs. For instance, if we have two NUMA nodes we can start two instances of the application bound to each node and then use SO_REUSEPORT and SO_INCOMING_CPU_MASK to ensure connections are processed on the the same NUMA node. Packets crossing NUMA boundaries even with RFS is painful. Tom -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2015-10-08 at 09:29 -0700, Eric Dumazet wrote: > For optimal behavior against DDOS, you need one TCP _listener_ per RX > queue on the NIC. I will reword the changelog to make this clear, and fix the htmldocs warning reported by kbuild in V2. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2015-10-08 at 09:44 -0700, Tom Herbert wrote: > I see. We are not using SO_INCOMING_CPU_MASK as a defense against > DDOS. It's used ensure affinity in application connection processing > between CPUs. For instance, if we have two NUMA nodes we can start two > instances of the application bound to each node and then use > SO_REUSEPORT and SO_INCOMING_CPU_MASK to ensure connections are > processed on the the same NUMA node. Packets crossing NUMA boundaries > even with RFS is painful. Then maybe you need something simpler than a mask of cpus, like SO_INCOMING_NODE ? Note that this could also be automatically tuned. If you have a bunch of listeners on one port (could be 2), then the underlying node on which TCP socket was allocated could be used as a score modifier in compute_score() -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Oct 8, 2015 at 10:00 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > On Thu, 2015-10-08 at 09:44 -0700, Tom Herbert wrote: > >> I see. We are not using SO_INCOMING_CPU_MASK as a defense against >> DDOS. It's used ensure affinity in application connection processing >> between CPUs. For instance, if we have two NUMA nodes we can start two >> instances of the application bound to each node and then use >> SO_REUSEPORT and SO_INCOMING_CPU_MASK to ensure connections are >> processed on the the same NUMA node. Packets crossing NUMA boundaries >> even with RFS is painful. > > Then maybe you need something simpler than a mask of cpus, like > SO_INCOMING_NODE ? > > Note that this could also be automatically tuned. > > If you have a bunch of listeners on one port (could be 2), then the > underlying node on which TCP socket was allocated could be used as a > score modifier in compute_score() > Maybe, but I really like the simplicity of SO_INCOMING_CPU_MASK in that it doesn't depend on NUMA configuration or RSS configuration. It's very easy for the application to just set the same incoming CPU mask to be the same CPU mask that the listener thread is bound to. We've seen some nice performance benefits in doing with memcached, that is to run an instances bound to the set of CPUs for each set NUMA node and bind listener threads to the same set. Tom > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Oct 8, 2015 at 8:37 AM, Eric Dumazet <edumazet@google.com> wrote: > SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command > to fetch incoming cpu handling a particular TCP flow after accept() > > This commits adds setsockopt() support and extends SO_REUSEPORT selection > logic : If a TCP listener or UDP socket has this option set, a packet is > delivered to this socket only if CPU handling the packet matches the specified one. > > This allows to build very efficient TCP servers, using one thread per cpu, > as the associated TCP listener should only accept flows handled in softirq > by the same cpu. This provides optimal NUMA/SMP behavior and keep cpu caches hot. > > Note that __inet_lookup_listener() still has to iterate over the list of > all listeners. Following patch puts sk_refcnt in a different cache line > to let this iteration hit only shared and read mostly cache lines. > > Signed-off-by: Eric Dumazet <edumazet@google.com> > --- > include/net/sock.h | 11 +++++------ > net/core/sock.c | 5 +++++ > net/ipv4/inet_hashtables.c | 5 +++++ > net/ipv4/udp.c | 12 +++++++++++- > net/ipv6/inet6_hashtables.c | 5 +++++ > net/ipv6/udp.c | 11 +++++++++++ > 6 files changed, 42 insertions(+), 7 deletions(-) > > diff --git a/include/net/sock.h b/include/net/sock.h > index dfe2eb8e1132..00f60bea983b 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -150,6 +150,7 @@ typedef __u64 __bitwise __addrpair; > * @skc_node: main hash linkage for various protocol lookup tables > * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol > * @skc_tx_queue_mapping: tx queue number for this connection > + * @skc_incoming_cpu: record/match cpu processing incoming packets > * @skc_refcnt: reference count > * > * This is the minimal network layer representation of sockets, the header > @@ -212,6 +213,8 @@ struct sock_common { > struct hlist_nulls_node skc_nulls_node; > }; > int skc_tx_queue_mapping; > + int skc_incoming_cpu; > + > atomic_t skc_refcnt; > /* private: */ > int skc_dontcopy_end[0]; > @@ -274,7 +277,7 @@ struct cg_proto; > * @sk_rcvtimeo: %SO_RCVTIMEO setting > * @sk_sndtimeo: %SO_SNDTIMEO setting > * @sk_rxhash: flow hash received from netif layer > - * @sk_incoming_cpu: record cpu processing incoming packets > + * @sk_incoming_cpu: record/match cpu processing incoming packets > * @sk_txhash: computed flow hash for use on transmit > * @sk_filter: socket filtering instructions > * @sk_timer: sock cleanup timer > @@ -331,6 +334,7 @@ struct sock { > #define sk_v6_daddr __sk_common.skc_v6_daddr > #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr > #define sk_cookie __sk_common.skc_cookie > +#define sk_incoming_cpu __sk_common.skc_incoming_cpu > > socket_lock_t sk_lock; > struct sk_buff_head sk_receive_queue; > @@ -353,11 +357,6 @@ struct sock { > #ifdef CONFIG_RPS > __u32 sk_rxhash; > #endif > - u16 sk_incoming_cpu; > - /* 16bit hole > - * Warned : sk_incoming_cpu can be set from softirq, > - * Do not use this hole without fully understanding possible issues. > - */ > > __u32 sk_txhash; > #ifdef CONFIG_NET_RX_BUSY_POLL > diff --git a/net/core/sock.c b/net/core/sock.c > index 7dd1263e4c24..1071f9380250 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -988,6 +988,10 @@ set_rcvbuf: > sk->sk_max_pacing_rate); > break; > > + case SO_INCOMING_CPU: > + sk->sk_incoming_cpu = val; > + break; > + > default: > ret = -ENOPROTOOPT; > break; > @@ -2353,6 +2357,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) > > sk->sk_max_pacing_rate = ~0U; > sk->sk_pacing_rate = ~0U; > + sk->sk_incoming_cpu = -1; > /* > * Before updating sk_refcnt, we must commit prior changes to memory > * (Documentation/RCU/rculist_nulls.txt for details) > diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c > index bed8886a4b6c..eabcfbc13afb 100644 > --- a/net/ipv4/inet_hashtables.c > +++ b/net/ipv4/inet_hashtables.c > @@ -185,6 +185,11 @@ static inline int compute_score(struct sock *sk, struct net *net, > return -1; > score += 4; > } > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } If the incoming CPU is set for a connected UDP via sk_incoming_cpu_update, wouldn't this check subsequently _only_ allow packets for that socket to come from the same CPU? Also, the check seems a little austere. Why not do something like: if (sk->sk_incoming_cpu != -1) { if (sk->sk_incoming_cpu != raw_smp_processor_id()) score += 4; } My worry is that the packet steering configuration may change without the application's knowledge, so it's possible packets may come in on CPUs that the are unexpected to the application and then they would be dropped without matching a socket. I suppose that this could work with the original patch if a socket is bound to every CPU or there is at least one listener socket that is not bound to any CPU. > } > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index e1fc129099ea..de675b796f78 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -375,7 +375,11 @@ static inline int compute_score(struct sock *sk, struct net *net, > return -1; > score += 4; > } > - > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > return score; > } > > @@ -419,6 +423,12 @@ static inline int compute_score2(struct sock *sk, struct net *net, > score += 4; > } > > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > + > return score; > } > > diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c > index 6ac8dad0138a..af3d7f826bff 100644 > --- a/net/ipv6/inet6_hashtables.c > +++ b/net/ipv6/inet6_hashtables.c > @@ -114,6 +114,11 @@ static inline int compute_score(struct sock *sk, struct net *net, > return -1; > score++; > } > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > } > return score; > } > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c > index 0aba654f5b91..222fdc780405 100644 > --- a/net/ipv6/udp.c > +++ b/net/ipv6/udp.c > @@ -182,6 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net, > score++; > } > > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > + > return score; > } > > @@ -223,6 +229,11 @@ static inline int compute_score2(struct sock *sk, struct net *net, > score++; > } > > + if (sk->sk_incoming_cpu != -1) { > + if (sk->sk_incoming_cpu != raw_smp_processor_id()) > + return -1; > + score++; > + } > return score; > } > > -- > 2.6.0.rc2.230.g3dd15c0 > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2015-10-08 at 13:53 -0700, Tom Herbert wrote: > If the incoming CPU is set for a connected UDP via > sk_incoming_cpu_update, wouldn't this check subsequently _only_ allow > packets for that socket to come from the same CPU? > Hmm, I thought the SO_REUSEPORT path would be taken only for non connected UDP sockets (like TCP listeners.). But you might be right ! > Also, the check seems a little austere. Why not do something like: > > if (sk->sk_incoming_cpu != -1) { > if (sk->sk_incoming_cpu != raw_smp_processor_id()) > score += 4; > } > > My worry is that the packet steering configuration may change without > the application's knowledge, so it's possible packets may come in on > CPUs that the are unexpected to the application and then they would be > dropped without matching a socket. I suppose that this could work with > the original patch if a socket is bound to every CPU or there is at > least one listener socket that is not bound to any CPU. This is what I initially wrote, then I attempted a short cut, (abort full list scan), then forgot to re-instate the first try, when I decided to let this for future patch (Ying patch) if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; (Note we do not even have to test for sk_incoming_cpu == -1 in this variant) I'll include this in v2. Thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/net/sock.h b/include/net/sock.h index dfe2eb8e1132..00f60bea983b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -150,6 +150,7 @@ typedef __u64 __bitwise __addrpair; * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection + * @skc_incoming_cpu: record/match cpu processing incoming packets * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header @@ -212,6 +213,8 @@ struct sock_common { struct hlist_nulls_node skc_nulls_node; }; int skc_tx_queue_mapping; + int skc_incoming_cpu; + atomic_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; @@ -274,7 +277,7 @@ struct cg_proto; * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_rxhash: flow hash received from netif layer - * @sk_incoming_cpu: record cpu processing incoming packets + * @sk_incoming_cpu: record/match cpu processing incoming packets * @sk_txhash: computed flow hash for use on transmit * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer @@ -331,6 +334,7 @@ struct sock { #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_cookie __sk_common.skc_cookie +#define sk_incoming_cpu __sk_common.skc_incoming_cpu socket_lock_t sk_lock; struct sk_buff_head sk_receive_queue; @@ -353,11 +357,6 @@ struct sock { #ifdef CONFIG_RPS __u32 sk_rxhash; #endif - u16 sk_incoming_cpu; - /* 16bit hole - * Warned : sk_incoming_cpu can be set from softirq, - * Do not use this hole without fully understanding possible issues. - */ __u32 sk_txhash; #ifdef CONFIG_NET_RX_BUSY_POLL diff --git a/net/core/sock.c b/net/core/sock.c index 7dd1263e4c24..1071f9380250 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -988,6 +988,10 @@ set_rcvbuf: sk->sk_max_pacing_rate); break; + case SO_INCOMING_CPU: + sk->sk_incoming_cpu = val; + break; + default: ret = -ENOPROTOOPT; break; @@ -2353,6 +2357,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; + sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index bed8886a4b6c..eabcfbc13afb 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -185,6 +185,11 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } + if (sk->sk_incoming_cpu != -1) { + if (sk->sk_incoming_cpu != raw_smp_processor_id()) + return -1; + score++; + } } return score; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e1fc129099ea..de675b796f78 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -375,7 +375,11 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - + if (sk->sk_incoming_cpu != -1) { + if (sk->sk_incoming_cpu != raw_smp_processor_id()) + return -1; + score++; + } return score; } @@ -419,6 +423,12 @@ static inline int compute_score2(struct sock *sk, struct net *net, score += 4; } + if (sk->sk_incoming_cpu != -1) { + if (sk->sk_incoming_cpu != raw_smp_processor_id()) + return -1; + score++; + } + return score; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 6ac8dad0138a..af3d7f826bff 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -114,6 +114,11 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } + if (sk->sk_incoming_cpu != -1) { + if (sk->sk_incoming_cpu != raw_smp_processor_id()) + return -1; + score++; + } } return score; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0aba654f5b91..222fdc780405 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -182,6 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net, score++; } + if (sk->sk_incoming_cpu != -1) { + if (sk->sk_incoming_cpu != raw_smp_processor_id()) + return -1; + score++; + } + return score; } @@ -223,6 +229,11 @@ static inline int compute_score2(struct sock *sk, struct net *net, score++; } + if (sk->sk_incoming_cpu != -1) { + if (sk->sk_incoming_cpu != raw_smp_processor_id()) + return -1; + score++; + } return score; }
SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command to fetch incoming cpu handling a particular TCP flow after accept() This commits adds setsockopt() support and extends SO_REUSEPORT selection logic : If a TCP listener or UDP socket has this option set, a packet is delivered to this socket only if CPU handling the packet matches the specified one. This allows to build very efficient TCP servers, using one thread per cpu, as the associated TCP listener should only accept flows handled in softirq by the same cpu. This provides optimal NUMA/SMP behavior and keep cpu caches hot. Note that __inet_lookup_listener() still has to iterate over the list of all listeners. Following patch puts sk_refcnt in a different cache line to let this iteration hit only shared and read mostly cache lines. Signed-off-by: Eric Dumazet <edumazet@google.com> --- include/net/sock.h | 11 +++++------ net/core/sock.c | 5 +++++ net/ipv4/inet_hashtables.c | 5 +++++ net/ipv4/udp.c | 12 +++++++++++- net/ipv6/inet6_hashtables.c | 5 +++++ net/ipv6/udp.c | 11 +++++++++++ 6 files changed, 42 insertions(+), 7 deletions(-)