diff mbox

[net-next,1/4] net: SO_INCOMING_CPU setsockopt() support

Message ID 1444318627-27883-2-git-send-email-edumazet@google.com
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Oct. 8, 2015, 3:37 p.m. UTC
SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command
to fetch incoming cpu handling a particular TCP flow after accept()

This commits adds setsockopt() support and extends SO_REUSEPORT selection
logic : If a TCP listener or UDP socket has this option set, a packet is
delivered to this socket only if CPU handling the packet matches the specified one.

This allows to build very efficient TCP servers, using one thread per cpu,
as the associated TCP listener should only accept flows handled in softirq
by the same cpu. This provides optimal NUMA/SMP behavior and keep cpu caches hot.

Note that __inet_lookup_listener() still has to iterate over the list of
all listeners. Following patch puts sk_refcnt in a different cache line
to let this iteration hit only shared and read mostly cache lines.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sock.h          | 11 +++++------
 net/core/sock.c             |  5 +++++
 net/ipv4/inet_hashtables.c  |  5 +++++
 net/ipv4/udp.c              | 12 +++++++++++-
 net/ipv6/inet6_hashtables.c |  5 +++++
 net/ipv6/udp.c              | 11 +++++++++++
 6 files changed, 42 insertions(+), 7 deletions(-)

Comments

Tom Herbert Oct. 8, 2015, 4:03 p.m. UTC | #1
On Thu, Oct 8, 2015 at 8:37 AM, Eric Dumazet <edumazet@google.com> wrote:
> SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command
> to fetch incoming cpu handling a particular TCP flow after accept()
>
> This commits adds setsockopt() support and extends SO_REUSEPORT selection
> logic : If a TCP listener or UDP socket has this option set, a packet is
> delivered to this socket only if CPU handling the packet matches the specified one.
>
> This allows to build very efficient TCP servers, using one thread per cpu,
> as the associated TCP listener should only accept flows handled in softirq
> by the same cpu. This provides optimal NUMA/SMP behavior and keep cpu caches hot.
>
Please look again at my SO_INCOMING_CPU_MASK patches to see it these
will work. I believe SO_INCOMING_CPU setsockopt is probably a subset
of that functionality. A single CPU assigned to socket forces an
application design to have one thread per CPU-- this may be overkill.
It's probably be sufficient in many cases to have just one listener
thread per NUMA node.

Thanks,
Tom

> Note that __inet_lookup_listener() still has to iterate over the list of
> all listeners. Following patch puts sk_refcnt in a different cache line
> to let this iteration hit only shared and read mostly cache lines.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/net/sock.h          | 11 +++++------
>  net/core/sock.c             |  5 +++++
>  net/ipv4/inet_hashtables.c  |  5 +++++
>  net/ipv4/udp.c              | 12 +++++++++++-
>  net/ipv6/inet6_hashtables.c |  5 +++++
>  net/ipv6/udp.c              | 11 +++++++++++
>  6 files changed, 42 insertions(+), 7 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dfe2eb8e1132..00f60bea983b 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -150,6 +150,7 @@ typedef __u64 __bitwise __addrpair;
>   *     @skc_node: main hash linkage for various protocol lookup tables
>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>   *     @skc_tx_queue_mapping: tx queue number for this connection
> + *     @skc_incoming_cpu: record/match cpu processing incoming packets
>   *     @skc_refcnt: reference count
>   *
>   *     This is the minimal network layer representation of sockets, the header
> @@ -212,6 +213,8 @@ struct sock_common {
>                 struct hlist_nulls_node skc_nulls_node;
>         };
>         int                     skc_tx_queue_mapping;
> +       int                     skc_incoming_cpu;
> +
>         atomic_t                skc_refcnt;
>         /* private: */
>         int                     skc_dontcopy_end[0];
> @@ -274,7 +277,7 @@ struct cg_proto;
>    *    @sk_rcvtimeo: %SO_RCVTIMEO setting
>    *    @sk_sndtimeo: %SO_SNDTIMEO setting
>    *    @sk_rxhash: flow hash received from netif layer
> -  *    @sk_incoming_cpu: record cpu processing incoming packets
> +  *    @sk_incoming_cpu: record/match cpu processing incoming packets
>    *    @sk_txhash: computed flow hash for use on transmit
>    *    @sk_filter: socket filtering instructions
>    *    @sk_timer: sock cleanup timer
> @@ -331,6 +334,7 @@ struct sock {
>  #define sk_v6_daddr            __sk_common.skc_v6_daddr
>  #define sk_v6_rcv_saddr        __sk_common.skc_v6_rcv_saddr
>  #define sk_cookie              __sk_common.skc_cookie
> +#define sk_incoming_cpu                __sk_common.skc_incoming_cpu
>
>         socket_lock_t           sk_lock;
>         struct sk_buff_head     sk_receive_queue;
> @@ -353,11 +357,6 @@ struct sock {
>  #ifdef CONFIG_RPS
>         __u32                   sk_rxhash;
>  #endif
> -       u16                     sk_incoming_cpu;
> -       /* 16bit hole
> -        * Warned : sk_incoming_cpu can be set from softirq,
> -        * Do not use this hole without fully understanding possible issues.
> -        */
>
>         __u32                   sk_txhash;
>  #ifdef CONFIG_NET_RX_BUSY_POLL
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 7dd1263e4c24..1071f9380250 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -988,6 +988,10 @@ set_rcvbuf:
>                                          sk->sk_max_pacing_rate);
>                 break;
>
> +       case SO_INCOMING_CPU:
> +               sk->sk_incoming_cpu = val;
> +               break;
> +
>         default:
>                 ret = -ENOPROTOOPT;
>                 break;
> @@ -2353,6 +2357,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>
>         sk->sk_max_pacing_rate = ~0U;
>         sk->sk_pacing_rate = ~0U;
> +       sk->sk_incoming_cpu = -1;
>         /*
>          * Before updating sk_refcnt, we must commit prior changes to memory
>          * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index bed8886a4b6c..eabcfbc13afb 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -185,6 +185,11 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                                 return -1;
>                         score += 4;
>                 }
> +               if (sk->sk_incoming_cpu != -1) {
> +                       if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                               return -1;
> +                       score++;
> +               }
>         }
>         return score;
>  }
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index e1fc129099ea..de675b796f78 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -375,7 +375,11 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                         return -1;
>                 score += 4;
>         }
> -
> +       if (sk->sk_incoming_cpu != -1) {
> +               if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                       return -1;
> +               score++;
> +       }
>         return score;
>  }
>
> @@ -419,6 +423,12 @@ static inline int compute_score2(struct sock *sk, struct net *net,
>                 score += 4;
>         }
>
> +       if (sk->sk_incoming_cpu != -1) {
> +               if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                       return -1;
> +               score++;
> +       }
> +
>         return score;
>  }
>
> diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
> index 6ac8dad0138a..af3d7f826bff 100644
> --- a/net/ipv6/inet6_hashtables.c
> +++ b/net/ipv6/inet6_hashtables.c
> @@ -114,6 +114,11 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                                 return -1;
>                         score++;
>                 }
> +               if (sk->sk_incoming_cpu != -1) {
> +                       if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                               return -1;
> +                       score++;
> +               }
>         }
>         return score;
>  }
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 0aba654f5b91..222fdc780405 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -182,6 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                 score++;
>         }
>
> +       if (sk->sk_incoming_cpu != -1) {
> +               if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                       return -1;
> +               score++;
> +       }
> +
>         return score;
>  }
>
> @@ -223,6 +229,11 @@ static inline int compute_score2(struct sock *sk, struct net *net,
>                 score++;
>         }
>
> +       if (sk->sk_incoming_cpu != -1) {
> +               if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                       return -1;
> +               score++;
> +       }
>         return score;
>  }
>
> --
> 2.6.0.rc2.230.g3dd15c0
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
kernel test robot Oct. 8, 2015, 4:07 p.m. UTC | #2
Hi Eric,

[auto build test WARNING on net-next/master -- if it's inappropriate base, please ignore]

reproduce: make htmldocs

All warnings (new ones prefixed by >>):

   include/linux/skbuff.h:833: warning: No description found for parameter 'sk'
>> include/net/sock.h:442: warning: Excess struct/union/enum/typedef member 'sk_incoming_cpu' description in 'sock'
   net/core/gen_stats.c:155: warning: No description found for parameter 'cpu'
   net/core/gen_estimator.c:212: warning: No description found for parameter 'cpu_bstats'
   net/core/gen_estimator.c:303: warning: No description found for parameter 'cpu_bstats'
   net/core/dev.c:6102: warning: No description found for parameter 'len'
   include/linux/netdevice.h:1291: warning: Enum value 'IFF_XMIT_DST_RELEASE_PERM' not described in enum 'netdev_priv_flags'
   include/linux/netdevice.h:1291: warning: Enum value 'IFF_IPVLAN_MASTER' not described in enum 'netdev_priv_flags'
   include/linux/netdevice.h:1291: warning: Enum value 'IFF_IPVLAN_SLAVE' not described in enum 'netdev_priv_flags'
   include/linux/netdevice.h:1791: warning: No description found for parameter 'ptype_all'
   include/linux/netdevice.h:1791: warning: No description found for parameter 'ptype_specific'

vim +442 include/net/sock.h

^1da177e4 Linus Torvalds     2005-04-16  426  	int			sk_write_pending;
d5f642384 Alexey Dobriyan    2008-11-04  427  #ifdef CONFIG_SECURITY
^1da177e4 Linus Torvalds     2005-04-16  428  	void			*sk_security;
d5f642384 Alexey Dobriyan    2008-11-04  429  #endif
4a19ec580 Laszlo Attila Toth 2008-01-30  430  	__u32			sk_mark;
e181a5430 Mathias Krause     2015-07-19  431  #ifdef CONFIG_CGROUP_NET_CLASSID
f84517253 Herbert Xu         2010-05-24  432  	u32			sk_classid;
e181a5430 Mathias Krause     2015-07-19  433  #endif
e1aab161e Glauber Costa      2011-12-11  434  	struct cg_proto		*sk_cgrp;
^1da177e4 Linus Torvalds     2005-04-16  435  	void			(*sk_state_change)(struct sock *sk);
676d23690 David S. Miller    2014-04-11  436  	void			(*sk_data_ready)(struct sock *sk);
^1da177e4 Linus Torvalds     2005-04-16  437  	void			(*sk_write_space)(struct sock *sk);
^1da177e4 Linus Torvalds     2005-04-16  438  	void			(*sk_error_report)(struct sock *sk);
^1da177e4 Linus Torvalds     2005-04-16  439  	int			(*sk_backlog_rcv)(struct sock *sk,
^1da177e4 Linus Torvalds     2005-04-16  440  						  struct sk_buff *skb);
^1da177e4 Linus Torvalds     2005-04-16  441  	void                    (*sk_destruct)(struct sock *sk);
^1da177e4 Linus Torvalds     2005-04-16 @442  };
^1da177e4 Linus Torvalds     2005-04-16  443  
559835ea7 Pravin B Shelar    2013-09-24  444  #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
559835ea7 Pravin B Shelar    2013-09-24  445  
559835ea7 Pravin B Shelar    2013-09-24  446  #define rcu_dereference_sk_user_data(sk)	rcu_dereference(__sk_user_data((sk)))
559835ea7 Pravin B Shelar    2013-09-24  447  #define rcu_assign_sk_user_data(sk, ptr)	rcu_assign_pointer(__sk_user_data((sk)), ptr)
559835ea7 Pravin B Shelar    2013-09-24  448  
4a17fd522 Pavel Emelyanov    2012-04-19  449  /*
4a17fd522 Pavel Emelyanov    2012-04-19  450   * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK

:::::: The code at line 442 was first introduced by commit
:::::: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Linux-2.6.12-rc2

:::::: TO: Linus Torvalds <torvalds@ppc970.osdl.org>
:::::: CC: Linus Torvalds <torvalds@ppc970.osdl.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Eric Dumazet Oct. 8, 2015, 4:29 p.m. UTC | #3
On Thu, 2015-10-08 at 09:03 -0700, Tom Herbert wrote:
> On Thu, Oct 8, 2015 at 8:37 AM, Eric Dumazet <edumazet@google.com> wrote:
> > SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command
> > to fetch incoming cpu handling a particular TCP flow after accept()
> >
> > This commits adds setsockopt() support and extends SO_REUSEPORT selection
> > logic : If a TCP listener or UDP socket has this option set, a packet is
> > delivered to this socket only if CPU handling the packet matches the specified one.
> >
> > This allows to build very efficient TCP servers, using one thread per cpu,
> > as the associated TCP listener should only accept flows handled in softirq
> > by the same cpu. This provides optimal NUMA/SMP behavior and keep cpu caches hot.
> >
> Please look again at my SO_INCOMING_CPU_MASK patches to see it these
> will work. I believe SO_INCOMING_CPU setsockopt is probably a subset
> of that functionality. A single CPU assigned to socket forces an
> application design to have one thread per CPU-- this may be overkill.
> It's probably be sufficient in many cases to have just one listener
> thread per NUMA node.


I think you misunderstood my patch.

For optimal behavior against DDOS, you need one TCP _listener_ per RX
queue on the NIC.

Not one listener per cpu !

You can then have multiple threads servicing requests coming for a
particular listener. Or one thread (cpu) servicing requests coming from
multiple TCP listeners. It's as you said an application choice/design.

Say you have 32 cpu (hyper threads) and 8 RX queues, you can create
groups of 4 threads per RX queue. Or one thread servicing all accept()

If you use 2 listeners (one per NUMA node), performance is not so good,
and SO_INCOMING_CPU_MASK adds an extra indirection, scaling poorly with
say 64 listeners.

Ying Cai has a patch to add an array selection (instead of having to
parse the list, yet to be rebased) 

I am afraid your choice of SO_INCOMING_CPU_MASK might have been driven
by the necessity of keeping the list short, but this is about to be a
non issue.

Thanks.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 8, 2015, 4:44 p.m. UTC | #4
On Thu, Oct 8, 2015 at 9:29 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2015-10-08 at 09:03 -0700, Tom Herbert wrote:
>> On Thu, Oct 8, 2015 at 8:37 AM, Eric Dumazet <edumazet@google.com> wrote:
>> > SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command
>> > to fetch incoming cpu handling a particular TCP flow after accept()
>> >
>> > This commits adds setsockopt() support and extends SO_REUSEPORT selection
>> > logic : If a TCP listener or UDP socket has this option set, a packet is
>> > delivered to this socket only if CPU handling the packet matches the specified one.
>> >
>> > This allows to build very efficient TCP servers, using one thread per cpu,
>> > as the associated TCP listener should only accept flows handled in softirq
>> > by the same cpu. This provides optimal NUMA/SMP behavior and keep cpu caches hot.
>> >
>> Please look again at my SO_INCOMING_CPU_MASK patches to see it these
>> will work. I believe SO_INCOMING_CPU setsockopt is probably a subset
>> of that functionality. A single CPU assigned to socket forces an
>> application design to have one thread per CPU-- this may be overkill.
>> It's probably be sufficient in many cases to have just one listener
>> thread per NUMA node.
>
>
> I think you misunderstood my patch.
>
> For optimal behavior against DDOS, you need one TCP _listener_ per RX
> queue on the NIC.
>
I see. We are not using SO_INCOMING_CPU_MASK as a defense against
DDOS. It's used ensure affinity in application connection processing
between CPUs. For instance, if we have two NUMA nodes we can start two
instances of the application bound to each node and then use
SO_REUSEPORT and SO_INCOMING_CPU_MASK to ensure connections are
processed on the the same NUMA node. Packets crossing NUMA boundaries
even with RFS is painful.

Tom
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 8, 2015, 4:50 p.m. UTC | #5
On Thu, 2015-10-08 at 09:29 -0700, Eric Dumazet wrote:

> For optimal behavior against DDOS, you need one TCP _listener_ per RX
> queue on the NIC.

I will reword the changelog to make this clear, and fix the htmldocs
warning reported by kbuild in V2.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 8, 2015, 5 p.m. UTC | #6
On Thu, 2015-10-08 at 09:44 -0700, Tom Herbert wrote:

> I see. We are not using SO_INCOMING_CPU_MASK as a defense against
> DDOS. It's used ensure affinity in application connection processing
> between CPUs. For instance, if we have two NUMA nodes we can start two
> instances of the application bound to each node and then use
> SO_REUSEPORT and SO_INCOMING_CPU_MASK to ensure connections are
> processed on the the same NUMA node. Packets crossing NUMA boundaries
> even with RFS is painful.

Then maybe you need something simpler than a mask of cpus, like
SO_INCOMING_NODE ?

Note that this could also be automatically tuned.

If you have a bunch of listeners on one port (could be 2), then the
underlying node on which TCP socket was allocated could be used as a
score modifier in compute_score()


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 8, 2015, 5:10 p.m. UTC | #7
On Thu, Oct 8, 2015 at 10:00 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2015-10-08 at 09:44 -0700, Tom Herbert wrote:
>
>> I see. We are not using SO_INCOMING_CPU_MASK as a defense against
>> DDOS. It's used ensure affinity in application connection processing
>> between CPUs. For instance, if we have two NUMA nodes we can start two
>> instances of the application bound to each node and then use
>> SO_REUSEPORT and SO_INCOMING_CPU_MASK to ensure connections are
>> processed on the the same NUMA node. Packets crossing NUMA boundaries
>> even with RFS is painful.
>
> Then maybe you need something simpler than a mask of cpus, like
> SO_INCOMING_NODE ?
>
> Note that this could also be automatically tuned.
>
> If you have a bunch of listeners on one port (could be 2), then the
> underlying node on which TCP socket was allocated could be used as a
> score modifier in compute_score()
>
Maybe, but I really like the simplicity of SO_INCOMING_CPU_MASK in
that it doesn't depend on NUMA configuration or RSS configuration.
It's very easy for the application to just set the same incoming CPU
mask to be the same CPU mask that the listener thread is bound to.
We've seen some nice performance benefits in doing with memcached,
that is to run an instances bound to the set of CPUs for each set NUMA
node and bind listener threads to the same set.

Tom

>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 8, 2015, 8:53 p.m. UTC | #8
On Thu, Oct 8, 2015 at 8:37 AM, Eric Dumazet <edumazet@google.com> wrote:
> SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command
> to fetch incoming cpu handling a particular TCP flow after accept()
>
> This commits adds setsockopt() support and extends SO_REUSEPORT selection
> logic : If a TCP listener or UDP socket has this option set, a packet is
> delivered to this socket only if CPU handling the packet matches the specified one.
>
> This allows to build very efficient TCP servers, using one thread per cpu,
> as the associated TCP listener should only accept flows handled in softirq
> by the same cpu. This provides optimal NUMA/SMP behavior and keep cpu caches hot.
>
> Note that __inet_lookup_listener() still has to iterate over the list of
> all listeners. Following patch puts sk_refcnt in a different cache line
> to let this iteration hit only shared and read mostly cache lines.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/net/sock.h          | 11 +++++------
>  net/core/sock.c             |  5 +++++
>  net/ipv4/inet_hashtables.c  |  5 +++++
>  net/ipv4/udp.c              | 12 +++++++++++-
>  net/ipv6/inet6_hashtables.c |  5 +++++
>  net/ipv6/udp.c              | 11 +++++++++++
>  6 files changed, 42 insertions(+), 7 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dfe2eb8e1132..00f60bea983b 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -150,6 +150,7 @@ typedef __u64 __bitwise __addrpair;
>   *     @skc_node: main hash linkage for various protocol lookup tables
>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>   *     @skc_tx_queue_mapping: tx queue number for this connection
> + *     @skc_incoming_cpu: record/match cpu processing incoming packets
>   *     @skc_refcnt: reference count
>   *
>   *     This is the minimal network layer representation of sockets, the header
> @@ -212,6 +213,8 @@ struct sock_common {
>                 struct hlist_nulls_node skc_nulls_node;
>         };
>         int                     skc_tx_queue_mapping;
> +       int                     skc_incoming_cpu;
> +
>         atomic_t                skc_refcnt;
>         /* private: */
>         int                     skc_dontcopy_end[0];
> @@ -274,7 +277,7 @@ struct cg_proto;
>    *    @sk_rcvtimeo: %SO_RCVTIMEO setting
>    *    @sk_sndtimeo: %SO_SNDTIMEO setting
>    *    @sk_rxhash: flow hash received from netif layer
> -  *    @sk_incoming_cpu: record cpu processing incoming packets
> +  *    @sk_incoming_cpu: record/match cpu processing incoming packets
>    *    @sk_txhash: computed flow hash for use on transmit
>    *    @sk_filter: socket filtering instructions
>    *    @sk_timer: sock cleanup timer
> @@ -331,6 +334,7 @@ struct sock {
>  #define sk_v6_daddr            __sk_common.skc_v6_daddr
>  #define sk_v6_rcv_saddr        __sk_common.skc_v6_rcv_saddr
>  #define sk_cookie              __sk_common.skc_cookie
> +#define sk_incoming_cpu                __sk_common.skc_incoming_cpu
>
>         socket_lock_t           sk_lock;
>         struct sk_buff_head     sk_receive_queue;
> @@ -353,11 +357,6 @@ struct sock {
>  #ifdef CONFIG_RPS
>         __u32                   sk_rxhash;
>  #endif
> -       u16                     sk_incoming_cpu;
> -       /* 16bit hole
> -        * Warned : sk_incoming_cpu can be set from softirq,
> -        * Do not use this hole without fully understanding possible issues.
> -        */
>
>         __u32                   sk_txhash;
>  #ifdef CONFIG_NET_RX_BUSY_POLL
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 7dd1263e4c24..1071f9380250 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -988,6 +988,10 @@ set_rcvbuf:
>                                          sk->sk_max_pacing_rate);
>                 break;
>
> +       case SO_INCOMING_CPU:
> +               sk->sk_incoming_cpu = val;
> +               break;
> +
>         default:
>                 ret = -ENOPROTOOPT;
>                 break;
> @@ -2353,6 +2357,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>
>         sk->sk_max_pacing_rate = ~0U;
>         sk->sk_pacing_rate = ~0U;
> +       sk->sk_incoming_cpu = -1;
>         /*
>          * Before updating sk_refcnt, we must commit prior changes to memory
>          * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index bed8886a4b6c..eabcfbc13afb 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -185,6 +185,11 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                                 return -1;
>                         score += 4;
>                 }
> +               if (sk->sk_incoming_cpu != -1) {
> +                       if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                               return -1;
> +                       score++;
> +               }

If the incoming CPU is set for a connected UDP via
sk_incoming_cpu_update, wouldn't this check subsequently _only_ allow
packets for that socket to come from the same CPU?

Also, the check seems a little austere. Why not do something like:

               if (sk->sk_incoming_cpu != -1) {
                       if (sk->sk_incoming_cpu != raw_smp_processor_id())
                            score += 4;
               }

My worry is that the packet steering configuration may change without
the application's knowledge, so it's possible packets may come in on
CPUs that the are unexpected to the application and then they would be
dropped without matching a socket. I suppose that this could work with
the original patch if a socket is bound to every CPU or there is at
least one listener socket that is not bound to any CPU.

>  }
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index e1fc129099ea..de675b796f78 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -375,7 +375,11 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                         return -1;
>                 score += 4;
>         }
> -
> +       if (sk->sk_incoming_cpu != -1) {
> +               if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                       return -1;
> +               score++;
> +       }
>         return score;
>  }
>
> @@ -419,6 +423,12 @@ static inline int compute_score2(struct sock *sk, struct net *net,
>                 score += 4;
>         }
>
> +       if (sk->sk_incoming_cpu != -1) {
> +               if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                       return -1;
> +               score++;
> +       }
> +
>         return score;
>  }
>
> diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
> index 6ac8dad0138a..af3d7f826bff 100644
> --- a/net/ipv6/inet6_hashtables.c
> +++ b/net/ipv6/inet6_hashtables.c
> @@ -114,6 +114,11 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                                 return -1;
>                         score++;
>                 }
> +               if (sk->sk_incoming_cpu != -1) {
> +                       if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                               return -1;
> +                       score++;
> +               }
>         }
>         return score;
>  }
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 0aba654f5b91..222fdc780405 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -182,6 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                 score++;
>         }
>
> +       if (sk->sk_incoming_cpu != -1) {
> +               if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                       return -1;
> +               score++;
> +       }
> +
>         return score;
>  }
>
> @@ -223,6 +229,11 @@ static inline int compute_score2(struct sock *sk, struct net *net,
>                 score++;
>         }
>
> +       if (sk->sk_incoming_cpu != -1) {
> +               if (sk->sk_incoming_cpu != raw_smp_processor_id())
> +                       return -1;
> +               score++;
> +       }
>         return score;
>  }
>
> --
> 2.6.0.rc2.230.g3dd15c0
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 8, 2015, 9:17 p.m. UTC | #9
On Thu, 2015-10-08 at 13:53 -0700, Tom Herbert wrote:

> If the incoming CPU is set for a connected UDP via
> sk_incoming_cpu_update, wouldn't this check subsequently _only_ allow
> packets for that socket to come from the same CPU?
> 

Hmm, I thought the SO_REUSEPORT path would be taken only for non
connected UDP sockets (like TCP listeners.).

But you might be right !

> Also, the check seems a little austere. Why not do something like:
> 
>                if (sk->sk_incoming_cpu != -1) {
>                        if (sk->sk_incoming_cpu != raw_smp_processor_id())
>                             score += 4;
>                }
> 
> My worry is that the packet steering configuration may change without
> the application's knowledge, so it's possible packets may come in on
> CPUs that the are unexpected to the application and then they would be
> dropped without matching a socket. I suppose that this could work with
> the original patch if a socket is bound to every CPU or there is at
> least one listener socket that is not bound to any CPU.

This is what I initially wrote, then I attempted a short cut, (abort
full list scan), then forgot to re-instate the first try, when I decided
to let this for future patch (Ying patch)

if (sk->sk_incoming_cpu == raw_smp_processor_id())
    score++;

(Note we do not even have to test for sk_incoming_cpu == -1 in this
variant)

I'll include this in v2.

Thanks.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index dfe2eb8e1132..00f60bea983b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -150,6 +150,7 @@  typedef __u64 __bitwise __addrpair;
  *	@skc_node: main hash linkage for various protocol lookup tables
  *	@skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  *	@skc_tx_queue_mapping: tx queue number for this connection
+ *	@skc_incoming_cpu: record/match cpu processing incoming packets
  *	@skc_refcnt: reference count
  *
  *	This is the minimal network layer representation of sockets, the header
@@ -212,6 +213,8 @@  struct sock_common {
 		struct hlist_nulls_node skc_nulls_node;
 	};
 	int			skc_tx_queue_mapping;
+	int			skc_incoming_cpu;
+
 	atomic_t		skc_refcnt;
 	/* private: */
 	int                     skc_dontcopy_end[0];
@@ -274,7 +277,7 @@  struct cg_proto;
   *	@sk_rcvtimeo: %SO_RCVTIMEO setting
   *	@sk_sndtimeo: %SO_SNDTIMEO setting
   *	@sk_rxhash: flow hash received from netif layer
-  *	@sk_incoming_cpu: record cpu processing incoming packets
+  *	@sk_incoming_cpu: record/match cpu processing incoming packets
   *	@sk_txhash: computed flow hash for use on transmit
   *	@sk_filter: socket filtering instructions
   *	@sk_timer: sock cleanup timer
@@ -331,6 +334,7 @@  struct sock {
 #define sk_v6_daddr		__sk_common.skc_v6_daddr
 #define sk_v6_rcv_saddr	__sk_common.skc_v6_rcv_saddr
 #define sk_cookie		__sk_common.skc_cookie
+#define sk_incoming_cpu		__sk_common.skc_incoming_cpu
 
 	socket_lock_t		sk_lock;
 	struct sk_buff_head	sk_receive_queue;
@@ -353,11 +357,6 @@  struct sock {
 #ifdef CONFIG_RPS
 	__u32			sk_rxhash;
 #endif
-	u16			sk_incoming_cpu;
-	/* 16bit hole
-	 * Warned : sk_incoming_cpu can be set from softirq,
-	 * Do not use this hole without fully understanding possible issues.
-	 */
 
 	__u32			sk_txhash;
 #ifdef CONFIG_NET_RX_BUSY_POLL
diff --git a/net/core/sock.c b/net/core/sock.c
index 7dd1263e4c24..1071f9380250 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -988,6 +988,10 @@  set_rcvbuf:
 					 sk->sk_max_pacing_rate);
 		break;
 
+	case SO_INCOMING_CPU:
+		sk->sk_incoming_cpu = val;
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -2353,6 +2357,7 @@  void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->sk_max_pacing_rate = ~0U;
 	sk->sk_pacing_rate = ~0U;
+	sk->sk_incoming_cpu = -1;
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index bed8886a4b6c..eabcfbc13afb 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -185,6 +185,11 @@  static inline int compute_score(struct sock *sk, struct net *net,
 				return -1;
 			score += 4;
 		}
+		if (sk->sk_incoming_cpu != -1) {
+			if (sk->sk_incoming_cpu != raw_smp_processor_id())
+				return -1;
+			score++;
+		}
 	}
 	return score;
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e1fc129099ea..de675b796f78 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -375,7 +375,11 @@  static inline int compute_score(struct sock *sk, struct net *net,
 			return -1;
 		score += 4;
 	}
-
+	if (sk->sk_incoming_cpu != -1) {
+		if (sk->sk_incoming_cpu != raw_smp_processor_id())
+			return -1;
+		score++;
+	}
 	return score;
 }
 
@@ -419,6 +423,12 @@  static inline int compute_score2(struct sock *sk, struct net *net,
 		score += 4;
 	}
 
+	if (sk->sk_incoming_cpu != -1) {
+		if (sk->sk_incoming_cpu != raw_smp_processor_id())
+			return -1;
+		score++;
+	}
+
 	return score;
 }
 
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 6ac8dad0138a..af3d7f826bff 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -114,6 +114,11 @@  static inline int compute_score(struct sock *sk, struct net *net,
 				return -1;
 			score++;
 		}
+		if (sk->sk_incoming_cpu != -1) {
+			if (sk->sk_incoming_cpu != raw_smp_processor_id())
+				return -1;
+			score++;
+		}
 	}
 	return score;
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0aba654f5b91..222fdc780405 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -182,6 +182,12 @@  static inline int compute_score(struct sock *sk, struct net *net,
 		score++;
 	}
 
+	if (sk->sk_incoming_cpu != -1) {
+		if (sk->sk_incoming_cpu != raw_smp_processor_id())
+			return -1;
+		score++;
+	}
+
 	return score;
 }
 
@@ -223,6 +229,11 @@  static inline int compute_score2(struct sock *sk, struct net *net,
 		score++;
 	}
 
+	if (sk->sk_incoming_cpu != -1) {
+		if (sk->sk_incoming_cpu != raw_smp_processor_id())
+			return -1;
+		score++;
+	}
 	return score;
 }