diff mbox

[v3,net-next,1/4] net: SO_INCOMING_CPU setsockopt() support

Message ID 1444358004-26483-2-git-send-email-edumazet@google.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Oct. 9, 2015, 2:33 a.m. UTC
SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command
to fetch incoming cpu handling a particular TCP flow after accept()

This commits adds setsockopt() support and extends SO_REUSEPORT selection
logic : If a TCP listener or UDP socket has this option set, a packet is
delivered to this socket only if CPU handling the packet matches the specified
one.

This allows to build very efficient TCP servers, using one listener per
RX queue, as the associated TCP listener should only accept flows handled
in softirq by the same cpu.
This provides optimal NUMA behavior and keep cpu caches hot.

Note that __inet_lookup_listener() still has to iterate over the list of
all listeners. Following patch puts sk_refcnt in a different cache line
to let this iteration hit only shared and read mostly cache lines.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sock.h          | 10 ++++------
 net/core/sock.c             |  5 +++++
 net/ipv4/inet_hashtables.c  |  2 ++
 net/ipv4/udp.c              |  6 +++++-
 net/ipv6/inet6_hashtables.c |  2 ++
 net/ipv6/udp.c              | 11 +++++++----
 6 files changed, 25 insertions(+), 11 deletions(-)

Comments

Tom Herbert Oct. 9, 2015, 3:40 a.m. UTC | #1
On Thu, Oct 8, 2015 at 7:33 PM, Eric Dumazet <edumazet@google.com> wrote:
> SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command
> to fetch incoming cpu handling a particular TCP flow after accept()
>
> This commits adds setsockopt() support and extends SO_REUSEPORT selection
> logic : If a TCP listener or UDP socket has this option set, a packet is
> delivered to this socket only if CPU handling the packet matches the specified
> one.
>
> This allows to build very efficient TCP servers, using one listener per
> RX queue, as the associated TCP listener should only accept flows handled
> in softirq by the same cpu.
> This provides optimal NUMA behavior and keep cpu caches hot.
>
> Note that __inet_lookup_listener() still has to iterate over the list of
> all listeners. Following patch puts sk_refcnt in a different cache line
> to let this iteration hit only shared and read mostly cache lines.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/net/sock.h          | 10 ++++------
>  net/core/sock.c             |  5 +++++
>  net/ipv4/inet_hashtables.c  |  2 ++
>  net/ipv4/udp.c              |  6 +++++-
>  net/ipv6/inet6_hashtables.c |  2 ++
>  net/ipv6/udp.c              | 11 +++++++----
>  6 files changed, 25 insertions(+), 11 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dfe2eb8e1132..08abffe32236 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -150,6 +150,7 @@ typedef __u64 __bitwise __addrpair;
>   *     @skc_node: main hash linkage for various protocol lookup tables
>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>   *     @skc_tx_queue_mapping: tx queue number for this connection
> + *     @skc_incoming_cpu: record/match cpu processing incoming packets
>   *     @skc_refcnt: reference count
>   *
>   *     This is the minimal network layer representation of sockets, the header
> @@ -212,6 +213,8 @@ struct sock_common {
>                 struct hlist_nulls_node skc_nulls_node;
>         };
>         int                     skc_tx_queue_mapping;
> +       int                     skc_incoming_cpu;
> +
>         atomic_t                skc_refcnt;
>         /* private: */
>         int                     skc_dontcopy_end[0];
> @@ -274,7 +277,6 @@ struct cg_proto;
>    *    @sk_rcvtimeo: %SO_RCVTIMEO setting
>    *    @sk_sndtimeo: %SO_SNDTIMEO setting
>    *    @sk_rxhash: flow hash received from netif layer
> -  *    @sk_incoming_cpu: record cpu processing incoming packets
>    *    @sk_txhash: computed flow hash for use on transmit
>    *    @sk_filter: socket filtering instructions
>    *    @sk_timer: sock cleanup timer
> @@ -331,6 +333,7 @@ struct sock {
>  #define sk_v6_daddr            __sk_common.skc_v6_daddr
>  #define sk_v6_rcv_saddr        __sk_common.skc_v6_rcv_saddr
>  #define sk_cookie              __sk_common.skc_cookie
> +#define sk_incoming_cpu                __sk_common.skc_incoming_cpu
>
>         socket_lock_t           sk_lock;
>         struct sk_buff_head     sk_receive_queue;
> @@ -353,11 +356,6 @@ struct sock {
>  #ifdef CONFIG_RPS
>         __u32                   sk_rxhash;
>  #endif
> -       u16                     sk_incoming_cpu;
> -       /* 16bit hole
> -        * Warned : sk_incoming_cpu can be set from softirq,
> -        * Do not use this hole without fully understanding possible issues.
> -        */
>
>         __u32                   sk_txhash;
>  #ifdef CONFIG_NET_RX_BUSY_POLL
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 7dd1263e4c24..1071f9380250 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -988,6 +988,10 @@ set_rcvbuf:
>                                          sk->sk_max_pacing_rate);
>                 break;
>
> +       case SO_INCOMING_CPU:
> +               sk->sk_incoming_cpu = val;
> +               break;
> +
>         default:
>                 ret = -ENOPROTOOPT;
>                 break;
> @@ -2353,6 +2357,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>
>         sk->sk_max_pacing_rate = ~0U;
>         sk->sk_pacing_rate = ~0U;
> +       sk->sk_incoming_cpu = -1;
>         /*
>          * Before updating sk_refcnt, we must commit prior changes to memory
>          * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index bed8886a4b6c..08643a3616af 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -185,6 +185,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                                 return -1;
>                         score += 4;
>                 }
> +               if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +                       score++;
>         }
>         return score;
>  }
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index e1fc129099ea..24ec14f9825c 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -375,7 +375,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                         return -1;
>                 score += 4;
>         }
> -
> +       if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +               score++;
>         return score;
>  }
>
> @@ -419,6 +420,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
>                 score += 4;
>         }
>
> +       if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +               score++;
> +
>         return score;
>  }
>
> diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
> index 6ac8dad0138a..21ace5a2bf7c 100644
> --- a/net/ipv6/inet6_hashtables.c
> +++ b/net/ipv6/inet6_hashtables.c
> @@ -114,6 +114,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                                 return -1;
>                         score++;
>                 }
> +               if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +                       score++;
>         }
>         return score;
>  }
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 0aba654f5b91..01bcb49619ee 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -182,10 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                 score++;
>         }
>
> +       if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +               score++;
> +
>         return score;
>  }
>
> -#define SCORE2_MAX (1 + 1 + 1)
>  static inline int compute_score2(struct sock *sk, struct net *net,
>                                  const struct in6_addr *saddr, __be16 sport,
>                                  const struct in6_addr *daddr,
> @@ -223,6 +225,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
>                 score++;
>         }
>
> +       if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +               score++;
> +
>         return score;
>  }
>
> @@ -251,8 +256,7 @@ begin:
>                                 hash = udp6_ehashfn(net, daddr, hnum,
>                                                     saddr, sport);
>                                 matches = 1;
> -                       } else if (score == SCORE2_MAX)
> -                               goto exact_match;
> +                       }

Do we care about losing this optimization? It's not done in IPv4 but I
can imagine that there is some arguments that address comparisons in
IPv6 are more expensive hence this might make sense...

>                 } else if (score == badness && reuseport) {
>                         matches++;
>                         if (reciprocal_scale(hash, matches) == 0)
> @@ -269,7 +273,6 @@ begin:
>                 goto begin;
>
>         if (result) {
> -exact_match:
>                 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
>                         result = NULL;
>                 else if (unlikely(compute_score2(result, net, saddr, sport,
> --
> 2.6.0.rc2.230.g3dd15c0
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 9, 2015, 9:45 a.m. UTC | #2
On Thu, 2015-10-08 at 20:40 -0700, Tom Herbert wrote:

> Do we care about losing this optimization? It's not done in IPv4 but I
> can imagine that there is some arguments that address comparisons in
> IPv6 are more expensive hence this might make sense...

I do not think we care. You removed the 'optimization' in IPv4 in commit
ba418fa357a7b ("soreuseport: UDP/IPv4 implementation") back in 2013 and
really no one noticed.

The important factor here is the number of cache lines taken to traverse
the list...


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index dfe2eb8e1132..08abffe32236 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -150,6 +150,7 @@  typedef __u64 __bitwise __addrpair;
  *	@skc_node: main hash linkage for various protocol lookup tables
  *	@skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  *	@skc_tx_queue_mapping: tx queue number for this connection
+ *	@skc_incoming_cpu: record/match cpu processing incoming packets
  *	@skc_refcnt: reference count
  *
  *	This is the minimal network layer representation of sockets, the header
@@ -212,6 +213,8 @@  struct sock_common {
 		struct hlist_nulls_node skc_nulls_node;
 	};
 	int			skc_tx_queue_mapping;
+	int			skc_incoming_cpu;
+
 	atomic_t		skc_refcnt;
 	/* private: */
 	int                     skc_dontcopy_end[0];
@@ -274,7 +277,6 @@  struct cg_proto;
   *	@sk_rcvtimeo: %SO_RCVTIMEO setting
   *	@sk_sndtimeo: %SO_SNDTIMEO setting
   *	@sk_rxhash: flow hash received from netif layer
-  *	@sk_incoming_cpu: record cpu processing incoming packets
   *	@sk_txhash: computed flow hash for use on transmit
   *	@sk_filter: socket filtering instructions
   *	@sk_timer: sock cleanup timer
@@ -331,6 +333,7 @@  struct sock {
 #define sk_v6_daddr		__sk_common.skc_v6_daddr
 #define sk_v6_rcv_saddr	__sk_common.skc_v6_rcv_saddr
 #define sk_cookie		__sk_common.skc_cookie
+#define sk_incoming_cpu		__sk_common.skc_incoming_cpu
 
 	socket_lock_t		sk_lock;
 	struct sk_buff_head	sk_receive_queue;
@@ -353,11 +356,6 @@  struct sock {
 #ifdef CONFIG_RPS
 	__u32			sk_rxhash;
 #endif
-	u16			sk_incoming_cpu;
-	/* 16bit hole
-	 * Warned : sk_incoming_cpu can be set from softirq,
-	 * Do not use this hole without fully understanding possible issues.
-	 */
 
 	__u32			sk_txhash;
 #ifdef CONFIG_NET_RX_BUSY_POLL
diff --git a/net/core/sock.c b/net/core/sock.c
index 7dd1263e4c24..1071f9380250 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -988,6 +988,10 @@  set_rcvbuf:
 					 sk->sk_max_pacing_rate);
 		break;
 
+	case SO_INCOMING_CPU:
+		sk->sk_incoming_cpu = val;
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -2353,6 +2357,7 @@  void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->sk_max_pacing_rate = ~0U;
 	sk->sk_pacing_rate = ~0U;
+	sk->sk_incoming_cpu = -1;
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index bed8886a4b6c..08643a3616af 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -185,6 +185,8 @@  static inline int compute_score(struct sock *sk, struct net *net,
 				return -1;
 			score += 4;
 		}
+		if (sk->sk_incoming_cpu == raw_smp_processor_id())
+			score++;
 	}
 	return score;
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e1fc129099ea..24ec14f9825c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -375,7 +375,8 @@  static inline int compute_score(struct sock *sk, struct net *net,
 			return -1;
 		score += 4;
 	}
-
+	if (sk->sk_incoming_cpu == raw_smp_processor_id())
+		score++;
 	return score;
 }
 
@@ -419,6 +420,9 @@  static inline int compute_score2(struct sock *sk, struct net *net,
 		score += 4;
 	}
 
+	if (sk->sk_incoming_cpu == raw_smp_processor_id())
+		score++;
+
 	return score;
 }
 
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 6ac8dad0138a..21ace5a2bf7c 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -114,6 +114,8 @@  static inline int compute_score(struct sock *sk, struct net *net,
 				return -1;
 			score++;
 		}
+		if (sk->sk_incoming_cpu == raw_smp_processor_id())
+			score++;
 	}
 	return score;
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0aba654f5b91..01bcb49619ee 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -182,10 +182,12 @@  static inline int compute_score(struct sock *sk, struct net *net,
 		score++;
 	}
 
+	if (sk->sk_incoming_cpu == raw_smp_processor_id())
+		score++;
+
 	return score;
 }
 
-#define SCORE2_MAX (1 + 1 + 1)
 static inline int compute_score2(struct sock *sk, struct net *net,
 				 const struct in6_addr *saddr, __be16 sport,
 				 const struct in6_addr *daddr,
@@ -223,6 +225,9 @@  static inline int compute_score2(struct sock *sk, struct net *net,
 		score++;
 	}
 
+	if (sk->sk_incoming_cpu == raw_smp_processor_id())
+		score++;
+
 	return score;
 }
 
@@ -251,8 +256,7 @@  begin:
 				hash = udp6_ehashfn(net, daddr, hnum,
 						    saddr, sport);
 				matches = 1;
-			} else if (score == SCORE2_MAX)
-				goto exact_match;
+			}
 		} else if (score == badness && reuseport) {
 			matches++;
 			if (reciprocal_scale(hash, matches) == 0)
@@ -269,7 +273,6 @@  begin:
 		goto begin;
 
 	if (result) {
-exact_match:
 		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
 			result = NULL;
 		else if (unlikely(compute_score2(result, net, saddr, sport,