diff mbox

[net-next] net: introduce SO_INCOMING_CPU

Message ID 1415393472.13896.119.camel@edumazet-glaptop2.roam.corp.google.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Nov. 7, 2014, 8:51 p.m. UTC
From: Eric Dumazet <edumazet@google.com>

Alternative to RPS/RFS is to use hardware support for multi queue.

Then split a set of million of sockets into worker threads, each
one using epoll() to manage events on its own socket pool.

Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
know after accept() or connect() on which queue/cpu a socket is managed.

We normally use one cpu per RX queue (IRQ smp_affinity being properly
set), so remembering on socket structure which cpu delivered last packet
is enough to solve the problem.

After accept(), connect(), or even file descriptor passing around
processes, applications can use :

 int cpu;
 socklen_t len = sizeof(cpu);

 getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);

And use this information to put the socket into the right silo
for optimal performance, as all networking stack should run
on the appropriate cpu, without need to send IPI (RPS/RFS).

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 arch/alpha/include/uapi/asm/socket.h   |    2 ++
 arch/avr32/include/uapi/asm/socket.h   |    2 ++
 arch/cris/include/uapi/asm/socket.h    |    2 ++
 arch/frv/include/uapi/asm/socket.h     |    2 ++
 arch/ia64/include/uapi/asm/socket.h    |    2 ++
 arch/m32r/include/uapi/asm/socket.h    |    2 ++
 arch/mips/include/uapi/asm/socket.h    |    2 ++
 arch/mn10300/include/uapi/asm/socket.h |    2 ++
 arch/parisc/include/uapi/asm/socket.h  |    2 ++
 arch/powerpc/include/uapi/asm/socket.h |    2 ++
 arch/s390/include/uapi/asm/socket.h    |    2 ++
 arch/sparc/include/uapi/asm/socket.h   |    2 ++
 arch/xtensa/include/uapi/asm/socket.h  |    2 ++
 include/net/sock.h                     |   12 ++++++++++++
 include/uapi/asm-generic/socket.h      |    2 ++
 net/core/sock.c                        |    5 +++++
 net/ipv4/tcp_ipv4.c                    |    1 +
 net/ipv4/udp.c                         |    1 +
 net/ipv6/tcp_ipv6.c                    |    1 +
 net/ipv6/udp.c                         |    1 +
 net/sctp/ulpqueue.c                    |    5 +++--
 21 files changed, 52 insertions(+), 2 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Tom Herbert Nov. 7, 2014, 11:02 p.m. UTC | #1
On Fri, Nov 7, 2014 at 12:51 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Alternative to RPS/RFS is to use hardware support for multi queue.
>
> Then split a set of million of sockets into worker threads, each
> one using epoll() to manage events on its own socket pool.
>
> Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
> know after accept() or connect() on which queue/cpu a socket is managed.
>
> We normally use one cpu per RX queue (IRQ smp_affinity being properly
> set), so remembering on socket structure which cpu delivered last packet
> is enough to solve the problem.
>
> After accept(), connect(), or even file descriptor passing around
> processes, applications can use :
>
>  int cpu;
>  socklen_t len = sizeof(cpu);
>
>  getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
>
> And use this information to put the socket into the right silo
> for optimal performance, as all networking stack should run
> on the appropriate cpu, without need to send IPI (RPS/RFS).
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  arch/alpha/include/uapi/asm/socket.h   |    2 ++
>  arch/avr32/include/uapi/asm/socket.h   |    2 ++
>  arch/cris/include/uapi/asm/socket.h    |    2 ++
>  arch/frv/include/uapi/asm/socket.h     |    2 ++
>  arch/ia64/include/uapi/asm/socket.h    |    2 ++
>  arch/m32r/include/uapi/asm/socket.h    |    2 ++
>  arch/mips/include/uapi/asm/socket.h    |    2 ++
>  arch/mn10300/include/uapi/asm/socket.h |    2 ++
>  arch/parisc/include/uapi/asm/socket.h  |    2 ++
>  arch/powerpc/include/uapi/asm/socket.h |    2 ++
>  arch/s390/include/uapi/asm/socket.h    |    2 ++
>  arch/sparc/include/uapi/asm/socket.h   |    2 ++
>  arch/xtensa/include/uapi/asm/socket.h  |    2 ++
>  include/net/sock.h                     |   12 ++++++++++++
>  include/uapi/asm-generic/socket.h      |    2 ++
>  net/core/sock.c                        |    5 +++++
>  net/ipv4/tcp_ipv4.c                    |    1 +
>  net/ipv4/udp.c                         |    1 +
>  net/ipv6/tcp_ipv6.c                    |    1 +
>  net/ipv6/udp.c                         |    1 +
>  net/sctp/ulpqueue.c                    |    5 +++--
>  21 files changed, 52 insertions(+), 2 deletions(-)
>
> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> index 3de1394bcab821984674e89a3ee022cc6dd5f0f2..e2fe0700b3b442bffc1f606b1b8b0bb7759aa157 100644
> --- a/arch/alpha/include/uapi/asm/socket.h
> +++ b/arch/alpha/include/uapi/asm/socket.h
> @@ -87,4 +87,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
> index 6e6cd159924b1855aa5f1811ad4e4c60b403c431..92121b0f5b989a61c008e0be24030725bab88e36 100644
> --- a/arch/avr32/include/uapi/asm/socket.h
> +++ b/arch/avr32/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI__ASM_AVR32_SOCKET_H */
> diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
> index ed94e5ed0a238c2750e677ccb806a6bc0a94041a..60f60f5b9b35bd219d7a9834fe5394e8ac5fdbab 100644
> --- a/arch/cris/include/uapi/asm/socket.h
> +++ b/arch/cris/include/uapi/asm/socket.h
> @@ -82,6 +82,8 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
>
>
> diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
> index ca2c6e6f31c6817780d31a246652adcc9847e373..2c6890209ea60c149bf097c2a1b369519cb8c301 100644
> --- a/arch/frv/include/uapi/asm/socket.h
> +++ b/arch/frv/include/uapi/asm/socket.h
> @@ -80,5 +80,7 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
>
> diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
> index a1b49bac7951929127ed08db549218c2c16ccf89..09a93fb566f6c6c6fe29c10c95b931881843d1cd 100644
> --- a/arch/ia64/include/uapi/asm/socket.h
> +++ b/arch/ia64/include/uapi/asm/socket.h
> @@ -89,4 +89,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_IA64_SOCKET_H */
> diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
> index 6c9a24b3aefa3a4f3048c17a7fa06d97b585ec14..e8589819c2743c6e112b15a245fc3ebd146e6313 100644
> --- a/arch/m32r/include/uapi/asm/socket.h
> +++ b/arch/m32r/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_M32R_SOCKET_H */
> diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> index a14baa218c76f14de988ef106bdac5dadc48aceb..2e9ee8c55a103a0337d9f80f71fe9ef28be1154b 100644
> --- a/arch/mips/include/uapi/asm/socket.h
> +++ b/arch/mips/include/uapi/asm/socket.h
> @@ -98,4 +98,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
> index 6aa3ce1854aa9523d46bc28851eddabd59edeb37..f3492e8c9f7009c33e07168df916f7337bef3929 100644
> --- a/arch/mn10300/include/uapi/asm/socket.h
> +++ b/arch/mn10300/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
> diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> index fe35ceacf0e72cad69a43d9b1ce7b8f5ec3da98a..7984a1cab3da980f1f810827967b4b67616eb89b 100644
> --- a/arch/parisc/include/uapi/asm/socket.h
> +++ b/arch/parisc/include/uapi/asm/socket.h
> @@ -79,4 +79,6 @@
>
>  #define SO_BPF_EXTENSIONS      0x4029
>
> +#define SO_INCOMING_CPU                0x402A
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
> index a9c3e2e18c054a1e952fe33599401de57c6a6544..3474e4ef166df4a573773916b325d0fa9f3b45d0 100644
> --- a/arch/powerpc/include/uapi/asm/socket.h
> +++ b/arch/powerpc/include/uapi/asm/socket.h
> @@ -87,4 +87,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_POWERPC_SOCKET_H */
> diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
> index e031332096d7c7b23b5953680289e8f3bcc3b378..8457636c33e1b67a9b7804daa05627839035a8fb 100644
> --- a/arch/s390/include/uapi/asm/socket.h
> +++ b/arch/s390/include/uapi/asm/socket.h
> @@ -86,4 +86,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
> diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> index 54d9608681b6947ae25dab008f808841d96125c0..4a8003a9416348006cfa85d5bcdf7553c8d23958 100644
> --- a/arch/sparc/include/uapi/asm/socket.h
> +++ b/arch/sparc/include/uapi/asm/socket.h
> @@ -76,6 +76,8 @@
>
>  #define SO_BPF_EXTENSIONS      0x0032
>
> +#define SO_INCOMING_CPU                0x0033
> +
>  /* Security levels - as per NRL IPv6 - don't actually do anything */
>  #define SO_SECURITY_AUTHENTICATION             0x5001
>  #define SO_SECURITY_ENCRYPTION_TRANSPORT       0x5002
> diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
> index 39acec0cf0b1d500c1c40f9b523ef3a9a142c2f1..c46f6a696849c6f7f8a34b2cc522b48e04b17380 100644
> --- a/arch/xtensa/include/uapi/asm/socket.h
> +++ b/arch/xtensa/include/uapi/asm/socket.h
> @@ -91,4 +91,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _XTENSA_SOCKET_H */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 6767d75ecb17693eb59a99b8218da4319854ccc0..7789b59c0c400eb99f65d1f0e03cd9773664cf93 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -273,6 +273,7 @@ struct cg_proto;
>    *    @sk_rcvtimeo: %SO_RCVTIMEO setting
>    *    @sk_sndtimeo: %SO_SNDTIMEO setting
>    *    @sk_rxhash: flow hash received from netif layer
> +  *    @sk_incoming_cpu: record cpu processing incoming packets
>    *    @sk_txhash: computed flow hash for use on transmit
>    *    @sk_filter: socket filtering instructions
>    *    @sk_protinfo: private area, net family specific, when not using slab
> @@ -350,6 +351,12 @@ struct sock {
>  #ifdef CONFIG_RPS
>         __u32                   sk_rxhash;
>  #endif
> +       u16                     sk_incoming_cpu;
> +       /* 16bit hole
> +        * Warned : sk_incoming_cpu can be set from softirq,
> +        * Do not use this hole without fully understanding possible issues.
> +        */
> +
>         __u32                   sk_txhash;
>  #ifdef CONFIG_NET_RX_BUSY_POLL
>         unsigned int            sk_napi_id;
> @@ -833,6 +840,11 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
>         return sk->sk_backlog_rcv(sk, skb);
>  }
>
> +static inline void sk_incoming_cpu_update(struct sock *sk)
> +{
> +       sk->sk_incoming_cpu = raw_smp_processor_id();
> +}
> +
>  static inline void sock_rps_record_flow_hash(__u32 hash)
>  {
>  #ifdef CONFIG_RPS
> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
> index ea0796bdcf88404ef0f127eb6e64ba00c16ea856..f541ccefd4acbeb4ad757be9dbf4b67f204bf21d 100644
> --- a/include/uapi/asm-generic/socket.h
> +++ b/include/uapi/asm-generic/socket.h
> @@ -82,4 +82,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* __ASM_GENERIC_SOCKET_H */
> diff --git a/net/core/sock.c b/net/core/sock.c
> index ac56dd06c306f3712e57ce8e4724c79565589499..0725cf0cb685787b2122606437da53299fb24621 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>                 v.val = sk->sk_max_pacing_rate;
>                 break;
>
> +       case SO_INCOMING_CPU:
> +               v.val = sk->sk_incoming_cpu;
> +               break;
> +
>         default:
>                 return -ENOPROTOOPT;
>         }
> @@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>
>                 newsk->sk_err      = 0;
>                 newsk->sk_priority = 0;
> +               newsk->sk_incoming_cpu = raw_smp_processor_id();
>                 /*
>                  * Before updating sk_refcnt, we must commit prior changes to memory
>                  * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 9c7d7621466b1241f404a5ca11de809dcff2d02a..3893f51972f28271a6d27a763c05495c5c2554f7 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1662,6 +1662,7 @@ process:
>                 goto discard_and_relse;
>
>         sk_mark_napi_id(sk, skb);
> +       sk_incoming_cpu_update(sk);

Would it be better to get the incoming CPU before packet steering? Or
maybe return this in addition to the CPU where ULP processes packet?

>         skb->dev = NULL;
>
>         bh_lock_sock_nested(sk);
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index df19027f44f3d6fbe13dec78d3b085968dbf2329..f52b6081158e87caa5df32e8e5d27dbf314a01b1 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>         if (inet_sk(sk)->inet_daddr) {
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_incoming_cpu_update(sk);
>         }
>
>         rc = sock_queue_rcv_skb(sk, skb);
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index ace29b60813cf8a1d7182ad2262cbcbd21810fa7..ac40d23204b5e55da5172c80dafd1d4854b370d5 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1455,6 +1455,7 @@ process:
>                 goto discard_and_relse;
>
>         sk_mark_napi_id(sk, skb);
> +       sk_incoming_cpu_update(sk);
>         skb->dev = NULL;
>
>         bh_lock_sock_nested(sk);
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 9b6809232b178c16d699ce3d152196b8c4cb096b..0125ca3daf47a4a3333e7462a11550d3e2f96875 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>         if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_incoming_cpu_update(sk);
>         }
>
>         rc = sock_queue_rcv_skb(sk, skb);
> diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
> index d49dc2ed30adb97a809eb37902b9956c366a2862..ce469d648ffbe166f9ae1c5650f481256f31a7f8 100644
> --- a/net/sctp/ulpqueue.c
> +++ b/net/sctp/ulpqueue.c
> @@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
>         if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
>                 goto out_free;
>
> -       if (!sctp_ulpevent_is_notification(event))
> +       if (!sctp_ulpevent_is_notification(event)) {
>                 sk_mark_napi_id(sk, skb);
> -
> +               sk_incoming_cpu_update(sk);
> +       }
>         /* Check if the user wishes to receive this event.  */
>         if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
>                 goto out_free;
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 7, 2014, 11:26 p.m. UTC | #2
On Fri, 2014-11-07 at 15:02 -0800, Tom Herbert wrote:

> Would it be better to get the incoming CPU before packet steering? Or
> maybe return this in addition to the CPU where ULP processes packet?

No packet steering : RFS is off.

If for some reason RPS is on (say NIC has a limited number of RX queues)
we'd like to have the cpu processing the packet in TCP stack for better
affinities.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Nov. 10, 2014, 8:08 p.m. UTC | #3
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 07 Nov 2014 12:51:12 -0800

> @@ -1455,6 +1455,7 @@ process:
>  		goto discard_and_relse;
>  
>  	sk_mark_napi_id(sk, skb);
> +	sk_incoming_cpu_update(sk);

Just make sk_mark_napi_id() call sk_incoming_cpu_update().

You've matched up the calls precisely in this patch, and I can't think
of any situation where we'd add a sk_mark_napi_call() not not want to
do an sk_incoming_cpu_update().

If, alternatively, you want to have a helper function that does both
things and has a special name, that's fine too.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Kerrisk \(man-pages\) Nov. 14, 2014, 8:05 a.m. UTC | #4
Hi Eric,

Since this is an API change ( Documentation/SubmitChecklist),
linux-api@ should be CCed.

Thanks,

Michael



On Fri, Nov 7, 2014 at 9:51 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Alternative to RPS/RFS is to use hardware support for multi queue.
>
> Then split a set of million of sockets into worker threads, each
> one using epoll() to manage events on its own socket pool.
>
> Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
> know after accept() or connect() on which queue/cpu a socket is managed.
>
> We normally use one cpu per RX queue (IRQ smp_affinity being properly
> set), so remembering on socket structure which cpu delivered last packet
> is enough to solve the problem.
>
> After accept(), connect(), or even file descriptor passing around
> processes, applications can use :
>
>  int cpu;
>  socklen_t len = sizeof(cpu);
>
>  getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
>
> And use this information to put the socket into the right silo
> for optimal performance, as all networking stack should run
> on the appropriate cpu, without need to send IPI (RPS/RFS).
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  arch/alpha/include/uapi/asm/socket.h   |    2 ++
>  arch/avr32/include/uapi/asm/socket.h   |    2 ++
>  arch/cris/include/uapi/asm/socket.h    |    2 ++
>  arch/frv/include/uapi/asm/socket.h     |    2 ++
>  arch/ia64/include/uapi/asm/socket.h    |    2 ++
>  arch/m32r/include/uapi/asm/socket.h    |    2 ++
>  arch/mips/include/uapi/asm/socket.h    |    2 ++
>  arch/mn10300/include/uapi/asm/socket.h |    2 ++
>  arch/parisc/include/uapi/asm/socket.h  |    2 ++
>  arch/powerpc/include/uapi/asm/socket.h |    2 ++
>  arch/s390/include/uapi/asm/socket.h    |    2 ++
>  arch/sparc/include/uapi/asm/socket.h   |    2 ++
>  arch/xtensa/include/uapi/asm/socket.h  |    2 ++
>  include/net/sock.h                     |   12 ++++++++++++
>  include/uapi/asm-generic/socket.h      |    2 ++
>  net/core/sock.c                        |    5 +++++
>  net/ipv4/tcp_ipv4.c                    |    1 +
>  net/ipv4/udp.c                         |    1 +
>  net/ipv6/tcp_ipv6.c                    |    1 +
>  net/ipv6/udp.c                         |    1 +
>  net/sctp/ulpqueue.c                    |    5 +++--
>  21 files changed, 52 insertions(+), 2 deletions(-)
>
> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> index 3de1394bcab821984674e89a3ee022cc6dd5f0f2..e2fe0700b3b442bffc1f606b1b8b0bb7759aa157 100644
> --- a/arch/alpha/include/uapi/asm/socket.h
> +++ b/arch/alpha/include/uapi/asm/socket.h
> @@ -87,4 +87,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
> index 6e6cd159924b1855aa5f1811ad4e4c60b403c431..92121b0f5b989a61c008e0be24030725bab88e36 100644
> --- a/arch/avr32/include/uapi/asm/socket.h
> +++ b/arch/avr32/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI__ASM_AVR32_SOCKET_H */
> diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
> index ed94e5ed0a238c2750e677ccb806a6bc0a94041a..60f60f5b9b35bd219d7a9834fe5394e8ac5fdbab 100644
> --- a/arch/cris/include/uapi/asm/socket.h
> +++ b/arch/cris/include/uapi/asm/socket.h
> @@ -82,6 +82,8 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
>
>
> diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
> index ca2c6e6f31c6817780d31a246652adcc9847e373..2c6890209ea60c149bf097c2a1b369519cb8c301 100644
> --- a/arch/frv/include/uapi/asm/socket.h
> +++ b/arch/frv/include/uapi/asm/socket.h
> @@ -80,5 +80,7 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
>
> diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
> index a1b49bac7951929127ed08db549218c2c16ccf89..09a93fb566f6c6c6fe29c10c95b931881843d1cd 100644
> --- a/arch/ia64/include/uapi/asm/socket.h
> +++ b/arch/ia64/include/uapi/asm/socket.h
> @@ -89,4 +89,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_IA64_SOCKET_H */
> diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
> index 6c9a24b3aefa3a4f3048c17a7fa06d97b585ec14..e8589819c2743c6e112b15a245fc3ebd146e6313 100644
> --- a/arch/m32r/include/uapi/asm/socket.h
> +++ b/arch/m32r/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_M32R_SOCKET_H */
> diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> index a14baa218c76f14de988ef106bdac5dadc48aceb..2e9ee8c55a103a0337d9f80f71fe9ef28be1154b 100644
> --- a/arch/mips/include/uapi/asm/socket.h
> +++ b/arch/mips/include/uapi/asm/socket.h
> @@ -98,4 +98,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
> index 6aa3ce1854aa9523d46bc28851eddabd59edeb37..f3492e8c9f7009c33e07168df916f7337bef3929 100644
> --- a/arch/mn10300/include/uapi/asm/socket.h
> +++ b/arch/mn10300/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
> diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> index fe35ceacf0e72cad69a43d9b1ce7b8f5ec3da98a..7984a1cab3da980f1f810827967b4b67616eb89b 100644
> --- a/arch/parisc/include/uapi/asm/socket.h
> +++ b/arch/parisc/include/uapi/asm/socket.h
> @@ -79,4 +79,6 @@
>
>  #define SO_BPF_EXTENSIONS      0x4029
>
> +#define SO_INCOMING_CPU                0x402A
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
> index a9c3e2e18c054a1e952fe33599401de57c6a6544..3474e4ef166df4a573773916b325d0fa9f3b45d0 100644
> --- a/arch/powerpc/include/uapi/asm/socket.h
> +++ b/arch/powerpc/include/uapi/asm/socket.h
> @@ -87,4 +87,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_POWERPC_SOCKET_H */
> diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
> index e031332096d7c7b23b5953680289e8f3bcc3b378..8457636c33e1b67a9b7804daa05627839035a8fb 100644
> --- a/arch/s390/include/uapi/asm/socket.h
> +++ b/arch/s390/include/uapi/asm/socket.h
> @@ -86,4 +86,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
> diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> index 54d9608681b6947ae25dab008f808841d96125c0..4a8003a9416348006cfa85d5bcdf7553c8d23958 100644
> --- a/arch/sparc/include/uapi/asm/socket.h
> +++ b/arch/sparc/include/uapi/asm/socket.h
> @@ -76,6 +76,8 @@
>
>  #define SO_BPF_EXTENSIONS      0x0032
>
> +#define SO_INCOMING_CPU                0x0033
> +
>  /* Security levels - as per NRL IPv6 - don't actually do anything */
>  #define SO_SECURITY_AUTHENTICATION             0x5001
>  #define SO_SECURITY_ENCRYPTION_TRANSPORT       0x5002
> diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
> index 39acec0cf0b1d500c1c40f9b523ef3a9a142c2f1..c46f6a696849c6f7f8a34b2cc522b48e04b17380 100644
> --- a/arch/xtensa/include/uapi/asm/socket.h
> +++ b/arch/xtensa/include/uapi/asm/socket.h
> @@ -91,4 +91,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _XTENSA_SOCKET_H */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 6767d75ecb17693eb59a99b8218da4319854ccc0..7789b59c0c400eb99f65d1f0e03cd9773664cf93 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -273,6 +273,7 @@ struct cg_proto;
>    *    @sk_rcvtimeo: %SO_RCVTIMEO setting
>    *    @sk_sndtimeo: %SO_SNDTIMEO setting
>    *    @sk_rxhash: flow hash received from netif layer
> +  *    @sk_incoming_cpu: record cpu processing incoming packets
>    *    @sk_txhash: computed flow hash for use on transmit
>    *    @sk_filter: socket filtering instructions
>    *    @sk_protinfo: private area, net family specific, when not using slab
> @@ -350,6 +351,12 @@ struct sock {
>  #ifdef CONFIG_RPS
>         __u32                   sk_rxhash;
>  #endif
> +       u16                     sk_incoming_cpu;
> +       /* 16bit hole
> +        * Warned : sk_incoming_cpu can be set from softirq,
> +        * Do not use this hole without fully understanding possible issues.
> +        */
> +
>         __u32                   sk_txhash;
>  #ifdef CONFIG_NET_RX_BUSY_POLL
>         unsigned int            sk_napi_id;
> @@ -833,6 +840,11 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
>         return sk->sk_backlog_rcv(sk, skb);
>  }
>
> +static inline void sk_incoming_cpu_update(struct sock *sk)
> +{
> +       sk->sk_incoming_cpu = raw_smp_processor_id();
> +}
> +
>  static inline void sock_rps_record_flow_hash(__u32 hash)
>  {
>  #ifdef CONFIG_RPS
> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
> index ea0796bdcf88404ef0f127eb6e64ba00c16ea856..f541ccefd4acbeb4ad757be9dbf4b67f204bf21d 100644
> --- a/include/uapi/asm-generic/socket.h
> +++ b/include/uapi/asm-generic/socket.h
> @@ -82,4 +82,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* __ASM_GENERIC_SOCKET_H */
> diff --git a/net/core/sock.c b/net/core/sock.c
> index ac56dd06c306f3712e57ce8e4724c79565589499..0725cf0cb685787b2122606437da53299fb24621 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>                 v.val = sk->sk_max_pacing_rate;
>                 break;
>
> +       case SO_INCOMING_CPU:
> +               v.val = sk->sk_incoming_cpu;
> +               break;
> +
>         default:
>                 return -ENOPROTOOPT;
>         }
> @@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>
>                 newsk->sk_err      = 0;
>                 newsk->sk_priority = 0;
> +               newsk->sk_incoming_cpu = raw_smp_processor_id();
>                 /*
>                  * Before updating sk_refcnt, we must commit prior changes to memory
>                  * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 9c7d7621466b1241f404a5ca11de809dcff2d02a..3893f51972f28271a6d27a763c05495c5c2554f7 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1662,6 +1662,7 @@ process:
>                 goto discard_and_relse;
>
>         sk_mark_napi_id(sk, skb);
> +       sk_incoming_cpu_update(sk);
>         skb->dev = NULL;
>
>         bh_lock_sock_nested(sk);
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index df19027f44f3d6fbe13dec78d3b085968dbf2329..f52b6081158e87caa5df32e8e5d27dbf314a01b1 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>         if (inet_sk(sk)->inet_daddr) {
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_incoming_cpu_update(sk);
>         }
>
>         rc = sock_queue_rcv_skb(sk, skb);
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index ace29b60813cf8a1d7182ad2262cbcbd21810fa7..ac40d23204b5e55da5172c80dafd1d4854b370d5 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1455,6 +1455,7 @@ process:
>                 goto discard_and_relse;
>
>         sk_mark_napi_id(sk, skb);
> +       sk_incoming_cpu_update(sk);
>         skb->dev = NULL;
>
>         bh_lock_sock_nested(sk);
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 9b6809232b178c16d699ce3d152196b8c4cb096b..0125ca3daf47a4a3333e7462a11550d3e2f96875 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>         if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_incoming_cpu_update(sk);
>         }
>
>         rc = sock_queue_rcv_skb(sk, skb);
> diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
> index d49dc2ed30adb97a809eb37902b9956c366a2862..ce469d648ffbe166f9ae1c5650f481256f31a7f8 100644
> --- a/net/sctp/ulpqueue.c
> +++ b/net/sctp/ulpqueue.c
> @@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
>         if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
>                 goto out_free;
>
> -       if (!sctp_ulpevent_is_notification(event))
> +       if (!sctp_ulpevent_is_notification(event)) {
>                 sk_mark_napi_id(sk, skb);
> -
> +               sk_incoming_cpu_update(sk);
> +       }
>         /* Check if the user wishes to receive this event.  */
>         if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
>                 goto out_free;
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 14, 2014, 5 p.m. UTC | #5
On Fri, 2014-11-14 at 09:05 +0100, Michael Kerrisk wrote:
> Hi Eric,
> 
> Since this is an API change ( Documentation/SubmitChecklist),
> linux-api@ should be CCed.

Right, I am sorry !

Thanks.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 14, 2014, 5:17 p.m. UTC | #6
On Fri, Nov 14, 2014 at 12:05 AM, Michael Kerrisk
<mtk.manpages@gmail.com> wrote:
> Hi Eric,
>
> Since this is an API change ( Documentation/SubmitChecklist),
> linux-api@ should be CCed.
>
> Thanks,
>
> Michael
>
>
>
> On Fri, Nov 7, 2014 at 9:51 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> From: Eric Dumazet <edumazet@google.com>
>>
>> Alternative to RPS/RFS is to use hardware support for multi queue.
>>
>> Then split a set of million of sockets into worker threads, each
>> one using epoll() to manage events on its own socket pool.
>>
>> Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
>> know after accept() or connect() on which queue/cpu a socket is managed.
>>
>> We normally use one cpu per RX queue (IRQ smp_affinity being properly
>> set), so remembering on socket structure which cpu delivered last packet
>> is enough to solve the problem.
>>
>> After accept(), connect(), or even file descriptor passing around
>> processes, applications can use :
>>
>>  int cpu;
>>  socklen_t len = sizeof(cpu);
>>
>>  getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
>>
>> And use this information to put the socket into the right silo
>> for optimal performance, as all networking stack should run
>> on the appropriate cpu, without need to send IPI (RPS/RFS).

As a heavy user of RFS (and finder of bugs in it, too), here's my
question about this API:

How does an application tell whether the socket represents a
non-actively-steered flow?  If the flow is subject to RFS, then moving
the application handling to the socket's CPU seems problematic, as the
socket's CPU might move as well.  The current implementation in this
patch seems to tell me which CPU the most recent packet came in on,
which is not necessarily very useful.

Some possibilities:

1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.

2. Change the interface a bit to report the socket's preferred CPU
(where it would go without RFS, for example) and then let the
application use setsockopt to tell the socket to stay put (i.e. turn
off RFS and RPS for that flow).

3. Report the preferred CPU as in (2) but let the application ask for
something different.

For example, I have flows for which I know which CPU I want.  A nice
API to put the flow there would be quite useful.


Also, it may be worth changing the naming to indicate that these are
about the rx cpu (they are, right?).  For some applications (sparse,
low-latency flows, for example), it can be useful to keep the tx
completion handling on a different CPU.

--Andy

>>
>> Signed-off-by: Eric Dumazet <edumazet@google.com>
>> ---
>>  arch/alpha/include/uapi/asm/socket.h   |    2 ++
>>  arch/avr32/include/uapi/asm/socket.h   |    2 ++
>>  arch/cris/include/uapi/asm/socket.h    |    2 ++
>>  arch/frv/include/uapi/asm/socket.h     |    2 ++
>>  arch/ia64/include/uapi/asm/socket.h    |    2 ++
>>  arch/m32r/include/uapi/asm/socket.h    |    2 ++
>>  arch/mips/include/uapi/asm/socket.h    |    2 ++
>>  arch/mn10300/include/uapi/asm/socket.h |    2 ++
>>  arch/parisc/include/uapi/asm/socket.h  |    2 ++
>>  arch/powerpc/include/uapi/asm/socket.h |    2 ++
>>  arch/s390/include/uapi/asm/socket.h    |    2 ++
>>  arch/sparc/include/uapi/asm/socket.h   |    2 ++
>>  arch/xtensa/include/uapi/asm/socket.h  |    2 ++
>>  include/net/sock.h                     |   12 ++++++++++++
>>  include/uapi/asm-generic/socket.h      |    2 ++
>>  net/core/sock.c                        |    5 +++++
>>  net/ipv4/tcp_ipv4.c                    |    1 +
>>  net/ipv4/udp.c                         |    1 +
>>  net/ipv6/tcp_ipv6.c                    |    1 +
>>  net/ipv6/udp.c                         |    1 +
>>  net/sctp/ulpqueue.c                    |    5 +++--
>>  21 files changed, 52 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
>> index 3de1394bcab821984674e89a3ee022cc6dd5f0f2..e2fe0700b3b442bffc1f606b1b8b0bb7759aa157 100644
>> --- a/arch/alpha/include/uapi/asm/socket.h
>> +++ b/arch/alpha/include/uapi/asm/socket.h
>> @@ -87,4 +87,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _UAPI_ASM_SOCKET_H */
>> diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
>> index 6e6cd159924b1855aa5f1811ad4e4c60b403c431..92121b0f5b989a61c008e0be24030725bab88e36 100644
>> --- a/arch/avr32/include/uapi/asm/socket.h
>> +++ b/arch/avr32/include/uapi/asm/socket.h
>> @@ -80,4 +80,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _UAPI__ASM_AVR32_SOCKET_H */
>> diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
>> index ed94e5ed0a238c2750e677ccb806a6bc0a94041a..60f60f5b9b35bd219d7a9834fe5394e8ac5fdbab 100644
>> --- a/arch/cris/include/uapi/asm/socket.h
>> +++ b/arch/cris/include/uapi/asm/socket.h
>> @@ -82,6 +82,8 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _ASM_SOCKET_H */
>>
>>
>> diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
>> index ca2c6e6f31c6817780d31a246652adcc9847e373..2c6890209ea60c149bf097c2a1b369519cb8c301 100644
>> --- a/arch/frv/include/uapi/asm/socket.h
>> +++ b/arch/frv/include/uapi/asm/socket.h
>> @@ -80,5 +80,7 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _ASM_SOCKET_H */
>>
>> diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
>> index a1b49bac7951929127ed08db549218c2c16ccf89..09a93fb566f6c6c6fe29c10c95b931881843d1cd 100644
>> --- a/arch/ia64/include/uapi/asm/socket.h
>> +++ b/arch/ia64/include/uapi/asm/socket.h
>> @@ -89,4 +89,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _ASM_IA64_SOCKET_H */
>> diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
>> index 6c9a24b3aefa3a4f3048c17a7fa06d97b585ec14..e8589819c2743c6e112b15a245fc3ebd146e6313 100644
>> --- a/arch/m32r/include/uapi/asm/socket.h
>> +++ b/arch/m32r/include/uapi/asm/socket.h
>> @@ -80,4 +80,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _ASM_M32R_SOCKET_H */
>> diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
>> index a14baa218c76f14de988ef106bdac5dadc48aceb..2e9ee8c55a103a0337d9f80f71fe9ef28be1154b 100644
>> --- a/arch/mips/include/uapi/asm/socket.h
>> +++ b/arch/mips/include/uapi/asm/socket.h
>> @@ -98,4 +98,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _UAPI_ASM_SOCKET_H */
>> diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
>> index 6aa3ce1854aa9523d46bc28851eddabd59edeb37..f3492e8c9f7009c33e07168df916f7337bef3929 100644
>> --- a/arch/mn10300/include/uapi/asm/socket.h
>> +++ b/arch/mn10300/include/uapi/asm/socket.h
>> @@ -80,4 +80,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _ASM_SOCKET_H */
>> diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
>> index fe35ceacf0e72cad69a43d9b1ce7b8f5ec3da98a..7984a1cab3da980f1f810827967b4b67616eb89b 100644
>> --- a/arch/parisc/include/uapi/asm/socket.h
>> +++ b/arch/parisc/include/uapi/asm/socket.h
>> @@ -79,4 +79,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      0x4029
>>
>> +#define SO_INCOMING_CPU                0x402A
>> +
>>  #endif /* _UAPI_ASM_SOCKET_H */
>> diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
>> index a9c3e2e18c054a1e952fe33599401de57c6a6544..3474e4ef166df4a573773916b325d0fa9f3b45d0 100644
>> --- a/arch/powerpc/include/uapi/asm/socket.h
>> +++ b/arch/powerpc/include/uapi/asm/socket.h
>> @@ -87,4 +87,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _ASM_POWERPC_SOCKET_H */
>> diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
>> index e031332096d7c7b23b5953680289e8f3bcc3b378..8457636c33e1b67a9b7804daa05627839035a8fb 100644
>> --- a/arch/s390/include/uapi/asm/socket.h
>> +++ b/arch/s390/include/uapi/asm/socket.h
>> @@ -86,4 +86,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _ASM_SOCKET_H */
>> diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
>> index 54d9608681b6947ae25dab008f808841d96125c0..4a8003a9416348006cfa85d5bcdf7553c8d23958 100644
>> --- a/arch/sparc/include/uapi/asm/socket.h
>> +++ b/arch/sparc/include/uapi/asm/socket.h
>> @@ -76,6 +76,8 @@
>>
>>  #define SO_BPF_EXTENSIONS      0x0032
>>
>> +#define SO_INCOMING_CPU                0x0033
>> +
>>  /* Security levels - as per NRL IPv6 - don't actually do anything */
>>  #define SO_SECURITY_AUTHENTICATION             0x5001
>>  #define SO_SECURITY_ENCRYPTION_TRANSPORT       0x5002
>> diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
>> index 39acec0cf0b1d500c1c40f9b523ef3a9a142c2f1..c46f6a696849c6f7f8a34b2cc522b48e04b17380 100644
>> --- a/arch/xtensa/include/uapi/asm/socket.h
>> +++ b/arch/xtensa/include/uapi/asm/socket.h
>> @@ -91,4 +91,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* _XTENSA_SOCKET_H */
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 6767d75ecb17693eb59a99b8218da4319854ccc0..7789b59c0c400eb99f65d1f0e03cd9773664cf93 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -273,6 +273,7 @@ struct cg_proto;
>>    *    @sk_rcvtimeo: %SO_RCVTIMEO setting
>>    *    @sk_sndtimeo: %SO_SNDTIMEO setting
>>    *    @sk_rxhash: flow hash received from netif layer
>> +  *    @sk_incoming_cpu: record cpu processing incoming packets
>>    *    @sk_txhash: computed flow hash for use on transmit
>>    *    @sk_filter: socket filtering instructions
>>    *    @sk_protinfo: private area, net family specific, when not using slab
>> @@ -350,6 +351,12 @@ struct sock {
>>  #ifdef CONFIG_RPS
>>         __u32                   sk_rxhash;
>>  #endif
>> +       u16                     sk_incoming_cpu;
>> +       /* 16bit hole
>> +        * Warned : sk_incoming_cpu can be set from softirq,
>> +        * Do not use this hole without fully understanding possible issues.
>> +        */
>> +
>>         __u32                   sk_txhash;
>>  #ifdef CONFIG_NET_RX_BUSY_POLL
>>         unsigned int            sk_napi_id;
>> @@ -833,6 +840,11 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
>>         return sk->sk_backlog_rcv(sk, skb);
>>  }
>>
>> +static inline void sk_incoming_cpu_update(struct sock *sk)
>> +{
>> +       sk->sk_incoming_cpu = raw_smp_processor_id();
>> +}
>> +
>>  static inline void sock_rps_record_flow_hash(__u32 hash)
>>  {
>>  #ifdef CONFIG_RPS
>> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
>> index ea0796bdcf88404ef0f127eb6e64ba00c16ea856..f541ccefd4acbeb4ad757be9dbf4b67f204bf21d 100644
>> --- a/include/uapi/asm-generic/socket.h
>> +++ b/include/uapi/asm-generic/socket.h
>> @@ -82,4 +82,6 @@
>>
>>  #define SO_BPF_EXTENSIONS      48
>>
>> +#define SO_INCOMING_CPU                49
>> +
>>  #endif /* __ASM_GENERIC_SOCKET_H */
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index ac56dd06c306f3712e57ce8e4724c79565589499..0725cf0cb685787b2122606437da53299fb24621 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>>                 v.val = sk->sk_max_pacing_rate;
>>                 break;
>>
>> +       case SO_INCOMING_CPU:
>> +               v.val = sk->sk_incoming_cpu;
>> +               break;
>> +
>>         default:
>>                 return -ENOPROTOOPT;
>>         }
>> @@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>>
>>                 newsk->sk_err      = 0;
>>                 newsk->sk_priority = 0;
>> +               newsk->sk_incoming_cpu = raw_smp_processor_id();
>>                 /*
>>                  * Before updating sk_refcnt, we must commit prior changes to memory
>>                  * (Documentation/RCU/rculist_nulls.txt for details)
>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> index 9c7d7621466b1241f404a5ca11de809dcff2d02a..3893f51972f28271a6d27a763c05495c5c2554f7 100644
>> --- a/net/ipv4/tcp_ipv4.c
>> +++ b/net/ipv4/tcp_ipv4.c
>> @@ -1662,6 +1662,7 @@ process:
>>                 goto discard_and_relse;
>>
>>         sk_mark_napi_id(sk, skb);
>> +       sk_incoming_cpu_update(sk);
>>         skb->dev = NULL;
>>
>>         bh_lock_sock_nested(sk);
>> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
>> index df19027f44f3d6fbe13dec78d3b085968dbf2329..f52b6081158e87caa5df32e8e5d27dbf314a01b1 100644
>> --- a/net/ipv4/udp.c
>> +++ b/net/ipv4/udp.c
>> @@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>>         if (inet_sk(sk)->inet_daddr) {
>>                 sock_rps_save_rxhash(sk, skb);
>>                 sk_mark_napi_id(sk, skb);
>> +               sk_incoming_cpu_update(sk);
>>         }
>>
>>         rc = sock_queue_rcv_skb(sk, skb);
>> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
>> index ace29b60813cf8a1d7182ad2262cbcbd21810fa7..ac40d23204b5e55da5172c80dafd1d4854b370d5 100644
>> --- a/net/ipv6/tcp_ipv6.c
>> +++ b/net/ipv6/tcp_ipv6.c
>> @@ -1455,6 +1455,7 @@ process:
>>                 goto discard_and_relse;
>>
>>         sk_mark_napi_id(sk, skb);
>> +       sk_incoming_cpu_update(sk);
>>         skb->dev = NULL;
>>
>>         bh_lock_sock_nested(sk);
>> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
>> index 9b6809232b178c16d699ce3d152196b8c4cb096b..0125ca3daf47a4a3333e7462a11550d3e2f96875 100644
>> --- a/net/ipv6/udp.c
>> +++ b/net/ipv6/udp.c
>> @@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>>         if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
>>                 sock_rps_save_rxhash(sk, skb);
>>                 sk_mark_napi_id(sk, skb);
>> +               sk_incoming_cpu_update(sk);
>>         }
>>
>>         rc = sock_queue_rcv_skb(sk, skb);
>> diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
>> index d49dc2ed30adb97a809eb37902b9956c366a2862..ce469d648ffbe166f9ae1c5650f481256f31a7f8 100644
>> --- a/net/sctp/ulpqueue.c
>> +++ b/net/sctp/ulpqueue.c
>> @@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
>>         if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
>>                 goto out_free;
>>
>> -       if (!sctp_ulpevent_is_notification(event))
>> +       if (!sctp_ulpevent_is_notification(event)) {
>>                 sk_mark_napi_id(sk, skb);
>> -
>> +               sk_incoming_cpu_update(sk);
>> +       }
>>         /* Check if the user wishes to receive this event.  */
>>         if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
>>                 goto out_free;
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Michael Kerrisk Linux man-pages maintainer;
> http://www.kernel.org/doc/man-pages/
> Author of "The Linux Programming Interface", http://blog.man7.org/
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 14, 2014, 7:33 p.m. UTC | #7
On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:

> As a heavy user of RFS (and finder of bugs in it, too), here's my
> question about this API:
> 
> How does an application tell whether the socket represents a
> non-actively-steered flow?  If the flow is subject to RFS, then moving
> the application handling to the socket's CPU seems problematic, as the
> socket's CPU might move as well.  The current implementation in this
> patch seems to tell me which CPU the most recent packet came in on,
> which is not necessarily very useful.

Its the cpu that hit the TCP stack, bringing dozens of cache lines in
its cache. This is all that matters,

> 
> Some possibilities:
> 
> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.

Well, idea is to not use RFS at all. Otherwise, it is useless.

RFS is the other way around : You want the flow to follow your thread.

RPS wont be a problem if you have sensible RPS settings.

> 
> 2. Change the interface a bit to report the socket's preferred CPU
> (where it would go without RFS, for example) and then let the
> application use setsockopt to tell the socket to stay put (i.e. turn
> off RFS and RPS for that flow).
> 
> 3. Report the preferred CPU as in (2) but let the application ask for
> something different.
> 
> For example, I have flows for which I know which CPU I want.  A nice
> API to put the flow there would be quite useful.
> 
> 
> Also, it may be worth changing the naming to indicate that these are
> about the rx cpu (they are, right?).  For some applications (sparse,
> low-latency flows, for example), it can be useful to keep the tx
> completion handling on a different CPU.

SO_INCOMING_CPU is rx, like incoming ;)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Nov. 14, 2014, 7:52 p.m. UTC | #8
On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>
>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>> question about this API:
>>
>> How does an application tell whether the socket represents a
>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>> the application handling to the socket's CPU seems problematic, as the
>> socket's CPU might move as well.  The current implementation in this
>> patch seems to tell me which CPU the most recent packet came in on,
>> which is not necessarily very useful.
>
> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
> its cache. This is all that matters,
>
>>
>> Some possibilities:
>>
>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>
> Well, idea is to not use RFS at all. Otherwise, it is useless.
>
Bear in mind this is only an interface to report RX CPU and in itself
doesn't provide any functionality for changing scheduling, there is
obviously logic needed in user space that would need to do something.

If we track the interrupting CPU in skb, the interface could be easily
extended to provide the interrupting CPU, the RPS CPU (calculated at
reported time), and the CPU processing transport (post steering which
is what is currently returned). That would provide the complete
picture to control scheduling a flow from userspace, and an interface
to selectively turn off RFS for a socket would make sense then.

> RFS is the other way around : You want the flow to follow your thread.
>
> RPS wont be a problem if you have sensible RPS settings.
>
>>
>> 2. Change the interface a bit to report the socket's preferred CPU
>> (where it would go without RFS, for example) and then let the
>> application use setsockopt to tell the socket to stay put (i.e. turn
>> off RFS and RPS for that flow).
>>
>> 3. Report the preferred CPU as in (2) but let the application ask for
>> something different.
>>
>> For example, I have flows for which I know which CPU I want.  A nice
>> API to put the flow there would be quite useful.
>>
>>
>> Also, it may be worth changing the naming to indicate that these are
>> about the rx cpu (they are, right?).  For some applications (sparse,
>> low-latency flows, for example), it can be useful to keep the tx
>> completion handling on a different CPU.
>
> SO_INCOMING_CPU is rx, like incoming ;)
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 14, 2014, 8:16 p.m. UTC | #9
On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>
>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>> question about this API:
>>>
>>> How does an application tell whether the socket represents a
>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>> the application handling to the socket's CPU seems problematic, as the
>>> socket's CPU might move as well.  The current implementation in this
>>> patch seems to tell me which CPU the most recent packet came in on,
>>> which is not necessarily very useful.
>>
>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>> its cache. This is all that matters,
>>
>>>
>>> Some possibilities:
>>>
>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>
>> Well, idea is to not use RFS at all. Otherwise, it is useless.

Sure, but how do I know that it'll be the same CPU next time?

>>
> Bear in mind this is only an interface to report RX CPU and in itself
> doesn't provide any functionality for changing scheduling, there is
> obviously logic needed in user space that would need to do something.
>
> If we track the interrupting CPU in skb, the interface could be easily
> extended to provide the interrupting CPU, the RPS CPU (calculated at
> reported time), and the CPU processing transport (post steering which
> is what is currently returned). That would provide the complete
> picture to control scheduling a flow from userspace, and an interface
> to selectively turn off RFS for a socket would make sense then.

I think that a turn-off-RFS interface would also want a way to figure
out where the flow would go without RFS.  Can the network stack do
that (e.g. evaluate the rx indirection hash or whatever happens these
days)?

>
>> RFS is the other way around : You want the flow to follow your thread.
>>
>> RPS wont be a problem if you have sensible RPS settings.
>>
>>>
>>> 2. Change the interface a bit to report the socket's preferred CPU
>>> (where it would go without RFS, for example) and then let the
>>> application use setsockopt to tell the socket to stay put (i.e. turn
>>> off RFS and RPS for that flow).
>>>
>>> 3. Report the preferred CPU as in (2) but let the application ask for
>>> something different.
>>>
>>> For example, I have flows for which I know which CPU I want.  A nice
>>> API to put the flow there would be quite useful.
>>>
>>>
>>> Also, it may be worth changing the naming to indicate that these are
>>> about the rx cpu (they are, right?).  For some applications (sparse,
>>> low-latency flows, for example), it can be useful to keep the tx
>>> completion handling on a different CPU.
>>
>> SO_INCOMING_CPU is rx, like incoming ;)
>>
>>

Duh :)

>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Nov. 14, 2014, 8:25 p.m. UTC | #10
On Fri, Nov 14, 2014 at 12:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
>> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>>
>>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>>> question about this API:
>>>>
>>>> How does an application tell whether the socket represents a
>>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>>> the application handling to the socket's CPU seems problematic, as the
>>>> socket's CPU might move as well.  The current implementation in this
>>>> patch seems to tell me which CPU the most recent packet came in on,
>>>> which is not necessarily very useful.
>>>
>>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>>> its cache. This is all that matters,
>>>
>>>>
>>>> Some possibilities:
>>>>
>>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>>
>>> Well, idea is to not use RFS at all. Otherwise, it is useless.
>
> Sure, but how do I know that it'll be the same CPU next time?
>
>>>
>> Bear in mind this is only an interface to report RX CPU and in itself
>> doesn't provide any functionality for changing scheduling, there is
>> obviously logic needed in user space that would need to do something.
>>
>> If we track the interrupting CPU in skb, the interface could be easily
>> extended to provide the interrupting CPU, the RPS CPU (calculated at
>> reported time), and the CPU processing transport (post steering which
>> is what is currently returned). That would provide the complete
>> picture to control scheduling a flow from userspace, and an interface
>> to selectively turn off RFS for a socket would make sense then.
>
> I think that a turn-off-RFS interface would also want a way to figure
> out where the flow would go without RFS.  Can the network stack do
> that (e.g. evaluate the rx indirection hash or whatever happens these
> days)?
>
Yes,. We need the rxhash and the CPU that packets are received on from
the device for the socket. The former we already have, the latter
might be done by adding a field to skbuff to set received CPU. Given
the L4 hash and interrupting CPU we can calculated the RPS CPU which
is where packet would have landed with RFS off.

>>
>>> RFS is the other way around : You want the flow to follow your thread.
>>>
>>> RPS wont be a problem if you have sensible RPS settings.
>>>
>>>>
>>>> 2. Change the interface a bit to report the socket's preferred CPU
>>>> (where it would go without RFS, for example) and then let the
>>>> application use setsockopt to tell the socket to stay put (i.e. turn
>>>> off RFS and RPS for that flow).
>>>>
>>>> 3. Report the preferred CPU as in (2) but let the application ask for
>>>> something different.
>>>>
>>>> For example, I have flows for which I know which CPU I want.  A nice
>>>> API to put the flow there would be quite useful.
>>>>
>>>>
>>>> Also, it may be worth changing the naming to indicate that these are
>>>> about the rx cpu (they are, right?).  For some applications (sparse,
>>>> low-latency flows, for example), it can be useful to keep the tx
>>>> completion handling on a different CPU.
>>>
>>> SO_INCOMING_CPU is rx, like incoming ;)
>>>
>>>
>
> Duh :)
>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Andy Lutomirski
> AMA Capital Management, LLC
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 14, 2014, 8:34 p.m. UTC | #11
On Fri, Nov 14, 2014 at 12:25 PM, Tom Herbert <therbert@google.com> wrote:
> On Fri, Nov 14, 2014 at 12:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
>>> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>>>
>>>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>>>> question about this API:
>>>>>
>>>>> How does an application tell whether the socket represents a
>>>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>>>> the application handling to the socket's CPU seems problematic, as the
>>>>> socket's CPU might move as well.  The current implementation in this
>>>>> patch seems to tell me which CPU the most recent packet came in on,
>>>>> which is not necessarily very useful.
>>>>
>>>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>>>> its cache. This is all that matters,
>>>>
>>>>>
>>>>> Some possibilities:
>>>>>
>>>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>>>
>>>> Well, idea is to not use RFS at all. Otherwise, it is useless.
>>
>> Sure, but how do I know that it'll be the same CPU next time?
>>
>>>>
>>> Bear in mind this is only an interface to report RX CPU and in itself
>>> doesn't provide any functionality for changing scheduling, there is
>>> obviously logic needed in user space that would need to do something.
>>>
>>> If we track the interrupting CPU in skb, the interface could be easily
>>> extended to provide the interrupting CPU, the RPS CPU (calculated at
>>> reported time), and the CPU processing transport (post steering which
>>> is what is currently returned). That would provide the complete
>>> picture to control scheduling a flow from userspace, and an interface
>>> to selectively turn off RFS for a socket would make sense then.
>>
>> I think that a turn-off-RFS interface would also want a way to figure
>> out where the flow would go without RFS.  Can the network stack do
>> that (e.g. evaluate the rx indirection hash or whatever happens these
>> days)?
>>
> Yes,. We need the rxhash and the CPU that packets are received on from
> the device for the socket. The former we already have, the latter
> might be done by adding a field to skbuff to set received CPU. Given
> the L4 hash and interrupting CPU we can calculated the RPS CPU which
> is where packet would have landed with RFS off.

Hmm.  I think this would be useful for me.  It would *definitely* be
useful for me if I could pin an RFS flow to a cpu of my choice.

With SO_INCOMING_CPU as described, I'm worried that people will write
programs that perform very well if RFS is off, but that once that code
runs with RFS on, weird things could happen.

(On a side note: the RFS flow hash stuff seems to be rather buggy.
Some Solarflare engineers know about this, but a fix seems to be
rather slow in the works.  I think that some of the bugs are in core
code, though.)

--Andy

>
>>>
>>>> RFS is the other way around : You want the flow to follow your thread.
>>>>
>>>> RPS wont be a problem if you have sensible RPS settings.
>>>>
>>>>>
>>>>> 2. Change the interface a bit to report the socket's preferred CPU
>>>>> (where it would go without RFS, for example) and then let the
>>>>> application use setsockopt to tell the socket to stay put (i.e. turn
>>>>> off RFS and RPS for that flow).
>>>>>
>>>>> 3. Report the preferred CPU as in (2) but let the application ask for
>>>>> something different.
>>>>>
>>>>> For example, I have flows for which I know which CPU I want.  A nice
>>>>> API to put the flow there would be quite useful.
>>>>>
>>>>>
>>>>> Also, it may be worth changing the naming to indicate that these are
>>>>> about the rx cpu (they are, right?).  For some applications (sparse,
>>>>> low-latency flows, for example), it can be useful to keep the tx
>>>>> completion handling on a different CPU.
>>>>
>>>> SO_INCOMING_CPU is rx, like incoming ;)
>>>>
>>>>
>>
>> Duh :)
>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>> --
>> Andy Lutomirski
>> AMA Capital Management, LLC
Tom Herbert Nov. 14, 2014, 9:36 p.m. UTC | #12
On Fri, Nov 14, 2014 at 12:34 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Nov 14, 2014 at 12:25 PM, Tom Herbert <therbert@google.com> wrote:
>> On Fri, Nov 14, 2014 at 12:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
>>>> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>>>>
>>>>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>>>>> question about this API:
>>>>>>
>>>>>> How does an application tell whether the socket represents a
>>>>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>>>>> the application handling to the socket's CPU seems problematic, as the
>>>>>> socket's CPU might move as well.  The current implementation in this
>>>>>> patch seems to tell me which CPU the most recent packet came in on,
>>>>>> which is not necessarily very useful.
>>>>>
>>>>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>>>>> its cache. This is all that matters,
>>>>>
>>>>>>
>>>>>> Some possibilities:
>>>>>>
>>>>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>>>>
>>>>> Well, idea is to not use RFS at all. Otherwise, it is useless.
>>>
>>> Sure, but how do I know that it'll be the same CPU next time?
>>>
>>>>>
>>>> Bear in mind this is only an interface to report RX CPU and in itself
>>>> doesn't provide any functionality for changing scheduling, there is
>>>> obviously logic needed in user space that would need to do something.
>>>>
>>>> If we track the interrupting CPU in skb, the interface could be easily
>>>> extended to provide the interrupting CPU, the RPS CPU (calculated at
>>>> reported time), and the CPU processing transport (post steering which
>>>> is what is currently returned). That would provide the complete
>>>> picture to control scheduling a flow from userspace, and an interface
>>>> to selectively turn off RFS for a socket would make sense then.
>>>
>>> I think that a turn-off-RFS interface would also want a way to figure
>>> out where the flow would go without RFS.  Can the network stack do
>>> that (e.g. evaluate the rx indirection hash or whatever happens these
>>> days)?
>>>
>> Yes,. We need the rxhash and the CPU that packets are received on from
>> the device for the socket. The former we already have, the latter
>> might be done by adding a field to skbuff to set received CPU. Given
>> the L4 hash and interrupting CPU we can calculated the RPS CPU which
>> is where packet would have landed with RFS off.
>
> Hmm.  I think this would be useful for me.  It would *definitely* be
> useful for me if I could pin an RFS flow to a cpu of my choice.
>
Andy, can you elaborate a little more on your use case. I've thought
several times about an interface to program the flow table from
userspace, but never quite came up with a compelling use case and
there is the security concern that a user could "steal" cycles from
arbitrary CPUs.

> With SO_INCOMING_CPU as described, I'm worried that people will write
> programs that perform very well if RFS is off, but that once that code
> runs with RFS on, weird things could happen.
>
> (On a side note: the RFS flow hash stuff seems to be rather buggy.
> Some Solarflare engineers know about this, but a fix seems to be
> rather slow in the works.  I think that some of the bugs are in core
> code, though.)

This is problems with accelerated RFS or just getting the flow hash for packets?

Thanks,
Tom

>
> --Andy
>
>>
>>>>
>>>>> RFS is the other way around : You want the flow to follow your thread.
>>>>>
>>>>> RPS wont be a problem if you have sensible RPS settings.
>>>>>
>>>>>>
>>>>>> 2. Change the interface a bit to report the socket's preferred CPU
>>>>>> (where it would go without RFS, for example) and then let the
>>>>>> application use setsockopt to tell the socket to stay put (i.e. turn
>>>>>> off RFS and RPS for that flow).
>>>>>>
>>>>>> 3. Report the preferred CPU as in (2) but let the application ask for
>>>>>> something different.
>>>>>>
>>>>>> For example, I have flows for which I know which CPU I want.  A nice
>>>>>> API to put the flow there would be quite useful.
>>>>>>
>>>>>>
>>>>>> Also, it may be worth changing the naming to indicate that these are
>>>>>> about the rx cpu (they are, right?).  For some applications (sparse,
>>>>>> low-latency flows, for example), it can be useful to keep the tx
>>>>>> completion handling on a different CPU.
>>>>>
>>>>> SO_INCOMING_CPU is rx, like incoming ;)
>>>>>
>>>>>
>>>
>>> Duh :)
>>>
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>>
>>> --
>>> Andy Lutomirski
>>> AMA Capital Management, LLC
>
>
>
> --
> Andy Lutomirski
> AMA Capital Management, LLC
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 14, 2014, 10:10 p.m. UTC | #13
On Fri, Nov 14, 2014 at 1:36 PM, Tom Herbert <therbert@google.com> wrote:
> On Fri, Nov 14, 2014 at 12:34 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Nov 14, 2014 at 12:25 PM, Tom Herbert <therbert@google.com> wrote:
>>> On Fri, Nov 14, 2014 at 12:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>> On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
>>>>> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>>>>>
>>>>>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>>>>>> question about this API:
>>>>>>>
>>>>>>> How does an application tell whether the socket represents a
>>>>>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>>>>>> the application handling to the socket's CPU seems problematic, as the
>>>>>>> socket's CPU might move as well.  The current implementation in this
>>>>>>> patch seems to tell me which CPU the most recent packet came in on,
>>>>>>> which is not necessarily very useful.
>>>>>>
>>>>>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>>>>>> its cache. This is all that matters,
>>>>>>
>>>>>>>
>>>>>>> Some possibilities:
>>>>>>>
>>>>>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>>>>>
>>>>>> Well, idea is to not use RFS at all. Otherwise, it is useless.
>>>>
>>>> Sure, but how do I know that it'll be the same CPU next time?
>>>>
>>>>>>
>>>>> Bear in mind this is only an interface to report RX CPU and in itself
>>>>> doesn't provide any functionality for changing scheduling, there is
>>>>> obviously logic needed in user space that would need to do something.
>>>>>
>>>>> If we track the interrupting CPU in skb, the interface could be easily
>>>>> extended to provide the interrupting CPU, the RPS CPU (calculated at
>>>>> reported time), and the CPU processing transport (post steering which
>>>>> is what is currently returned). That would provide the complete
>>>>> picture to control scheduling a flow from userspace, and an interface
>>>>> to selectively turn off RFS for a socket would make sense then.
>>>>
>>>> I think that a turn-off-RFS interface would also want a way to figure
>>>> out where the flow would go without RFS.  Can the network stack do
>>>> that (e.g. evaluate the rx indirection hash or whatever happens these
>>>> days)?
>>>>
>>> Yes,. We need the rxhash and the CPU that packets are received on from
>>> the device for the socket. The former we already have, the latter
>>> might be done by adding a field to skbuff to set received CPU. Given
>>> the L4 hash and interrupting CPU we can calculated the RPS CPU which
>>> is where packet would have landed with RFS off.
>>
>> Hmm.  I think this would be useful for me.  It would *definitely* be
>> useful for me if I could pin an RFS flow to a cpu of my choice.
>>
> Andy, can you elaborate a little more on your use case. I've thought
> several times about an interface to program the flow table from
> userspace, but never quite came up with a compelling use case and
> there is the security concern that a user could "steal" cycles from
> arbitrary CPUs.

I have a bunch of threads that are pinned to various CPUs or groups of
CPUs.  Each thread is responsible for a fixed set of flows.  I'd like
those flows to go to those CPUs.

RFS will eventually do it, but it would be nice if I could
deterministically ask for a flow to be routed to the right CPU.  Also,
if my thread bounces temporarily to another CPU, I don't really need
the flow to follow it -- I'd like it to stay put.

This has a significant benefit over using automatic steering: with
automatic steering, I have to make all of the hash tables have a size
around the square of the total number of the flows in order to make it
reliable.

Something like SO_STEER_TO_THIS_CPU would be fine, as long as it
reported whether it worked (for my diagnostics).

>
>> With SO_INCOMING_CPU as described, I'm worried that people will write
>> programs that perform very well if RFS is off, but that once that code
>> runs with RFS on, weird things could happen.
>>
>> (On a side note: the RFS flow hash stuff seems to be rather buggy.
>> Some Solarflare engineers know about this, but a fix seems to be
>> rather slow in the works.  I think that some of the bugs are in core
>> code, though.)
>
> This is problems with accelerated RFS or just getting the flow hash for packets?

Accelerated RFS.

Digging through my email, I thought that
net.core.rps_sock_flow_entries != the per-queue rps_flow_cnt would
make no sense, although I haven't configured it that way.

More importantly, though, I think that some of the has table stuff is
problematic.  My understanding is:

get_rps_cpu may call set_rps_cpu, passing rflow =
flow_table->flows[hash & flow_table->mask];

set_rps_cpu will compute flow_id = hash & flow_table->mask, which
looks to me like it has the property that rflow == flow_table[hash]
(unless we race with a hash table resize).

Now set_rps_cpu tries to steer the new flow to the right CPU (all good
so far), but then it gets weird.  We have a very high probability of
old_flow == rflow.  rflow->filter gets overwritten with the filter id,
the if condition doesn't execute, and nothing gets set to
RPS_NO_FILTER.

This is technically all correct, but if there are two active flows
with the same hash, they'll each keep getting steered to the same
place.  This wastes cycles and seems to anger the sfc driver (the
latter is presumably an sfc bug).  It also means that some of the
filters are likely to get expired for no good reason.

Also, shouldn't ndo_rx_flow_steer be passed the full hash, not just
the index into the hash table?  What's the point of cutting off the
high bits?

--Andy

--Andy
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 14, 2014, 10:16 p.m. UTC | #14
On Fri, 2014-11-14 at 12:16 -0800, Andy Lutomirski wrote:

> Sure, but how do I know that it'll be the same CPU next time?

Because the NIC always use same RX queue for a given flow.

So if you setup your IRQ affinities properly, the same CPU will drain
packets from this RX queue. And since RFS is off, you have the guarantee
the same CPU will be used to process packets in TCP stack.

This SO_INCOMING_CPU info is a hint, there is no guarantee eg if you use
bonding and some load balancer or switch decides to send packets on
different links.

Most NIC use Toeplitz hash, so given the 4-tuple, and rss key (40
bytes), you can actually compute the hash in software and know on which
RX queue traffic should land.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 14, 2014, 10:18 p.m. UTC | #15
On Fri, Nov 14, 2014 at 2:16 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2014-11-14 at 12:16 -0800, Andy Lutomirski wrote:
>
>> Sure, but how do I know that it'll be the same CPU next time?
>
> Because the NIC always use same RX queue for a given flow.
>
> So if you setup your IRQ affinities properly, the same CPU will drain
> packets from this RX queue. And since RFS is off, you have the guarantee
> the same CPU will be used to process packets in TCP stack.

Right.  My concern is that if RFS is off, everything works fine, but
if RFS is on (and the app has no good way to know), then silly results
will happen.  I think I'd rather have the getsockopt fail if RFS is on
or at least give some indication so that the app can react
accordingly.

--Andy

>
> This SO_INCOMING_CPU info is a hint, there is no guarantee eg if you use
> bonding and some load balancer or switch decides to send packets on
> different links.
>
> Most NIC use Toeplitz hash, so given the 4-tuple, and rss key (40
> bytes), you can actually compute the hash in software and know on which
> RX queue traffic should land.
>
>
Eric Dumazet Nov. 14, 2014, 10:24 p.m. UTC | #16
On Fri, 2014-11-14 at 14:10 -0800, Andy Lutomirski wrote:

> I have a bunch of threads that are pinned to various CPUs or groups of
> CPUs.  Each thread is responsible for a fixed set of flows.  I'd like
> those flows to go to those CPUs.
> 
> RFS will eventually do it, but it would be nice if I could
> deterministically ask for a flow to be routed to the right CPU.  Also,
> if my thread bounces temporarily to another CPU, I don't really need
> the flow to follow it -- I'd like it to stay put.
> 
> This has a significant benefit over using automatic steering: with
> automatic steering, I have to make all of the hash tables have a size
> around the square of the total number of the flows in order to make it
> reliable.
> 
> Something like SO_STEER_TO_THIS_CPU would be fine, as long as it
> reported whether it worked (for my diagnostics).

This requires some kind of hardware support, and unfortunately this is
not generic.

With SO_INCOMING_CPU, you simply can pass fd of sockets around threads,
so that a dumb RSS multiqueue NIC is OK (assuming you are not using some
encapsulation that NIC is not able to parse to find L4 information)

Steering is a dream, I really think its easier to build flows so that
their RX queue matches your requirements.

We usually can pick at least one element of the 4-tuple, so its actually
possible to get this before connect().


Two cases :

1) Passive connections.

   After accept(), get SO_INCOMING_CPU, then pass the fd to appropriate
thread of your pool.

2) Active connections .
   find a proper 4-tuple, bind() then connect(). Eventually check
SO_INCOMING_CPU to verify your expectations.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 14, 2014, 10:27 p.m. UTC | #17
On Fri, Nov 14, 2014 at 2:24 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2014-11-14 at 14:10 -0800, Andy Lutomirski wrote:
>
>> I have a bunch of threads that are pinned to various CPUs or groups of
>> CPUs.  Each thread is responsible for a fixed set of flows.  I'd like
>> those flows to go to those CPUs.
>>
>> RFS will eventually do it, but it would be nice if I could
>> deterministically ask for a flow to be routed to the right CPU.  Also,
>> if my thread bounces temporarily to another CPU, I don't really need
>> the flow to follow it -- I'd like it to stay put.
>>
>> This has a significant benefit over using automatic steering: with
>> automatic steering, I have to make all of the hash tables have a size
>> around the square of the total number of the flows in order to make it
>> reliable.
>>
>> Something like SO_STEER_TO_THIS_CPU would be fine, as long as it
>> reported whether it worked (for my diagnostics).
>
> This requires some kind of hardware support, and unfortunately this is
> not generic.
>
> With SO_INCOMING_CPU, you simply can pass fd of sockets around threads,
> so that a dumb RSS multiqueue NIC is OK (assuming you are not using some
> encapsulation that NIC is not able to parse to find L4 information)

I can't really do this.  It means that the performance of my system
will be wildly different every time I restart it.  I don't have enough
connections for everything to average out.

>
> Steering is a dream, I really think its easier to build flows so that
> their RX queue matches your requirements.

I have supporting hardware :)  I just want it to work without
programming the ntuple table myself.

>
> We usually can pick at least one element of the 4-tuple, so its actually
> possible to get this before connect().
>

Hmm.  An API for that would be quite nice :)

>
> Two cases :
>
> 1) Passive connections.
>
>    After accept(), get SO_INCOMING_CPU, then pass the fd to appropriate
> thread of your pool.
>
> 2) Active connections .
>    find a proper 4-tuple, bind() then connect(). Eventually check
> SO_INCOMING_CPU to verify your expectations.

The people at the other end will be really pissed if that results in
lots of reconnections.

--Andy

>
>
>
Eric Dumazet Nov. 14, 2014, 10:58 p.m. UTC | #18
On Fri, 2014-11-14 at 14:27 -0800, Andy Lutomirski wrote:

> The people at the other end will be really pissed if that results in
> lots of reconnections.

No reconnections necessary.

I believe you misunderstood : On the 4-tuple (SADDR,SPORT,DADDR,DPORT),
you can pick for example SPORT so that hash(SADDR,SPORT,DADDR,DPORT)
maps to a known and wanted RX queue number.

Once you know that, you use bind(SADDR, SPORT), then
connect(DADDR,DPORT). 

Anyway, if your hardware is able to cope with the few number of flows,
just use the hardware and be happy.

Here we want about 10 millions sockets, there is little hope for
hardware being helpful.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 14, 2014, 11:03 p.m. UTC | #19
On Fri, Nov 14, 2014 at 2:58 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2014-11-14 at 14:27 -0800, Andy Lutomirski wrote:
>
>> The people at the other end will be really pissed if that results in
>> lots of reconnections.
>
> No reconnections necessary.
>
> I believe you misunderstood : On the 4-tuple (SADDR,SPORT,DADDR,DPORT),
> you can pick for example SPORT so that hash(SADDR,SPORT,DADDR,DPORT)
> maps to a known and wanted RX queue number.
>
> Once you know that, you use bind(SADDR, SPORT), then
> connect(DADDR,DPORT).

If the kernel had an API for this, I'd be all for using it.

>
> Anyway, if your hardware is able to cope with the few number of flows,
> just use the hardware and be happy.
>
> Here we want about 10 millions sockets, there is little hope for
> hardware being helpful.
>

It's the intermediate numbers that are bad.

With ten flows, the current accelerated RFS works fine.  With 10M
flows, RFS is a lost cause and this solution is much nicer.  With,
say, 1k flows, accelerated RFS *deserves* to work perfectly, because
the hardware has enough filter slots.  But making it work reliably
requires a ridiculously large hash table, and collisions cause silent
bad behavior.

--Andy

>
Eric Dumazet Nov. 14, 2014, 11:32 p.m. UTC | #20
On Fri, 2014-11-14 at 15:03 -0800, Andy Lutomirski wrote:

> If the kernel had an API for this, I'd be all for using it.

It would be user land code, not kernel.

Why doing a system call when you can avoid it ? ;)

Doing it in the kernel would be quite complex actually.

In userland, you can even implement this by machine learning.

For every connection you made, you get the 4-tuple and INCOMING_CPU,
then you store it in a cache.

Next time you need to connect to same remote peer, you can lookup in the
cache to find a good candidate.

It would actually be good if we could do in a single socket verb a
bind_and_connect(fd, &source, &destination)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 14, 2014, 11:40 p.m. UTC | #21
On Fri, Nov 14, 2014 at 3:32 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2014-11-14 at 15:03 -0800, Andy Lutomirski wrote:
>
>> If the kernel had an API for this, I'd be all for using it.
>
> It would be user land code, not kernel.
>
> Why doing a system call when you can avoid it ? ;)
>
> Doing it in the kernel would be quite complex actually.

A semi-official library would work, too.  As long as it kept working.

>
> In userland, you can even implement this by machine learning.
>
> For every connection you made, you get the 4-tuple and INCOMING_CPU,
> then you store it in a cache.

Eww :(

>
> Next time you need to connect to same remote peer, you can lookup in the
> cache to find a good candidate.
>
> It would actually be good if we could do in a single socket verb a
> bind_and_connect(fd, &source, &destination)

Fair enough.  Although for my use case, the time it takes to connect
is completely irrelevant.

--Andy

>
>
Tom Herbert Nov. 15, 2014, 12:06 a.m. UTC | #22
On Fri, Nov 14, 2014 at 2:10 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Nov 14, 2014 at 1:36 PM, Tom Herbert <therbert@google.com> wrote:
>> On Fri, Nov 14, 2014 at 12:34 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Fri, Nov 14, 2014 at 12:25 PM, Tom Herbert <therbert@google.com> wrote:
>>>> On Fri, Nov 14, 2014 at 12:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>> On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
>>>>>> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>>>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>>>>>>
>>>>>>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>>>>>>> question about this API:
>>>>>>>>
>>>>>>>> How does an application tell whether the socket represents a
>>>>>>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>>>>>>> the application handling to the socket's CPU seems problematic, as the
>>>>>>>> socket's CPU might move as well.  The current implementation in this
>>>>>>>> patch seems to tell me which CPU the most recent packet came in on,
>>>>>>>> which is not necessarily very useful.
>>>>>>>
>>>>>>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>>>>>>> its cache. This is all that matters,
>>>>>>>
>>>>>>>>
>>>>>>>> Some possibilities:
>>>>>>>>
>>>>>>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>>>>>>
>>>>>>> Well, idea is to not use RFS at all. Otherwise, it is useless.
>>>>>
>>>>> Sure, but how do I know that it'll be the same CPU next time?
>>>>>
>>>>>>>
>>>>>> Bear in mind this is only an interface to report RX CPU and in itself
>>>>>> doesn't provide any functionality for changing scheduling, there is
>>>>>> obviously logic needed in user space that would need to do something.
>>>>>>
>>>>>> If we track the interrupting CPU in skb, the interface could be easily
>>>>>> extended to provide the interrupting CPU, the RPS CPU (calculated at
>>>>>> reported time), and the CPU processing transport (post steering which
>>>>>> is what is currently returned). That would provide the complete
>>>>>> picture to control scheduling a flow from userspace, and an interface
>>>>>> to selectively turn off RFS for a socket would make sense then.
>>>>>
>>>>> I think that a turn-off-RFS interface would also want a way to figure
>>>>> out where the flow would go without RFS.  Can the network stack do
>>>>> that (e.g. evaluate the rx indirection hash or whatever happens these
>>>>> days)?
>>>>>
>>>> Yes,. We need the rxhash and the CPU that packets are received on from
>>>> the device for the socket. The former we already have, the latter
>>>> might be done by adding a field to skbuff to set received CPU. Given
>>>> the L4 hash and interrupting CPU we can calculated the RPS CPU which
>>>> is where packet would have landed with RFS off.
>>>
>>> Hmm.  I think this would be useful for me.  It would *definitely* be
>>> useful for me if I could pin an RFS flow to a cpu of my choice.
>>>
>> Andy, can you elaborate a little more on your use case. I've thought
>> several times about an interface to program the flow table from
>> userspace, but never quite came up with a compelling use case and
>> there is the security concern that a user could "steal" cycles from
>> arbitrary CPUs.
>
> I have a bunch of threads that are pinned to various CPUs or groups of
> CPUs.  Each thread is responsible for a fixed set of flows.  I'd like
> those flows to go to those CPUs.
>
> RFS will eventually do it, but it would be nice if I could
> deterministically ask for a flow to be routed to the right CPU.  Also,
> if my thread bounces temporarily to another CPU, I don't really need
> the flow to follow it -- I'd like it to stay put.
>
Okay, how about it we have a SO_RFS_LOCK_FLOW sockopt. When this is
called on a socket we can lock the socket to CPU binding to the
current CPU it is called from. It could be unlocked at a later point.
Would this satisfy your requirements?

> This has a significant benefit over using automatic steering: with
> automatic steering, I have to make all of the hash tables have a size
> around the square of the total number of the flows in order to make it
> reliable.
>
> Something like SO_STEER_TO_THIS_CPU would be fine, as long as it
> reported whether it worked (for my diagnostics).
>
>>
>>> With SO_INCOMING_CPU as described, I'm worried that people will write
>>> programs that perform very well if RFS is off, but that once that code
>>> runs with RFS on, weird things could happen.
>>>
>>> (On a side note: the RFS flow hash stuff seems to be rather buggy.
>>> Some Solarflare engineers know about this, but a fix seems to be
>>> rather slow in the works.  I think that some of the bugs are in core
>>> code, though.)
>>
>> This is problems with accelerated RFS or just getting the flow hash for packets?
>
> Accelerated RFS.
>
> Digging through my email, I thought that
> net.core.rps_sock_flow_entries != the per-queue rps_flow_cnt would
> make no sense, although I haven't configured it that way.
>
> More importantly, though, I think that some of the has table stuff is
> problematic.  My understanding is:
>
> get_rps_cpu may call set_rps_cpu, passing rflow =
> flow_table->flows[hash & flow_table->mask];
>
> set_rps_cpu will compute flow_id = hash & flow_table->mask, which
> looks to me like it has the property that rflow == flow_table[hash]
> (unless we race with a hash table resize).
>
> Now set_rps_cpu tries to steer the new flow to the right CPU (all good
> so far), but then it gets weird.  We have a very high probability of
> old_flow == rflow.  rflow->filter gets overwritten with the filter id,
> the if condition doesn't execute, and nothing gets set to
> RPS_NO_FILTER.
>
> This is technically all correct, but if there are two active flows
> with the same hash, they'll each keep getting steered to the same
> place.  This wastes cycles and seems to anger the sfc driver (the
> latter is presumably an sfc bug).  It also means that some of the
> filters are likely to get expired for no good reason.
>
Yes, I could see where persistent hash collisions could cause an issue
with aRFS. IIRC, I saw programming the filters to be quite an
expensive operation. Minimally if seems like there some rate limiting
change to avoid hysteresis.

> Also, shouldn't ndo_rx_flow_steer be passed the full hash, not just
> the index into the hash table?  What's the point of cutting off the
> high bits?
>
In order to make aRFS transparent to the user, it was implemented to
be driven from the RFS table which hashes based on the mask so that is
why it is limited. An alternative would be to program the filters
directly from a socket using the full hash, that interface might
already exist in ntuple filtering I suppose.

> --Andy
>
> --Andy
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Nov. 15, 2014, 12:15 a.m. UTC | #23
On Fri, Nov 14, 2014 at 2:18 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Nov 14, 2014 at 2:16 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> On Fri, 2014-11-14 at 12:16 -0800, Andy Lutomirski wrote:
>>
>>> Sure, but how do I know that it'll be the same CPU next time?
>>
>> Because the NIC always use same RX queue for a given flow.
>>
>> So if you setup your IRQ affinities properly, the same CPU will drain
>> packets from this RX queue. And since RFS is off, you have the guarantee
>> the same CPU will be used to process packets in TCP stack.
>
> Right.  My concern is that if RFS is off, everything works fine, but
> if RFS is on (and the app has no good way to know), then silly results
> will happen.  I think I'd rather have the getsockopt fail if RFS is on
> or at least give some indication so that the app can react
> accordingly.
>
As I mentioned, there is no material functionality in this patch and
it should be independent of RFS. It simply returns the CPU where the
stack processed the packet. Whether or not this is meaningful
information to the algorithm being implemented in userspace is
completely up to the caller to decide.

Tom

> --Andy
>
>>
>> This SO_INCOMING_CPU info is a hint, there is no guarantee eg if you use
>> bonding and some load balancer or switch decides to send packets on
>> different links.
>>
>> Most NIC use Toeplitz hash, so given the 4-tuple, and rss key (40
>> bytes), you can actually compute the hash in software and know on which
>> RX queue traffic should land.
>>
>>
>
>
>
> --
> Andy Lutomirski
> AMA Capital Management, LLC
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 15, 2014, 12:24 a.m. UTC | #24
On Fri, Nov 14, 2014 at 4:06 PM, Tom Herbert <therbert@google.com> wrote:
> On Fri, Nov 14, 2014 at 2:10 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Nov 14, 2014 at 1:36 PM, Tom Herbert <therbert@google.com> wrote:
>>> On Fri, Nov 14, 2014 at 12:34 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>> On Fri, Nov 14, 2014 at 12:25 PM, Tom Herbert <therbert@google.com> wrote:
>>>>> On Fri, Nov 14, 2014 at 12:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>> On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
>>>>>>> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>>>>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>>>>>>>
>>>>>>>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>>>>>>>> question about this API:
>>>>>>>>>
>>>>>>>>> How does an application tell whether the socket represents a
>>>>>>>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>>>>>>>> the application handling to the socket's CPU seems problematic, as the
>>>>>>>>> socket's CPU might move as well.  The current implementation in this
>>>>>>>>> patch seems to tell me which CPU the most recent packet came in on,
>>>>>>>>> which is not necessarily very useful.
>>>>>>>>
>>>>>>>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>>>>>>>> its cache. This is all that matters,
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Some possibilities:
>>>>>>>>>
>>>>>>>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>>>>>>>
>>>>>>>> Well, idea is to not use RFS at all. Otherwise, it is useless.
>>>>>>
>>>>>> Sure, but how do I know that it'll be the same CPU next time?
>>>>>>
>>>>>>>>
>>>>>>> Bear in mind this is only an interface to report RX CPU and in itself
>>>>>>> doesn't provide any functionality for changing scheduling, there is
>>>>>>> obviously logic needed in user space that would need to do something.
>>>>>>>
>>>>>>> If we track the interrupting CPU in skb, the interface could be easily
>>>>>>> extended to provide the interrupting CPU, the RPS CPU (calculated at
>>>>>>> reported time), and the CPU processing transport (post steering which
>>>>>>> is what is currently returned). That would provide the complete
>>>>>>> picture to control scheduling a flow from userspace, and an interface
>>>>>>> to selectively turn off RFS for a socket would make sense then.
>>>>>>
>>>>>> I think that a turn-off-RFS interface would also want a way to figure
>>>>>> out where the flow would go without RFS.  Can the network stack do
>>>>>> that (e.g. evaluate the rx indirection hash or whatever happens these
>>>>>> days)?
>>>>>>
>>>>> Yes,. We need the rxhash and the CPU that packets are received on from
>>>>> the device for the socket. The former we already have, the latter
>>>>> might be done by adding a field to skbuff to set received CPU. Given
>>>>> the L4 hash and interrupting CPU we can calculated the RPS CPU which
>>>>> is where packet would have landed with RFS off.
>>>>
>>>> Hmm.  I think this would be useful for me.  It would *definitely* be
>>>> useful for me if I could pin an RFS flow to a cpu of my choice.
>>>>
>>> Andy, can you elaborate a little more on your use case. I've thought
>>> several times about an interface to program the flow table from
>>> userspace, but never quite came up with a compelling use case and
>>> there is the security concern that a user could "steal" cycles from
>>> arbitrary CPUs.
>>
>> I have a bunch of threads that are pinned to various CPUs or groups of
>> CPUs.  Each thread is responsible for a fixed set of flows.  I'd like
>> those flows to go to those CPUs.
>>
>> RFS will eventually do it, but it would be nice if I could
>> deterministically ask for a flow to be routed to the right CPU.  Also,
>> if my thread bounces temporarily to another CPU, I don't really need
>> the flow to follow it -- I'd like it to stay put.
>>
> Okay, how about it we have a SO_RFS_LOCK_FLOW sockopt. When this is
> called on a socket we can lock the socket to CPU binding to the
> current CPU it is called from. It could be unlocked at a later point.
> Would this satisfy your requirements?

Yes, I think.  Especially if it bypassed the hash table.

>
>> This has a significant benefit over using automatic steering: with
>> automatic steering, I have to make all of the hash tables have a size
>> around the square of the total number of the flows in order to make it
>> reliable.
>>
>> Something like SO_STEER_TO_THIS_CPU would be fine, as long as it
>> reported whether it worked (for my diagnostics).
>>
>>>
>>>> With SO_INCOMING_CPU as described, I'm worried that people will write
>>>> programs that perform very well if RFS is off, but that once that code
>>>> runs with RFS on, weird things could happen.
>>>>
>>>> (On a side note: the RFS flow hash stuff seems to be rather buggy.
>>>> Some Solarflare engineers know about this, but a fix seems to be
>>>> rather slow in the works.  I think that some of the bugs are in core
>>>> code, though.)
>>>
>>> This is problems with accelerated RFS or just getting the flow hash for packets?
>>
>> Accelerated RFS.
>>
>> Digging through my email, I thought that
>> net.core.rps_sock_flow_entries != the per-queue rps_flow_cnt would
>> make no sense, although I haven't configured it that way.
>>
>> More importantly, though, I think that some of the has table stuff is
>> problematic.  My understanding is:
>>
>> get_rps_cpu may call set_rps_cpu, passing rflow =
>> flow_table->flows[hash & flow_table->mask];
>>
>> set_rps_cpu will compute flow_id = hash & flow_table->mask, which
>> looks to me like it has the property that rflow == flow_table[hash]
>> (unless we race with a hash table resize).
>>
>> Now set_rps_cpu tries to steer the new flow to the right CPU (all good
>> so far), but then it gets weird.  We have a very high probability of
>> old_flow == rflow.  rflow->filter gets overwritten with the filter id,
>> the if condition doesn't execute, and nothing gets set to
>> RPS_NO_FILTER.
>>
>> This is technically all correct, but if there are two active flows
>> with the same hash, they'll each keep getting steered to the same
>> place.  This wastes cycles and seems to anger the sfc driver (the
>> latter is presumably an sfc bug).  It also means that some of the
>> filters are likely to get expired for no good reason.
>>
> Yes, I could see where persistent hash collisions could cause an issue
> with aRFS. IIRC, I saw programming the filters to be quite an
> expensive operation. Minimally if seems like there some rate limiting
> change to avoid hysteresis.

They're especially expensive, given that they often generate printks
for me due to the sfc bug.


> As I mentioned, there is no material functionality in this patch and
> it should be independent of RFS. It simply returns the CPU where the
> stack processed the packet. Whether or not this is meaningful
> information to the algorithm being implemented in userspace is
> completely up to the caller to decide.

Agreed.

My only concern is that writing that userspace algorithm might result
in surprises if RFS is on.  Having the user program notice the problem
early and alert the admin might help keep Murphy's Law at bay here.

--Andy
Tom Herbert Nov. 15, 2014, 12:40 a.m. UTC | #25
On Fri, Nov 14, 2014 at 4:24 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Nov 14, 2014 at 4:06 PM, Tom Herbert <therbert@google.com> wrote:
>> On Fri, Nov 14, 2014 at 2:10 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Fri, Nov 14, 2014 at 1:36 PM, Tom Herbert <therbert@google.com> wrote:
>>>> On Fri, Nov 14, 2014 at 12:34 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>> On Fri, Nov 14, 2014 at 12:25 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>> On Fri, Nov 14, 2014 at 12:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>>> On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
>>>>>>>> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>>>>>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>>>>>>>>
>>>>>>>>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>>>>>>>>> question about this API:
>>>>>>>>>>
>>>>>>>>>> How does an application tell whether the socket represents a
>>>>>>>>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>>>>>>>>> the application handling to the socket's CPU seems problematic, as the
>>>>>>>>>> socket's CPU might move as well.  The current implementation in this
>>>>>>>>>> patch seems to tell me which CPU the most recent packet came in on,
>>>>>>>>>> which is not necessarily very useful.
>>>>>>>>>
>>>>>>>>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>>>>>>>>> its cache. This is all that matters,
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Some possibilities:
>>>>>>>>>>
>>>>>>>>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>>>>>>>>
>>>>>>>>> Well, idea is to not use RFS at all. Otherwise, it is useless.
>>>>>>>
>>>>>>> Sure, but how do I know that it'll be the same CPU next time?
>>>>>>>
>>>>>>>>>
>>>>>>>> Bear in mind this is only an interface to report RX CPU and in itself
>>>>>>>> doesn't provide any functionality for changing scheduling, there is
>>>>>>>> obviously logic needed in user space that would need to do something.
>>>>>>>>
>>>>>>>> If we track the interrupting CPU in skb, the interface could be easily
>>>>>>>> extended to provide the interrupting CPU, the RPS CPU (calculated at
>>>>>>>> reported time), and the CPU processing transport (post steering which
>>>>>>>> is what is currently returned). That would provide the complete
>>>>>>>> picture to control scheduling a flow from userspace, and an interface
>>>>>>>> to selectively turn off RFS for a socket would make sense then.
>>>>>>>
>>>>>>> I think that a turn-off-RFS interface would also want a way to figure
>>>>>>> out where the flow would go without RFS.  Can the network stack do
>>>>>>> that (e.g. evaluate the rx indirection hash or whatever happens these
>>>>>>> days)?
>>>>>>>
>>>>>> Yes,. We need the rxhash and the CPU that packets are received on from
>>>>>> the device for the socket. The former we already have, the latter
>>>>>> might be done by adding a field to skbuff to set received CPU. Given
>>>>>> the L4 hash and interrupting CPU we can calculated the RPS CPU which
>>>>>> is where packet would have landed with RFS off.
>>>>>
>>>>> Hmm.  I think this would be useful for me.  It would *definitely* be
>>>>> useful for me if I could pin an RFS flow to a cpu of my choice.
>>>>>
>>>> Andy, can you elaborate a little more on your use case. I've thought
>>>> several times about an interface to program the flow table from
>>>> userspace, but never quite came up with a compelling use case and
>>>> there is the security concern that a user could "steal" cycles from
>>>> arbitrary CPUs.
>>>
>>> I have a bunch of threads that are pinned to various CPUs or groups of
>>> CPUs.  Each thread is responsible for a fixed set of flows.  I'd like
>>> those flows to go to those CPUs.
>>>
>>> RFS will eventually do it, but it would be nice if I could
>>> deterministically ask for a flow to be routed to the right CPU.  Also,
>>> if my thread bounces temporarily to another CPU, I don't really need
>>> the flow to follow it -- I'd like it to stay put.
>>>
>> Okay, how about it we have a SO_RFS_LOCK_FLOW sockopt. When this is
>> called on a socket we can lock the socket to CPU binding to the
>> current CPU it is called from. It could be unlocked at a later point.
>> Would this satisfy your requirements?
>
> Yes, I think.  Especially if it bypassed the hash table.

Unfortunately we can't easily bypass the hash table. The only way I
know of to to do that is to perform the socket lookup to do steering
(I tried that early on, but it was pretty costly).

>
>>
>>> This has a significant benefit over using automatic steering: with
>>> automatic steering, I have to make all of the hash tables have a size
>>> around the square of the total number of the flows in order to make it
>>> reliable.
>>>
>>> Something like SO_STEER_TO_THIS_CPU would be fine, as long as it
>>> reported whether it worked (for my diagnostics).
>>>
>>>>
>>>>> With SO_INCOMING_CPU as described, I'm worried that people will write
>>>>> programs that perform very well if RFS is off, but that once that code
>>>>> runs with RFS on, weird things could happen.
>>>>>
>>>>> (On a side note: the RFS flow hash stuff seems to be rather buggy.
>>>>> Some Solarflare engineers know about this, but a fix seems to be
>>>>> rather slow in the works.  I think that some of the bugs are in core
>>>>> code, though.)
>>>>
>>>> This is problems with accelerated RFS or just getting the flow hash for packets?
>>>
>>> Accelerated RFS.
>>>
>>> Digging through my email, I thought that
>>> net.core.rps_sock_flow_entries != the per-queue rps_flow_cnt would
>>> make no sense, although I haven't configured it that way.
>>>
>>> More importantly, though, I think that some of the has table stuff is
>>> problematic.  My understanding is:
>>>
>>> get_rps_cpu may call set_rps_cpu, passing rflow =
>>> flow_table->flows[hash & flow_table->mask];
>>>
>>> set_rps_cpu will compute flow_id = hash & flow_table->mask, which
>>> looks to me like it has the property that rflow == flow_table[hash]
>>> (unless we race with a hash table resize).
>>>
>>> Now set_rps_cpu tries to steer the new flow to the right CPU (all good
>>> so far), but then it gets weird.  We have a very high probability of
>>> old_flow == rflow.  rflow->filter gets overwritten with the filter id,
>>> the if condition doesn't execute, and nothing gets set to
>>> RPS_NO_FILTER.
>>>
>>> This is technically all correct, but if there are two active flows
>>> with the same hash, they'll each keep getting steered to the same
>>> place.  This wastes cycles and seems to anger the sfc driver (the
>>> latter is presumably an sfc bug).  It also means that some of the
>>> filters are likely to get expired for no good reason.
>>>
>> Yes, I could see where persistent hash collisions could cause an issue
>> with aRFS. IIRC, I saw programming the filters to be quite an
>> expensive operation. Minimally if seems like there some rate limiting
>> change to avoid hysteresis.
>
> They're especially expensive, given that they often generate printks
> for me due to the sfc bug.
>
:-)

>
>> As I mentioned, there is no material functionality in this patch and
>> it should be independent of RFS. It simply returns the CPU where the
>> stack processed the packet. Whether or not this is meaningful
>> information to the algorithm being implemented in userspace is
>> completely up to the caller to decide.
>
> Agreed.
>
> My only concern is that writing that userspace algorithm might result
> in surprises if RFS is on.  Having the user program notice the problem
> early and alert the admin might help keep Murphy's Law at bay here.
>
By Murphy's law we'd also have to consider that the flow hash could
change after reading the results so that the scheduling done in
userspace is completely wrong until the CPU is read again.
Synchronizing kernel and device state with userspace state is not
always so easy. One way to mitigate is to use ancillary data which
would provide real time information and obviate the need for another
system call.


> --Andy
>
>
> --
> Andy Lutomirski
> AMA Capital Management, LLC
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 15, 2014, 12:50 a.m. UTC | #26
On Fri, Nov 14, 2014 at 4:40 PM, Tom Herbert <therbert@google.com> wrote:
> On Fri, Nov 14, 2014 at 4:24 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Nov 14, 2014 at 4:06 PM, Tom Herbert <therbert@google.com> wrote:
>>> On Fri, Nov 14, 2014 at 2:10 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>> On Fri, Nov 14, 2014 at 1:36 PM, Tom Herbert <therbert@google.com> wrote:
>>>>> On Fri, Nov 14, 2014 at 12:34 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>> On Fri, Nov 14, 2014 at 12:25 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>>> On Fri, Nov 14, 2014 at 12:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>>>> On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
>>>>>>>>> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>>>>>>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>>>>>>>>>
>>>>>>>>>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>>>>>>>>>> question about this API:
>>>>>>>>>>>
>>>>>>>>>>> How does an application tell whether the socket represents a
>>>>>>>>>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>>>>>>>>>> the application handling to the socket's CPU seems problematic, as the
>>>>>>>>>>> socket's CPU might move as well.  The current implementation in this
>>>>>>>>>>> patch seems to tell me which CPU the most recent packet came in on,
>>>>>>>>>>> which is not necessarily very useful.
>>>>>>>>>>
>>>>>>>>>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>>>>>>>>>> its cache. This is all that matters,
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Some possibilities:
>>>>>>>>>>>
>>>>>>>>>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>>>>>>>>>
>>>>>>>>>> Well, idea is to not use RFS at all. Otherwise, it is useless.
>>>>>>>>
>>>>>>>> Sure, but how do I know that it'll be the same CPU next time?
>>>>>>>>
>>>>>>>>>>
>>>>>>>>> Bear in mind this is only an interface to report RX CPU and in itself
>>>>>>>>> doesn't provide any functionality for changing scheduling, there is
>>>>>>>>> obviously logic needed in user space that would need to do something.
>>>>>>>>>
>>>>>>>>> If we track the interrupting CPU in skb, the interface could be easily
>>>>>>>>> extended to provide the interrupting CPU, the RPS CPU (calculated at
>>>>>>>>> reported time), and the CPU processing transport (post steering which
>>>>>>>>> is what is currently returned). That would provide the complete
>>>>>>>>> picture to control scheduling a flow from userspace, and an interface
>>>>>>>>> to selectively turn off RFS for a socket would make sense then.
>>>>>>>>
>>>>>>>> I think that a turn-off-RFS interface would also want a way to figure
>>>>>>>> out where the flow would go without RFS.  Can the network stack do
>>>>>>>> that (e.g. evaluate the rx indirection hash or whatever happens these
>>>>>>>> days)?
>>>>>>>>
>>>>>>> Yes,. We need the rxhash and the CPU that packets are received on from
>>>>>>> the device for the socket. The former we already have, the latter
>>>>>>> might be done by adding a field to skbuff to set received CPU. Given
>>>>>>> the L4 hash and interrupting CPU we can calculated the RPS CPU which
>>>>>>> is where packet would have landed with RFS off.
>>>>>>
>>>>>> Hmm.  I think this would be useful for me.  It would *definitely* be
>>>>>> useful for me if I could pin an RFS flow to a cpu of my choice.
>>>>>>
>>>>> Andy, can you elaborate a little more on your use case. I've thought
>>>>> several times about an interface to program the flow table from
>>>>> userspace, but never quite came up with a compelling use case and
>>>>> there is the security concern that a user could "steal" cycles from
>>>>> arbitrary CPUs.
>>>>
>>>> I have a bunch of threads that are pinned to various CPUs or groups of
>>>> CPUs.  Each thread is responsible for a fixed set of flows.  I'd like
>>>> those flows to go to those CPUs.
>>>>
>>>> RFS will eventually do it, but it would be nice if I could
>>>> deterministically ask for a flow to be routed to the right CPU.  Also,
>>>> if my thread bounces temporarily to another CPU, I don't really need
>>>> the flow to follow it -- I'd like it to stay put.
>>>>
>>> Okay, how about it we have a SO_RFS_LOCK_FLOW sockopt. When this is
>>> called on a socket we can lock the socket to CPU binding to the
>>> current CPU it is called from. It could be unlocked at a later point.
>>> Would this satisfy your requirements?
>>
>> Yes, I think.  Especially if it bypassed the hash table.
>
> Unfortunately we can't easily bypass the hash table. The only way I
> know of to to do that is to perform the socket lookup to do steering
> (I tried that early on, but it was pretty costly).

What happens if you just call ndo_rx_flow_steer and do something to
keep the result from expiring?

>>
>>> As I mentioned, there is no material functionality in this patch and
>>> it should be independent of RFS. It simply returns the CPU where the
>>> stack processed the packet. Whether or not this is meaningful
>>> information to the algorithm being implemented in userspace is
>>> completely up to the caller to decide.
>>
>> Agreed.
>>
>> My only concern is that writing that userspace algorithm might result
>> in surprises if RFS is on.  Having the user program notice the problem
>> early and alert the admin might help keep Murphy's Law at bay here.
>>
> By Murphy's law we'd also have to consider that the flow hash could
> change after reading the results so that the scheduling done in
> userspace is completely wrong until the CPU is read again.
> Synchronizing kernel and device state with userspace state is not
> always so easy. One way to mitigate is to use ancillary data which
> would provide real time information and obviate the need for another
> system call.

Hmm.  That would work, too.  I don't know how annoyed user code would
be at having to read ancillary data, though.

The flow hash really shouldn't change much, though, right?

--Andy
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Nov. 15, 2014, 6:41 p.m. UTC | #27
On Fri, Nov 14, 2014 at 4:50 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Nov 14, 2014 at 4:40 PM, Tom Herbert <therbert@google.com> wrote:
>> On Fri, Nov 14, 2014 at 4:24 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Fri, Nov 14, 2014 at 4:06 PM, Tom Herbert <therbert@google.com> wrote:
>>>> On Fri, Nov 14, 2014 at 2:10 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>> On Fri, Nov 14, 2014 at 1:36 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>> On Fri, Nov 14, 2014 at 12:34 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>>> On Fri, Nov 14, 2014 at 12:25 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>>>> On Fri, Nov 14, 2014 at 12:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>>>>> On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
>>>>>>>>>> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>>>>>>>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>>>>>>>>>>
>>>>>>>>>>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>>>>>>>>>>> question about this API:
>>>>>>>>>>>>
>>>>>>>>>>>> How does an application tell whether the socket represents a
>>>>>>>>>>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>>>>>>>>>>> the application handling to the socket's CPU seems problematic, as the
>>>>>>>>>>>> socket's CPU might move as well.  The current implementation in this
>>>>>>>>>>>> patch seems to tell me which CPU the most recent packet came in on,
>>>>>>>>>>>> which is not necessarily very useful.
>>>>>>>>>>>
>>>>>>>>>>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>>>>>>>>>>> its cache. This is all that matters,
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Some possibilities:
>>>>>>>>>>>>
>>>>>>>>>>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>>>>>>>>>>
>>>>>>>>>>> Well, idea is to not use RFS at all. Otherwise, it is useless.
>>>>>>>>>
>>>>>>>>> Sure, but how do I know that it'll be the same CPU next time?
>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>> Bear in mind this is only an interface to report RX CPU and in itself
>>>>>>>>>> doesn't provide any functionality for changing scheduling, there is
>>>>>>>>>> obviously logic needed in user space that would need to do something.
>>>>>>>>>>
>>>>>>>>>> If we track the interrupting CPU in skb, the interface could be easily
>>>>>>>>>> extended to provide the interrupting CPU, the RPS CPU (calculated at
>>>>>>>>>> reported time), and the CPU processing transport (post steering which
>>>>>>>>>> is what is currently returned). That would provide the complete
>>>>>>>>>> picture to control scheduling a flow from userspace, and an interface
>>>>>>>>>> to selectively turn off RFS for a socket would make sense then.
>>>>>>>>>
>>>>>>>>> I think that a turn-off-RFS interface would also want a way to figure
>>>>>>>>> out where the flow would go without RFS.  Can the network stack do
>>>>>>>>> that (e.g. evaluate the rx indirection hash or whatever happens these
>>>>>>>>> days)?
>>>>>>>>>
>>>>>>>> Yes,. We need the rxhash and the CPU that packets are received on from
>>>>>>>> the device for the socket. The former we already have, the latter
>>>>>>>> might be done by adding a field to skbuff to set received CPU. Given
>>>>>>>> the L4 hash and interrupting CPU we can calculated the RPS CPU which
>>>>>>>> is where packet would have landed with RFS off.
>>>>>>>
>>>>>>> Hmm.  I think this would be useful for me.  It would *definitely* be
>>>>>>> useful for me if I could pin an RFS flow to a cpu of my choice.
>>>>>>>
>>>>>> Andy, can you elaborate a little more on your use case. I've thought
>>>>>> several times about an interface to program the flow table from
>>>>>> userspace, but never quite came up with a compelling use case and
>>>>>> there is the security concern that a user could "steal" cycles from
>>>>>> arbitrary CPUs.
>>>>>
>>>>> I have a bunch of threads that are pinned to various CPUs or groups of
>>>>> CPUs.  Each thread is responsible for a fixed set of flows.  I'd like
>>>>> those flows to go to those CPUs.
>>>>>
>>>>> RFS will eventually do it, but it would be nice if I could
>>>>> deterministically ask for a flow to be routed to the right CPU.  Also,
>>>>> if my thread bounces temporarily to another CPU, I don't really need
>>>>> the flow to follow it -- I'd like it to stay put.
>>>>>
>>>> Okay, how about it we have a SO_RFS_LOCK_FLOW sockopt. When this is
>>>> called on a socket we can lock the socket to CPU binding to the
>>>> current CPU it is called from. It could be unlocked at a later point.
>>>> Would this satisfy your requirements?
>>>
>>> Yes, I think.  Especially if it bypassed the hash table.
>>
>> Unfortunately we can't easily bypass the hash table. The only way I
>> know of to to do that is to perform the socket lookup to do steering
>> (I tried that early on, but it was pretty costly).
>
> What happens if you just call ndo_rx_flow_steer and do something to
> keep the result from expiring?
>
Okay, I will look at that. Do you know how many flows we are talking
about, both in the number you need and the number that can be put in
the HW without collision?

Thanks,
Tom

>>>
>>>> As I mentioned, there is no material functionality in this patch and
>>>> it should be independent of RFS. It simply returns the CPU where the
>>>> stack processed the packet. Whether or not this is meaningful
>>>> information to the algorithm being implemented in userspace is
>>>> completely up to the caller to decide.
>>>
>>> Agreed.
>>>
>>> My only concern is that writing that userspace algorithm might result
>>> in surprises if RFS is on.  Having the user program notice the problem
>>> early and alert the admin might help keep Murphy's Law at bay here.
>>>
>> By Murphy's law we'd also have to consider that the flow hash could
>> change after reading the results so that the scheduling done in
>> userspace is completely wrong until the CPU is read again.
>> Synchronizing kernel and device state with userspace state is not
>> always so easy. One way to mitigate is to use ancillary data which
>> would provide real time information and obviate the need for another
>> system call.
>
> Hmm.  That would work, too.  I don't know how annoyed user code would
> be at having to read ancillary data, though.
>
> The flow hash really shouldn't change much, though, right?
>
> --Andy
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Nov. 15, 2014, 6:47 p.m. UTC | #28
On Sat, Nov 15, 2014 at 10:41 AM, Tom Herbert <therbert@google.com> wrote:
> On Fri, Nov 14, 2014 at 4:50 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Nov 14, 2014 at 4:40 PM, Tom Herbert <therbert@google.com> wrote:
>>> On Fri, Nov 14, 2014 at 4:24 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>> On Fri, Nov 14, 2014 at 4:06 PM, Tom Herbert <therbert@google.com> wrote:
>>>>> On Fri, Nov 14, 2014 at 2:10 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>> On Fri, Nov 14, 2014 at 1:36 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>>> On Fri, Nov 14, 2014 at 12:34 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>>>> On Fri, Nov 14, 2014 at 12:25 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>>>>> On Fri, Nov 14, 2014 at 12:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>>>>>> On Fri, Nov 14, 2014 at 11:52 AM, Tom Herbert <therbert@google.com> wrote:
>>>>>>>>>>> On Fri, Nov 14, 2014 at 11:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>>>>>>>>> On Fri, 2014-11-14 at 09:17 -0800, Andy Lutomirski wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>> As a heavy user of RFS (and finder of bugs in it, too), here's my
>>>>>>>>>>>>> question about this API:
>>>>>>>>>>>>>
>>>>>>>>>>>>> How does an application tell whether the socket represents a
>>>>>>>>>>>>> non-actively-steered flow?  If the flow is subject to RFS, then moving
>>>>>>>>>>>>> the application handling to the socket's CPU seems problematic, as the
>>>>>>>>>>>>> socket's CPU might move as well.  The current implementation in this
>>>>>>>>>>>>> patch seems to tell me which CPU the most recent packet came in on,
>>>>>>>>>>>>> which is not necessarily very useful.
>>>>>>>>>>>>
>>>>>>>>>>>> Its the cpu that hit the TCP stack, bringing dozens of cache lines in
>>>>>>>>>>>> its cache. This is all that matters,
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Some possibilities:
>>>>>>>>>>>>>
>>>>>>>>>>>>> 1. Let SO_INCOMING_CPU fail if RFS or RPS are in play.
>>>>>>>>>>>>
>>>>>>>>>>>> Well, idea is to not use RFS at all. Otherwise, it is useless.
>>>>>>>>>>
>>>>>>>>>> Sure, but how do I know that it'll be the same CPU next time?
>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>> Bear in mind this is only an interface to report RX CPU and in itself
>>>>>>>>>>> doesn't provide any functionality for changing scheduling, there is
>>>>>>>>>>> obviously logic needed in user space that would need to do something.
>>>>>>>>>>>
>>>>>>>>>>> If we track the interrupting CPU in skb, the interface could be easily
>>>>>>>>>>> extended to provide the interrupting CPU, the RPS CPU (calculated at
>>>>>>>>>>> reported time), and the CPU processing transport (post steering which
>>>>>>>>>>> is what is currently returned). That would provide the complete
>>>>>>>>>>> picture to control scheduling a flow from userspace, and an interface
>>>>>>>>>>> to selectively turn off RFS for a socket would make sense then.
>>>>>>>>>>
>>>>>>>>>> I think that a turn-off-RFS interface would also want a way to figure
>>>>>>>>>> out where the flow would go without RFS.  Can the network stack do
>>>>>>>>>> that (e.g. evaluate the rx indirection hash or whatever happens these
>>>>>>>>>> days)?
>>>>>>>>>>
>>>>>>>>> Yes,. We need the rxhash and the CPU that packets are received on from
>>>>>>>>> the device for the socket. The former we already have, the latter
>>>>>>>>> might be done by adding a field to skbuff to set received CPU. Given
>>>>>>>>> the L4 hash and interrupting CPU we can calculated the RPS CPU which
>>>>>>>>> is where packet would have landed with RFS off.
>>>>>>>>
>>>>>>>> Hmm.  I think this would be useful for me.  It would *definitely* be
>>>>>>>> useful for me if I could pin an RFS flow to a cpu of my choice.
>>>>>>>>
>>>>>>> Andy, can you elaborate a little more on your use case. I've thought
>>>>>>> several times about an interface to program the flow table from
>>>>>>> userspace, but never quite came up with a compelling use case and
>>>>>>> there is the security concern that a user could "steal" cycles from
>>>>>>> arbitrary CPUs.
>>>>>>
>>>>>> I have a bunch of threads that are pinned to various CPUs or groups of
>>>>>> CPUs.  Each thread is responsible for a fixed set of flows.  I'd like
>>>>>> those flows to go to those CPUs.
>>>>>>
>>>>>> RFS will eventually do it, but it would be nice if I could
>>>>>> deterministically ask for a flow to be routed to the right CPU.  Also,
>>>>>> if my thread bounces temporarily to another CPU, I don't really need
>>>>>> the flow to follow it -- I'd like it to stay put.
>>>>>>
>>>>> Okay, how about it we have a SO_RFS_LOCK_FLOW sockopt. When this is
>>>>> called on a socket we can lock the socket to CPU binding to the
>>>>> current CPU it is called from. It could be unlocked at a later point.
>>>>> Would this satisfy your requirements?
>>>>
>>>> Yes, I think.  Especially if it bypassed the hash table.
>>>
>>> Unfortunately we can't easily bypass the hash table. The only way I
>>> know of to to do that is to perform the socket lookup to do steering
>>> (I tried that early on, but it was pretty costly).
>>
>> What happens if you just call ndo_rx_flow_steer and do something to
>> keep the result from expiring?
>>
> Okay, I will look at that. Do you know how many flows we are talking
> about, both in the number you need and the number that can be put in
> the HW without collision?

I think I have on the order of 100 flows.  Maybe 400.  It's been a few
months since I checked, and this particular metric is much easier to
measure on a weekday.

I think the HW has several thousand slots.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 3de1394bcab821984674e89a3ee022cc6dd5f0f2..e2fe0700b3b442bffc1f606b1b8b0bb7759aa157 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -87,4 +87,6 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 6e6cd159924b1855aa5f1811ad4e4c60b403c431..92121b0f5b989a61c008e0be24030725bab88e36 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -80,4 +80,6 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
index ed94e5ed0a238c2750e677ccb806a6bc0a94041a..60f60f5b9b35bd219d7a9834fe5394e8ac5fdbab 100644
--- a/arch/cris/include/uapi/asm/socket.h
+++ b/arch/cris/include/uapi/asm/socket.h
@@ -82,6 +82,8 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif /* _ASM_SOCKET_H */
 
 
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index ca2c6e6f31c6817780d31a246652adcc9847e373..2c6890209ea60c149bf097c2a1b369519cb8c301 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -80,5 +80,7 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index a1b49bac7951929127ed08db549218c2c16ccf89..09a93fb566f6c6c6fe29c10c95b931881843d1cd 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -89,4 +89,6 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 6c9a24b3aefa3a4f3048c17a7fa06d97b585ec14..e8589819c2743c6e112b15a245fc3ebd146e6313 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -80,4 +80,6 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index a14baa218c76f14de988ef106bdac5dadc48aceb..2e9ee8c55a103a0337d9f80f71fe9ef28be1154b 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -98,4 +98,6 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index 6aa3ce1854aa9523d46bc28851eddabd59edeb37..f3492e8c9f7009c33e07168df916f7337bef3929 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -80,4 +80,6 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index fe35ceacf0e72cad69a43d9b1ce7b8f5ec3da98a..7984a1cab3da980f1f810827967b4b67616eb89b 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -79,4 +79,6 @@ 
 
 #define SO_BPF_EXTENSIONS	0x4029
 
+#define SO_INCOMING_CPU		0x402A
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index a9c3e2e18c054a1e952fe33599401de57c6a6544..3474e4ef166df4a573773916b325d0fa9f3b45d0 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -87,4 +87,6 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index e031332096d7c7b23b5953680289e8f3bcc3b378..8457636c33e1b67a9b7804daa05627839035a8fb 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -86,4 +86,6 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 54d9608681b6947ae25dab008f808841d96125c0..4a8003a9416348006cfa85d5bcdf7553c8d23958 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -76,6 +76,8 @@ 
 
 #define SO_BPF_EXTENSIONS	0x0032
 
+#define SO_INCOMING_CPU		0x0033
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index 39acec0cf0b1d500c1c40f9b523ef3a9a142c2f1..c46f6a696849c6f7f8a34b2cc522b48e04b17380 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -91,4 +91,6 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 6767d75ecb17693eb59a99b8218da4319854ccc0..7789b59c0c400eb99f65d1f0e03cd9773664cf93 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -273,6 +273,7 @@  struct cg_proto;
   *	@sk_rcvtimeo: %SO_RCVTIMEO setting
   *	@sk_sndtimeo: %SO_SNDTIMEO setting
   *	@sk_rxhash: flow hash received from netif layer
+  *	@sk_incoming_cpu: record cpu processing incoming packets
   *	@sk_txhash: computed flow hash for use on transmit
   *	@sk_filter: socket filtering instructions
   *	@sk_protinfo: private area, net family specific, when not using slab
@@ -350,6 +351,12 @@  struct sock {
 #ifdef CONFIG_RPS
 	__u32			sk_rxhash;
 #endif
+	u16			sk_incoming_cpu;
+	/* 16bit hole
+	 * Warned : sk_incoming_cpu can be set from softirq,
+	 * Do not use this hole without fully understanding possible issues.
+	 */
+
 	__u32			sk_txhash;
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	unsigned int		sk_napi_id;
@@ -833,6 +840,11 @@  static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 	return sk->sk_backlog_rcv(sk, skb);
 }
 
+static inline void sk_incoming_cpu_update(struct sock *sk)
+{
+	sk->sk_incoming_cpu = raw_smp_processor_id();
+}
+
 static inline void sock_rps_record_flow_hash(__u32 hash)
 {
 #ifdef CONFIG_RPS
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index ea0796bdcf88404ef0f127eb6e64ba00c16ea856..f541ccefd4acbeb4ad757be9dbf4b67f204bf21d 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -82,4 +82,6 @@ 
 
 #define SO_BPF_EXTENSIONS	48
 
+#define SO_INCOMING_CPU		49
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index ac56dd06c306f3712e57ce8e4724c79565589499..0725cf0cb685787b2122606437da53299fb24621 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1213,6 +1213,10 @@  int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sk->sk_max_pacing_rate;
 		break;
 
+	case SO_INCOMING_CPU:
+		v.val = sk->sk_incoming_cpu;
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -1517,6 +1521,7 @@  struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 		newsk->sk_err	   = 0;
 		newsk->sk_priority = 0;
+		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9c7d7621466b1241f404a5ca11de809dcff2d02a..3893f51972f28271a6d27a763c05495c5c2554f7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1662,6 +1662,7 @@  process:
 		goto discard_and_relse;
 
 	sk_mark_napi_id(sk, skb);
+	sk_incoming_cpu_update(sk);
 	skb->dev = NULL;
 
 	bh_lock_sock_nested(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index df19027f44f3d6fbe13dec78d3b085968dbf2329..f52b6081158e87caa5df32e8e5d27dbf314a01b1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1445,6 +1445,7 @@  static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (inet_sk(sk)->inet_daddr) {
 		sock_rps_save_rxhash(sk, skb);
 		sk_mark_napi_id(sk, skb);
+		sk_incoming_cpu_update(sk);
 	}
 
 	rc = sock_queue_rcv_skb(sk, skb);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ace29b60813cf8a1d7182ad2262cbcbd21810fa7..ac40d23204b5e55da5172c80dafd1d4854b370d5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1455,6 +1455,7 @@  process:
 		goto discard_and_relse;
 
 	sk_mark_napi_id(sk, skb);
+	sk_incoming_cpu_update(sk);
 	skb->dev = NULL;
 
 	bh_lock_sock_nested(sk);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9b6809232b178c16d699ce3d152196b8c4cb096b..0125ca3daf47a4a3333e7462a11550d3e2f96875 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -577,6 +577,7 @@  static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
 		sock_rps_save_rxhash(sk, skb);
 		sk_mark_napi_id(sk, skb);
+		sk_incoming_cpu_update(sk);
 	}
 
 	rc = sock_queue_rcv_skb(sk, skb);
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index d49dc2ed30adb97a809eb37902b9956c366a2862..ce469d648ffbe166f9ae1c5650f481256f31a7f8 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -205,9 +205,10 @@  int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
 	if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
 		goto out_free;
 
-	if (!sctp_ulpevent_is_notification(event))
+	if (!sctp_ulpevent_is_notification(event)) {
 		sk_mark_napi_id(sk, skb);
-
+		sk_incoming_cpu_update(sk);
+	}
 	/* Check if the user wishes to receive this event.  */
 	if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
 		goto out_free;