diff mbox series

[RFC,v2,1/3] UDP: enable GRO by default.

Message ID 20190128085025.14532-2-steffen.klassert@secunet.com
State RFC
Delegated to: David Miller
Headers show
Series Support fraglist GRO/GSO | expand

Commit Message

Steffen Klassert Jan. 28, 2019, 8:50 a.m. UTC
This patch enables UDP GRO regardless if a GRO capable
socket is present. With this GRO is done by default
for the local input and forwarding path.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/udp.h      |  2 +-
 net/ipv4/udp_offload.c | 33 ++++++++++++++-------------------
 net/ipv6/udp_offload.c |  8 +++++++-
 3 files changed, 22 insertions(+), 21 deletions(-)

Comments

Paolo Abeni Jan. 28, 2019, 3:30 p.m. UTC | #1
Hi,

On Mon, 2019-01-28 at 09:50 +0100, Steffen Klassert wrote:
> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
> index 64f9715173ac..584635db9231 100644
> --- a/net/ipv4/udp_offload.c
> +++ b/net/ipv4/udp_offload.c
> @@ -392,35 +392,24 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
>  	return NULL;
>  }
>  
> -INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
> -						   __be16 sport, __be16 dport));
>  struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
> -				struct udphdr *uh, udp_lookup_t lookup)
> +				struct udphdr *uh, struct sock *sk)
>  {
>  	struct sk_buff *pp = NULL;
>  	struct sk_buff *p;
>  	struct udphdr *uh2;
>  	unsigned int off = skb_gro_offset(skb);
>  	int flush = 1;
> -	struct sock *sk;
> -
> -	rcu_read_lock();
> -	sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
> -				udp4_lib_lookup_skb, skb, uh->source, uh->dest);
> -	if (!sk)
> -		goto out_unlock;
>  
> -	if (udp_sk(sk)->gro_enabled) {
> +	if (!sk || !udp_sk(sk)->gro_receive) {
>  		pp = call_gro_receive(udp_gro_receive_segment, head, skb);
> -		rcu_read_unlock();
>  		return pp;
>  	}
>  
>  	if (NAPI_GRO_CB(skb)->encap_mark ||
>  	    (skb->ip_summed != CHECKSUM_PARTIAL &&
>  	     NAPI_GRO_CB(skb)->csum_cnt == 0 &&
> -	     !NAPI_GRO_CB(skb)->csum_valid) ||
> -	    !udp_sk(sk)->gro_receive)
> +	     !NAPI_GRO_CB(skb)->csum_valid))
>  		goto out_unlock;

Here I think an additional chunk is missing: the caller is holding the
rcu lock, we should drop the rcu_read_unlock() from this function (and
likely rename the associated label).
 
>  	/* mark that this skb passed once through the tunnel gro layer */
> @@ -459,8 +448,10 @@ INDIRECT_CALLABLE_SCOPE
>  struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
>  {
>  	struct udphdr *uh = udp_gro_udphdr(skb);
> +	struct sk_buff *pp;
> +	struct sock *sk;
>  
> -	if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key))
> +	if (unlikely(!uh))
>  		goto flush;
>  
>  	/* Don't bother verifying checksum if we're going to flush anyway. */
> @@ -475,7 +466,11 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
>  					     inet_gro_compute_pseudo);
>  skip:
>  	NAPI_GRO_CB(skb)->is_ipv6 = 0;
> -	return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
> +	rcu_read_lock();
> +	sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL;
> +	pp = udp_gro_receive(head, skb, uh, sk);
> +	rcu_read_unlock();
> +	return pp;
>  
>  flush:
>  	NAPI_GRO_CB(skb)->flush = 1;

_Unrelated_ to this patch, but IIRC the RCU lock is already help by
dev_gro_receive(), so IMHO a follow-up patch could possibly remove the
lock here and make the code smaller. 

Apart from first point above, I like this patch a lot!

Paolo
Steffen Klassert Feb. 13, 2019, 10:52 a.m. UTC | #2
Move discussion back to the list...

On Wed, Feb 13, 2019 at 10:50:53AM +0100, Paolo Abeni wrote:
> Hi,
> 
> I'm unsure if the off-list is intentional, anyhow I keep the reply off-
> list. Please feel free to move back the discussion on the ML as it fit
> you better.

Your last mail was already off-list, I've just replied to everyone in Cc :-)

> 
> On Wed, 2019-02-13 at 10:04 +0100, Steffen Klassert wrote:
> > On Tue, Feb 12, 2019 at 05:18:38PM +0100, Paolo Abeni wrote:
> > > Hi,
> > > 
> > > On Mon, 2019-01-28 at 09:50 +0100, Steffen Klassert wrote: 
> > > > diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
> > > > index 83b11d0ac091..5f7937a4f71a 100644
> > > > --- a/net/ipv6/udp_offload.c
> > > > +++ b/net/ipv6/udp_offload.c
> > > > @@ -119,6 +119,8 @@ INDIRECT_CALLABLE_SCOPE
> > > >  struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
> > > >  {
> > > >  	struct udphdr *uh = udp_gro_udphdr(skb);
> > > > +	struct sk_buff *pp;
> > > > +	struct sock *sk;
> > > >  
> > > >  	if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key))
> > > >  		goto flush;
> > > 
> > > This                         ^^^^
> > > should be dropped, otherwise GRO is not enabled for ipv6.
> > 
> > Yes, will change this in the next patchset.
> > 
> > > I finally had some time to benchmark this series.
> > > 
> > > I used a pktgen sender b2b connected with the DUT via a 10Gb link. The
> > > sender generates  64 bytes UDP packets, and and the receiver runs an
> > > UDP sink _without_ UDP_GRO.
> > > 
> > > With aggregation (pktgen uses a single flow):
> > > 
> > > 		vanilla		patched		delta
> > > 		(kpps)		(kpps)		(%)
> > > sink with	2000		2050		2.5
> > > rcvmsg
> > > 
> > > sink with	1260		2900		130
> > > rcvmmsg
> > 
> > Do you know the bottleneck with the patched kernel? In particular
> > with rcvmsg, I guess the bottleneck is a full socket receive queue.
> 
> Yes, with the patched kernel the bottle-neck is the user-space sink: 
> especially due to PTI the US/KS transition is very costly. This is why
> rcvmmsg gives so great improvements. 
> 
> IIRC the bottle-neck is still on the US even with rcvmmsg, because GRO
> (even with segmentation) gives a great speedup vs non GRO processing.
> 
> Additional note: all the above is without any nf/ct related module
> loaded - that is, with the less favorable conditions for GRO.
> 
> > > Note: with the unpatched kernel, the bottle-neck is in ksoftirq
> > > processing: making the receiver faster (with rcvmmsg) reduces the
> > > "goodput", as the kernel processing needs to spend more time to wake up
> > > the user-space process.
> > > 
> > > When aggregation does not take place (pktgen changes the source port
> > > for each packet, creating 64K different flows):
> > 
> > I tried this with random source ports on my forwarding setup.
> > Here the NIC distributes the packets to different RX queues,
> > so I could easily fill a 40G link with this workload.
> 
> I'm sorry, I omitted another piece of information: In my tests I forced
> the NIC to use a single queue, (and with 64 bytes packets). Otherwise
> the link speed (or the bus b/w) easily become the bottle-neck and I
> could not measure any difference. I have only 10Gbs links handy.

Ok, so this is likely the worst case we can get.

> 
> > > 		vanilla		patched		delta
> > > 		(kpps)		(kpps)		(%)
> > > sink with	2020		1620		-20
> > > rcvmsg
> > > 
> > > sink with	1260		1110		-12
> > > rcvmmsg
> > > 
> > > Here there is a measurable regression. I haven't tracked it fully, but
> > > it looks like it depends mostly on less cache-friendly execution: 64
> > > packets are stored in the GRO hash and than flushed altogether. When
> > > that happens, most packets headers are cold in the cache.
> > 
> > With 64 packets in the gro hashtable, a L1D cache miss is rather
> > likely. Btw. why can we hold 64 flows in the gro hashtable now?
> > The old gro_list could hold up to 8 flows. Seems like
> > 
> > commit 6312fe77751f57d4fa2b28abeef84c6a95c28136
> > net: limit each hash list length to MAX_GRO_SKBS
> > 
> > changed this. 
> 
> AFAICS, yes each list can have at most 8 entries, but the hash has
> GRO_HASH_BUCKETS (8) buckets/lists for a possible total of 64 flows.
> Likely we will hit the MAX_GRO_SKBS limit before completely filling the
> hash, I have to instrument to get some real numbers, but likely at each
> napi_poll() completion we will have a lot of flows still there (almost
> 64). 
> 
> I'm using HZ == 1000, so I hit a napi_gro_flush() at each napi_poll()
> 
> > I'm wondering if a prefetch(skb->data) in  __napi_gro_flush_chain()
> > > could help?!? (even for TCP workload)
> > 
> > Yes, the problem should exist for TCP too. Not sure if a prefetch
> > will help, but you can try.
> 
> Indeed, I would be happy for any better solution!

Maybe we could adjust napi_gro_complete() etc. so the we can use
netif_receive_skb_list() on a gro flush. Cache would be still
cold, but we would not need to run the full stack for each
packet in the gro hashtable.

> 
> The following is likely overkill, but... long time ago, I tried to
> implement early demux for unconnected sockets:
> 
> https://www.spinics.net/lists/netdev/msg456384.html
> 
> One idea behind that series is that, if there is no policy routing
> configuration in place, we can detect if a packet is locally terminated
> without any route lookup, with a way cheaper address lookup. *Perhaps*
> a similar idea could be used here -> in absence of UDP_GRO sockets, do
> UDP GRO only if the packet is not locally terminated (using an address
> lookup). 

That would not help TCP and we can not use listified gro for the
local UDP input path (still not sure if we want to).
diff mbox series

Patch

diff --git a/include/net/udp.h b/include/net/udp.h
index fd6d948755c8..2b8a0119264d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -171,7 +171,7 @@  typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport,
 				     __be16 dport);
 
 struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
-				struct udphdr *uh, udp_lookup_t lookup);
+				struct udphdr *uh, struct sock *sk);
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 64f9715173ac..584635db9231 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -392,35 +392,24 @@  static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
 	return NULL;
 }
 
-INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
-						   __be16 sport, __be16 dport));
 struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
-				struct udphdr *uh, udp_lookup_t lookup)
+				struct udphdr *uh, struct sock *sk)
 {
 	struct sk_buff *pp = NULL;
 	struct sk_buff *p;
 	struct udphdr *uh2;
 	unsigned int off = skb_gro_offset(skb);
 	int flush = 1;
-	struct sock *sk;
-
-	rcu_read_lock();
-	sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
-				udp4_lib_lookup_skb, skb, uh->source, uh->dest);
-	if (!sk)
-		goto out_unlock;
 
-	if (udp_sk(sk)->gro_enabled) {
+	if (!sk || !udp_sk(sk)->gro_receive) {
 		pp = call_gro_receive(udp_gro_receive_segment, head, skb);
-		rcu_read_unlock();
 		return pp;
 	}
 
 	if (NAPI_GRO_CB(skb)->encap_mark ||
 	    (skb->ip_summed != CHECKSUM_PARTIAL &&
 	     NAPI_GRO_CB(skb)->csum_cnt == 0 &&
-	     !NAPI_GRO_CB(skb)->csum_valid) ||
-	    !udp_sk(sk)->gro_receive)
+	     !NAPI_GRO_CB(skb)->csum_valid))
 		goto out_unlock;
 
 	/* mark that this skb passed once through the tunnel gro layer */
@@ -459,8 +448,10 @@  INDIRECT_CALLABLE_SCOPE
 struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
+	struct sk_buff *pp;
+	struct sock *sk;
 
-	if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key))
+	if (unlikely(!uh))
 		goto flush;
 
 	/* Don't bother verifying checksum if we're going to flush anyway. */
@@ -475,7 +466,11 @@  struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
 					     inet_gro_compute_pseudo);
 skip:
 	NAPI_GRO_CB(skb)->is_ipv6 = 0;
-	return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
+	rcu_read_lock();
+	sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL;
+	pp = udp_gro_receive(head, skb, uh, sk);
+	rcu_read_unlock();
+	return pp;
 
 flush:
 	NAPI_GRO_CB(skb)->flush = 1;
@@ -508,9 +503,7 @@  int udp_gro_complete(struct sk_buff *skb, int nhoff,
 	rcu_read_lock();
 	sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
 				udp4_lib_lookup_skb, skb, uh->source, uh->dest);
-	if (sk && udp_sk(sk)->gro_enabled) {
-		err = udp_gro_complete_segment(skb);
-	} else if (sk && udp_sk(sk)->gro_complete) {
+	if (sk && udp_sk(sk)->gro_complete) {
 		skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
 					: SKB_GSO_UDP_TUNNEL;
 
@@ -520,6 +513,8 @@  int udp_gro_complete(struct sk_buff *skb, int nhoff,
 		skb->encapsulation = 1;
 		err = udp_sk(sk)->gro_complete(sk, skb,
 				nhoff + sizeof(struct udphdr));
+	} else {
+		err = udp_gro_complete_segment(skb);
 	}
 	rcu_read_unlock();
 
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 83b11d0ac091..5f7937a4f71a 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -119,6 +119,8 @@  INDIRECT_CALLABLE_SCOPE
 struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
+	struct sk_buff *pp;
+	struct sock *sk;
 
 	if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key))
 		goto flush;
@@ -136,7 +138,11 @@  struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
 
 skip:
 	NAPI_GRO_CB(skb)->is_ipv6 = 1;
-	return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb);
+	rcu_read_lock();
+	sk = static_branch_unlikely(&udp_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL;
+	pp = udp_gro_receive(head, skb, uh, sk);
+	rcu_read_unlock();
+	return pp;
 
 flush:
 	NAPI_GRO_CB(skb)->flush = 1;