diff mbox

[net-next] tcp: refine tcp_prune_ofo_queue() to not drop all packets

Message ID 1471468629.29842.39.camel@edumazet-glaptop3.roam.corp.google.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Aug. 17, 2016, 9:17 p.m. UTC
From: Eric Dumazet <edumazet@google.com>

Over the years, TCP BDP has increased a lot, and is typically
in the order of ~10 Mbytes with help of clever Congestion Control
modules.

In presence of packet losses, TCP stores incoming packets into an out of
order queue, and number of skbs sitting there waiting for the missing
packets to be received can match the BDP (~10 Mbytes)

In some cases, TCP needs to make room for incoming skbs, and current
strategy can simply remove all skbs in the out of order queue as a last
resort, incurring a huge penalty, both for receiver and sender.

Unfortunately these 'last resort events' are quite frequent, forcing
sender to send all packets again, stalling the flow and wasting a lot of
resources.

This patch cleans only a part of the out of order queue in order
to meet the memory constraints.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: C. Stephen Gun <csg@google.com>
Cc: Van Jacobson <vanj@google.com>
---
 net/ipv4/tcp_input.c |   47 ++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 19 deletions(-)

Comments

Soheil Hassas Yeganeh Aug. 18, 2016, 1:10 a.m. UTC | #1
On Wed, Aug 17, 2016 at 5:17 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Over the years, TCP BDP has increased a lot, and is typically
> in the order of ~10 Mbytes with help of clever Congestion Control
> modules.
>
> In presence of packet losses, TCP stores incoming packets into an out of
> order queue, and number of skbs sitting there waiting for the missing
> packets to be received can match the BDP (~10 Mbytes)
>
> In some cases, TCP needs to make room for incoming skbs, and current
> strategy can simply remove all skbs in the out of order queue as a last
> resort, incurring a huge penalty, both for receiver and sender.
>
> Unfortunately these 'last resort events' are quite frequent, forcing
> sender to send all packets again, stalling the flow and wasting a lot of
> resources.
>
> This patch cleans only a part of the out of order queue in order
> to meet the memory constraints.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Neal Cardwell <ncardwell@google.com>
> Cc: Yuchung Cheng <ycheng@google.com>
> Cc: Soheil Hassas Yeganeh <soheil@google.com>
> Cc: C. Stephen Gun <csg@google.com>
> Cc: Van Jacobson <vanj@google.com>

Acked-by: Soheil Hassas Yeganeh <soheil@google.com>

> ---
>  net/ipv4/tcp_input.c |   47 ++++++++++++++++++++++++-----------------
>  1 file changed, 28 insertions(+), 19 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 3ebf45b38bc309f448dbc4f27fe8722cefabaf19..8cd02c0b056cbc22e2e4a4fe8530b74f7bd25419 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4392,12 +4392,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
>                 if (tcp_prune_queue(sk) < 0)
>                         return -1;
>
> -               if (!sk_rmem_schedule(sk, skb, size)) {
> +               while (!sk_rmem_schedule(sk, skb, size)) {
>                         if (!tcp_prune_ofo_queue(sk))
>                                 return -1;
> -
> -                       if (!sk_rmem_schedule(sk, skb, size))
> -                               return -1;
>                 }
>         }
>         return 0;
> @@ -4874,29 +4871,41 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
>  }
>
>  /*
> - * Purge the out-of-order queue.
> - * Return true if queue was pruned.
> + * Clean the out-of-order queue to make room.
> + * We drop high sequences packets to :
> + * 1) Let a chance for holes to be filled.
> + * 2) not add too big latencies if thousands of packets sit there.
> + *    (But if application shrinks SO_RCVBUF, we could still end up
> + *     freeing whole queue here)
> + *
> + * Return true if queue has shrunk.
>   */
>  static bool tcp_prune_ofo_queue(struct sock *sk)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
> -       bool res = false;
> +       struct sk_buff *skb;
>
> -       if (!skb_queue_empty(&tp->out_of_order_queue)) {
> -               NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> -               __skb_queue_purge(&tp->out_of_order_queue);
> +       if (skb_queue_empty(&tp->out_of_order_queue))
> +               return false;
>
> -               /* Reset SACK state.  A conforming SACK implementation will
> -                * do the same at a timeout based retransmit.  When a connection
> -                * is in a sad state like this, we care only about integrity
> -                * of the connection not performance.
> -                */
> -               if (tp->rx_opt.sack_ok)
> -                       tcp_sack_reset(&tp->rx_opt);
> +       NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> +
> +       while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
> +               tcp_drop(sk, skb);
>                 sk_mem_reclaim(sk);
> -               res = true;
> +               if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
> +                   !tcp_under_memory_pressure(sk))
> +                       break;
>         }
> -       return res;
> +
> +       /* Reset SACK state.  A conforming SACK implementation will
> +        * do the same at a timeout based retransmit.  When a connection
> +        * is in a sad state like this, we care only about integrity
> +        * of the connection not performance.
> +        */
> +       if (tp->rx_opt.sack_ok)
> +               tcp_sack_reset(&tp->rx_opt);
> +       return true;
>  }
>
>  /* Reduce allocated memory if we can, trying to get
>
>

Very nice patch, Eric! Thanks.
Yuchung Cheng Aug. 18, 2016, 5:55 p.m. UTC | #2
On Wed, Aug 17, 2016 at 2:17 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> From: Eric Dumazet <edumazet@google.com>
>
> Over the years, TCP BDP has increased a lot, and is typically
> in the order of ~10 Mbytes with help of clever Congestion Control
> modules.
>
> In presence of packet losses, TCP stores incoming packets into an out of
> order queue, and number of skbs sitting there waiting for the missing
> packets to be received can match the BDP (~10 Mbytes)
>
> In some cases, TCP needs to make room for incoming skbs, and current
> strategy can simply remove all skbs in the out of order queue as a last
> resort, incurring a huge penalty, both for receiver and sender.
>
> Unfortunately these 'last resort events' are quite frequent, forcing
> sender to send all packets again, stalling the flow and wasting a lot of
> resources.
>
> This patch cleans only a part of the out of order queue in order
> to meet the memory constraints.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Neal Cardwell <ncardwell@google.com>
> Cc: Yuchung Cheng <ycheng@google.com>
> Cc: Soheil Hassas Yeganeh <soheil@google.com>
> Cc: C. Stephen Gun <csg@google.com>
> Cc: Van Jacobson <vanj@google.com>
> ---
Acked-by: Yuchung Cheng <ycheng@google.com>

Nice patch


>  net/ipv4/tcp_input.c |   47 ++++++++++++++++++++++++-----------------
>  1 file changed, 28 insertions(+), 19 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 3ebf45b38bc309f448dbc4f27fe8722cefabaf19..8cd02c0b056cbc22e2e4a4fe8530b74f7bd25419 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4392,12 +4392,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
>                 if (tcp_prune_queue(sk) < 0)
>                         return -1;
>
> -               if (!sk_rmem_schedule(sk, skb, size)) {
> +               while (!sk_rmem_schedule(sk, skb, size)) {
>                         if (!tcp_prune_ofo_queue(sk))
>                                 return -1;
> -
> -                       if (!sk_rmem_schedule(sk, skb, size))
> -                               return -1;
>                 }
>         }
>         return 0;
> @@ -4874,29 +4871,41 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
>  }
>
>  /*
> - * Purge the out-of-order queue.
> - * Return true if queue was pruned.
> + * Clean the out-of-order queue to make room.
> + * We drop high sequences packets to :
> + * 1) Let a chance for holes to be filled.
> + * 2) not add too big latencies if thousands of packets sit there.
> + *    (But if application shrinks SO_RCVBUF, we could still end up
> + *     freeing whole queue here)
> + *
> + * Return true if queue has shrunk.
>   */
>  static bool tcp_prune_ofo_queue(struct sock *sk)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
> -       bool res = false;
> +       struct sk_buff *skb;
>
> -       if (!skb_queue_empty(&tp->out_of_order_queue)) {
> -               NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> -               __skb_queue_purge(&tp->out_of_order_queue);
> +       if (skb_queue_empty(&tp->out_of_order_queue))
> +               return false;
>
> -               /* Reset SACK state.  A conforming SACK implementation will
> -                * do the same at a timeout based retransmit.  When a connection
> -                * is in a sad state like this, we care only about integrity
> -                * of the connection not performance.
> -                */
> -               if (tp->rx_opt.sack_ok)
> -                       tcp_sack_reset(&tp->rx_opt);
> +       NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> +
> +       while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
> +               tcp_drop(sk, skb);
>                 sk_mem_reclaim(sk);
> -               res = true;
> +               if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
> +                   !tcp_under_memory_pressure(sk))
> +                       break;
>         }
> -       return res;
> +
> +       /* Reset SACK state.  A conforming SACK implementation will
> +        * do the same at a timeout based retransmit.  When a connection
> +        * is in a sad state like this, we care only about integrity
> +        * of the connection not performance.
> +        */
> +       if (tp->rx_opt.sack_ok)
> +               tcp_sack_reset(&tp->rx_opt);
I am curious what if we don't reset. It seems SACK will continue to
function properly (at least for Linux sender). But this of course
belongs  to a different patch / discussion.

> +       return true;
>  }
>
>  /* Reduce allocated memory if we can, trying to get
>
>
Eric Dumazet Aug. 18, 2016, 6:05 p.m. UTC | #3
On Thu, 2016-08-18 at 10:55 -0700, Yuchung Cheng wrote:

> > +       /* Reset SACK state.  A conforming SACK implementation will
> > +        * do the same at a timeout based retransmit.  When a connection
> > +        * is in a sad state like this, we care only about integrity
> > +        * of the connection not performance.
> > +        */
> > +       if (tp->rx_opt.sack_ok)
> > +               tcp_sack_reset(&tp->rx_opt);
> I am curious what if we don't reset. It seems SACK will continue to
> function properly (at least for Linux sender). But this of course
> belongs  to a different patch / discussion.
> 

Agreed, we might try to be smarter here and update sack block if
possible.

BTW, here is the packetdrill test I used to test this patch :

# cat sack-prune.pkt

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [2048], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0

+0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 0>
+0.100 < . 1:1(0) ack 1 win 1024
+0 accept(3, ..., ...) = 4

+0.01 < . 2:3(1) ack 1 win 1024
+0    > . 1:1(0) ack 1 <nop,nop, sack 2:3>

+0.01 < . 4:5(1) ack 1 win 1024
+0    > . 1:1(0) ack 1 <nop,nop, sack 4:5 2:3>

+0.01 < . 6:7(1) ack 1 win 1024
+0    > . 1:1(0) ack 1 <nop,nop, sack 6:7 4:5 2:3>

+0.01 < . 8:9(1) ack 1 win 1024
+0    > . 1:1(0) ack 1 <nop,nop, sack 8:9 6:7 4:5 2:3>

+0.01 < . 10:11(1) ack 1 win 1024
// Alas, we had to clean OFO queue, last skb (sequence 8:9) was dropped.
+0    > . 1:1(0) ack 1 <nop,nop, sack 10:11>

+0.01 < . 1:2(1) ack 1 win 1024
// Make sure kernel did not drop 2:3 sequence
+0    > . 1:1(0) ack 3 <nop,nop, sack 10:11>

+0 read(4, ..., 2) = 2

+0.01 < . 3:4(1) ack 1 win 1024
+0    > . 1:1(0) ack 5 <nop,nop, sack 10:11>

+0.01 < . 5:6(1) ack 1 win 1024
+0    > . 1:1(0) ack 7 <nop,nop, sack 10:11>

+0 read(4, ..., 4) = 4

+0.01 < . 7:8(1) ack 1 win 1024
+0    > . 1:1(0) ack 8 <nop,nop, sack 10:11>

+0.01 < . 8:9(1) ack 1 win 1024
+0    > . 1:1(0) ack 9 <nop,nop, sack 10:11>

+0.01 < . 9:10(1) ack 1 win 1024
+0    > . 1:1(0) ack 11
Neal Cardwell Aug. 18, 2016, 7:08 p.m. UTC | #4
On Wed, Aug 17, 2016 at 5:17 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> From: Eric Dumazet <edumazet@google.com>
>
> Over the years, TCP BDP has increased a lot, and is typically
> in the order of ~10 Mbytes with help of clever Congestion Control
> modules.
>
> In presence of packet losses, TCP stores incoming packets into an out of
> order queue, and number of skbs sitting there waiting for the missing
> packets to be received can match the BDP (~10 Mbytes)
>
> In some cases, TCP needs to make room for incoming skbs, and current
> strategy can simply remove all skbs in the out of order queue as a last
> resort, incurring a huge penalty, both for receiver and sender.
>
> Unfortunately these 'last resort events' are quite frequent, forcing
> sender to send all packets again, stalling the flow and wasting a lot of
> resources.
>
> This patch cleans only a part of the out of order queue in order
> to meet the memory constraints.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Neal Cardwell <ncardwell@google.com>
> Cc: Yuchung Cheng <ycheng@google.com>
> Cc: Soheil Hassas Yeganeh <soheil@google.com>
> Cc: C. Stephen Gun <csg@google.com>
> Cc: Van Jacobson <vanj@google.com>
> ---

Nice.

Acked-by: Neal Cardwell <ncardwell@google.com>

neal
David Miller Aug. 19, 2016, 6:36 a.m. UTC | #5
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 17 Aug 2016 14:17:09 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> Over the years, TCP BDP has increased a lot, and is typically
> in the order of ~10 Mbytes with help of clever Congestion Control
> modules.
> 
> In presence of packet losses, TCP stores incoming packets into an out of
> order queue, and number of skbs sitting there waiting for the missing
> packets to be received can match the BDP (~10 Mbytes)
> 
> In some cases, TCP needs to make room for incoming skbs, and current
> strategy can simply remove all skbs in the out of order queue as a last
> resort, incurring a huge penalty, both for receiver and sender.
> 
> Unfortunately these 'last resort events' are quite frequent, forcing
> sender to send all packets again, stalling the flow and wasting a lot of
> resources.
> 
> This patch cleans only a part of the out of order queue in order
> to meet the memory constraints.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Looks great, applied.
diff mbox

Patch

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3ebf45b38bc309f448dbc4f27fe8722cefabaf19..8cd02c0b056cbc22e2e4a4fe8530b74f7bd25419 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4392,12 +4392,9 @@  static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 		if (tcp_prune_queue(sk) < 0)
 			return -1;
 
-		if (!sk_rmem_schedule(sk, skb, size)) {
+		while (!sk_rmem_schedule(sk, skb, size)) {
 			if (!tcp_prune_ofo_queue(sk))
 				return -1;
-
-			if (!sk_rmem_schedule(sk, skb, size))
-				return -1;
 		}
 	}
 	return 0;
@@ -4874,29 +4871,41 @@  static void tcp_collapse_ofo_queue(struct sock *sk)
 }
 
 /*
- * Purge the out-of-order queue.
- * Return true if queue was pruned.
+ * Clean the out-of-order queue to make room.
+ * We drop high sequences packets to :
+ * 1) Let a chance for holes to be filled.
+ * 2) not add too big latencies if thousands of packets sit there.
+ *    (But if application shrinks SO_RCVBUF, we could still end up
+ *     freeing whole queue here)
+ *
+ * Return true if queue has shrunk.
  */
 static bool tcp_prune_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	bool res = false;
+	struct sk_buff *skb;
 
-	if (!skb_queue_empty(&tp->out_of_order_queue)) {
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
-		__skb_queue_purge(&tp->out_of_order_queue);
+	if (skb_queue_empty(&tp->out_of_order_queue))
+		return false;
 
-		/* Reset SACK state.  A conforming SACK implementation will
-		 * do the same at a timeout based retransmit.  When a connection
-		 * is in a sad state like this, we care only about integrity
-		 * of the connection not performance.
-		 */
-		if (tp->rx_opt.sack_ok)
-			tcp_sack_reset(&tp->rx_opt);
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+
+	while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
+		tcp_drop(sk, skb);
 		sk_mem_reclaim(sk);
-		res = true;
+		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+		    !tcp_under_memory_pressure(sk))
+			break;
 	}
-	return res;
+
+	/* Reset SACK state.  A conforming SACK implementation will
+	 * do the same at a timeout based retransmit.  When a connection
+	 * is in a sad state like this, we care only about integrity
+	 * of the connection not performance.
+	 */
+	if (tp->rx_opt.sack_ok)
+		tcp_sack_reset(&tp->rx_opt);
+	return true;
 }
 
 /* Reduce allocated memory if we can, trying to get