diff mbox

[v2,2/3] RFC tcp: early retransmit

Message ID 1335638781-960-2-git-send-email-ycheng@google.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Yuchung Cheng April 28, 2012, 6:46 p.m. UTC
This patch implements RFC 5827 early retransmit (ER) for TCP.
It reduces DUPACK threshold (dupthresh) if outstanding packets are
less than 4 to recover losses by fast recovery instead of timeout.

While the algorithm is simple, small but frequent network reordering
makes this feature dangerous: the connection repeatedly enter
false recovery and degrade performance. Therefore we implement
a mitigation suggested in the appendix of the RFC that delays
entering fast recovery by a small interval, i.e., RTT/4. But when
the network reordering degree is too large, i.e., 3 packets,
ER is disabled to avoid false fast recoveries for the rest of
the connection. The performance impact of ER is summarized in
section 6 of the paper "Proportional Rate Reduction for TCP”,
IMC 2011. http://conferences.sigcomm.org/imc/2011/docs/p155.pdf

Note that Linux has a similar feature called THIN_DUPACK. The
differences are THIN_DUPACK do not mitigate reorderings and is only
used after slow start. Currently ER is disabled if THIN_DUPACK is
enabled. I would be happy to merge THIN_DUPACK feature with ER if
people think it's a good idea.

ER is enabled by sysctl_tcp_early_retrans:
  0: Disables ER

  1: Reduce dupthresh to packets_out - 1 when outstanding packets < 4.

  2: (Default) reduce dupthresh like mode 1. In addition, delay
     entering fast recovery by RTT/4.

Note: mode 2 is implemented in the third part of this patch series.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 Documentation/networking/ip-sysctl.txt |   14 ++++++++++++++
 include/linux/tcp.h                    |    7 ++++---
 include/net/tcp.h                      |   15 +++++++++++++++
 net/ipv4/sysctl_net_ipv4.c             |   10 ++++++++++
 net/ipv4/tcp.c                         |    3 +++
 net/ipv4/tcp_input.c                   |   13 +++++++++++++
 net/ipv4/tcp_minisocks.c               |    1 +
 7 files changed, 60 insertions(+), 3 deletions(-)

Comments

Neal Cardwell May 1, 2012, 5:53 p.m. UTC | #1
On Sat, Apr 28, 2012 at 2:46 PM, Yuchung Cheng <ycheng@google.com> wrote:
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -202,6 +202,20 @@ tcp_ecn - INTEGER
>                  not support ECN, behavior is like with ECN disabled.
>        Default: 2
>
> +tcp_early_retrans - INTEGER
> +       Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold

I get a trailing-whitespace error in the documentation when I apply this patch:

> git am /tmp/er2.patch
Applying: RFC tcp: early retransmit
.../.git/rebase-apply/patch:20:
trailing whitespace.
       Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
warning: 1 line adds whitespace errors.

Also, to keep the sysctls alphabetical, tcp_early_retrans should probably go
before tcp_ecn.

> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -365,12 +365,13 @@ struct tcp_sock {
>
>        u32     frto_highmark;  /* snd_nxt when RTO occurred */
>        u16     advmss;         /* Advertised MSS                       */
> -       u8      frto_counter;   /* Number of new acks after RTO */
> -       u8      nonagle     : 4,/* Disable Nagle algorithm?             */
> +       u16     nonagle     : 4,/* Disable Nagle algorithm?             */
>                thin_lto    : 1,/* Use linear timeouts for thin streams */
>                thin_dupack : 1,/* Fast retransmit on first dupack      */
>                repair      : 1,
> -               unused      : 1;
> +               do_early_retrans: 1;/* Enable RFC5827 early-retransmit  */
> +
> +       u8      frto_counter;   /* Number of new acks after RTO */
>        u8      repair_queue;

To keep the change minimal and reduce the risk of mysterious
performance regressions from cache effects, I'd suggest keeping the
frto_counter and nonagle u8 bytes as u8 bytes in their current
location, and add a new u8 for the two ER bits. Same amount of space
as the scheme in the patch, just less shuffling.

> @@ -987,6 +989,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
>                       tp->undo_marker ? tp->undo_retrans : 0);
>  #endif
>                tcp_disable_fack(tp);
> +               tcp_disable_early_retrans(tp);
>        }
>  }

I think we should stick with the behavior where we disable early
retransmit any time tcp_update_reordering() is called with a non-zero
reordering metric. This is what we've tested and measured, and my
sense is that we could risk a significant number of spurious ER
firings if instead we relax this so that only reordering >3 causes us
to disable ER. I know the delayed ER should help avoid spurious ER
firings when there is a small degree of reordering, but my guess would
be that the max(RTT/4, 2ms) is perhaps not big enough if we're
allowing delayed ER for connections that have already witnessed small
degrees of reordering. So until we have more experimental data, I'd
recommend sticking with:

	if (metric > 0)
		tcp_disable_early_retrans(tp);

Otherwise, looks good to me.

neal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 9b569a2..23ebeae 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -202,6 +202,20 @@  tcp_ecn - INTEGER
 		  not support ECN, behavior is like with ECN disabled.
 	Default: 2
 
+tcp_early_retrans - INTEGER
+	Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold 
+	for triggering fast retransmit when the amount of outstanding data is
+	small and when no previously unsent data can be transmitted (such
+	that limited transmit could be used).
+	Possible values:
+		0 disables ER
+		1 enables ER
+		2 enables ER but delays fast recovery and fast retransmit
+		  by a fourth of RTT. This mitigates connection falsely
+		  recovers when network has a small degree of reordering
+		  (less than 3 packets).
+	Default: 2
+
 tcp_fack - BOOLEAN
 	Enable FACK congestion avoidance and fast retransmission.
 	The value is not used, if tcp_sack is not enabled.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 278af9e..7d08a79 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -365,12 +365,13 @@  struct tcp_sock {
 
 	u32	frto_highmark;	/* snd_nxt when RTO occurred */
 	u16	advmss;		/* Advertised MSS			*/
-	u8	frto_counter;	/* Number of new acks after RTO */
-	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
+	u16	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		thin_dupack : 1,/* Fast retransmit on first dupack      */
 		repair      : 1,
-		unused      : 1;
+		do_early_retrans: 1;/* Enable RFC5827 early-retransmit  */
+
+	u8	frto_counter;	/* Number of new acks after RTO */
 	u8	repair_queue;
 
 /* RTT measurement */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0fb84de..685437a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -252,6 +252,7 @@  extern int sysctl_tcp_max_ssthresh;
 extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
+extern int sysctl_tcp_early_retrans;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -797,6 +798,20 @@  static inline void tcp_enable_fack(struct tcp_sock *tp)
 	tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
 }
 
+/* TCP early-retransmit (ER) is similar to but more conservative than
+ * the thin-dupack feature.  Enable ER only if thin-dupack is disabled.
+ */
+static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
+{
+	tp->do_early_retrans = sysctl_tcp_early_retrans &&
+		!sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3;
+}
+
+static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
+{
+	tp->do_early_retrans = 0;
+}
+
 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
 	return tp->sacked_out + tp->lost_out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 33417f8..ef32956 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -27,6 +27,7 @@ 
 #include <net/tcp_memcontrol.h>
 
 static int zero;
+static int two = 2;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -677,6 +678,15 @@  static struct ctl_table ipv4_table[] = {
 		.proc_handler   = proc_dointvec
 	},
 	{
+		.procname	= "tcp_early_retrans",
+		.data		= &sysctl_tcp_early_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
+	},
+	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
 		.maxlen		= sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9670af3..6802c89 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -395,6 +395,7 @@  void tcp_init_sock(struct sock *sk)
 	tp->mss_cache = TCP_MSS_DEFAULT;
 
 	tp->reordering = sysctl_tcp_reordering;
+	tcp_enable_early_retrans(tp);
 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
 
 	sk->sk_state = TCP_CLOSE;
@@ -2495,6 +2496,8 @@  static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		else
 			tp->thin_dupack = val;
+			if (tp->thin_dupack)
+				tcp_disable_early_retrans(tp);
 		break;
 
 	case TCP_REPAIR:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 22df826..98c586d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -99,6 +99,7 @@  int sysctl_tcp_thin_dupack __read_mostly;
 
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_abc __read_mostly;
+int sysctl_tcp_early_retrans __read_mostly = 2;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
@@ -906,6 +907,7 @@  static void tcp_init_metrics(struct sock *sk)
 	if (dst_metric(dst, RTAX_REORDERING) &&
 	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
 		tcp_disable_fack(tp);
+		tcp_disable_early_retrans(tp);
 		tp->reordering = dst_metric(dst, RTAX_REORDERING);
 	}
 
@@ -987,6 +989,7 @@  static void tcp_update_reordering(struct sock *sk, const int metric,
 		       tp->undo_marker ? tp->undo_retrans : 0);
 #endif
 		tcp_disable_fack(tp);
+		tcp_disable_early_retrans(tp);
 	}
 }
 
@@ -2492,6 +2495,16 @@  static int tcp_time_to_recover(struct sock *sk)
 	    tcp_is_sack(tp) && !tcp_send_head(sk))
 		return 1;
 
+	/* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious
+	 * retransmissions due to small network reorderings, we implement
+	 * Mitigation A.3 in the RFC and delay the retransmission for a short
+	 * interval if appropriate.
+	 */
+	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
+	    (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
+	    !tcp_may_send_now(sk))
+		return 1;
+
 	return 0;
 }
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3cabafb..6f6a918 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -482,6 +482,7 @@  struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->sacked_out = 0;
 		newtp->fackets_out = 0;
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+		tcp_enable_early_retrans(newtp);
 
 		/* So many TCP implementations out there (incorrectly) count the
 		 * initial SYN frame in their delayed-ACK and congestion control