diff mbox series

[net] tcp: invalidate rate samples during SACK reneging

Message ID 20171207214134.90015-1-ysseung@google.com
State Accepted, archived
Delegated to: David Miller
Headers show
Series [net] tcp: invalidate rate samples during SACK reneging | expand

Commit Message

Yousuk Seung Dec. 7, 2017, 9:41 p.m. UTC
Mark tcp_sock during a SACK reneging event and invalidate rate samples
while marked. Such rate samples may overestimate bw by including packets
that were SACKed before reneging.

< ack 6001 win 10000 sack 7001:38001
< ack 7001 win 0 sack 8001:38001 // Reneg detected
> seq 7001:8001 // RTO, SACK cleared.
< ack 38001 win 10000

In above example the rate sample taken after the last ack will count
7001-38001 as delivered while the actual delivery rate likely could
be much lower i.e. 7001-8001.

This patch adds a new field tcp_sock.sack_reneg and marks it when we
declare SACK reneging and entering TCP_CA_Loss, and unmarks it after
the last rate sample was taken before moving back to TCP_CA_Open. This
patch also invalidates rate samples taken while tcp_sock.is_sack_reneg
is set.

Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection")
Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Priyaranjan Jha <priyarjha@google.com>
---
 include/linux/tcp.h  |  3 ++-
 include/net/tcp.h    |  2 +-
 net/ipv4/tcp.c       |  1 +
 net/ipv4/tcp_input.c | 10 ++++++++--
 net/ipv4/tcp_rate.c  | 10 +++++++---
 5 files changed, 19 insertions(+), 7 deletions(-)

Comments

David Miller Dec. 8, 2017, 3:07 p.m. UTC | #1
From: Yousuk Seung <ysseung@google.com>
Date: Thu,  7 Dec 2017 13:41:34 -0800

> Mark tcp_sock during a SACK reneging event and invalidate rate samples
> while marked. Such rate samples may overestimate bw by including packets
> that were SACKed before reneging.
> 
> < ack 6001 win 10000 sack 7001:38001
> < ack 7001 win 0 sack 8001:38001 // Reneg detected
>> seq 7001:8001 // RTO, SACK cleared.
> < ack 38001 win 10000
> 
> In above example the rate sample taken after the last ack will count
> 7001-38001 as delivered while the actual delivery rate likely could
> be much lower i.e. 7001-8001.
> 
> This patch adds a new field tcp_sock.sack_reneg and marks it when we
> declare SACK reneging and entering TCP_CA_Loss, and unmarks it after
> the last rate sample was taken before moving back to TCP_CA_Open. This
> patch also invalidates rate samples taken while tcp_sock.is_sack_reneg
> is set.
> 
> Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection")
> Signed-off-by: Yousuk Seung <ysseung@google.com>
> Signed-off-by: Neal Cardwell <ncardwell@google.com>
> Signed-off-by: Yuchung Cheng <ycheng@google.com>
> Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
> Acked-by: Eric Dumazet <edumazet@google.com>
> Acked-by: Priyaranjan Jha <priyarjha@google.com>

Applied and queued up for -stable.
diff mbox series

Patch

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index df5d97a85e1a..ca4a6361389b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -224,7 +224,8 @@  struct tcp_sock {
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
-		unused:3;
+		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
+		unused:2;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6998707e81f3..6da880d2f022 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1055,7 +1055,7 @@  void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 			    struct rate_sample *rs);
 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
-		  struct rate_sample *rs);
+		  bool is_sack_reneg, struct rate_sample *rs);
 void tcp_rate_check_app_limited(struct sock *sk);
 
 /* These functions determine how the current flow behaves in respect of SACK
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bf97317e6c97..f08eebe60446 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2412,6 +2412,7 @@  int tcp_disconnect(struct sock *sk, int flags)
 	tp->snd_cwnd_cnt = 0;
 	tp->window_clamp = 0;
 	tcp_set_ca_state(sk, TCP_CA_Open);
+	tp->is_sack_reneg = 0;
 	tcp_clear_retrans(tp);
 	inet_csk_delack_init(sk);
 	/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 734cfc8ff76e..7b9e8b99b28d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1941,6 +1941,8 @@  void tcp_enter_loss(struct sock *sk)
 	if (is_reneg) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
 		tp->sacked_out = 0;
+		/* Mark SACK reneging until we recover from this loss event. */
+		tp->is_sack_reneg = 1;
 	}
 	tcp_clear_all_retrans_hints(tp);
 
@@ -2364,6 +2366,7 @@  static bool tcp_try_undo_recovery(struct sock *sk)
 		return true;
 	}
 	tcp_set_ca_state(sk, TCP_CA_Open);
+	tp->is_sack_reneg = 0;
 	return false;
 }
 
@@ -2397,8 +2400,10 @@  static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
 			NET_INC_STATS(sock_net(sk),
 					LINUX_MIB_TCPSPURIOUSRTOS);
 		inet_csk(sk)->icsk_retransmits = 0;
-		if (frto_undo || tcp_is_sack(tp))
+		if (frto_undo || tcp_is_sack(tp)) {
 			tcp_set_ca_state(sk, TCP_CA_Open);
+			tp->is_sack_reneg = 0;
+		}
 		return true;
 	}
 	return false;
@@ -3495,6 +3500,7 @@  static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	struct tcp_sacktag_state sack_state;
 	struct rate_sample rs = { .prior_delivered = 0 };
 	u32 prior_snd_una = tp->snd_una;
+	bool is_sack_reneg = tp->is_sack_reneg;
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	bool is_dupack = false;
@@ -3611,7 +3617,7 @@  static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
 	lost = tp->lost - lost;			/* freshly marked lost */
-	tcp_rate_gen(sk, delivered, lost, sack_state.rate);
+	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 3330a370d306..c61240e43923 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -106,7 +106,7 @@  void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 
 /* Update the connection delivery information and generate a rate sample. */
 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
-		  struct rate_sample *rs)
+		  bool is_sack_reneg, struct rate_sample *rs)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 snd_us, ack_us;
@@ -124,8 +124,12 @@  void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 
 	rs->acked_sacked = delivered;	/* freshly ACKed or SACKed */
 	rs->losses = lost;		/* freshly marked lost */
-	/* Return an invalid sample if no timing information is available. */
-	if (!rs->prior_mstamp) {
+	/* Return an invalid sample if no timing information is available or
+	 * in recovery from loss with SACK reneging. Rate samples taken during
+	 * a SACK reneging event may overestimate bw by including packets that
+	 * were SACKed before the reneg.
+	 */
+	if (!rs->prior_mstamp || is_sack_reneg) {
 		rs->delivered = -1;
 		rs->interval_us = -1;
 		return;