From patchwork Fri Jul  3 01:21:24 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Lawrence Brakmo <brakmo@fb.com>
X-Patchwork-Id: 490896
X-Patchwork-Delegate: davem@davemloft.net
Return-Path: <netdev-owner@vger.kernel.org>
X-Original-To: patchwork-incoming@ozlabs.org
Delivered-To: patchwork-incoming@ozlabs.org
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by ozlabs.org (Postfix) with ESMTP id 7E1151402A9
	for <patchwork-incoming@ozlabs.org>;
	Fri,  3 Jul 2015 11:21:40 +1000 (AEST)
Authentication-Results: ozlabs.org;
	dkim=fail reason="signature verification failed" (1024-bit key;
	unprotected) header.d=fb.com header.i=@fb.com header.b=V8/yUyEB;
	dkim-atps=neutral
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1754329AbbGCBVg (ORCPT <rfc822;patchwork-incoming@ozlabs.org>);
	Thu, 2 Jul 2015 21:21:36 -0400
Received: from mx0a-00082601.pphosted.com ([67.231.145.42]:10904 "EHLO
	mx0a-00082601.pphosted.com" rhost-flags-OK-OK-OK-OK)
	by vger.kernel.org with ESMTP id S1753448AbbGCBVc (ORCPT
	<rfc822;netdev@vger.kernel.org>); Thu, 2 Jul 2015 21:21:32 -0400
Received: from pps.filterd (m0004346 [127.0.0.1])
	by mx0a-00082601.pphosted.com (8.14.5/8.14.5) with SMTP id
	t631Iim0001527
	for <netdev@vger.kernel.org>; Thu, 2 Jul 2015 18:21:27 -0700
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=fb.com;
	h=from : to : cc : subject :
	date : message-id : mime-version : content-type; s=facebook;
	bh=DpREAOnawUuoH+pkrysV3PSrHY3byOcTLTXVUPqZj6I=;
	b=V8/yUyEBxlsARuUqATeSw4+zrGXMRotyK4livRjTLFo/FiOBC+c102ZIcg/TYbeBI6dv
	KvJ+MutXj0Lk2ip46Fb43t6Tw9Y1G5ajVq4OumM92UC/y47mCAz1ODm6cUp5zfFcFRBE
	IQOScqq4jkC1Ss+XRE3SQ8aOFPWPW91Lb1M=
Received: from mail.thefacebook.com ([199.201.64.23])
	by mx0a-00082601.pphosted.com with ESMTP id 1vdf0b04hf-1
	(version=TLSv1/SSLv3 cipher=AES128-SHA bits=128 verify=NOT)
	for <netdev@vger.kernel.org>; Thu, 02 Jul 2015 18:21:27 -0700
Received: from mx-out.facebook.com (192.168.52.123) by
	PRN-CHUB09.TheFacebook.com (192.168.16.19) with Microsoft SMTP Server
	(TLS) id 14.3.195.1; Thu, 2 Jul 2015 18:21:25 -0700
Received: from facebook.com (2401:db00:20:a00f:face:0:16:0)	by
	mx-out.facebook.com (10.223.101.97) with ESMTP	id
	d270cfaa212111e58ad124be0595f910-b20d92b0 for
	<netdev@vger.kernel.org>; Thu, 02 Jul 2015 18:21:25 -0700
Received: by devbig294.prn2.facebook.com (Postfix, from userid 10340)	id
	DB8551AC94A9; Thu,  2 Jul 2015 18:21:24 -0700 (PDT)
From: Lawrence Brakmo <brakmo@fb.com>
To: netdev <netdev@vger.kernel.org>
CC: Kernel Team <kernel-team@fb.com>
Subject: [RFC PATCH net-next] tcp: add NV congestion control
Date: Thu, 2 Jul 2015 18:21:24 -0700
Message-ID: <1435886484-1709996-1-git-send-email-brakmo@fb.com>
X-Mailer: git-send-email 1.8.1
X-FB-Internal: Safe
MIME-Version: 1.0
X-Proofpoint-Spam-Reason: safe
X-FB-Internal: Safe
X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:5.14.151, 1.0.33,
	0.0.0000
	definitions=2015-07-03_01:2015-07-02, 2015-07-02,
	1970-01-01 signatures=0
Sender: netdev-owner@vger.kernel.org
Precedence: bulk
List-ID: <netdev.vger.kernel.org>
X-Mailing-List: netdev@vger.kernel.org

This is a request for comments.

TCP-NV (New Vegas) is a major update to TCP-Vegas. An earlier version of
NV was presented at 2010's LPC (slides). It is a delayed based
congestion avoidance for the data center. This version has been tested
within a 10G rack where the HW RTTs are 20-50us.

A description of TCP-NV, including implementation and experimental
results, can be found at:
http://www.brakmo.org/networking/tcp-nv/TCPNV.html

The current version includes many module parameters to support
experimentation with the parameters.

Signed-off-by: Lawrence Brakmo <lawrence@brakmo.org>
---
 include/linux/skbuff.h     |   2 +-
 include/linux/tcp.h        |   4 +
 include/net/tcp.h          |   5 +-
 net/ipv4/Kconfig           |  16 ++
 net/ipv4/Makefile          |   1 +
 net/ipv4/sysctl_net_ipv4.c |   9 +
 net/ipv4/tcp_input.c       |   5 +
 net/ipv4/tcp_nv.c          | 477 +++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_output.c      |   4 +-
 9 files changed, 520 insertions(+), 3 deletions(-)
 create mode 100644 net/ipv4/tcp_nv.c

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d6cdd6e..96a131d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -547,7 +547,7 @@ struct sk_buff {
 	 * want to keep them across layers you have to do a skb_clone()
 	 * first. This is owned by whoever has the skb queued ATM.
 	 */
-	char			cb[48] __aligned(8);
+	char			cb[52] __aligned(8);
 
 	unsigned long		_skb_refdst;
 	void			(*destructor)(struct sk_buff *skb);
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 48c3696..05e0da5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -254,6 +254,10 @@ struct tcp_sock {
 	u32	lost_out;	/* Lost packets			*/
 	u32	sacked_out;	/* SACK'd packets			*/
 	u32	fackets_out;	/* FACK'd packets			*/
+	u32	ack_in_flight;	/* This field is populated when new acks
+				 * are received. It contains the number of 
+				 * bytes in flight when the last packet
+				 * acked was sent. Used by tcp-nv. */
 
 	/* from STCP, retrans queue hinting */
 	struct sk_buff* lost_skb_hint;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 950cfec..3e385c1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,7 @@ extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
+extern int sysctl_tcp_nv_enable;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -720,12 +721,14 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
 /* This is what the send packet queuing engine uses to pass
  * TCP per-packet control information to the transmission code.
  * We also store the host-order sequence numbers in here too.
- * This is 44 bytes if IPV6 is enabled.
+ * This is 48 bytes if IPV6 is enabled.
  * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
  */
 struct tcp_skb_cb {
 	__u32		seq;		/* Starting sequence number	*/
 	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
+	__u32		in_flight;	/* bytes in flight when this packet
+					 * was sent. */
 	union {
 		/* Note : tcp_tw_isn is used in input path only
 		 *	  (isn chosen by tcp_timewait_state_process())
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6fb3c90..c21f85d 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -539,6 +539,22 @@ config TCP_CONG_VEGAS
 	window. TCP Vegas should provide less packet loss, but it is
 	not as aggressive as TCP Reno.
 
+config TCP_CONG_NV
+       tristate "TCP NV"
+       default m
+       ---help---
+       TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+       10G networks, measurement noise introduced by LRO, GRO and interrupt
+       coalescence. In addition, it will decrease its cwnd multiplicative
+       instead of linearly.
+
+       Note that in general congestion avoidance (cwnd decreased when # packets
+       queued grows) cannot coexist with congestion control (cwnd decreased only
+       when there is packet loss) due to fairness issues. One scenario when the
+       can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+
+       For further details see http://www.brakmo.org/networking/tcp-nv/TCPNV>html
+
 config TCP_CONG_SCALABLE
 	tristate "Scalable TCP"
 	default n
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index efc43f3..06f335f 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
 obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
 obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
 obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
 obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 433231c..31846d5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -730,6 +730,15 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_ms_jiffies,
 	},
 	{
+		.procname	= "tcp_nv_enable",
+		.data		= &sysctl_tcp_nv_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},		
+	{
 		.procname	= "icmp_msgs_per_sec",
 		.data		= &sysctl_icmp_msgs_per_sec,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 684f095..2a3c413 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -101,6 +101,8 @@ int sysctl_tcp_thin_dupack __read_mostly;
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
+int sysctl_tcp_nv_enable __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_nv_enable);
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
@@ -3063,6 +3065,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	long ca_rtt_us = -1L;
 	struct sk_buff *skb;
 	u32 pkts_acked = 0;
+	u32 last_in_flight = 0;
 	bool rtt_update;
 	int flag = 0;
 
@@ -3102,6 +3105,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			if (!first_ackt.v64)
 				first_ackt = last_ackt;
 
+			last_in_flight = TCP_SKB_CB(skb)->in_flight;
 			reord = min(pkts_acked, reord);
 			if (!after(scb->end_seq, tp->high_seq))
 				flag |= FLAG_ORIG_SACK_ACKED;
@@ -3190,6 +3194,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		tcp_rearm_rto(sk);
 	}
 
+	tp->ack_in_flight = last_in_flight;
 	if (icsk->icsk_ca_ops->pkts_acked)
 		icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
 
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
new file mode 100644
index 0000000..585f1dd
--- /dev/null
+++ b/net/ipv4/tcp_nv.c
@@ -0,0 +1,477 @@
+/*
+ * TCP NV: TCP with Congestion Avoidance
+ *
+ * TCP-NV is a successor of TCP-Vegas that has been developed to
+ * deal with the issues that occur in modern networks. 
+ * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
+ * the ability to detect congestion before packet losses occur.
+ * When congestion (queue buildup) starts to occur, TCP-NV
+ * predicts what the cwnd size should be for the current
+ * throughput and it reduces the cwnd proportionally to
+ * the difference between the current cwnd and the predicted cwnd.
+ * TCP-NV behaves like Reno when no congestion is detected, or when
+ * recovering from packet losses.
+ *
+ * More information on the design, implementation and experimental
+ * results at http://www.brakmo.org:/networking/tcp-nv/TCPNV.html
+ *
+ * TODO:
+ * 1) Add mechanism to deal with reverse congestion.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+/* TCP NV parameters */
+static int nv_pad __read_mostly = 8;
+static int nv_reset_period __read_mostly = 5;
+static int nv_min_cwnd = 10;
+static int nv_dec_eval_min_calls = 100;
+static int nv_ssthresh_eval_min_calls = 30;
+static int nv_rtt_min_cnt = 2;
+static int nv_cong_decrease_mult = 30*128/100;
+static int nv_ssthresh_factor = 8;
+static int nv_rtt_factor = 128;
+static int nv_rtt_cnt_inc_delta = 32; /* dec cwnd by this many RTTs */
+static int nv_dec_factor = 4;  /* actual value is factor/8 */
+static int nv_loss_dec_factor = 820; /* on loss reduce cwnd by 20% */
+static int nv_cwnd_growth_factor = 2; /* larger => cwnd grows slower */
+
+module_param(nv_pad, int, 0644);
+MODULE_PARM_DESC(nv_pad, "extra packets above congestion level");
+module_param(nv_reset_period, int, 0644);
+MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
+module_param(nv_min_cwnd, int, 0644);
+MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
+		 " without losses");
+module_param(nv_dec_eval_min_calls, int, 0644);
+MODULE_PARM_DESC(nv_dec_eval_min_calls, "Wait for this many data points "
+		 "before declaring congestion (< 256)");
+module_param(nv_ssthresh_eval_min_calls, int, 0644);
+MODULE_PARM_DESC(nv_ssthresh_eval_min_calls, "Wait for this many data points "
+		 "before declaring congestion during initial slow-start");
+module_param(nv_rtt_min_cnt, int, 0644);
+MODULE_PARM_DESC(nv_rtt_min_cnt, "Wait for this many RTTs before declaring"
+		 " congestion (<64)");
+module_param(nv_cong_decrease_mult, int, 0644);
+MODULE_PARM_DESC(nv_cong_decrease_mult, "Congestion decrease factor");
+module_param(nv_ssthresh_factor, int, 0644);
+MODULE_PARM_DESC(nv_ssthresh_factor, "ssthresh factor");
+module_param(nv_rtt_factor, int, 0644);
+MODULE_PARM_DESC(nv_rtt_factor, "rtt averaging factor (0-256)");
+module_param(nv_rtt_cnt_inc_delta, int, 0644);
+MODULE_PARM_DESC(nv_rtt_cnt_inc_delta, "decrease cwnd for this many RTTs "
+		 "every 100 RTTs");
+module_param(nv_dec_factor, int, 0644);
+MODULE_PARM_DESC(nv_dec_factor, "decrease cwnd every ~192 RTTS by factor/8");
+module_param(nv_loss_dec_factor, int, 0644);
+MODULE_PARM_DESC(nv_loss_dec_factor, "on loss new cwnd = cwnd * this / 1024");
+module_param(nv_cwnd_growth_factor, int, 0644);
+MODULE_PARM_DESC(nv_cwnd_growth_factor, "larger => cwnd grows slower");
+
+/* TCP NV Parameters */
+struct tcpnv {
+	unsigned long nv_min_rtt_reset_jiffies;  /* when to switch to
+						  * nv_min_rtt_new */
+	u32	cnt;		/* increase cwnd by 1 after ACKs */
+	u32	loss_cwnd;	/* cwnd at last loss */
+	u8	nv_enable:1,
+		nv_allow_cwnd_growth:1, 	/* whether cwnd can grow */
+		nv_rtt_cnt:6;	/* RTTs without making ca decision */
+	u8	nv_rtt_cnt_dec;	/* RTTs since last temporary cwnd decrease */
+	u8	nv_eval_call_cnt;/* call count since last eval */
+	u8	nv_min_cwnd;	/* nv won't make a ca decision if cwnd is
+				 * smaller than this. It may grow to handle
+				 * TSO, LRO and interrupt coalescence because
+				 * with these a small cwnd cannot saturate
+				 * the link. Note that this is different from
+				 * sysctl_tcp_nv_min_cwnd */
+	u32 nv_last_rtt;	/* last rtt */
+	u32 nv_min_rtt;		/* active min rtt. Used to determine slope */
+	u32 nv_min_rtt_new;	/* min rtt for future use */
+	u32 nv_rtt_max_rate;  	/* max rate seen during current RTT */
+	u32 nv_rtt_start_seq;	/* current RTT ends when packet arrives
+				 * acking beyond nv_rtt_start_seq */
+	u32 nv_last_snd_una;	/* Previous value of tp->snd_una. It is
+				 * used to determine bytes acked since last
+				 * call to bictcp_acked */
+	u32 nv_no_cong_cnt;	/* Consecutive no congestion decisions */
+};
+
+#define NV_INIT_RTT	  0xffffffff
+#define NV_MIN_CWND	  4
+#define NV_MIN_CWND_GROW  2
+#define NV_TSO_CWND_BOUND 80
+
+static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->loss_cwnd = 0;
+	ca->nv_no_cong_cnt = 0;
+	ca->cnt = 0;
+	ca->nv_rtt_cnt = 0;
+	ca->nv_rtt_cnt_dec = 0;
+	ca->nv_allow_cwnd_growth = 1;
+	ca->nv_last_rtt = 0;
+	ca->nv_rtt_max_rate = 0;
+	ca->nv_rtt_start_seq = tp->snd_una;
+	ca->nv_eval_call_cnt = 0;
+	ca->nv_last_snd_una = tp->snd_una;
+}
+
+static void tcpnv_init(struct sock *sk)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	tcpnv_reset(ca, sk);
+
+	ca->nv_min_rtt_reset_jiffies = jiffies + 2*HZ;
+	ca->nv_min_rtt = NV_INIT_RTT;
+	ca->nv_min_rtt_new = NV_INIT_RTT;
+	ca->nv_enable = sysctl_tcp_nv_enable;
+	ca->nv_min_cwnd = NV_MIN_CWND;
+	if (nv_dec_eval_min_calls > 255)
+		nv_dec_eval_min_calls = 255;
+	if (nv_rtt_min_cnt > 63)
+		nv_rtt_min_cnt = 63;
+}
+
+static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	/* Only grow cwnd if NV has not detected congestion */
+	if (sysctl_tcp_nv_enable && ca->nv_enable &&
+	    !ca->nv_allow_cwnd_growth)
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
+	if (ca->cnt == 0)
+		ca->cnt = tp->snd_cwnd;
+
+	tcp_cong_avoid_ai(tp, ca->cnt, acked);
+}
+
+static u32 tcpnv_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	ca->loss_cwnd = tp->snd_cwnd;
+	return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); 
+}
+
+static u32 tcpnv_undo_cwnd(struct sock *sk)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
+static void tcpnv_state(struct sock *sk, u8 new_state)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Open) {
+		ca->nv_enable = 1;
+		tcpnv_reset(ca, sk);
+	} else if (new_state == TCP_CA_Loss) {
+		ca->nv_enable = 0;
+	}
+}
+
+/* Do congestion avoidance calculaitons for TCP-NV
+ */
+static void tcpnv_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+	unsigned long now = jiffies;
+	s64 rate64 = 0;
+	u32 rate, max_win, cwnd_by_slope;
+	u32 avg_rtt;
+	u32 bytes_acked = 0;
+
+	/* Some calls are for duplicates without timetamps */
+	if (rtt_us < 0)
+		return;
+
+	/* If not in TCP_CA_Open state, skip. */
+	if (icsk->icsk_ca_state != TCP_CA_Open)
+		return;
+
+	/* If NV mode is not enabled, behave like Reno */
+	if (!sysctl_tcp_nv_enable  ||  !ca->nv_enable) {
+		ca->nv_allow_cwnd_growth = 1;
+		return;
+	}
+
+	bytes_acked = tp->snd_una - ca->nv_last_snd_una;
+	ca->nv_last_snd_una = tp->snd_una;
+
+	if (tp->ack_in_flight == 0)
+		return;
+
+	/* Calculate moving average of RTT */
+	if (nv_rtt_factor > 0) {
+		if (ca->nv_last_rtt > 0) {
+			avg_rtt = (((u64)rtt_us) * nv_rtt_factor +
+				   ((u64)ca->nv_last_rtt)
+				   * (256 - nv_rtt_factor)) >> 8;
+		} else {
+			avg_rtt = rtt_us;
+			ca->nv_min_rtt = avg_rtt << 1;
+		}
+		ca->nv_last_rtt = avg_rtt;
+	} else {
+		avg_rtt = rtt_us;
+	}
+
+	/* rate in 100's bits per second */
+	rate64 = ((u64)tp->ack_in_flight) * 8000000;
+	rate = (u32)div64_u64(rate64, (u64)(avg_rtt*100));
+
+	/* Remember the maximum rate seen during this RTT
+	 * Note: It may be more than one RTT. This function should be
+	 *       called at least nv_dec_eval_min_calls times.
+	 */
+	if (ca->nv_rtt_max_rate < rate)
+		ca->nv_rtt_max_rate = rate;
+
+	/* We have valid information, increment counter */
+	if (ca->nv_eval_call_cnt < 255)
+		ca->nv_eval_call_cnt++;
+
+	/* update min rtt if necessary */
+	if (avg_rtt < ca->nv_min_rtt)
+		ca->nv_min_rtt = avg_rtt;
+
+	/* update future min_rtt if necessary */
+	if (avg_rtt < ca->nv_min_rtt_new)
+		ca->nv_min_rtt_new = avg_rtt;
+
+	/* nv_min_rtt is updated with the minimum (possibley averaged) rtt
+	 * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
+	 * warm reset). This new nv_min_rtt will be continued to be updated
+	 * and be used for another sysctl_tcp_nv_reset_period seconds,
+	 * when it will be updated again.
+	 * In practice we introduce some randomness, so the actual period used
+	 * is chosen randomly from the range:
+	 *   [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4)
+	 */
+	if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
+		unsigned char rand;
+		ca->nv_min_rtt = ca->nv_min_rtt_new;
+		ca->nv_min_rtt_new = NV_INIT_RTT;
+		get_random_bytes(&rand, 1);
+		ca->nv_min_rtt_reset_jiffies =
+			now + ((nv_reset_period*(384 + rand)*HZ)>>9);
+		/* Every so often we decrease nv_min_cwnd in case previous
+		 *  value is no longer accurate.
+		 */
+		ca->nv_min_cwnd = max(ca->nv_min_cwnd/2, NV_MIN_CWND);
+	}
+
+	/* Once per RTT check if we need to do congestion avoidance */
+	if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
+		ca->nv_rtt_start_seq = tp->snd_nxt;
+		if (ca->nv_rtt_cnt < 63)
+			/* Increase counter for RTTs without CA decision */
+			ca->nv_rtt_cnt++;
+		if (ca->nv_rtt_cnt_dec < 255)
+			/* Increase counter for temporary cwnd decrease */
+			ca->nv_rtt_cnt_dec++;
+
+		/* If this function is only called once within an RTT
+		 * the cwnd is probably too small (in some cases due to
+		 * tso, lro or interrupt coalescence), so we increase
+		 * nv_min_cwnd.
+		 */
+		if (ca->nv_eval_call_cnt == 1
+		    && bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache
+		    && ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)
+		    && ca->nv_rtt_cnt_dec < 192) {
+			ca->nv_min_cwnd = min(ca->nv_min_cwnd
+					      + NV_MIN_CWND_GROW,
+					      NV_TSO_CWND_BOUND + 1);
+			ca->nv_rtt_start_seq = tp->snd_nxt +
+				ca->nv_min_cwnd*tp->mss_cache;
+			ca->nv_eval_call_cnt = 0;
+			ca->nv_allow_cwnd_growth = 1;
+			return;
+		}
+
+		/* Every 64 to 192 RTTs decrease cwnd to get better min RTT
+		 * measurement. In practice we accomplish this by initializing
+		 * nv_rtt_cnd_dec randomly form the range [0, 128) and
+		 * stopping at 192.
+		 * We keep the value low for nv_rtt_cnt_inc_delta RTTs and then
+		 * we restore cwnd to its previous value (by setting
+		 * ssthresh to the previous value).
+		 */
+		if (ca->nv_rtt_cnt_dec == 192) {
+			/* decrease cwnd and ssthresh */
+			tp->snd_cwnd =
+				max((unsigned int)nv_min_cwnd,
+				    ((tp->snd_cwnd * nv_dec_factor) >> 3));
+			tp->snd_ssthresh =
+				max(tp->snd_cwnd,
+				    ((tp->snd_ssthresh * nv_dec_factor) >> 3));
+			ca->nv_allow_cwnd_growth = 0;
+			return;
+		} else if (ca->nv_rtt_cnt_dec > 192) {
+			if (ca->nv_rtt_cnt_dec - 192 >= nv_rtt_cnt_inc_delta) {
+				/* Restore ssthresh to restore cwnd */
+				unsigned char rand;
+				get_random_bytes(&rand, 1);
+				ca->nv_rtt_cnt_dec = rand >> 1;
+				tp->snd_ssthresh = (tp->snd_ssthresh << 3)
+					/ nv_dec_factor;
+				ca->nv_allow_cwnd_growth = 1;
+				ca->nv_no_cong_cnt = 0;
+			}
+			return;
+		}
+
+		/* Find the ideal cwnd for current rate from slope
+		 * slope = 80000.0 * mss / nv_min_rtt
+		 * cwnd_by_slope = nv_rtt_max_rate / slope
+		 */
+		cwnd_by_slope = (u32)
+			div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
+				  (u64)(80000 * tp->mss_cache));
+		max_win = cwnd_by_slope + nv_pad;
+
+		/* If cwnd > max_win, decrease cwnd
+		 * if cwnd < max_win, grow cwnd
+		 * else leave the same
+		 */
+		if (tp->snd_cwnd > max_win) {
+			/* there is congestion, check that it is ok
+			 * to make a CA decision
+			 * 1. We should have at least nv_dec_eval_min_calls
+			 *    data points before making a CA  decision
+			 * 2. We only make a congesion decision after
+			 *    nv_rtt_min_cnt RTTs
+			 */
+			if (ca->nv_rtt_cnt < nv_rtt_min_cnt)
+				return;
+			else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
+				if (ca->nv_eval_call_cnt <
+				    nv_ssthresh_eval_min_calls)
+					return;
+			} else if (ca->nv_eval_call_cnt <
+				   nv_dec_eval_min_calls) {
+				return;
+			}
+
+			/* We have enough data to determine we are congested */
+			ca->nv_allow_cwnd_growth = 0;
+			tp->snd_ssthresh =
+				(nv_ssthresh_factor * max_win) >> 3;
+			if (tp->snd_cwnd - max_win > 2) {
+				/* gap > 2, we do exponential cwnd decrease */
+				int dec;
+				dec = max(2U, ((tp->snd_cwnd - max_win) *
+					       nv_cong_decrease_mult) >> 7);
+				tp->snd_cwnd -= dec;
+			} else if (nv_cong_decrease_mult > 0) {
+				tp->snd_cwnd = max_win;
+			}
+			ca->cnt = tp->snd_cwnd;
+			ca->nv_no_cong_cnt = 0;
+		} else if (tp->snd_cwnd <=  max_win - 2) {
+			/* We allow growth of cwnd every RTT since we would
+			 * have grown even if we waited (just slower)
+			 */
+			ca->nv_allow_cwnd_growth = 1;
+			ca->nv_no_cong_cnt++;
+			if (nv_cwnd_growth_factor > 0 &&
+			    ca->nv_no_cong_cnt > nv_cwnd_growth_factor) {
+				ca->cnt = max(ca->cnt >> 1, (u32) 4);
+				ca->nv_no_cong_cnt = 0;
+			}
+		} else {
+			ca->nv_allow_cwnd_growth = 0;
+		}
+
+		/* update state */
+		ca->nv_eval_call_cnt = 0;
+		ca->nv_rtt_cnt = 0;
+		ca->nv_rtt_max_rate = 0;
+
+		/* Don't want to make cwnd < nv_min_cwnd
+		 * (it wasn't before, if it is now is because nv
+		 *  decreased it).
+		 */
+		if (tp->snd_cwnd < nv_min_cwnd)
+			tp->snd_cwnd = nv_min_cwnd;
+
+  }
+}
+
+/* Extract info for Tcp socket info provided via netlink */
+size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
+		       union tcp_cc_info *info)
+{
+	const struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		info->vegas.tcpv_enabled = ca->nv_enable
+			&& sysctl_tcp_nv_enable;
+		info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
+		info->vegas.tcpv_rtt = ca->nv_last_rtt;
+		info->vegas.tcpv_minrtt = ca->nv_min_rtt;
+
+		*attr = INET_DIAG_VEGASINFO;
+		return sizeof(struct tcpvegas_info);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tcpnv_get_info);
+
+static struct tcp_congestion_ops tcpnv __read_mostly = {
+	.init		= tcpnv_init,
+	.ssthresh	= tcpnv_recalc_ssthresh,
+	.cong_avoid	= tcpnv_cong_avoid,
+	.set_state	= tcpnv_state,
+	.undo_cwnd	= tcpnv_undo_cwnd,
+	.pkts_acked     = tcpnv_acked,
+	.get_info	= tcpnv_get_info,
+
+	.owner		= THIS_MODULE,
+	.name		= "nv",
+};
+
+static int __init tcpnv_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
+
+	return tcp_register_congestion_control(&tcpnv);
+}
+
+static void __exit tcpnv_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcpnv);
+}
+
+module_init(tcpnv_register);
+module_exit(tcpnv_unregister);
+
+MODULE_AUTHOR("Lawrence Brakmo");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP NV");
+MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b1c218d..97b02f1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -923,8 +923,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	BUG_ON(!skb || !tcp_skb_pcount(skb));
 
+	tp = tcp_sk(sk);
 	if (clone_it) {
 		skb_mstamp_get(&skb->skb_mstamp);
+		TCP_SKB_CB(skb)->in_flight = TCP_SKB_CB(skb)->end_seq
+			- tp->snd_una;
 
 		if (unlikely(skb_cloned(skb)))
 			skb = pskb_copy(skb, gfp_mask);
@@ -935,7 +938,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	}
 
 	inet = inet_sk(sk);
-	tp = tcp_sk(sk);
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));