Message ID | 1421720216.11734.188.camel@edumazet-glaptop2.roam.corp.google.com |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
On Mon, Jan 19, 2015 at 6:16 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > On Sun, 2015-01-18 at 23:40 +0200, Eyal Perry wrote: > >> So indeed, interrupt mitigation (tx-usecs 1 tx-frames 1) improves things up >> for the "refined TSO autosizing" kernel (from 18.4Gbps to 19.7Gbps). but >> in the >> other kernel, the BW is remains the same with and without the coalescing. > > OK thanks for testing. > > I believe the regression comes from inability for cc to cope with > stretch acks. > > Nowadays on fast networks, each ACK packet acknowledges ~45 MSS, but > CUBIC (and others cc) got support for this only during slow start, with > commit 9f9843a751d0a2057f9f3d313886e7e5e6ebaac9 > ("tcp: properly handle stretch acks in slow start") > > I guess it is time to also handle congestion avoidance phase. Are you saying that at long last, delayed acks as we knew them are dead, dead, dead? > With following patch (very close to what we use here at Google) I > reached 37Gbps instead of 20Gbps : > > ethtool -C eth1 tx-usecs 4 tx-frames 4 What is the default here? What happens with the default here? > > DUMP_TCP_INFO=1 ./netperf -H remote -T2,2 -t TCP_STREAM -l 20 > MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to remote () port 0 AF_INET : cpu bind > rto=201000 ato=0 pmtu=1500 rcv_ssthresh=29200 rtt=67 rttvar=6 snd_ssthresh=263 cwnd=265 reordering=3 total_retrans=4569 ca_state=0 The above statistics are not dumped by my netperf, and look extremely desirable to capture in netperf-wrapper. This is a script parsing some other kernel data at the conclusion of the run? or a better netperf? If ECN was on the bottleneck link, I imagine total_retrans would be 0, or are packets getting dropped in the kernel? > Recv Send Send > Socket Socket Message Elapsed > Size Size Size Time Throughput > bytes bytes bytes secs. 10^6bits/sec > > 87380 16384 16384 20.00 37213.05 > > I guess this is a world record, my previous one was 34Gbps. > > > include/net/tcp.h | 2 > net/ipv4/tcp_cong.c | 4 + > net/ipv4/tcp_cubic.c | 91 +++++++++++++++++++---------------------- > 3 files changed, 47 insertions(+), 50 deletions(-) > > diff --git a/include/net/tcp.h b/include/net/tcp.h > index b8fdc6bab3f3..05815fbb490f 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -843,7 +843,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len); > void tcp_get_allowed_congestion_control(char *buf, size_t len); > int tcp_set_allowed_congestion_control(char *allowed); > int tcp_set_congestion_control(struct sock *sk, const char *name); > -void tcp_slow_start(struct tcp_sock *tp, u32 acked); > +int tcp_slow_start(struct tcp_sock *tp, u32 acked); > void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); > > u32 tcp_reno_ssthresh(struct sock *sk); > diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c > index 63c29dba68a8..f0fc696b9333 100644 > --- a/net/ipv4/tcp_cong.c > +++ b/net/ipv4/tcp_cong.c > @@ -360,13 +360,15 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) > * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and > * returns the leftover acks to adjust cwnd in congestion avoidance mode. > */ > -void tcp_slow_start(struct tcp_sock *tp, u32 acked) > +int tcp_slow_start(struct tcp_sock *tp, u32 acked) > { > u32 cwnd = tp->snd_cwnd + acked; > > if (cwnd > tp->snd_ssthresh) > cwnd = tp->snd_ssthresh + 1; > + acked -= cwnd - tp->snd_cwnd; > tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); > + return acked; > } > EXPORT_SYMBOL_GPL(tcp_slow_start); > > diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c > index 6b6002416a73..c0e048929b74 100644 > --- a/net/ipv4/tcp_cubic.c > +++ b/net/ipv4/tcp_cubic.c > @@ -81,7 +81,6 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse > > /* BIC TCP Parameters */ > struct bictcp { > - u32 cnt; /* increase cwnd by 1 after ACKs */ > u32 last_max_cwnd; /* last maximum snd_cwnd */ > u32 loss_cwnd; /* congestion window at last loss */ > u32 last_cwnd; /* the last snd_cwnd */ > @@ -93,20 +92,18 @@ struct bictcp { > u32 epoch_start; /* beginning of an epoch */ > u32 ack_cnt; /* number of acks */ > u32 tcp_cwnd; /* estimated tcp cwnd */ > -#define ACK_RATIO_SHIFT 4 > -#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) > - u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ > u8 sample_cnt; /* number of samples to decide curr_rtt */ > u8 found; /* the exit point is found? */ > u32 round_start; /* beginning of each round */ > u32 end_seq; /* end_seq of the round */ > u32 last_ack; /* last time when the ACK spacing is close */ > u32 curr_rtt; /* the minimum rtt of current round */ > + u32 last_bic_target;/* last target cwnd computed by cubic > + * (not tcp_friendliness mode) */ > }; > > static inline void bictcp_reset(struct bictcp *ca) > { > - ca->cnt = 0; > ca->last_max_cwnd = 0; > ca->last_cwnd = 0; > ca->last_time = 0; > @@ -114,7 +111,6 @@ static inline void bictcp_reset(struct bictcp *ca) > ca->bic_K = 0; > ca->delay_min = 0; > ca->epoch_start = 0; > - ca->delayed_ack = 2 << ACK_RATIO_SHIFT; > ca->ack_cnt = 0; > ca->tcp_cwnd = 0; > ca->found = 0; > @@ -205,12 +201,14 @@ static u32 cubic_root(u64 a) > /* > * Compute congestion window to use. > */ > -static inline void bictcp_update(struct bictcp *ca, u32 cwnd) > +static inline void bictcp_update(struct bictcp *ca, u32 pkts_acked, u32 cwnd) > { > - u32 delta, bic_target, max_cnt; > + u32 delta, bic_target; > u64 offs, t; > > - ca->ack_cnt++; /* count the number of ACKs */ > + ca->ack_cnt += pkts_acked; /* count the number of packets that > + * have been ACKed > + */ > > if (ca->last_cwnd == cwnd && > (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) > @@ -221,7 +219,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) > > if (ca->epoch_start == 0) { > ca->epoch_start = tcp_time_stamp; /* record beginning */ > - ca->ack_cnt = 1; /* start counting */ > + ca->ack_cnt = pkts_acked; /* start counting */ > ca->tcp_cwnd = cwnd; /* syn with cubic */ > > if (ca->last_max_cwnd <= cwnd) { > @@ -269,19 +267,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) > else /* above origin*/ > bic_target = ca->bic_origin_point + delta; > > - /* cubic function - calc bictcp_cnt*/ > - if (bic_target > cwnd) { > - ca->cnt = cwnd / (bic_target - cwnd); > - } else { > - ca->cnt = 100 * cwnd; /* very small increment*/ > - } > - > - /* > - * The initial growth of cubic function may be too conservative > - * when the available bandwidth is still unknown. > - */ > - if (ca->last_max_cwnd == 0 && ca->cnt > 20) > - ca->cnt = 20; /* increase cwnd 5% per RTT */ > + ca->last_bic_target = bic_target; > > /* TCP Friendly */ > if (tcp_friendliness) { > @@ -292,18 +278,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) > ca->ack_cnt -= delta; > ca->tcp_cwnd++; > } > - > - if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ > - delta = ca->tcp_cwnd - cwnd; > - max_cnt = cwnd / delta; > - if (ca->cnt > max_cnt) > - ca->cnt = max_cnt; > - } > } > - > - ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; > - if (ca->cnt == 0) /* cannot be zero */ > - ca->cnt = 1; > } > > static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) > @@ -314,13 +289,43 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) > if (!tcp_is_cwnd_limited(sk)) > return; > > + /* cwnd may first advance in slow start then move on to congestion > + * control mode on a stretch ACK. > + */ > if (tp->snd_cwnd <= tp->snd_ssthresh) { > if (hystart && after(ack, ca->end_seq)) > bictcp_hystart_reset(sk); > - tcp_slow_start(tp, acked); > - } else { > - bictcp_update(ca, tp->snd_cwnd); > - tcp_cong_avoid_ai(tp, ca->cnt); > + acked = tcp_slow_start(tp, acked); > + } > + > + if (acked && tp->snd_cwnd > tp->snd_ssthresh) { > + u32 target, cnt; > + > + bictcp_update(ca, acked, tp->snd_cwnd); > + /* Compute target cwnd based on bic_target and tcp_cwnd > + * (whichever is faster) > + */ > + target = (ca->last_bic_target >= ca->tcp_cwnd) ? > + ca->last_bic_target : ca->tcp_cwnd; > + while (acked > 0) { > + if (target > tp->snd_cwnd) > + cnt = tp->snd_cwnd / (target - tp->snd_cwnd); > + else > + cnt = 100 * tp->snd_cwnd; > + > + /* The initial growth of cubic function may be > + * too conservative when the available > + * bandwidth is still unknown. > + */ > + if (ca->last_max_cwnd == 0 && cnt > 20) > + cnt = 20; /* increase cwnd 5% per RTT */ > + > + if (cnt == 0) /* cannot be zero */ > + cnt = 1; > + > + tcp_cong_avoid_ai(tp, cnt); > + acked--; > + } > } > } > > @@ -411,20 +416,10 @@ static void hystart_update(struct sock *sk, u32 delay) > */ > static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) > { > - const struct inet_connection_sock *icsk = inet_csk(sk); > const struct tcp_sock *tp = tcp_sk(sk); > struct bictcp *ca = inet_csk_ca(sk); > u32 delay; > > - if (icsk->icsk_ca_state == TCP_CA_Open) { > - u32 ratio = ca->delayed_ack; > - > - ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; > - ratio += cnt; > - > - ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); > - } > - > /* Some calls are for duplicates without timetamps */ > if (rtt_us < 0) > return; > > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2015-01-19 at 18:37 -0800, Dave Taht wrote: > On Mon, Jan 19, 2015 at 6:16 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > > On Sun, 2015-01-18 at 23:40 +0200, Eyal Perry wrote: > > > >> So indeed, interrupt mitigation (tx-usecs 1 tx-frames 1) improves things up > >> for the "refined TSO autosizing" kernel (from 18.4Gbps to 19.7Gbps). but > >> in the > >> other kernel, the BW is remains the same with and without the coalescing. > > > > OK thanks for testing. > > > > I believe the regression comes from inability for cc to cope with > > stretch acks. > > > > Nowadays on fast networks, each ACK packet acknowledges ~45 MSS, but > > CUBIC (and others cc) got support for this only during slow start, with > > commit 9f9843a751d0a2057f9f3d313886e7e5e6ebaac9 > > ("tcp: properly handle stretch acks in slow start") > > > > I guess it is time to also handle congestion avoidance phase. > > Are you saying that at long last, delayed acks as we knew them are > dead, dead, dead? Sorry, I can not parse what you are saying. In case you missed it, it has nothing to do with delayed ACK but GRO on receiver. > > > With following patch (very close to what we use here at Google) I > > reached 37Gbps instead of 20Gbps : > > > > ethtool -C eth1 tx-usecs 4 tx-frames 4 > > What is the default here? 16 & 16, see my prior answer in this thread. > > What happens with the default here? ethtool -C eth1 tx-usecs 16 tx-frames 16 DUMP_TCP_INFO=1 ./netperf -H remote -T2,2 -t TCP_STREAM -l 20 MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to remote () port 0 AF_INET : cpu bind rto=201000 ato=0 pmtu=1500 rcv_ssthresh=29200 rtt=60 rttvar=2 snd_ssthresh=179 cwnd=243 reordering=3 total_retrans=23 ca_state=0 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 20.00 22923.74 > > > > > DUMP_TCP_INFO=1 ./netperf -H remote -T2,2 -t TCP_STREAM -l 20 > > MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to remote () port 0 AF_INET : cpu bind > > rto=201000 ato=0 pmtu=1500 rcv_ssthresh=29200 rtt=67 rttvar=6 snd_ssthresh=263 cwnd=265 reordering=3 total_retrans=4569 ca_state=0 > > The above statistics are not dumped by my netperf, and look extremely > desirable to capture in netperf-wrapper. This is a script parsing some > other kernel data at the conclusion of the run? or a better netperf? Thats a 3 lines patch in netperf actually. > > If ECN was on the bottleneck link, I imagine total_retrans would be 0, > or are packets getting dropped in the kernel? The receiver drops frames, because we are at the limit of what the NIC can do on a single RX queue. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Are you saying that at long last, delayed acks as we knew them are >> dead, dead, dead? > > Sorry, I can not parse what you are saying. > > In case you missed it, it has nothing to do with delayed ACK but GRO on > receiver. Dave - assuming I've interpreted Eric's comments correctly, I believe the answer to your question is No. Your desire for a world brimming with ack-every-other purity has not been fulfilled :) However, the engineers formerly at Mentat are probably pleased that a functional near-equivalent to their ACK avoidance heuristic has ended-up being implemented and tacitly accepted, albeit by the back door :) >>> DUMP_TCP_INFO=1 ./netperf -H remote -T2,2 -t TCP_STREAM -l 20 >>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to remote () port 0 AF_INET : cpu bind >>> rto=201000 ato=0 pmtu=1500 rcv_ssthresh=29200 rtt=67 rttvar=6 snd_ssthresh=263 cwnd=265 reordering=3 total_retrans=4569 ca_state=0 >> >> The above statistics are not dumped by my netperf, and look extremely >> desirable to capture in netperf-wrapper. This is a script parsing some >> other kernel data at the conclusion of the run? or a better netperf? > > Thats a 3 lines patch in netperf actually. More stuff to pull from a TCP_INFO call I presume? Feel free to drop me a patch, though I'd probably want it to be in the guise of the omni output selectors. happy benchmarking, rick -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2015-01-21 at 12:26 +0000, David Laight wrote: > From: Of Rick Jones > > >> Are you saying that at long last, delayed acks as we knew them are > > >> dead, dead, dead? > > > > > > Sorry, I can not parse what you are saying. > > > > > > In case you missed it, it has nothing to do with delayed ACK but GRO on > > > receiver. > > > > Dave - assuming I've interpreted Eric's comments correctly, I believe > > the answer to your question is No. Your desire for a world brimming > > with ack-every-other purity has not been fulfilled :) > > > > However, the engineers formerly at Mentat are probably pleased that a > > functional near-equivalent to their ACK avoidance heuristic has ended-up > > being implemented and tacitly accepted, albeit by the back door :) > > I must recheck something I discovered a while back with more recent kernels. > There has been a bad interaction between 'slow start' and 'delayed acks' > when nagle is disabled on 0 RTT local links with uni-directional traffic. > > 'Slow start' would refuse to send more than 4 messages until it received > an ack (rather than 4 mss of data). > The receiving system wouldn't send an ack until the timer expired > (or several mss of data were received) by which time the sender could have > a lot of data queued. > > Due to the 0 RTT and bursty nature of the data 'slow start' happened > all the time. Following packetdrill test suggests that current kernel send up to 10 messages without having to wait for any ACK (IW10) // Set up production and experiment configs `../common/defaults.sh` // Establish a connection. 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 0.000 bind(3, ..., ...) = 0 0.000 listen(3, 1) = 0 0.100 < S 0:0(0) win 32792 <mss 1460,nop,wscale 7> 0.100 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 6> 0.110 < . 1:1(0) ack 1 win 257 0.110 accept(3, ..., ...) = 4 0.200 %{ assert tcpi_snd_cwnd == 10 }% +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 +0.01 write(4, ..., 100) = 100 +0 > P. 1:101(100) ack 1 +0.01 write(4, ..., 100) = 100 +0 > P. 101:201(100) ack 1 +0.01 write(4, ..., 100) = 100 +0 > P. 201:301(100) ack 1 +0.01 write(4, ..., 100) = 100 +0 > P. 301:401(100) ack 1 +0.01 write(4, ..., 100) = 100 +0 > P. 401:501(100) ack 1 +0.01 write(4, ..., 100) = 100 +0 > P. 501:601(100) ack 1 +0.01 write(4, ..., 100) = 100 +0 > P. 601:701(100) ack 1 +0.01 write(4, ..., 100) = 100 +0 > P. 701:801(100) ack 1 +0.01 write(4, ..., 100) = 100 +0 > P. 801:901(100) ack 1 +0.01 write(4, ..., 100) = 100 +0 > P. 901:1001(100) ack 1 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/net/tcp.h b/include/net/tcp.h index b8fdc6bab3f3..05815fbb490f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -843,7 +843,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name); -void tcp_slow_start(struct tcp_sock *tp, u32 acked); +int tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); u32 tcp_reno_ssthresh(struct sock *sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 63c29dba68a8..f0fc696b9333 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -360,13 +360,15 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -void tcp_slow_start(struct tcp_sock *tp, u32 acked) +int tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = tp->snd_cwnd + acked; if (cwnd > tp->snd_ssthresh) cwnd = tp->snd_ssthresh + 1; + acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); + return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 6b6002416a73..c0e048929b74 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -81,7 +81,6 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse /* BIC TCP Parameters */ struct bictcp { - u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ @@ -93,20 +92,18 @@ struct bictcp { u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ -#define ACK_RATIO_SHIFT 4 -#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) - u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ u32 end_seq; /* end_seq of the round */ u32 last_ack; /* last time when the ACK spacing is close */ u32 curr_rtt; /* the minimum rtt of current round */ + u32 last_bic_target;/* last target cwnd computed by cubic + * (not tcp_friendliness mode) */ }; static inline void bictcp_reset(struct bictcp *ca) { - ca->cnt = 0; ca->last_max_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; @@ -114,7 +111,6 @@ static inline void bictcp_reset(struct bictcp *ca) ca->bic_K = 0; ca->delay_min = 0; ca->epoch_start = 0; - ca->delayed_ack = 2 << ACK_RATIO_SHIFT; ca->ack_cnt = 0; ca->tcp_cwnd = 0; ca->found = 0; @@ -205,12 +201,14 @@ static u32 cubic_root(u64 a) /* * Compute congestion window to use. */ -static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +static inline void bictcp_update(struct bictcp *ca, u32 pkts_acked, u32 cwnd) { - u32 delta, bic_target, max_cnt; + u32 delta, bic_target; u64 offs, t; - ca->ack_cnt++; /* count the number of ACKs */ + ca->ack_cnt += pkts_acked; /* count the number of packets that + * have been ACKed + */ if (ca->last_cwnd == cwnd && (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) @@ -221,7 +219,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) if (ca->epoch_start == 0) { ca->epoch_start = tcp_time_stamp; /* record beginning */ - ca->ack_cnt = 1; /* start counting */ + ca->ack_cnt = pkts_acked; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ if (ca->last_max_cwnd <= cwnd) { @@ -269,19 +267,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) else /* above origin*/ bic_target = ca->bic_origin_point + delta; - /* cubic function - calc bictcp_cnt*/ - if (bic_target > cwnd) { - ca->cnt = cwnd / (bic_target - cwnd); - } else { - ca->cnt = 100 * cwnd; /* very small increment*/ - } - - /* - * The initial growth of cubic function may be too conservative - * when the available bandwidth is still unknown. - */ - if (ca->last_max_cwnd == 0 && ca->cnt > 20) - ca->cnt = 20; /* increase cwnd 5% per RTT */ + ca->last_bic_target = bic_target; /* TCP Friendly */ if (tcp_friendliness) { @@ -292,18 +278,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->ack_cnt -= delta; ca->tcp_cwnd++; } - - if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ - delta = ca->tcp_cwnd - cwnd; - max_cnt = cwnd / delta; - if (ca->cnt > max_cnt) - ca->cnt = max_cnt; - } } - - ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; - if (ca->cnt == 0) /* cannot be zero */ - ca->cnt = 1; } static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) @@ -314,13 +289,43 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; + /* cwnd may first advance in slow start then move on to congestion + * control mode on a stretch ACK. + */ if (tp->snd_cwnd <= tp->snd_ssthresh) { if (hystart && after(ack, ca->end_seq)) bictcp_hystart_reset(sk); - tcp_slow_start(tp, acked); - } else { - bictcp_update(ca, tp->snd_cwnd); - tcp_cong_avoid_ai(tp, ca->cnt); + acked = tcp_slow_start(tp, acked); + } + + if (acked && tp->snd_cwnd > tp->snd_ssthresh) { + u32 target, cnt; + + bictcp_update(ca, acked, tp->snd_cwnd); + /* Compute target cwnd based on bic_target and tcp_cwnd + * (whichever is faster) + */ + target = (ca->last_bic_target >= ca->tcp_cwnd) ? + ca->last_bic_target : ca->tcp_cwnd; + while (acked > 0) { + if (target > tp->snd_cwnd) + cnt = tp->snd_cwnd / (target - tp->snd_cwnd); + else + cnt = 100 * tp->snd_cwnd; + + /* The initial growth of cubic function may be + * too conservative when the available + * bandwidth is still unknown. + */ + if (ca->last_max_cwnd == 0 && cnt > 20) + cnt = 20; /* increase cwnd 5% per RTT */ + + if (cnt == 0) /* cannot be zero */ + cnt = 1; + + tcp_cong_avoid_ai(tp, cnt); + acked--; + } } } @@ -411,20 +416,10 @@ static void hystart_update(struct sock *sk, u32 delay) */ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) { - const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 delay; - if (icsk->icsk_ca_state == TCP_CA_Open) { - u32 ratio = ca->delayed_ack; - - ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ratio += cnt; - - ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); - } - /* Some calls are for duplicates without timetamps */ if (rtt_us < 0) return;