Message ID | 20160614191841.21496-1-dmetz@mytum.de |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On Tue, Jun 14, 2016 at 12:18 PM, Daniel Metz <dmetz@mytum.de> wrote: > From: Daniel Metz <daniel.metz@rohde-schwarz.com> > > This patch adjusts Linux RTO calculation to be RFC6298 Standard > compliant. MinRTO is no longer added to the computed RTO, RTO damping > and overestimation are decreased. > > In RFC 6298 Standard TCP Retransmission Timeout (RTO) calculation the > calculated RTO is rounded up to the Minimum RTO (MinRTO), if it is > less. The Linux implementation as a discrepancy to the Standard > basically adds the defined MinRTO to the calculated RTO. When > comparing both approaches, the Linux calculation seems to perform > worse for sender limited TCP flows like Telnet, SSH or constant bit > rate encoded transmissions, especially for Round Trip Times (RTT) of > 50ms to 800ms. > > Compared to the Linux implementation the RFC 6298 proposed RTO > calculation performs better and more precise in adapting to current > network characteristics. Extensive measurements for bulk data did not > show a negative impact of the adjusted calculation. > > Exemplary performance comparison for sender-limited-flows: > > - Rate: 10Mbit/s > - Delay: 200ms, Delay Variation: 10ms > - Time between each scheduled segment: 1s > - Amount Data Segments: 300 > - Mean of 11 runs > > Mean Response Waiting Time [milliseconds] > > PER [%] | 0.5 1 1.5 2 3 5 7 10 > --------+------------------------------------------------------- > old | 206.4 208.6 218.0 218.6 227.2 249.3 274.7 308.2 > new | 203.9 206.0 207.0 209.9 217.3 225.6 238.7 259.1 > > > Detailed analysis: > https://docs.google.com/document/d/1pKmPfnQb6fDK4qpiNVkN8cQyGE4wYDZukcuZfR-BnnM/ > > Reasoning for historic design: > Sarolahti, P.; Kuznetsov, A. (2002). Congestion Control in Linux TCP. > Conference Paper. Proceedings of the FREENIX Track. 2002 USENIX Annual > https://www.cs.helsinki.fi/research/iwtcp/papers/linuxtcp.pdf > > > Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> > Signed-off-by: Daniel Metz <dmetz@mytum.de> > Cc: Eric Dumazet <edumazet@google.com> > Cc: Yuchung Cheng <ycheng@google.com> > --- > > v2: > - Using the RFC 6298 compliant implementation, the tcp_sock struct variable > u32 mdev_max_us becomes obsolete and consequently is being removed. > - Add reference to Kuznetsov paper > > > include/linux/tcp.h | 1 - > net/ipv4/tcp_input.c | 74 ++++++++++++-------------------------------------- > net/ipv4/tcp_metrics.c | 2 +- > 3 files changed, 18 insertions(+), 59 deletions(-) > > diff --git a/include/linux/tcp.h b/include/linux/tcp.h > index 7be9b12..d1790c5 100644 > --- a/include/linux/tcp.h > +++ b/include/linux/tcp.h > @@ -231,7 +231,6 @@ struct tcp_sock { > /* RTT measurement */ > u32 srtt_us; /* smoothed round trip time << 3 in usecs */ > u32 mdev_us; /* medium deviation */ > - u32 mdev_max_us; /* maximal mdev for the last rtt period */ > u32 rttvar_us; /* smoothed mdev_max */ > u32 rtt_seq; /* sequence number to update rttvar */ > struct rtt_meas { > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index 94d4aff..0d53537 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -680,8 +680,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) > /* Called to compute a smoothed rtt estimate. The data fed to this > * routine either comes from timestamps, or from segments that were > * known _not_ to have been retransmitted [see Karn/Partridge > - * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 > - * piece by Van Jacobson. > + * Proceedings SIGCOMM 87]. > * NOTE: the next three routines used to be one big routine. > * To save cycles in the RFC 1323 implementation it was better to break > * it up into three procedures. -- erics > @@ -692,59 +691,21 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) > long m = mrtt_us; /* RTT */ > u32 srtt = tp->srtt_us; > > - /* The following amusing code comes from Jacobson's > - * article in SIGCOMM '88. Note that rtt and mdev > - * are scaled versions of rtt and mean deviation. > - * This is designed to be as fast as possible > - * m stands for "measurement". > - * > - * On a 1990 paper the rto value is changed to: > - * RTO = rtt + 4 * mdev > - * > - * Funny. This algorithm seems to be very broken. > - * These formulae increase RTO, when it should be decreased, increase > - * too slowly, when it should be increased quickly, decrease too quickly > - * etc. I guess in BSD RTO takes ONE value, so that it is absolutely > - * does not matter how to _calculate_ it. Seems, it was trap > - * that VJ failed to avoid. 8) > - */ > if (srtt != 0) { > - m -= (srtt >> 3); /* m is now error in rtt est */ > - srtt += m; /* rtt = 7/8 rtt + 1/8 new */ > - if (m < 0) { > - m = -m; /* m is now abs(error) */ > - m -= (tp->mdev_us >> 2); /* similar update on mdev */ > - /* This is similar to one of Eifel findings. > - * Eifel blocks mdev updates when rtt decreases. > - * This solution is a bit different: we use finer gain > - * for mdev in this case (alpha*beta). > - * Like Eifel it also prevents growth of rto, > - * but also it limits too fast rto decreases, > - * happening in pure Eifel. > - */ > - if (m > 0) > - m >>= 3; > - } else { > - m -= (tp->mdev_us >> 2); /* similar update on mdev */ > - } > - tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ > - if (tp->mdev_us > tp->mdev_max_us) { > - tp->mdev_max_us = tp->mdev_us; > - if (tp->mdev_max_us > tp->rttvar_us) > - tp->rttvar_us = tp->mdev_max_us; > - } > - if (after(tp->snd_una, tp->rtt_seq)) { > - if (tp->mdev_max_us < tp->rttvar_us) > - tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; > + m -= (srtt >> 3); /* m' = m - srtt / 8 = (R' - SRTT) */ > + srtt += m; /* srtt = srtt + m’ = srtt + m - srtt / 8 */ > + if (m < 0) > + m = -m; > + m -= (tp->mdev_us >> 2); /* m'' = |m'| - mdev / 4 */ > + tp->mdev_us += m; > + tp->rttvar_us = tp->mdev_us; > + if (after(tp->snd_una, tp->rtt_seq)) > tp->rtt_seq = tp->snd_nxt; > - tp->mdev_max_us = tcp_rto_min_us(sk); > - } > } else { > /* no previous measure. */ > - srtt = m << 3; /* take the measured time to be rtt */ > - tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ > - tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); > - tp->mdev_max_us = tp->rttvar_us; > + srtt = m << 3; > + tp->mdev_us = m << 1; > + tp->rttvar_us = tp->mdev_us; AFAICT we can update rttvar_us directly and don't need mdev_us anymore? > tp->rtt_seq = tp->snd_nxt; > } > tp->srtt_us = max(1U, srtt); > @@ -798,6 +759,7 @@ static void tcp_update_pacing_rate(struct sock *sk) > */ > static void tcp_set_rto(struct sock *sk) > { > + const u32 min_rto = tcp_rto_min_us(sk); > const struct tcp_sock *tp = tcp_sk(sk); > /* Old crap is replaced with new one. 8) > * > @@ -809,17 +771,15 @@ static void tcp_set_rto(struct sock *sk) > * is invisible. Actually, Linux-2.4 also generates erratic > * ACKs in some circumstances. > */ > - inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); > - > + if (((tp->srtt_us >> 3) + tp->rttvar_us) < min_rto) > + inet_csk(sk)->icsk_rto = usecs_to_jiffies(min_rto); > + else > + inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); This is more aggressive than RFC6298 that RTO <- SRTT + max (G, K*RTTVAR) where G = MIN_RTO = 200ms based on our discussion, in the spirit of keeping RTO more conservative, I recommend we implement RFC formula. Acks being delayed over 200ms is not uncommon (unfortunately due to bloat or other issues). Also I think we should change __tcp_set_rto so that the formula applies to backoffs or ICMP timeouts calculations too. > /* 2. Fixups made earlier cannot be right. > * If we do not estimate RTO correctly without them, > * all the algo is pure shit and should be replaced > * with correct one. It is exactly, which we pretend to do. > */ > - > - /* NOTE: clamping at TCP_RTO_MIN is not required, current algo > - * guarantees that rto is higher. > - */ > tcp_bound_rto(sk); > } > > diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c > index b617826..7f59f5b 100644 > --- a/net/ipv4/tcp_metrics.c > +++ b/net/ipv4/tcp_metrics.c > @@ -561,7 +561,7 @@ reset: > * retransmission. > */ > tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); > - tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; > + tp->mdev_us = tp->rttvar_us; > > inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; > } > -- > 2.8.3 >
* Yuchung Cheng | 2016-06-14 14:33:18 [-0700]: >> + tp->rttvar_us = tp->mdev_us; >AFAICT we can update rttvar_us directly and don't need mdev_us anymore? Yes, v3 will remove mdev_us. >This is more aggressive than RFC6298 that RTO <- SRTT + max (G, >K*RTTVAR) where G = MIN_RTO = 200ms > >based on our discussion, in the spirit of keeping RTO more >conservative, I recommend we implement RFC formula. Acks being delayed >over 200ms is not uncommon (unfortunately due to bloat or other >issues). > >Also I think we should change __tcp_set_rto so that the formula >applies to backoffs or ICMP timeouts calculations too. We are a unsure what you mean Yuchung. We believe this patch not to be more aggressive than RFC 6298. In fact, we believe it to be RFC 6298 compliant, as in RFC 6298, G is the clock granularity and we don’t see where it deviates from the RFC. However, it is more aggressive than “RTO <- SRTT + max (G, K*RTTVAR) where G = MIN_RTO = 200ms”. Which formula do you want to implement? Hagen
On Wed, Jun 15, 2016 at 10:41 AM, Hagen Paul Pfeifer <hagen@jauu.net> wrote: > > * Yuchung Cheng | 2016-06-14 14:33:18 [-0700]: > > >> + tp->rttvar_us = tp->mdev_us; > >AFAICT we can update rttvar_us directly and don't need mdev_us anymore? > > Yes, v3 will remove mdev_us. > > >This is more aggressive than RFC6298 that RTO <- SRTT + max (G, > >K*RTTVAR) where G = MIN_RTO = 200ms > > > >based on our discussion, in the spirit of keeping RTO more > >conservative, I recommend we implement RFC formula. Acks being delayed > >over 200ms is not uncommon (unfortunately due to bloat or other > >issues). > > > >Also I think we should change __tcp_set_rto so that the formula > >applies to backoffs or ICMP timeouts calculations too. > > We are a unsure what you mean Yuchung. We believe this patch not to be more > aggressive than RFC 6298. In fact, we believe it to be RFC 6298 compliant, as > in RFC 6298, G is the clock granularity and we don’t see where it deviates > from the RFC. However, it is more aggressive than “RTO <- SRTT + max (G, > K*RTTVAR) where G = MIN_RTO = 200ms”. Which formula do you want to implement? Let me explain in a different way: * RFC6298 applies a lower bound of 1 second to RTO (section 2.4) * Linux currently applies a lower bound of 200ms (min_rto) to K*RTTVAR, but /not/ RTO itself. * This patch applies the lower bound of 200ms to RTO, similar to RFC6298 Let's say the SRTT is 100ms and RTT variations is 10ms. The variation is low because we've been sending large chunks, and RTT is fairly stable, and we sample on every ACK. The RTOs produced are RFC6298: RTO=1s Linux: RTO=300ms This patch: RTO=200ms Then we send 1 packet out. The receiver delays the ACK up to 200ms. The actual RTT can be longer because other network components further delay the data or the ACK. This patch would surely fire the RTO spuriously. so we can either implement RFC6298 faithfully, or apply the lower-bound as-is, or something in between. But the current patch as-is is more aggressive. Did I miss something? > > > Hagen
Yuchung Cheng | 2016-06-15 20:02: > Let me explain in a different way: > > * RFC6298 applies a lower bound of 1 second to RTO (section 2.4) > > * Linux currently applies a lower bound of 200ms (min_rto) to > K*RTTVAR, but /not/ RTO itself. > > * This patch applies the lower bound of 200ms to RTO, similar to RFC6298 > > > Let's say the SRTT is 100ms and RTT variations is 10ms. The variation > is low because we've been sending large chunks, and RTT is fairly > stable, and we sample on every ACK. The RTOs produced are > > RFC6298: RTO=1s > Linux: RTO=300ms > This patch: RTO=200ms > > Then we send 1 packet out. The receiver delays the ACK up to 200ms. > The actual RTT can be longer because other network components further > delay the data or the ACK. This patch would surely fire the RTO > spuriously. > > so we can either implement RFC6298 faithfully, or apply the > lower-bound as-is, or something in between. But the current patch > as-is is more aggressive. Did I miss something? Thank you for the clarification. The fundamental thought of this patch was to decrease Linux RTO overestimation. This also involved not clinging to the RFC 6298 MinRTO of 1 second ((2.4) "[...] at the same time acknowledging that at some future point, research may show that a smaller minimum RTO is acceptable or superior."). A more aggressive RTO will of course increase the amount of Spurious Retransmission. The question is, if the benefit is higher than the sacrifice. The tests we conducted did not show significant negative impact so far. However, for sender-limited TCP flows the results were promising. Daniel
On Wed, Jun 15, 2016 at 1:34 PM, Daniel Metz <dmetz@mytum.de> wrote: > Yuchung Cheng | 2016-06-15 20:02: >> Let me explain in a different way: >> >> * RFC6298 applies a lower bound of 1 second to RTO (section 2.4) >> >> * Linux currently applies a lower bound of 200ms (min_rto) to >> K*RTTVAR, but /not/ RTO itself. >> >> * This patch applies the lower bound of 200ms to RTO, similar to RFC6298 >> >> >> Let's say the SRTT is 100ms and RTT variations is 10ms. The variation >> is low because we've been sending large chunks, and RTT is fairly >> stable, and we sample on every ACK. The RTOs produced are >> >> RFC6298: RTO=1s >> Linux: RTO=300ms >> This patch: RTO=200ms >> >> Then we send 1 packet out. The receiver delays the ACK up to 200ms. >> The actual RTT can be longer because other network components further >> delay the data or the ACK. This patch would surely fire the RTO >> spuriously. >> >> so we can either implement RFC6298 faithfully, or apply the >> lower-bound as-is, or something in between. But the current patch >> as-is is more aggressive. Did I miss something? > > Thank you for the clarification. The fundamental thought of this patch was > to decrease Linux RTO overestimation. This also involved not clinging to the > RFC 6298 MinRTO of 1 second ((2.4) "[...] at the same time acknowledging > that at some future point, research may show that a smaller minimum RTO is > acceptable or superior."). A more aggressive RTO will of course increase the > amount of Spurious Retransmission. The question is, if the benefit is higher > than the sacrifice. The tests we conducted did not show significant negative > impact so far. However, for sender-limited TCP flows the results were > promising. > I guess the problem is that some folks use smaller rto than RTAX_RTO_MIN , look at tcp_rto_min()
On Wed, Jun 15, 2016 at 1:38 PM, Eric Dumazet <edumazet@google.com> wrote: > > On Wed, Jun 15, 2016 at 1:34 PM, Daniel Metz <dmetz@mytum.de> wrote: > > Yuchung Cheng | 2016-06-15 20:02: > >> Let me explain in a different way: > >> > >> * RFC6298 applies a lower bound of 1 second to RTO (section 2.4) > >> > >> * Linux currently applies a lower bound of 200ms (min_rto) to > >> K*RTTVAR, but /not/ RTO itself. > >> > >> * This patch applies the lower bound of 200ms to RTO, similar to RFC6298 > >> > >> > >> Let's say the SRTT is 100ms and RTT variations is 10ms. The variation > >> is low because we've been sending large chunks, and RTT is fairly > >> stable, and we sample on every ACK. The RTOs produced are > >> > >> RFC6298: RTO=1s > >> Linux: RTO=300ms > >> This patch: RTO=200ms > >> > >> Then we send 1 packet out. The receiver delays the ACK up to 200ms. > >> The actual RTT can be longer because other network components further > >> delay the data or the ACK. This patch would surely fire the RTO > >> spuriously. > >> > >> so we can either implement RFC6298 faithfully, or apply the > >> lower-bound as-is, or something in between. But the current patch > >> as-is is more aggressive. Did I miss something? > > > > Thank you for the clarification. The fundamental thought of this patch was > > to decrease Linux RTO overestimation. This also involved not clinging to the > > RFC 6298 MinRTO of 1 second ((2.4) "[...] at the same time acknowledging > > that at some future point, research may show that a smaller minimum RTO is > > acceptable or superior."). A more aggressive RTO will of course increase the > > amount of Spurious Retransmission. The question is, if the benefit is higher > > than the sacrifice. The tests we conducted did not show significant negative > > impact so far. However, for sender-limited TCP flows the results were > > promising. > > > > I guess the problem is that some folks use smaller rto than > RTAX_RTO_MIN , look at tcp_rto_min() Also many other stacks (e.g., Windows until very recently) do not have 40ms delayed ACKs like Linux. One thing we at least know is that the current 200ms lower-bound on RTTVAR works for a long time. That's why I propose to do so. In other words, change the RTT variation averaging, but not the lower-bound. Will try to get the experiment going to test different min_rto values so we have more data.
> On June 15, 2016 at 8:02 PM Yuchung Cheng <ycheng@google.com> wrote: > > Let's say the SRTT is 100ms and RTT variations is 10ms. The variation > is low because we've been sending large chunks, and RTT is fairly > stable, and we sample on every ACK. The RTOs produced are > > RFC6298: RTO=1s > Linux: RTO=300ms > This patch: RTO=200ms > > Then we send 1 packet out. The receiver delays the ACK up to 200ms. > The actual RTT can be longer because other network components further > delay the data or the ACK. This patch would surely fire the RTO > spuriously. > > so we can either implement RFC6298 faithfully, or apply the > lower-bound as-is, or something in between. But the current patch > as-is is more aggressive. Did I miss something? We analyzed the impact for a wide variety of network characteristics. Starting from bulk data till chatty, sender-limited transmissions from low RTTs to high RTTs, small and large variances as well as different queue characteristics. For a group of tests we measured advantages of a RFC 6298 compliant implementation: sender-limited flows. For bulk data we did not measured any difference compared to standard Linux. As a result we concluded that the RFC conform implementation - mapped to real world protocols - if beneficial. For the mentioned use case, yes the new implementation is a little bit more aggressive: when delayed ack kicks in, a spurious retransmission can be triggerd, yes. We asked ourself if this is a real world scenario or more an theoretical issue. Furthermore, if a real world problem, if the retransmission is negligible compared to the advantages? Yuchung, can you test the patch and see if the patch have any downsides? And thank you for the comments! Hagen
> On June 15, 2016 at 10:38 PM Eric Dumazet <edumazet@google.com> wrote: > > I guess the problem is that some folks use smaller rto than > RTAX_RTO_MIN , look at tcp_rto_min() Due to the nature of the Linux calculation, this is probably more of a reason to use the RFC 6298 calculation. When a smaller MinRTO as 200ms is used, the Linux “advantage” to account for Delayed ACKs up to 200ms is decreased. Assuming a MinRTO of 0ms, the Linux ability and the RFC ability to account for sudden Delayed ACKs is pretty equal: zero. To illustrate this: RTT: 50ms, RTTVAR: 0ms, MinRTO: 50ms, Delayed ACKs: 200ms. Before any ACK is delayed: Linux RTO ~ 100+ms (tested) RFC 6298 RTO ~ 50+ms (tested) RTT of first delayed ACK if it is not shortened due to another data packet: ~250ms This is not tied to the RTT: RTT 1000ms, RTTVAR: 0ms, MinRTO: 50ms, Delayed ACKs: 200ms Before any ACK is delayed: Linux RTO ~ 1050+ms (tested) RFC 6298 RTO ~ 1000+ms (tested) RTT of first delayed ACK if it is not shortened due to another data packet: ~1200ms A RFC 6298 problem we run in so far was with extremely steady RTTs and sender limited data. A Spurious Retransmission occurred from time to time in this case. Hagen
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 7be9b12..d1790c5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -231,7 +231,6 @@ struct tcp_sock { /* RTT measurement */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ - u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ u32 rtt_seq; /* sequence number to update rttvar */ struct rtt_meas { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 94d4aff..0d53537 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -680,8 +680,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge - * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 - * piece by Van Jacobson. + * Proceedings SIGCOMM 87]. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics @@ -692,59 +691,21 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) long m = mrtt_us; /* RTT */ u32 srtt = tp->srtt_us; - /* The following amusing code comes from Jacobson's - * article in SIGCOMM '88. Note that rtt and mdev - * are scaled versions of rtt and mean deviation. - * This is designed to be as fast as possible - * m stands for "measurement". - * - * On a 1990 paper the rto value is changed to: - * RTO = rtt + 4 * mdev - * - * Funny. This algorithm seems to be very broken. - * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be increased quickly, decrease too quickly - * etc. I guess in BSD RTO takes ONE value, so that it is absolutely - * does not matter how to _calculate_ it. Seems, it was trap - * that VJ failed to avoid. 8) - */ if (srtt != 0) { - m -= (srtt >> 3); /* m is now error in rtt est */ - srtt += m; /* rtt = 7/8 rtt + 1/8 new */ - if (m < 0) { - m = -m; /* m is now abs(error) */ - m -= (tp->mdev_us >> 2); /* similar update on mdev */ - /* This is similar to one of Eifel findings. - * Eifel blocks mdev updates when rtt decreases. - * This solution is a bit different: we use finer gain - * for mdev in this case (alpha*beta). - * Like Eifel it also prevents growth of rto, - * but also it limits too fast rto decreases, - * happening in pure Eifel. - */ - if (m > 0) - m >>= 3; - } else { - m -= (tp->mdev_us >> 2); /* similar update on mdev */ - } - tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ - if (tp->mdev_us > tp->mdev_max_us) { - tp->mdev_max_us = tp->mdev_us; - if (tp->mdev_max_us > tp->rttvar_us) - tp->rttvar_us = tp->mdev_max_us; - } - if (after(tp->snd_una, tp->rtt_seq)) { - if (tp->mdev_max_us < tp->rttvar_us) - tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; + m -= (srtt >> 3); /* m' = m - srtt / 8 = (R' - SRTT) */ + srtt += m; /* srtt = srtt + m’ = srtt + m - srtt / 8 */ + if (m < 0) + m = -m; + m -= (tp->mdev_us >> 2); /* m'' = |m'| - mdev / 4 */ + tp->mdev_us += m; + tp->rttvar_us = tp->mdev_us; + if (after(tp->snd_una, tp->rtt_seq)) tp->rtt_seq = tp->snd_nxt; - tp->mdev_max_us = tcp_rto_min_us(sk); - } } else { /* no previous measure. */ - srtt = m << 3; /* take the measured time to be rtt */ - tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ - tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); - tp->mdev_max_us = tp->rttvar_us; + srtt = m << 3; + tp->mdev_us = m << 1; + tp->rttvar_us = tp->mdev_us; tp->rtt_seq = tp->snd_nxt; } tp->srtt_us = max(1U, srtt); @@ -798,6 +759,7 @@ static void tcp_update_pacing_rate(struct sock *sk) */ static void tcp_set_rto(struct sock *sk) { + const u32 min_rto = tcp_rto_min_us(sk); const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) * @@ -809,17 +771,15 @@ static void tcp_set_rto(struct sock *sk) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ - inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); - + if (((tp->srtt_us >> 3) + tp->rttvar_us) < min_rto) + inet_csk(sk)->icsk_rto = usecs_to_jiffies(min_rto); + else + inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ - - /* NOTE: clamping at TCP_RTO_MIN is not required, current algo - * guarantees that rto is higher. - */ tcp_bound_rto(sk); } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b617826..7f59f5b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -561,7 +561,7 @@ reset: * retransmission. */ tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); - tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; + tp->mdev_us = tp->rttvar_us; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; }