Message ID | 4B701EE2.1000006@simula.no |
---|---|
State | Superseded, archived |
Delegated to: | David Miller |
Headers | show |
Andreas Petlund schrieb: > Major change: Limit number of thin linear timeout tries to TCP_THIN_LT_RETRIES (currently 6). > >>From ec71404702149bc9197c749e5d1d68656c87f98f Mon Sep 17 00:00:00 2001 > From: Andreas Petlund <apetlund@simula.no> > Date: Mon, 8 Feb 2010 14:05:53 +0100 > Subject: [PATCH 2/3] net: TCP thin linear timeouts > > > Signed-off-by: Andreas Petlund <apetlund@simula.no> > --- > include/linux/sysctl.h | 1 + > include/linux/tcp.h | 3 +++ > include/net/tcp.h | 4 ++++ > net/ipv4/sysctl_net_ipv4.c | 7 +++++++ > net/ipv4/tcp.c | 5 +++++ > net/ipv4/tcp_timer.c | 19 ++++++++++++++++++- > 6 files changed, 38 insertions(+), 1 deletions(-) > > diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h > index 9f236cd..d840d75 100644 > --- a/include/linux/sysctl.h > +++ b/include/linux/sysctl.h > @@ -425,6 +425,7 @@ enum > NET_TCP_ALLOWED_CONG_CONTROL=123, > NET_TCP_MAX_SSTHRESH=124, > NET_TCP_FRTO_RESPONSE=125, > + NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS=126, > }; > > enum { > diff --git a/include/linux/tcp.h b/include/linux/tcp.h > index 7fee8a4..67da706 100644 > --- a/include/linux/tcp.h > +++ b/include/linux/tcp.h > @@ -103,6 +103,7 @@ enum { > #define TCP_CONGESTION 13 /* Congestion control algorithm */ > #define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ > #define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */ > +#define TCP_THIN_LT 16 /* Use linear timeouts for thin streams*/ > > /* for TCP_INFO socket option */ > #define TCPI_OPT_TIMESTAMPS 1 > @@ -341,6 +342,8 @@ struct tcp_sock { > u16 advmss; /* Advertised MSS */ > u8 frto_counter; /* Number of new acks after RTO */ > u8 nonagle; /* Disable Nagle algorithm? */ > + u8 thin_lt : 1,/* Use linear timeouts for thin streams */ > + thin_undef : 7; > > /* RTT measurement */ > u32 srtt; /* smoothed round trip time << 3 */ > diff --git a/include/net/tcp.h b/include/net/tcp.h > index e5e2056..bc5856a 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); > #define TCP_NAGLE_CORK 2 /* Socket is corked */ > #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ > > +/* TCP thin-stream limits */ > +#define TCP_THIN_LT_RETRIES 6 /* After 6 linear retries, do exp. backoff */ > + > extern struct inet_timewait_death_row tcp_death_row; > > /* sysctl variables for tcp */ > @@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows; > extern int sysctl_tcp_slow_start_after_idle; > extern int sysctl_tcp_max_ssthresh; > extern int sysctl_tcp_cookie_size; > +extern int sysctl_tcp_force_thin_linear_timeouts; > > extern atomic_t tcp_memory_allocated; > extern struct percpu_counter tcp_sockets_allocated; > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c > index 7e3712c..cb2ed35 100644 > --- a/net/ipv4/sysctl_net_ipv4.c > +++ b/net/ipv4/sysctl_net_ipv4.c > @@ -576,6 +576,13 @@ static struct ctl_table ipv4_table[] = { > .proc_handler = proc_dointvec > }, > { > + .procname = "tcp_force_thin_linear_timeouts", > + .data = &sysctl_tcp_force_thin_linear_timeouts, > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = proc_dointvec > + }, > + { > .procname = "udp_mem", > .data = &sysctl_udp_mem, > .maxlen = sizeof(sysctl_udp_mem), > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index d5d69ea..cbc1ee3 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -2229,6 +2229,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, > } > break; > > + case TCP_THIN_LT: > + if (val) > + tp->thin_lt = 1; > + break; > + > case TCP_CORK: > /* When set indicates to always queue non-full frames. > * Later the user clears this option and we transmit > diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c > index de7d1bf..f01a585 100644 > --- a/net/ipv4/tcp_timer.c > +++ b/net/ipv4/tcp_timer.c > @@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL; > int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; > int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; > int sysctl_tcp_orphan_retries __read_mostly; > +int sysctl_tcp_force_thin_linear_timeouts __read_mostly; > > static void tcp_write_timer(unsigned long); > static void tcp_delack_timer(unsigned long); > @@ -415,7 +416,23 @@ void tcp_retransmit_timer(struct sock *sk) > icsk->icsk_retransmits++; > > out_reset_timer: > - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); > + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is > + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this > + * might be increased if the stream oscillates between thin and thick, > + * thus the old value might already be too high compared to the value > + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without > + * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential > + * backoff behaviour to avoid continue hammering linear-timeout > + * retransmissions into a black hole*/ > + if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) && > + tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED && > + icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) { > + icsk->icsk_backoff = 0; Hi, I think, this value should be at least 1, as icsk_backoff might be decreased to -1 and used for bit-shifting in tcp_v4_err(). A lower boundary check might be even better. Regards Damian > + icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); > + } else { > + /* Use normal (exponential) backoff */ > + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); > + } > inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); > if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) > __sk_dst_reset(sk); -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le lundi 08 février 2010 à 15:25 +0100, Andreas Petlund a écrit : > > + case TCP_THIN_LT: > + if (val) > + tp->thin_lt = 1; > + break; > + Why not allowing user to clear thin_lt ? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 02/08/2010 09:50 PM, Damian Lukowski wrote: >> out_reset_timer: >> - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); >> + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is >> + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this >> + * might be increased if the stream oscillates between thin and thick, >> + * thus the old value might already be too high compared to the value >> + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without >> + * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential >> + * backoff behaviour to avoid continue hammering linear-timeout >> + * retransmissions into a black hole*/ >> + if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) && >> + tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED && >> + icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) { >> + icsk->icsk_backoff = 0; > > Hi, > I think, this value should be at least 1, as icsk_backoff > might be decreased to -1 and used for bit-shifting in tcp_v4_err(). > A lower boundary check might be even better. Hi Thanks for the feedback. As far as I can see, the check a couple of lines above the decrementation stops the icsk->icsk_backoff from being decremented if already zero. Beyond that I cannot find any more places where this situation may arise. Please correct me if I'm wrong and a boundary check is indeed warranted. Excerpt from tcp_ipv4.c ------------------ if (seq != tp->snd_una || !icsk->icsk_retransmits || !icsk->icsk_backoff) break; icsk->icsk_backoff--; inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << icsk->icsk_backoff; ------------------ Best regards, Andreas -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 02/09/2010 07:31 AM, Eric Dumazet wrote: > Le lundi 08 février 2010 à 15:25 +0100, Andreas Petlund a écrit : >> >> + case TCP_THIN_LT: >> + if (val) >> + tp->thin_lt = 1; >> + break; >> + > > Why not allowing user to clear thin_lt ? > > That is a very good point. I will fix that in the next iteration, as well as error handling for out of bounds values. Best regards, Andreas -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Am 09.02.2010, 17:40 Uhr, schrieb Andreas Petlund <apetlund@simula.no>: > On 02/08/2010 09:50 PM, Damian Lukowski wrote: >>> out_reset_timer: >>> - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); >>> + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is >>> + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this >>> + * might be increased if the stream oscillates between thin and >>> thick, >>> + * thus the old value might already be too high compared to the value >>> + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without >>> + * backoff. Limit to TCP_THIN_LT_RETRIES before initiating >>> exponential >>> + * backoff behaviour to avoid continue hammering linear-timeout >>> + * retransmissions into a black hole*/ >>> + if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) && >>> + tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED && >>> + icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) { >>> + icsk->icsk_backoff = 0; >> >> Hi, >> I think, this value should be at least 1, as icsk_backoff >> might be decreased to -1 and used for bit-shifting in tcp_v4_err(). >> A lower boundary check might be even better. > > Hi > > Thanks for the feedback. > > As far as I can see, the check a couple of lines above the decrementation > stops the icsk->icsk_backoff from being decremented if already zero. > Beyond that I cannot find any more places where this situation may arise. > Please correct me if I'm wrong and a boundary check is indeed warranted. Oops, you are right, of course ... I just had in mind, that a thin stream might also be a candidate for backoff reversion when connectivity breaks down, that's why I said "at least 1". And I really have forgotten the already existing check, sorry. So, setting icsk_backoff = 0 will prevent a backoff reversion, but that's ok, as the RTO is not doubled in the first place. It might have been an issue, if you had not used __tcp_set_rto() but left the value unchanged *and* a non-thin stream became thin at some point in the RTO retransmission phase (if that is even possible). Damian -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 9f236cd..d840d75 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -425,6 +425,7 @@ enum NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_MAX_SSTHRESH=124, NET_TCP_FRTO_RESPONSE=125, + NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS=126, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 7fee8a4..67da706 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -103,6 +103,7 @@ enum { #define TCP_CONGESTION 13 /* Congestion control algorithm */ #define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ #define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */ +#define TCP_THIN_LT 16 /* Use linear timeouts for thin streams*/ /* for TCP_INFO socket option */ #define TCPI_OPT_TIMESTAMPS 1 @@ -341,6 +342,8 @@ struct tcp_sock { u16 advmss; /* Advertised MSS */ u8 frto_counter; /* Number of new acks after RTO */ u8 nonagle; /* Disable Nagle algorithm? */ + u8 thin_lt : 1,/* Use linear timeouts for thin streams */ + thin_undef : 7; /* RTT measurement */ u32 srtt; /* smoothed round trip time << 3 */ diff --git a/include/net/tcp.h b/include/net/tcp.h index e5e2056..bc5856a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ +/* TCP thin-stream limits */ +#define TCP_THIN_LT_RETRIES 6 /* After 6 linear retries, do exp. backoff */ + extern struct inet_timewait_death_row tcp_death_row; /* sysctl variables for tcp */ @@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_max_ssthresh; extern int sysctl_tcp_cookie_size; +extern int sysctl_tcp_force_thin_linear_timeouts; extern atomic_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7e3712c..cb2ed35 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -576,6 +576,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_force_thin_linear_timeouts", + .data = &sysctl_tcp_force_thin_linear_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d5d69ea..cbc1ee3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2229,6 +2229,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, } break; + case TCP_THIN_LT: + if (val) + tp->thin_lt = 1; + break; + case TCP_CORK: /* When set indicates to always queue non-full frames. * Later the user clears this option and we transmit diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index de7d1bf..f01a585 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; int sysctl_tcp_orphan_retries __read_mostly; +int sysctl_tcp_force_thin_linear_timeouts __read_mostly; static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); @@ -415,7 +416,23 @@ void tcp_retransmit_timer(struct sock *sk) icsk->icsk_retransmits++; out_reset_timer: - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this + * might be increased if the stream oscillates between thin and thick, + * thus the old value might already be too high compared to the value + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without + * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential + * backoff behaviour to avoid continue hammering linear-timeout + * retransmissions into a black hole*/ + if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) && + tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED && + icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) { + icsk->icsk_backoff = 0; + icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); + } else { + /* Use normal (exponential) backoff */ + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) __sk_dst_reset(sk);