diff mbox

[1/1] net: tcp: RTO restart

Message ID 20140218181205.GA7780@kau.se
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Per Hurtig Feb. 18, 2014, 6:12 p.m. UTC
This patch implements the RTO restart modification described in
http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02

RTO Restart's goal is to provide quicker loss recovery for segments lost in the
end of a burst/connection. To accomplish this the algorithm adjusts the RTO
value on each rearm of the retransmission timer to be exactly RTO ms after the
earliest outstanding segment was sent. The offsetting against the earliest
outstanding segment is not done by the regular rearm algorithm, which causes
RTOs to occur, on average, after RTO+RTT ms.

As a faster timeout is only beneficial in scenarios where fast retransmit/early
retransmit cannot be triggered the algorithm will only kick in when there is a
small amount of outstanding segments.

The RTO Restart proposal is accepted as a working group item in the IETF TCP
Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
RFC status.

Signed-off-by: Per Hurtig <per.hurtig@kau.se>
---
 include/net/tcp.h          |    1 +
 net/ipv4/sysctl_net_ipv4.c |    7 +++++++
 net/ipv4/tcp_input.c       |   11 +++++++++++
 3 files changed, 19 insertions(+)

Comments

Yuchung Cheng Feb. 18, 2014, 6:46 p.m. UTC | #1
On Tue, Feb 18, 2014 at 10:12 AM, Per Hurtig <per.hurtig@kau.se> wrote:
> This patch implements the RTO restart modification described in
> http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02
>
> RTO Restart's goal is to provide quicker loss recovery for segments lost in the
> end of a burst/connection. To accomplish this the algorithm adjusts the RTO
> value on each rearm of the retransmission timer to be exactly RTO ms after the
> earliest outstanding segment was sent. The offsetting against the earliest
> outstanding segment is not done by the regular rearm algorithm, which causes
> RTOs to occur, on average, after RTO+RTT ms.
>
> As a faster timeout is only beneficial in scenarios where fast retransmit/early
> retransmit cannot be triggered the algorithm will only kick in when there is a
> small amount of outstanding segments.
(repost in plaine-text, sorry for the duplication)

I am not sure this works well with Linux min-RTO=200ms, and don't feel
comfortable this is default on without some real experiments.

I've implemented (a nearly identical version of) rto-restart on Google
web servers and found #timeouts increased by ~30%. Although the fast
timeout helps really short flows, it has a very negative side-effect:
resetting cwnd to 1. Thus the next HTTP response may start with a
small cwnd. In contrast, TCP loss probe has fast timeout (1.5RTT) to
do fast recovery to avoid the side-effect. In other words, I am
doubtful we need rto-restart with TCP loss probe, but applying
RTO-restart on TLP timer may be useful.

I've voiced this concern multiple times in ietf tcpm discussion when
we discuss this draft: that the idea itself is fine, but we'll need to
change Linux RTO algorithm together, not just the timer itself.

I am happy to post some more detailed data if people are interested.

>
> The RTO Restart proposal is accepted as a working group item in the IETF TCP
> Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
> RFC status.
>
> Signed-off-by: Per Hurtig <per.hurtig@kau.se>
> ---
>  include/net/tcp.h          |    1 +
>  net/ipv4/sysctl_net_ipv4.c |    7 +++++++
>  net/ipv4/tcp_input.c       |   11 +++++++++++
>  3 files changed, 19 insertions(+)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 56fc366..575e82a 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -278,6 +278,7 @@ extern int sysctl_tcp_slow_start_after_idle;
>  extern int sysctl_tcp_thin_linear_timeouts;
>  extern int sysctl_tcp_thin_dupack;
>  extern int sysctl_tcp_early_retrans;
> +extern int sysctl_tcp_rto_restart;
>  extern int sysctl_tcp_limit_output_bytes;
>  extern int sysctl_tcp_challenge_ack_limit;
>  extern unsigned int sysctl_tcp_notsent_lowat;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 44eba05..c605f4f 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -717,6 +717,13 @@ static struct ctl_table ipv4_table[] = {
>                 .extra2         = &four,
>         },
>         {
> +               .procname       = "tcp_rto_restart",
> +               .data           = &sysctl_tcp_rto_restart,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = proc_dointvec,
> +       },
> +       {
>                 .procname       = "tcp_min_tso_segs",
>                 .data           = &sysctl_tcp_min_tso_segs,
>                 .maxlen         = sizeof(int),
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 227cba7..450ee30 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -98,6 +98,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
>
>  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
>  int sysctl_tcp_early_retrans __read_mostly = 3;
> +int sysctl_tcp_rto_restart __read_mostly = 1;
>
>  #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
>  #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
> @@ -2972,6 +2973,16 @@ void tcp_rearm_rto(struct sock *sk)
>                          */
>                         if (delta > 0)
>                                 rto = delta;
> +               } else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
> +                          sysctl_tcp_rto_restart &&
> +                          sk->sk_send_head == NULL &&
> +                          tp->packets_out < 4) {
> +                       struct sk_buff *skb = tcp_write_queue_head(sk);
> +                       const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
> +                       s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
> +
> +                       if (delta > 0)
> +                               rto -= delta;
>                 }
>                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
>                                           TCP_RTO_MAX);
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Per Hurtig Feb. 19, 2014, 10:50 a.m. UTC | #2
Hi Yuchung,

see inline

On Tue, Feb 18, 2014 at 10:46:18AM -0800, Yuchung Cheng wrote:
> On Tue, Feb 18, 2014 at 10:12 AM, Per Hurtig <per.hurtig@kau.se> wrote:
> > This patch implements the RTO restart modification described in
> > http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02
> >
> > RTO Restart's goal is to provide quicker loss recovery for segments lost in the
> > end of a burst/connection. To accomplish this the algorithm adjusts the RTO
> > value on each rearm of the retransmission timer to be exactly RTO ms after the
> > earliest outstanding segment was sent. The offsetting against the earliest
> > outstanding segment is not done by the regular rearm algorithm, which causes
> > RTOs to occur, on average, after RTO+RTT ms.
> >
> > As a faster timeout is only beneficial in scenarios where fast retransmit/early
> > retransmit cannot be triggered the algorithm will only kick in when there is a
> > small amount of outstanding segments.
> (repost in plaine-text, sorry for the duplication)
> 
> I am not sure this works well with Linux min-RTO=200ms, and don't feel
> comfortable this is default on without some real experiments.
> 
> I've implemented (a nearly identical version of) rto-restart on Google
> web servers and found #timeouts increased by ~30%. Although the fast
> timeout helps really short flows, it has a very negative side-effect:
> resetting cwnd to 1. Thus the next HTTP response may start with a
> small cwnd. In contrast, TCP loss probe has fast timeout (1.5RTT) to
> do fast recovery to avoid the side-effect. In other words, I am
> doubtful we need rto-restart with TCP loss probe, but applying
> RTO-restart on TLP timer may be useful.

As we have discussed, I agree that RTO restart can also be applied to TLP and
this is also mentioned in the ietf draft. However, I think they are also
complementary in reducing loss recovery delay.

> 
> I've voiced this concern multiple times in ietf tcpm discussion when
> we discuss this draft: that the idea itself is fine, but we'll need to
> change Linux RTO algorithm together, not just the timer itself.
> 
> I am happy to post some more detailed data if people are interested.

We would love to see the data. The last time we discussed this, you were not
able to find it. From the latest discussions in the tcpm group I understood you
didn't think it was that big of a problem anymore.


Best Regards,
Per Hurtig
> 
> >
> > The RTO Restart proposal is accepted as a working group item in the IETF TCP
> > Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
> > RFC status.
> >
> > Signed-off-by: Per Hurtig <per.hurtig@kau.se>
> > ---
> >  include/net/tcp.h          |    1 +
> >  net/ipv4/sysctl_net_ipv4.c |    7 +++++++
> >  net/ipv4/tcp_input.c       |   11 +++++++++++
> >  3 files changed, 19 insertions(+)
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 56fc366..575e82a 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -278,6 +278,7 @@ extern int sysctl_tcp_slow_start_after_idle;
> >  extern int sysctl_tcp_thin_linear_timeouts;
> >  extern int sysctl_tcp_thin_dupack;
> >  extern int sysctl_tcp_early_retrans;
> > +extern int sysctl_tcp_rto_restart;
> >  extern int sysctl_tcp_limit_output_bytes;
> >  extern int sysctl_tcp_challenge_ack_limit;
> >  extern unsigned int sysctl_tcp_notsent_lowat;
> > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> > index 44eba05..c605f4f 100644
> > --- a/net/ipv4/sysctl_net_ipv4.c
> > +++ b/net/ipv4/sysctl_net_ipv4.c
> > @@ -717,6 +717,13 @@ static struct ctl_table ipv4_table[] = {
> >                 .extra2         = &four,
> >         },
> >         {
> > +               .procname       = "tcp_rto_restart",
> > +               .data           = &sysctl_tcp_rto_restart,
> > +               .maxlen         = sizeof(int),
> > +               .mode           = 0644,
> > +               .proc_handler   = proc_dointvec,
> > +       },
> > +       {
> >                 .procname       = "tcp_min_tso_segs",
> >                 .data           = &sysctl_tcp_min_tso_segs,
> >                 .maxlen         = sizeof(int),
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 227cba7..450ee30 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -98,6 +98,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
> >
> >  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
> >  int sysctl_tcp_early_retrans __read_mostly = 3;
> > +int sysctl_tcp_rto_restart __read_mostly = 1;
> >
> >  #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
> >  #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
> > @@ -2972,6 +2973,16 @@ void tcp_rearm_rto(struct sock *sk)
> >                          */
> >                         if (delta > 0)
> >                                 rto = delta;
> > +               } else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
> > +                          sysctl_tcp_rto_restart &&
> > +                          sk->sk_send_head == NULL &&
> > +                          tp->packets_out < 4) {
> > +                       struct sk_buff *skb = tcp_write_queue_head(sk);
> > +                       const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
> > +                       s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
> > +
> > +                       if (delta > 0)
> > +                               rto -= delta;
> >                 }
> >                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
> >                                           TCP_RTO_MAX);
> > --
> > 1.7.9.5
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Petlund Feb. 19, 2014, 5:17 p.m. UTC | #3
I’ve applied and tested the patch on VMs with both 32 bit and 64 bit kernel.

Acked-by: Andreas Petlund <apetlund@simula.no>
Tested-by: Andreas Petlund <apetlund@simula.no>

Best regards,
Andreas Petlund

On 18 Feb 2014, at 19:12, Per Hurtig <per.hurtig@kau.se> wrote:

> This patch implements the RTO restart modification described in
> http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02
> 
> RTO Restart's goal is to provide quicker loss recovery for segments lost in the
> end of a burst/connection. To accomplish this the algorithm adjusts the RTO
> value on each rearm of the retransmission timer to be exactly RTO ms after the
> earliest outstanding segment was sent. The offsetting against the earliest
> outstanding segment is not done by the regular rearm algorithm, which causes
> RTOs to occur, on average, after RTO+RTT ms.
> 
> As a faster timeout is only beneficial in scenarios where fast retransmit/early
> retransmit cannot be triggered the algorithm will only kick in when there is a
> small amount of outstanding segments.
> 
> The RTO Restart proposal is accepted as a working group item in the IETF TCP
> Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
> RFC status.
> 
> Signed-off-by: Per Hurtig <per.hurtig@kau.se>
> ---
> include/net/tcp.h          |    1 +
> net/ipv4/sysctl_net_ipv4.c |    7 +++++++
> net/ipv4/tcp_input.c       |   11 +++++++++++
> 3 files changed, 19 insertions(+)
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 56fc366..575e82a 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -278,6 +278,7 @@ extern int sysctl_tcp_slow_start_after_idle;
> extern int sysctl_tcp_thin_linear_timeouts;
> extern int sysctl_tcp_thin_dupack;
> extern int sysctl_tcp_early_retrans;
> +extern int sysctl_tcp_rto_restart;
> extern int sysctl_tcp_limit_output_bytes;
> extern int sysctl_tcp_challenge_ack_limit;
> extern unsigned int sysctl_tcp_notsent_lowat;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 44eba05..c605f4f 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -717,6 +717,13 @@ static struct ctl_table ipv4_table[] = {
> 		.extra2		= &four,
> 	},
> 	{
> +		.procname	= "tcp_rto_restart",
> +		.data		= &sysctl_tcp_rto_restart,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
> 		.procname	= "tcp_min_tso_segs",
> 		.data		= &sysctl_tcp_min_tso_segs,
> 		.maxlen		= sizeof(int),
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 227cba7..450ee30 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -98,6 +98,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
> 
> int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
> int sysctl_tcp_early_retrans __read_mostly = 3;
> +int sysctl_tcp_rto_restart __read_mostly = 1;
> 
> #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
> #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
> @@ -2972,6 +2973,16 @@ void tcp_rearm_rto(struct sock *sk)
> 			 */
> 			if (delta > 0)
> 				rto = delta;
> +		} else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
> +			   sysctl_tcp_rto_restart &&
> +			   sk->sk_send_head == NULL &&
> +			   tp->packets_out < 4) {
> +			struct sk_buff *skb = tcp_write_queue_head(sk);
> +			const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
> +			s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
> +
> +			if (delta > 0)
> +				rto -= delta;
> 		}
> 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
> 					  TCP_RTO_MAX);
> -- 
> 1.7.9.5
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuchung Cheng Feb. 19, 2014, 5:51 p.m. UTC | #4
On Wed, Feb 19, 2014 at 2:50 AM, Per Hurtig <per.hurtig@kau.se> wrote:
> Hi Yuchung,
>
> see inline
>
> On Tue, Feb 18, 2014 at 10:46:18AM -0800, Yuchung Cheng wrote:
>> On Tue, Feb 18, 2014 at 10:12 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>> > This patch implements the RTO restart modification described in
>> > http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02
>> >
>> > RTO Restart's goal is to provide quicker loss recovery for segments lost in the
>> > end of a burst/connection. To accomplish this the algorithm adjusts the RTO
>> > value on each rearm of the retransmission timer to be exactly RTO ms after the
>> > earliest outstanding segment was sent. The offsetting against the earliest
>> > outstanding segment is not done by the regular rearm algorithm, which causes
>> > RTOs to occur, on average, after RTO+RTT ms.
>> >
>> > As a faster timeout is only beneficial in scenarios where fast retransmit/early
>> > retransmit cannot be triggered the algorithm will only kick in when there is a
>> > small amount of outstanding segments.
>> (repost in plaine-text, sorry for the duplication)
>>
>> I am not sure this works well with Linux min-RTO=200ms, and don't feel
>> comfortable this is default on without some real experiments.
>>
>> I've implemented (a nearly identical version of) rto-restart on Google
>> web servers and found #timeouts increased by ~30%. Although the fast
>> timeout helps really short flows, it has a very negative side-effect:
>> resetting cwnd to 1. Thus the next HTTP response may start with a
>> small cwnd. In contrast, TCP loss probe has fast timeout (1.5RTT) to
>> do fast recovery to avoid the side-effect. In other words, I am
>> doubtful we need rto-restart with TCP loss probe, but applying
>> RTO-restart on TLP timer may be useful.
>
> As we have discussed, I agree that RTO restart can also be applied to TLP and
> this is also mentioned in the ietf draft. However, I think they are also
> complementary in reducing loss recovery delay.
>
>>
>> I've voiced this concern multiple times in ietf tcpm discussion when
>> we discuss this draft: that the idea itself is fine, but we'll need to
>> change Linux RTO algorithm together, not just the timer itself.
>>
>> I am happy to post some more detailed data if people are interested.
>
> We would love to see the data. The last time we discussed this, you were not
Sure I can gather some data this week for rto-restart.

> able to find it. From the latest discussions in the tcpm group I understood you
> didn't think it was that big of a problem anymore
rto-restart draft is OK with the context of hypothetical stack
implementing only RFC standards. But Linux is (much) more complex:
min-RTO=200ms, TLP, FACK, etc. Changing how the default timer is armed
needs to be carefully tested in real world before making rto-restart
default. Shortening timer (retry faster) always help short connections
but the long connections may suffer the cwnd reset due to spurious
timeout, *especially* Linux is already running a very aggressive
min-rto.

btw, the rto -= delta may overflow if the socket was locked when the
timer fired previously?

>
>
> Best Regards,
> Per Hurtig
>>
>> >
>> > The RTO Restart proposal is accepted as a working group item in the IETF TCP
>> > Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
>> > RFC status.
>> >
>> > Signed-off-by: Per Hurtig <per.hurtig@kau.se>
>> > ---
>> >  include/net/tcp.h          |    1 +
>> >  net/ipv4/sysctl_net_ipv4.c |    7 +++++++
>> >  net/ipv4/tcp_input.c       |   11 +++++++++++
>> >  3 files changed, 19 insertions(+)
>> >
>> > diff --git a/include/net/tcp.h b/include/net/tcp.h
>> > index 56fc366..575e82a 100644
>> > --- a/include/net/tcp.h
>> > +++ b/include/net/tcp.h
>> > @@ -278,6 +278,7 @@ extern int sysctl_tcp_slow_start_after_idle;
>> >  extern int sysctl_tcp_thin_linear_timeouts;
>> >  extern int sysctl_tcp_thin_dupack;
>> >  extern int sysctl_tcp_early_retrans;
>> > +extern int sysctl_tcp_rto_restart;
>> >  extern int sysctl_tcp_limit_output_bytes;
>> >  extern int sysctl_tcp_challenge_ack_limit;
>> >  extern unsigned int sysctl_tcp_notsent_lowat;
>> > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> > index 44eba05..c605f4f 100644
>> > --- a/net/ipv4/sysctl_net_ipv4.c
>> > +++ b/net/ipv4/sysctl_net_ipv4.c
>> > @@ -717,6 +717,13 @@ static struct ctl_table ipv4_table[] = {
>> >                 .extra2         = &four,
>> >         },
>> >         {
>> > +               .procname       = "tcp_rto_restart",
>> > +               .data           = &sysctl_tcp_rto_restart,
>> > +               .maxlen         = sizeof(int),
>> > +               .mode           = 0644,
>> > +               .proc_handler   = proc_dointvec,
>> > +       },
>> > +       {
>> >                 .procname       = "tcp_min_tso_segs",
>> >                 .data           = &sysctl_tcp_min_tso_segs,
>> >                 .maxlen         = sizeof(int),
>> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> > index 227cba7..450ee30 100644
>> > --- a/net/ipv4/tcp_input.c
>> > +++ b/net/ipv4/tcp_input.c
>> > @@ -98,6 +98,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
>> >
>> >  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
>> >  int sysctl_tcp_early_retrans __read_mostly = 3;
>> > +int sysctl_tcp_rto_restart __read_mostly = 1;
>> >
>> >  #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
>> >  #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
>> > @@ -2972,6 +2973,16 @@ void tcp_rearm_rto(struct sock *sk)
>> >                          */
>> >                         if (delta > 0)
>> >                                 rto = delta;
>> > +               } else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
>> > +                          sysctl_tcp_rto_restart &&
>> > +                          sk->sk_send_head == NULL &&
>> > +                          tp->packets_out < 4) {
>> > +                       struct sk_buff *skb = tcp_write_queue_head(sk);
>> > +                       const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
>> > +                       s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
>> > +
>> > +                       if (delta > 0)
>> > +                               rto -= delta;
>> >                 }
>> >                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
>> >                                           TCP_RTO_MAX);
>> > --
>> > 1.7.9.5
>> >
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe netdev" in
>> > the body of a message to majordomo@vger.kernel.org
>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Feb. 19, 2014, 6:01 p.m. UTC | #5
On Wed, 2014-02-19 at 18:17 +0100, Andreas Petlund wrote:
> I’ve applied and tested the patch on VMs with both 32 bit and 64 bit kernel.
> 
> Acked-by: Andreas Petlund <apetlund@simula.no>
> Tested-by: Andreas Petlund <apetlund@simula.no>

Sure the patch applies, but what experimental results did you get ?

My concern is that TCP_SKB_CB(skb)->when doesn't account for sojourn
time in host queues (qdisc and device). 

Note : Missing documentation for this new sysctl.

This patch certainly cannot be applied as is.

Thanks !


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bill Fink Feb. 19, 2014, 6:20 p.m. UTC | #6
On Tue, 18 Feb 2014, Per Hurtig wrote:

> This patch implements the RTO restart modification described in
> http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02
> 
> RTO Restart's goal is to provide quicker loss recovery for segments lost in the
> end of a burst/connection. To accomplish this the algorithm adjusts the RTO
> value on each rearm of the retransmission timer to be exactly RTO ms after the
> earliest outstanding segment was sent. The offsetting against the earliest
> outstanding segment is not done by the regular rearm algorithm, which causes
> RTOs to occur, on average, after RTO+RTT ms.
> 
> As a faster timeout is only beneficial in scenarios where fast retransmit/early
> retransmit cannot be triggered the algorithm will only kick in when there is a
> small amount of outstanding segments.
> 
> The RTO Restart proposal is accepted as a working group item in the IETF TCP
> Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
> RFC status.
> 
> Signed-off-by: Per Hurtig <per.hurtig@kau.se>
> ---
>  include/net/tcp.h          |    1 +
>  net/ipv4/sysctl_net_ipv4.c |    7 +++++++
>  net/ipv4/tcp_input.c       |   11 +++++++++++
>  3 files changed, 19 insertions(+)

This should also have a documentation update to ip-sysctl.txt.
A separate patch for the TCP(7) man page would also be useful
if the kernel patch is approved.

					-Bill



> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 56fc366..575e82a 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -278,6 +278,7 @@ extern int sysctl_tcp_slow_start_after_idle;
>  extern int sysctl_tcp_thin_linear_timeouts;
>  extern int sysctl_tcp_thin_dupack;
>  extern int sysctl_tcp_early_retrans;
> +extern int sysctl_tcp_rto_restart;
>  extern int sysctl_tcp_limit_output_bytes;
>  extern int sysctl_tcp_challenge_ack_limit;
>  extern unsigned int sysctl_tcp_notsent_lowat;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 44eba05..c605f4f 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -717,6 +717,13 @@ static struct ctl_table ipv4_table[] = {
>  		.extra2		= &four,
>  	},
>  	{
> +		.procname	= "tcp_rto_restart",
> +		.data		= &sysctl_tcp_rto_restart,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
>  		.procname	= "tcp_min_tso_segs",
>  		.data		= &sysctl_tcp_min_tso_segs,
>  		.maxlen		= sizeof(int),
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 227cba7..450ee30 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -98,6 +98,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
>  
>  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
>  int sysctl_tcp_early_retrans __read_mostly = 3;
> +int sysctl_tcp_rto_restart __read_mostly = 1;
>  
>  #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
>  #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
> @@ -2972,6 +2973,16 @@ void tcp_rearm_rto(struct sock *sk)
>  			 */
>  			if (delta > 0)
>  				rto = delta;
> +		} else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
> +			   sysctl_tcp_rto_restart &&
> +			   sk->sk_send_head == NULL &&
> +			   tp->packets_out < 4) {
> +			struct sk_buff *skb = tcp_write_queue_head(sk);
> +			const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
> +			s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
> +
> +			if (delta > 0)
> +				rto -= delta;
>  		}
>  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
>  					  TCP_RTO_MAX);
> -- 
> 1.7.9.5
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Petlund Feb. 19, 2014, 8:09 p.m. UTC | #7
On 19 Feb 2014, at 19:01, Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Wed, 2014-02-19 at 18:17 +0100, Andreas Petlund wrote:
>> I’ve applied and tested the patch on VMs with both 32 bit and 64 bit kernel.
>> 
>> Acked-by: Andreas Petlund <apetlund@simula.no>
>> Tested-by: Andreas Petlund <apetlund@simula.no>
> 
> Sure the patch applies, but what experimental results did you get ?
> 
> My concern is that TCP_SKB_CB(skb)->when doesn't account for sojourn
> time in host queues (qdisc and device). 
> 

Thank’s for the input. I didn’t run any specific tests with that in mind. I’ll look into it.

Best regards,
Andreas

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Per Hurtig Feb. 21, 2014, 10:48 a.m. UTC | #8
On 19 Feb 2014, at 18:51, Yuchung Cheng <ycheng@google.com> wrote:

> On Wed, Feb 19, 2014 at 2:50 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>> Hi Yuchung,
>> 
>> see inline
>> 
>> On Tue, Feb 18, 2014 at 10:46:18AM -0800, Yuchung Cheng wrote:
>>> On Tue, Feb 18, 2014 at 10:12 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>>>> This patch implements the RTO restart modification described in
>>>> http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02
>>>> 
>>>> RTO Restart's goal is to provide quicker loss recovery for segments lost in the
>>>> end of a burst/connection. To accomplish this the algorithm adjusts the RTO
>>>> value on each rearm of the retransmission timer to be exactly RTO ms after the
>>>> earliest outstanding segment was sent. The offsetting against the earliest
>>>> outstanding segment is not done by the regular rearm algorithm, which causes
>>>> RTOs to occur, on average, after RTO+RTT ms.
>>>> 
>>>> As a faster timeout is only beneficial in scenarios where fast retransmit/early
>>>> retransmit cannot be triggered the algorithm will only kick in when there is a
>>>> small amount of outstanding segments.
>>> (repost in plaine-text, sorry for the duplication)
>>> 
>>> I am not sure this works well with Linux min-RTO=200ms, and don't feel
>>> comfortable this is default on without some real experiments.
>>> 
>>> I've implemented (a nearly identical version of) rto-restart on Google
>>> web servers and found #timeouts increased by ~30%.

How did your version work, did it apply the modified restart for any amount of
outstanding segments? If so, I’m not surprised if the #timeouts increased
dramatically.


>>> Although the fast
>>> timeout helps really short flows, it has a very negative side-effect:
>>> resetting cwnd to 1. Thus the next HTTP response may start with a
>>> small cwnd. In contrast, TCP loss probe has fast timeout (1.5RTT) to
>>> do fast recovery to avoid the side-effect. In other words, I am
>>> doubtful we need rto-restart with TCP loss probe, but applying
>>> RTO-restart on TLP timer may be useful.
>> 
>> As we have discussed, I agree that RTO restart can also be applied to TLP and
>> this is also mentioned in the ietf draft. However, I think they are also
>> complementary in reducing loss recovery delay.
>> 
>>> 
>>> I've voiced this concern multiple times in ietf tcpm discussion when
>>> we discuss this draft: that the idea itself is fine, but we'll need to
>>> change Linux RTO algorithm together, not just the timer itself.
>>> 
>>> I am happy to post some more detailed data if people are interested.
>> 
>> We would love to see the data. The last time we discussed this, you were not
> Sure I can gather some data this week for rto-restart.

That would be great! If you will be in London for the IETF perhaps we could also
schedule a time to discuss it?

> 
>> able to find it. From the latest discussions in the tcpm group I understood you
>> didn't think it was that big of a problem anymore
> rto-restart draft is OK with the context of hypothetical stack
> implementing only RFC standards. But Linux is (much) more complex:
> min-RTO=200ms, TLP, FACK, etc. Changing how the default timer is armed
> needs to be carefully tested in real world before making rto-restart
> default. Shortening timer (retry faster) always help short connections
> but the long connections may suffer the cwnd reset due to spurious
> timeout, *especially* Linux is already running a very aggressive
> min-rto.
> 
> btw, the rto -= delta may overflow if the socket was locked when the
> timer fired previously?

Thanks for pointing that out, I think you might be right.


Best Regards,
Per Hurtig
> 
>> 
>> 
>> Best Regards,
>> Per Hurtig
>>> 
>>>> 
>>>> The RTO Restart proposal is accepted as a working group item in the IETF TCP
>>>> Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
>>>> RFC status.
>>>> 
>>>> Signed-off-by: Per Hurtig <per.hurtig@kau.se>
>>>> ---
>>>> include/net/tcp.h          |    1 +
>>>> net/ipv4/sysctl_net_ipv4.c |    7 +++++++
>>>> net/ipv4/tcp_input.c       |   11 +++++++++++
>>>> 3 files changed, 19 insertions(+)
>>>> 
>>>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>>>> index 56fc366..575e82a 100644
>>>> --- a/include/net/tcp.h
>>>> +++ b/include/net/tcp.h
>>>> @@ -278,6 +278,7 @@ extern int sysctl_tcp_slow_start_after_idle;
>>>> extern int sysctl_tcp_thin_linear_timeouts;
>>>> extern int sysctl_tcp_thin_dupack;
>>>> extern int sysctl_tcp_early_retrans;
>>>> +extern int sysctl_tcp_rto_restart;
>>>> extern int sysctl_tcp_limit_output_bytes;
>>>> extern int sysctl_tcp_challenge_ack_limit;
>>>> extern unsigned int sysctl_tcp_notsent_lowat;
>>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>>>> index 44eba05..c605f4f 100644
>>>> --- a/net/ipv4/sysctl_net_ipv4.c
>>>> +++ b/net/ipv4/sysctl_net_ipv4.c
>>>> @@ -717,6 +717,13 @@ static struct ctl_table ipv4_table[] = {
>>>>                .extra2         = &four,
>>>>        },
>>>>        {
>>>> +               .procname       = "tcp_rto_restart",
>>>> +               .data           = &sysctl_tcp_rto_restart,
>>>> +               .maxlen         = sizeof(int),
>>>> +               .mode           = 0644,
>>>> +               .proc_handler   = proc_dointvec,
>>>> +       },
>>>> +       {
>>>>                .procname       = "tcp_min_tso_segs",
>>>>                .data           = &sysctl_tcp_min_tso_segs,
>>>>                .maxlen         = sizeof(int),
>>>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>>>> index 227cba7..450ee30 100644
>>>> --- a/net/ipv4/tcp_input.c
>>>> +++ b/net/ipv4/tcp_input.c
>>>> @@ -98,6 +98,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
>>>> 
>>>> int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
>>>> int sysctl_tcp_early_retrans __read_mostly = 3;
>>>> +int sysctl_tcp_rto_restart __read_mostly = 1;
>>>> 
>>>> #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
>>>> #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
>>>> @@ -2972,6 +2973,16 @@ void tcp_rearm_rto(struct sock *sk)
>>>>                         */
>>>>                        if (delta > 0)
>>>>                                rto = delta;
>>>> +               } else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
>>>> +                          sysctl_tcp_rto_restart &&
>>>> +                          sk->sk_send_head == NULL &&
>>>> +                          tp->packets_out < 4) {
>>>> +                       struct sk_buff *skb = tcp_write_queue_head(sk);
>>>> +                       const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
>>>> +                       s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
>>>> +
>>>> +                       if (delta > 0)
>>>> +                               rto -= delta;
>>>>                }
>>>>                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
>>>>                                          TCP_RTO_MAX);
>>>> --
>>>> 1.7.9.5
>>>> 
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Per Hurtig Feb. 21, 2014, 10:48 a.m. UTC | #9
On 19 Feb 2014, at 19:20, Bill Fink <billfink@mindspring.com> wrote:

> On Tue, 18 Feb 2014, Per Hurtig wrote:
> 
>> This patch implements the RTO restart modification described in
>> http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02
>> 
>> RTO Restart's goal is to provide quicker loss recovery for segments lost in the
>> end of a burst/connection. To accomplish this the algorithm adjusts the RTO
>> value on each rearm of the retransmission timer to be exactly RTO ms after the
>> earliest outstanding segment was sent. The offsetting against the earliest
>> outstanding segment is not done by the regular rearm algorithm, which causes
>> RTOs to occur, on average, after RTO+RTT ms.
>> 
>> As a faster timeout is only beneficial in scenarios where fast retransmit/early
>> retransmit cannot be triggered the algorithm will only kick in when there is a
>> small amount of outstanding segments.
>> 
>> The RTO Restart proposal is accepted as a working group item in the IETF TCP
>> Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
>> RFC status.
>> 
>> Signed-off-by: Per Hurtig <per.hurtig@kau.se>
>> ---
>> include/net/tcp.h          |    1 +
>> net/ipv4/sysctl_net_ipv4.c |    7 +++++++
>> net/ipv4/tcp_input.c       |   11 +++++++++++
>> 3 files changed, 19 insertions(+)
> 
> This should also have a documentation update to ip-sysctl.txt.
> A separate patch for the TCP(7) man page would also be useful
> if the kernel patch is approved.
> 
> 					-Bill
> 

Thanks for pointing that out, we completely forgot it!


Best Regards,
Per Hurtig
> 
> 
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 56fc366..575e82a 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -278,6 +278,7 @@ extern int sysctl_tcp_slow_start_after_idle;
>> extern int sysctl_tcp_thin_linear_timeouts;
>> extern int sysctl_tcp_thin_dupack;
>> extern int sysctl_tcp_early_retrans;
>> +extern int sysctl_tcp_rto_restart;
>> extern int sysctl_tcp_limit_output_bytes;
>> extern int sysctl_tcp_challenge_ack_limit;
>> extern unsigned int sysctl_tcp_notsent_lowat;
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index 44eba05..c605f4f 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -717,6 +717,13 @@ static struct ctl_table ipv4_table[] = {
>> 		.extra2		= &four,
>> 	},
>> 	{
>> +		.procname	= "tcp_rto_restart",
>> +		.data		= &sysctl_tcp_rto_restart,
>> +		.maxlen		= sizeof(int),
>> +		.mode		= 0644,
>> +		.proc_handler	= proc_dointvec,
>> +	},
>> +	{
>> 		.procname	= "tcp_min_tso_segs",
>> 		.data		= &sysctl_tcp_min_tso_segs,
>> 		.maxlen		= sizeof(int),
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index 227cba7..450ee30 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -98,6 +98,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
>> 
>> int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
>> int sysctl_tcp_early_retrans __read_mostly = 3;
>> +int sysctl_tcp_rto_restart __read_mostly = 1;
>> 
>> #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
>> #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
>> @@ -2972,6 +2973,16 @@ void tcp_rearm_rto(struct sock *sk)
>> 			 */
>> 			if (delta > 0)
>> 				rto = delta;
>> +		} else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
>> +			   sysctl_tcp_rto_restart &&
>> +			   sk->sk_send_head == NULL &&
>> +			   tp->packets_out < 4) {
>> +			struct sk_buff *skb = tcp_write_queue_head(sk);
>> +			const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
>> +			s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
>> +
>> +			if (delta > 0)
>> +				rto -= delta;
>> 		}
>> 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
>> 					  TCP_RTO_MAX);
>> -- 
>> 1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Per Hurtig Feb. 21, 2014, 10:49 a.m. UTC | #10
On 19 Feb 2014, at 19:01, Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Wed, 2014-02-19 at 18:17 +0100, Andreas Petlund wrote:
>> I’ve applied and tested the patch on VMs with both 32 bit and 64 bit kernel.
>> 
>> Acked-by: Andreas Petlund <apetlund@simula.no>
>> Tested-by: Andreas Petlund <apetlund@simula.no>
> 
> Sure the patch applies, but what experimental results did you get ?
> 
> My concern is that TCP_SKB_CB(skb)->when doesn't account for sojourn
> time in host queues (qdisc and device). 
> 

Correct me if I’m wrong, but I was under the impression that the sojourn was
accounted for in the RTT calculations, and therefore rather inappropriate to
be accounted for again? If this assumption is wrong, how would you propose that
we account for the sojourn time?

> Note : Missing documentation for this new sysctl.
> 

Yes, this we completely missed. Will fix.

> This patch certainly cannot be applied as is.
> 
> Thanks !
> 
> 

Thank you,
Per Hurtig


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Feb. 21, 2014, 1:14 p.m. UTC | #11
On Fri, 2014-02-21 at 11:49 +0100, Per Hurtig wrote:
> On 19 Feb 2014, at 19:01, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > On Wed, 2014-02-19 at 18:17 +0100, Andreas Petlund wrote:
> >> I’ve applied and tested the patch on VMs with both 32 bit and 64 bit kernel.
> >> 
> >> Acked-by: Andreas Petlund <apetlund@simula.no>
> >> Tested-by: Andreas Petlund <apetlund@simula.no>
> > 
> > Sure the patch applies, but what experimental results did you get ?
> > 
> > My concern is that TCP_SKB_CB(skb)->when doesn't account for sojourn
> > time in host queues (qdisc and device). 
> > 
> 
> Correct me if I’m wrong, but I was under the impression that the sojourn was
> accounted for in the RTT calculations, and therefore rather inappropriate to
> be accounted for again? If this assumption is wrong, how would you propose that
> we account for the sojourn time?

This is a generic problem. 

Thats why I added LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES snmp counter,
so that we can experiment with the idea of not firing RTO while original
packet is still in the host qdisc.

       struct sk_buff *skb = tcp_write_queue_head(sk);
       const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
       s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);

       if (delta > 0)
            rto -= delta;

Can trivially gives rto < 0, resulting in a timer of 120 seconds
(TCP_RTO_MAX)




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuchung Cheng Feb. 21, 2014, 4:53 p.m. UTC | #12
On Fri, Feb 21, 2014 at 2:48 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>
> On 19 Feb 2014, at 18:51, Yuchung Cheng <ycheng@google.com> wrote:
>
>> On Wed, Feb 19, 2014 at 2:50 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>>> Hi Yuchung,
>>>
>>> see inline
>>>
>>> On Tue, Feb 18, 2014 at 10:46:18AM -0800, Yuchung Cheng wrote:
>>>> On Tue, Feb 18, 2014 at 10:12 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>>>>> This patch implements the RTO restart modification described in
>>>>> http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02
>>>>>
>>>>> RTO Restart's goal is to provide quicker loss recovery for segments lost in the
>>>>> end of a burst/connection. To accomplish this the algorithm adjusts the RTO
>>>>> value on each rearm of the retransmission timer to be exactly RTO ms after the
>>>>> earliest outstanding segment was sent. The offsetting against the earliest
>>>>> outstanding segment is not done by the regular rearm algorithm, which causes
>>>>> RTOs to occur, on average, after RTO+RTT ms.
>>>>>
>>>>> As a faster timeout is only beneficial in scenarios where fast retransmit/early
>>>>> retransmit cannot be triggered the algorithm will only kick in when there is a
>>>>> small amount of outstanding segments.
>>>> (repost in plaine-text, sorry for the duplication)
>>>>
>>>> I am not sure this works well with Linux min-RTO=200ms, and don't feel
>>>> comfortable this is default on without some real experiments.
>>>>
>>>> I've implemented (a nearly identical version of) rto-restart on Google
>>>> web servers and found #timeouts increased by ~30%.
>
> How did your version work, did it apply the modified restart for any amount of
> outstanding segments? If so, I’m not surprised if the #timeouts increased
> dramatically.
>
>
>>>> Although the fast
>>>> timeout helps really short flows, it has a very negative side-effect:
>>>> resetting cwnd to 1. Thus the next HTTP response may start with a
>>>> small cwnd. In contrast, TCP loss probe has fast timeout (1.5RTT) to
>>>> do fast recovery to avoid the side-effect. In other words, I am
>>>> doubtful we need rto-restart with TCP loss probe, but applying
>>>> RTO-restart on TLP timer may be useful.
>>>
>>> As we have discussed, I agree that RTO restart can also be applied to TLP and
>>> this is also mentioned in the ietf draft. However, I think they are also
>>> complementary in reducing loss recovery delay.
>>>
>>>>
>>>> I've voiced this concern multiple times in ietf tcpm discussion when
>>>> we discuss this draft: that the idea itself is fine, but we'll need to
>>>> change Linux RTO algorithm together, not just the timer itself.
>>>>
>>>> I am happy to post some more detailed data if people are interested.
>>>
>>> We would love to see the data. The last time we discussed this, you were not
>> Sure I can gather some data this week for rto-restart.
>
> That would be great! If you will be in London for the IETF perhaps we could also
> schedule a time to discuss it?
Sure but I should have the data by this weekend. I tweaked your patch
about the rto overflow issue.

I think it's worth the effort to evaluate RTO more thoroughly and
preferably use the same re-arming logic for retransmission. In other
words, always offset the time elapsed since last sent when re-arming a
retransmit timer, and relax the RTO calculation formula but not
depending on the inflight. For example, we already use similar logic
when handling ICMP dest-unreach. in tcp_ipv{4|6}.c.

>
>>
>>> able to find it. From the latest discussions in the tcpm group I understood you
>>> didn't think it was that big of a problem anymore
>> rto-restart draft is OK with the context of hypothetical stack
>> implementing only RFC standards. But Linux is (much) more complex:
>> min-RTO=200ms, TLP, FACK, etc. Changing how the default timer is armed
>> needs to be carefully tested in real world before making rto-restart
>> default. Shortening timer (retry faster) always help short connections
>> but the long connections may suffer the cwnd reset due to spurious
>> timeout, *especially* Linux is already running a very aggressive
>> min-rto.
>>
>> btw, the rto -= delta may overflow if the socket was locked when the
>> timer fired previously?
>
> Thanks for pointing that out, I think you might be right.
>
>
> Best Regards,
> Per Hurtig
>>
>>>
>>>
>>> Best Regards,
>>> Per Hurtig
>>>>
>>>>>
>>>>> The RTO Restart proposal is accepted as a working group item in the IETF TCP
>>>>> Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
>>>>> RFC status.
>>>>>
>>>>> Signed-off-by: Per Hurtig <per.hurtig@kau.se>
>>>>> ---
>>>>> include/net/tcp.h          |    1 +
>>>>> net/ipv4/sysctl_net_ipv4.c |    7 +++++++
>>>>> net/ipv4/tcp_input.c       |   11 +++++++++++
>>>>> 3 files changed, 19 insertions(+)
>>>>>
>>>>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>>>>> index 56fc366..575e82a 100644
>>>>> --- a/include/net/tcp.h
>>>>> +++ b/include/net/tcp.h
>>>>> @@ -278,6 +278,7 @@ extern int sysctl_tcp_slow_start_after_idle;
>>>>> extern int sysctl_tcp_thin_linear_timeouts;
>>>>> extern int sysctl_tcp_thin_dupack;
>>>>> extern int sysctl_tcp_early_retrans;
>>>>> +extern int sysctl_tcp_rto_restart;
>>>>> extern int sysctl_tcp_limit_output_bytes;
>>>>> extern int sysctl_tcp_challenge_ack_limit;
>>>>> extern unsigned int sysctl_tcp_notsent_lowat;
>>>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>>>>> index 44eba05..c605f4f 100644
>>>>> --- a/net/ipv4/sysctl_net_ipv4.c
>>>>> +++ b/net/ipv4/sysctl_net_ipv4.c
>>>>> @@ -717,6 +717,13 @@ static struct ctl_table ipv4_table[] = {
>>>>>                .extra2         = &four,
>>>>>        },
>>>>>        {
>>>>> +               .procname       = "tcp_rto_restart",
>>>>> +               .data           = &sysctl_tcp_rto_restart,
>>>>> +               .maxlen         = sizeof(int),
>>>>> +               .mode           = 0644,
>>>>> +               .proc_handler   = proc_dointvec,
>>>>> +       },
>>>>> +       {
>>>>>                .procname       = "tcp_min_tso_segs",
>>>>>                .data           = &sysctl_tcp_min_tso_segs,
>>>>>                .maxlen         = sizeof(int),
>>>>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>>>>> index 227cba7..450ee30 100644
>>>>> --- a/net/ipv4/tcp_input.c
>>>>> +++ b/net/ipv4/tcp_input.c
>>>>> @@ -98,6 +98,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
>>>>>
>>>>> int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
>>>>> int sysctl_tcp_early_retrans __read_mostly = 3;
>>>>> +int sysctl_tcp_rto_restart __read_mostly = 1;
>>>>>
>>>>> #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
>>>>> #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
>>>>> @@ -2972,6 +2973,16 @@ void tcp_rearm_rto(struct sock *sk)
>>>>>                         */
>>>>>                        if (delta > 0)
>>>>>                                rto = delta;
>>>>> +               } else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
>>>>> +                          sysctl_tcp_rto_restart &&
>>>>> +                          sk->sk_send_head == NULL &&
>>>>> +                          tp->packets_out < 4) {
>>>>> +                       struct sk_buff *skb = tcp_write_queue_head(sk);
>>>>> +                       const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
>>>>> +                       s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
>>>>> +
>>>>> +                       if (delta > 0)
>>>>> +                               rto -= delta;
>>>>>                }
>>>>>                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
>>>>>                                          TCP_RTO_MAX);
>>>>> --
>>>>> 1.7.9.5
>>>>>
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuchung Cheng Feb. 25, 2014, 12:03 a.m. UTC | #13
On Fri, Feb 21, 2014 at 8:53 AM, Yuchung Cheng <ycheng@google.com> wrote:
> On Fri, Feb 21, 2014 at 2:48 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>>
>> On 19 Feb 2014, at 18:51, Yuchung Cheng <ycheng@google.com> wrote:
>>
>>> On Wed, Feb 19, 2014 at 2:50 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>>>> Hi Yuchung,
>>>>
>>>> see inline
>>>>
>>>> On Tue, Feb 18, 2014 at 10:46:18AM -0800, Yuchung Cheng wrote:
>>>>> On Tue, Feb 18, 2014 at 10:12 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>>>>>> This patch implements the RTO restart modification described in
>>>>>> http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02
>>>>>>
>>>>>> RTO Restart's goal is to provide quicker loss recovery for segments lost in the
>>>>>> end of a burst/connection. To accomplish this the algorithm adjusts the RTO
>>>>>> value on each rearm of the retransmission timer to be exactly RTO ms after the
>>>>>> earliest outstanding segment was sent. The offsetting against the earliest
>>>>>> outstanding segment is not done by the regular rearm algorithm, which causes
>>>>>> RTOs to occur, on average, after RTO+RTT ms.
>>>>>>
>>>>>> As a faster timeout is only beneficial in scenarios where fast retransmit/early
>>>>>> retransmit cannot be triggered the algorithm will only kick in when there is a
>>>>>> small amount of outstanding segments.
>>>>> (repost in plaine-text, sorry for the duplication)
>>>>>
>>>>> I am not sure this works well with Linux min-RTO=200ms, and don't feel
>>>>> comfortable this is default on without some real experiments.
>>>>>
>>>>> I've implemented (a nearly identical version of) rto-restart on Google
>>>>> web servers and found #timeouts increased by ~30%.
>>
>> How did your version work, did it apply the modified restart for any amount of
>> outstanding segments? If so, I’m not surprised if the #timeouts increased
>> dramatically.
>>
>>
>>>>> Although the fast
>>>>> timeout helps really short flows, it has a very negative side-effect:
>>>>> resetting cwnd to 1. Thus the next HTTP response may start with a
>>>>> small cwnd. In contrast, TCP loss probe has fast timeout (1.5RTT) to
>>>>> do fast recovery to avoid the side-effect. In other words, I am
>>>>> doubtful we need rto-restart with TCP loss probe, but applying
>>>>> RTO-restart on TLP timer may be useful.
>>>>
>>>> As we have discussed, I agree that RTO restart can also be applied to TLP and
>>>> this is also mentioned in the ietf draft. However, I think they are also
>>>> complementary in reducing loss recovery delay.
>>>>
>>>>>
>>>>> I've voiced this concern multiple times in ietf tcpm discussion when
>>>>> we discuss this draft: that the idea itself is fine, but we'll need to
>>>>> change Linux RTO algorithm together, not just the timer itself.
>>>>>
>>>>> I am happy to post some more detailed data if people are interested.
>>>>
>>>> We would love to see the data. The last time we discussed this, you were not
>>> Sure I can gather some data this week for rto-restart.
>>
>> That would be great! If you will be in London for the IETF perhaps we could also
>> schedule a time to discuss it?
> Sure but I should have the data by this weekend. I tweaked your patch
> about the rto overflow issue.
>
> I think it's worth the effort to evaluate RTO more thoroughly and
> preferably use the same re-arming logic for retransmission. In other
> words, always offset the time elapsed since last sent when re-arming a
> retransmit timer, and relax the RTO calculation formula but not
> depending on the inflight. For example, we already use similar logic
> when handling ICMP dest-unreach. in tcp_ipv{4|6}.c.

I tested your patch on two co-located Google servers in North America.
I use all the default upstream TCP features (sack, FACK, tlp, frto,
IW10, etc) but vary the rto-restart knob. The latency didn't really
change. If anything it's slightly worse but that's within the 1%
noise. The retransmission rate are nearly identical. The breakdown of
various retranmission stats are the same too and SpuriousRtxHostQueues
are negligible. Each experiment has x+-1% millions of samples over a
course of 4 days.

*HTTP response latency.
baseline: 673ms, rto-restart:678ms, retrans-rate: 3.6%

HTTP response latency of size <= 3 MSS packets
baseline: 166ms, rto-restart: 171ms, retrans-rate: 3.0%

HTTP response latency is the interval from the first byte of the
response is sent out to the last byte is acked.

So it does not hurt nor does it really help in my test.

>
>>
>>>
>>>> able to find it. From the latest discussions in the tcpm group I understood you
>>>> didn't think it was that big of a problem anymore
>>> rto-restart draft is OK with the context of hypothetical stack
>>> implementing only RFC standards. But Linux is (much) more complex:
>>> min-RTO=200ms, TLP, FACK, etc. Changing how the default timer is armed
>>> needs to be carefully tested in real world before making rto-restart
>>> default. Shortening timer (retry faster) always help short connections
>>> but the long connections may suffer the cwnd reset due to spurious
>>> timeout, *especially* Linux is already running a very aggressive
>>> min-rto.
>>>
>>> btw, the rto -= delta may overflow if the socket was locked when the
>>> timer fired previously?
>>
>> Thanks for pointing that out, I think you might be right.
>>
>>
>> Best Regards,
>> Per Hurtig
>>>
>>>>
>>>>
>>>> Best Regards,
>>>> Per Hurtig
>>>>>
>>>>>>
>>>>>> The RTO Restart proposal is accepted as a working group item in the IETF TCP
>>>>>> Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
>>>>>> RFC status.
>>>>>>
>>>>>> Signed-off-by: Per Hurtig <per.hurtig@kau.se>
>>>>>> ---
>>>>>> include/net/tcp.h          |    1 +
>>>>>> net/ipv4/sysctl_net_ipv4.c |    7 +++++++
>>>>>> net/ipv4/tcp_input.c       |   11 +++++++++++
>>>>>> 3 files changed, 19 insertions(+)
>>>>>>
>>>>>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>>>>>> index 56fc366..575e82a 100644
>>>>>> --- a/include/net/tcp.h
>>>>>> +++ b/include/net/tcp.h
>>>>>> @@ -278,6 +278,7 @@ extern int sysctl_tcp_slow_start_after_idle;
>>>>>> extern int sysctl_tcp_thin_linear_timeouts;
>>>>>> extern int sysctl_tcp_thin_dupack;
>>>>>> extern int sysctl_tcp_early_retrans;
>>>>>> +extern int sysctl_tcp_rto_restart;
>>>>>> extern int sysctl_tcp_limit_output_bytes;
>>>>>> extern int sysctl_tcp_challenge_ack_limit;
>>>>>> extern unsigned int sysctl_tcp_notsent_lowat;
>>>>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>>>>>> index 44eba05..c605f4f 100644
>>>>>> --- a/net/ipv4/sysctl_net_ipv4.c
>>>>>> +++ b/net/ipv4/sysctl_net_ipv4.c
>>>>>> @@ -717,6 +717,13 @@ static struct ctl_table ipv4_table[] = {
>>>>>>                .extra2         = &four,
>>>>>>        },
>>>>>>        {
>>>>>> +               .procname       = "tcp_rto_restart",
>>>>>> +               .data           = &sysctl_tcp_rto_restart,
>>>>>> +               .maxlen         = sizeof(int),
>>>>>> +               .mode           = 0644,
>>>>>> +               .proc_handler   = proc_dointvec,
>>>>>> +       },
>>>>>> +       {
>>>>>>                .procname       = "tcp_min_tso_segs",
>>>>>>                .data           = &sysctl_tcp_min_tso_segs,
>>>>>>                .maxlen         = sizeof(int),
>>>>>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>>>>>> index 227cba7..450ee30 100644
>>>>>> --- a/net/ipv4/tcp_input.c
>>>>>> +++ b/net/ipv4/tcp_input.c
>>>>>> @@ -98,6 +98,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
>>>>>>
>>>>>> int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
>>>>>> int sysctl_tcp_early_retrans __read_mostly = 3;
>>>>>> +int sysctl_tcp_rto_restart __read_mostly = 1;
>>>>>>
>>>>>> #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
>>>>>> #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
>>>>>> @@ -2972,6 +2973,16 @@ void tcp_rearm_rto(struct sock *sk)
>>>>>>                         */
>>>>>>                        if (delta > 0)
>>>>>>                                rto = delta;
>>>>>> +               } else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
>>>>>> +                          sysctl_tcp_rto_restart &&
>>>>>> +                          sk->sk_send_head == NULL &&
>>>>>> +                          tp->packets_out < 4) {
>>>>>> +                       struct sk_buff *skb = tcp_write_queue_head(sk);
>>>>>> +                       const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
>>>>>> +                       s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
>>>>>> +
>>>>>> +                       if (delta > 0)
>>>>>> +                               rto -= delta;
>>>>>>                }
>>>>>>                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
>>>>>>                                          TCP_RTO_MAX);
>>>>>> --
>>>>>> 1.7.9.5
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>
>>
>>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Per Hurtig Feb. 26, 2014, 9:50 a.m. UTC | #14
On 25 Feb 2014, at 01:03, Yuchung Cheng <ycheng@google.com> wrote:

> On Fri, Feb 21, 2014 at 8:53 AM, Yuchung Cheng <ycheng@google.com> wrote:
>> On Fri, Feb 21, 2014 at 2:48 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>>> 
>>> On 19 Feb 2014, at 18:51, Yuchung Cheng <ycheng@google.com> wrote:
>>> 
>>>> On Wed, Feb 19, 2014 at 2:50 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>>>>> Hi Yuchung,
>>>>> 
>>>>> see inline
>>>>> 
>>>>> On Tue, Feb 18, 2014 at 10:46:18AM -0800, Yuchung Cheng wrote:
>>>>>> On Tue, Feb 18, 2014 at 10:12 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>>>>>>> This patch implements the RTO restart modification described in
>>>>>>> http://tools.ietf.org/html/draft-ietf-tcpm-rtorestart-02
>>>>>>> 
>>>>>>> RTO Restart's goal is to provide quicker loss recovery for segments lost in the
>>>>>>> end of a burst/connection. To accomplish this the algorithm adjusts the RTO
>>>>>>> value on each rearm of the retransmission timer to be exactly RTO ms after the
>>>>>>> earliest outstanding segment was sent. The offsetting against the earliest
>>>>>>> outstanding segment is not done by the regular rearm algorithm, which causes
>>>>>>> RTOs to occur, on average, after RTO+RTT ms.
>>>>>>> 
>>>>>>> As a faster timeout is only beneficial in scenarios where fast retransmit/early
>>>>>>> retransmit cannot be triggered the algorithm will only kick in when there is a
>>>>>>> small amount of outstanding segments.
>>>>>> (repost in plaine-text, sorry for the duplication)
>>>>>> 
>>>>>> I am not sure this works well with Linux min-RTO=200ms, and don't feel
>>>>>> comfortable this is default on without some real experiments.
>>>>>> 
>>>>>> I've implemented (a nearly identical version of) rto-restart on Google
>>>>>> web servers and found #timeouts increased by ~30%.
>>> 
>>> How did your version work, did it apply the modified restart for any amount of
>>> outstanding segments? If so, I’m not surprised if the #timeouts increased
>>> dramatically.
>>> 
>>> 
>>>>>> Although the fast
>>>>>> timeout helps really short flows, it has a very negative side-effect:
>>>>>> resetting cwnd to 1. Thus the next HTTP response may start with a
>>>>>> small cwnd. In contrast, TCP loss probe has fast timeout (1.5RTT) to
>>>>>> do fast recovery to avoid the side-effect. In other words, I am
>>>>>> doubtful we need rto-restart with TCP loss probe, but applying
>>>>>> RTO-restart on TLP timer may be useful.
>>>>> 
>>>>> As we have discussed, I agree that RTO restart can also be applied to TLP and
>>>>> this is also mentioned in the ietf draft. However, I think they are also
>>>>> complementary in reducing loss recovery delay.
>>>>> 
>>>>>> 
>>>>>> I've voiced this concern multiple times in ietf tcpm discussion when
>>>>>> we discuss this draft: that the idea itself is fine, but we'll need to
>>>>>> change Linux RTO algorithm together, not just the timer itself.
>>>>>> 
>>>>>> I am happy to post some more detailed data if people are interested.
>>>>> 
>>>>> We would love to see the data. The last time we discussed this, you were not
>>>> Sure I can gather some data this week for rto-restart.
>>> 
>>> That would be great! If you will be in London for the IETF perhaps we could also
>>> schedule a time to discuss it?
>> Sure but I should have the data by this weekend. I tweaked your patch
>> about the rto overflow issue.
>> 
>> I think it's worth the effort to evaluate RTO more thoroughly and
>> preferably use the same re-arming logic for retransmission. In other
>> words, always offset the time elapsed since last sent when re-arming a
>> retransmit timer, and relax the RTO calculation formula but not
>> depending on the inflight. For example, we already use similar logic
>> when handling ICMP dest-unreach. in tcp_ipv{4|6}.c.
> 
> I tested your patch on two co-located Google servers in North America.
> I use all the default upstream TCP features (sack, FACK, tlp, frto,
> IW10, etc) but vary the rto-restart knob. The latency didn't really
> change. If anything it's slightly worse but that's within the 1%
> noise. The retransmission rate are nearly identical. The breakdown of
> various retranmission stats are the same too and SpuriousRtxHostQueues
> are negligible. Each experiment has x+-1% millions of samples over a
> course of 4 days.
> 
> *HTTP response latency.
> baseline: 673ms, rto-restart:678ms, retrans-rate: 3.6%
> 
> HTTP response latency of size <= 3 MSS packets
> baseline: 166ms, rto-restart: 171ms, retrans-rate: 3.0%
> 
> HTTP response latency is the interval from the first byte of the
> response is sent out to the last byte is acked.
> 
> So it does not hurt nor does it really help in my test.

Thanks for sharing the data,

Do you know, or is it possible to determine, how the flows that saw
loss were affected by the modified restart, in terms of latency? From
measurements we’re currently conducting its rather common that the average
performance benefit for all flows, including those without loss, is low.
However, for flows that do experience loss, there can be significant improvements.

Also, as your setup includes TLP we only have the RTO restart over TLP gain to start
with.

Cheers,
Per
> 
>> 
>>> 
>>>> 
>>>>> able to find it. From the latest discussions in the tcpm group I understood you
>>>>> didn't think it was that big of a problem anymore
>>>> rto-restart draft is OK with the context of hypothetical stack
>>>> implementing only RFC standards. But Linux is (much) more complex:
>>>> min-RTO=200ms, TLP, FACK, etc. Changing how the default timer is armed
>>>> needs to be carefully tested in real world before making rto-restart
>>>> default. Shortening timer (retry faster) always help short connections
>>>> but the long connections may suffer the cwnd reset due to spurious
>>>> timeout, *especially* Linux is already running a very aggressive
>>>> min-rto.
>>>> 
>>>> btw, the rto -= delta may overflow if the socket was locked when the
>>>> timer fired previously?
>>> 
>>> Thanks for pointing that out, I think you might be right.
>>> 
>>> 
>>> Best Regards,
>>> Per Hurtig
>>>> 
>>>>> 
>>>>> 
>>>>> Best Regards,
>>>>> Per Hurtig
>>>>>> 
>>>>>>> 
>>>>>>> The RTO Restart proposal is accepted as a working group item in the IETF TCP
>>>>>>> Maintenance and Minor Extensions (tcpm) TCP wg and is intended for experimental
>>>>>>> RFC status.
>>>>>>> 
>>>>>>> Signed-off-by: Per Hurtig <per.hurtig@kau.se>
>>>>>>> ---
>>>>>>> include/net/tcp.h          |    1 +
>>>>>>> net/ipv4/sysctl_net_ipv4.c |    7 +++++++
>>>>>>> net/ipv4/tcp_input.c       |   11 +++++++++++
>>>>>>> 3 files changed, 19 insertions(+)
>>>>>>> 
>>>>>>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>>>>>>> index 56fc366..575e82a 100644
>>>>>>> --- a/include/net/tcp.h
>>>>>>> +++ b/include/net/tcp.h
>>>>>>> @@ -278,6 +278,7 @@ extern int sysctl_tcp_slow_start_after_idle;
>>>>>>> extern int sysctl_tcp_thin_linear_timeouts;
>>>>>>> extern int sysctl_tcp_thin_dupack;
>>>>>>> extern int sysctl_tcp_early_retrans;
>>>>>>> +extern int sysctl_tcp_rto_restart;
>>>>>>> extern int sysctl_tcp_limit_output_bytes;
>>>>>>> extern int sysctl_tcp_challenge_ack_limit;
>>>>>>> extern unsigned int sysctl_tcp_notsent_lowat;
>>>>>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>>>>>>> index 44eba05..c605f4f 100644
>>>>>>> --- a/net/ipv4/sysctl_net_ipv4.c
>>>>>>> +++ b/net/ipv4/sysctl_net_ipv4.c
>>>>>>> @@ -717,6 +717,13 @@ static struct ctl_table ipv4_table[] = {
>>>>>>>               .extra2         = &four,
>>>>>>>       },
>>>>>>>       {
>>>>>>> +               .procname       = "tcp_rto_restart",
>>>>>>> +               .data           = &sysctl_tcp_rto_restart,
>>>>>>> +               .maxlen         = sizeof(int),
>>>>>>> +               .mode           = 0644,
>>>>>>> +               .proc_handler   = proc_dointvec,
>>>>>>> +       },
>>>>>>> +       {
>>>>>>>               .procname       = "tcp_min_tso_segs",
>>>>>>>               .data           = &sysctl_tcp_min_tso_segs,
>>>>>>>               .maxlen         = sizeof(int),
>>>>>>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>>>>>>> index 227cba7..450ee30 100644
>>>>>>> --- a/net/ipv4/tcp_input.c
>>>>>>> +++ b/net/ipv4/tcp_input.c
>>>>>>> @@ -98,6 +98,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
>>>>>>> 
>>>>>>> int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
>>>>>>> int sysctl_tcp_early_retrans __read_mostly = 3;
>>>>>>> +int sysctl_tcp_rto_restart __read_mostly = 1;
>>>>>>> 
>>>>>>> #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
>>>>>>> #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
>>>>>>> @@ -2972,6 +2973,16 @@ void tcp_rearm_rto(struct sock *sk)
>>>>>>>                        */
>>>>>>>                       if (delta > 0)
>>>>>>>                               rto = delta;
>>>>>>> +               } else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
>>>>>>> +                          sysctl_tcp_rto_restart &&
>>>>>>> +                          sk->sk_send_head == NULL &&
>>>>>>> +                          tp->packets_out < 4) {
>>>>>>> +                       struct sk_buff *skb = tcp_write_queue_head(sk);
>>>>>>> +                       const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
>>>>>>> +                       s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
>>>>>>> +
>>>>>>> +                       if (delta > 0)
>>>>>>> +                               rto -= delta;
>>>>>>>               }
>>>>>>>               inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
>>>>>>>                                         TCP_RTO_MAX);
>>>>>>> --
>>>>>>> 1.7.9.5
>>>>>>> 
>>>>>>> --
>>>>>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>> 
>>> 
>>> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neal Cardwell Feb. 26, 2014, 2:57 p.m. UTC | #15
On Wed, Feb 26, 2014 at 4:50 AM, Per Hurtig <per.hurtig@kau.se> wrote:
> Also, as your setup includes TLP we only have the RTO restart over TLP gain to start
> with.

My gut says the next thing to try is applying the "RTO restart" idea
to the scheduling of the TLP probe (scheduling the TLP probe for
write_queue_head_skb->when + tlp_timeout). Doing an RTO earlier can be
a very bad thing, due to reseting the cwnd to 1. But doing a TLP
earlier does not have much downside, and could have significant
latency savings (though maybe Nandita already tried this at some
point)...

neal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nandita Dukkipati Feb. 26, 2014, 7:52 p.m. UTC | #16
On Wed, Feb 26, 2014 at 6:57 AM, Neal Cardwell <ncardwell@google.com> wrote:
> On Wed, Feb 26, 2014 at 4:50 AM, Per Hurtig <per.hurtig@kau.se> wrote:
>> Also, as your setup includes TLP we only have the RTO restart over TLP gain to start
>> with.
>
> My gut says the next thing to try is applying the "RTO restart" idea
> to the scheduling of the TLP probe (scheduling the TLP probe for
> write_queue_head_skb->when + tlp_timeout). Doing an RTO earlier can be
> a very bad thing, due to reseting the cwnd to 1. But doing a TLP
> earlier does not have much downside, and could have significant
> latency savings (though maybe Nandita already tried this at some
> point)...

+1 for applying RTO restart idea to scheduling a TLP. The thinking
with TLP at that time, was to send a probe approximately 1.5RTT after
the last "event" (which could be the transmit of a data packet or
receipt of an ACK). I did not try scheduling at
write_queue_head_skb->when + tlp_timeout, but I think it's worth an
experiment (may increase spurious TLPs by some amount which may be
OK).

Nandita
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 56fc366..575e82a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -278,6 +278,7 @@  extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
+extern int sysctl_tcp_rto_restart;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
 extern unsigned int sysctl_tcp_notsent_lowat;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 44eba05..c605f4f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -717,6 +717,13 @@  static struct ctl_table ipv4_table[] = {
 		.extra2		= &four,
 	},
 	{
+		.procname	= "tcp_rto_restart",
+		.data		= &sysctl_tcp_rto_restart,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "tcp_min_tso_segs",
 		.data		= &sysctl_tcp_min_tso_segs,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 227cba7..450ee30 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -98,6 +98,7 @@  int sysctl_tcp_thin_dupack __read_mostly;
 
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
+int sysctl_tcp_rto_restart __read_mostly = 1;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
@@ -2972,6 +2973,16 @@  void tcp_rearm_rto(struct sock *sk)
 			 */
 			if (delta > 0)
 				rto = delta;
+		} else if (icsk->icsk_pending == ICSK_TIME_RETRANS &&
+			   sysctl_tcp_rto_restart &&
+			   sk->sk_send_head == NULL &&
+			   tp->packets_out < 4) {
+			struct sk_buff *skb = tcp_write_queue_head(sk);
+			const u32 rto_time_stamp = TCP_SKB_CB(skb)->when;
+			s32 delta = (s32)(tcp_time_stamp - rto_time_stamp);
+
+			if (delta > 0)
+				rto -= delta;
 		}
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
 					  TCP_RTO_MAX);