diff mbox series

[net-next,v2] tcp: Improve setsockopt() TCP_USER_TIMEOUT accuracy

Message ID 20180704000608.17360-1-jmaxwell37@gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series [net-next,v2] tcp: Improve setsockopt() TCP_USER_TIMEOUT accuracy | expand

Commit Message

Jon Maxwell July 4, 2018, 12:06 a.m. UTC
v2 contains the following suggestions by Neal Cardwell:

1) Fix up units mismatch regarding msec/jiffies.
2) Address possiblility of time_remaining being negative.
3) Add a helper routine tcp_clamp_rto_to_user_timeout() to do the rto 
calculation.
4) Move start_ts logic into helper routine tcp_retrans_stamp() to 
validate tcp_sk(sk)->retrans_stamp.

Every time the TCP retransmission timer fires. It checks to see if there is a 
timeout before scheduling the next retransmit timer. The retransmit interval 
between each retransmission increases exponentially. The issue is that in order 
for the timeout to occur the retransmit timer needs to fire again. If the user 
timeout check happens after the 9th retransmit for example. It needs to wait for 
the 10th retransmit timer to fire in order to evaluate whether a timeout has 
occurred or not. If the interval is large enough then the timeout will be 
inaccurate.

For example with a TCP_USER_TIMEOUT of 10 seconds without patch:

1st retransmit:

22:25:18.973488 IP host1.49310 > host2.search-agent: Flags [.]

Last retransmit:

22:25:26.205499 IP host1.49310 > host2.search-agent: Flags [.]

Timeout:

send: Connection timed out
Sun Jul  1 22:25:34 EDT 2018

We can see that last retransmit took ~7 seconds. Which pushed the total 
timeout to ~15 seconds instead of the expected 10 seconds. This gets more 
inaccurate the larger the TCP_USER_TIMEOUT value. As the interval increases.

Add tcp_clamp_rto_to_user_timeout() to determine if the user rto has expired.
Or whether the rto interval needs to be recalculated. Use the original interval
if user rto is not set. 

Test results with the patch is the expected 10 second timeout:

1st retransmit:

01:37:59.022555 IP host1.49310 > host2.search-agent: Flags [.]

Last retransmit:

01:38:06.486558 IP host1.49310 > host2.search-agent: Flags [.]

Timeout:

send: Connection timed out
Mon Jul  2 01:38:09 EDT 2018

Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
---
 net/ipv4/tcp_timer.c | 48 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 9 deletions(-)

Comments

Neal Cardwell July 4, 2018, 2:13 p.m. UTC | #1
On Tue, Jul 3, 2018 at 8:06 PM Jon Maxwell <jmaxwell37@gmail.com> wrote:
>
> Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
> ---
>  net/ipv4/tcp_timer.c | 48 +++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 39 insertions(+), 9 deletions(-)
>
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index 3b3611729928..d129e670d02a 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -22,6 +22,39 @@
>  #include <linux/gfp.h>
>  #include <net/tcp.h>
>
> +unsigned int tcp_retransmit_stamp(struct sock *sk)
> +{
> +       unsigned int start_ts = tcp_sk(sk)->retrans_stamp;

Since retrans_stamp and tcp_skb_timestamp() are both u32, I'd suggest
using u32 for the local variable start_ts and the return type of the
function.

> +
> +       if (unlikely(!start_ts)) {
> +               struct sk_buff *head = tcp_rtx_queue_head(sk);
> +
> +               if (!head)
> +                       return false;

Looks like a copy-and-paste holdover: since this function is returning
an integer, I would suggest returning 0 rather than false.

> +               start_ts = tcp_skb_timestamp(head);
> +       }
> +       return start_ts;
> +}
> +
> +static __u32 tcp_clamp_rto_to_user_timeout(struct sock *sk)
> +{
> +       struct inet_connection_sock *icsk = inet_csk(sk);
> +       __u32 rto = icsk->icsk_rto;
> +       __u32 elapsed, user_timeout;
> +       unsigned int start_ts;

I'd suggest u32 here for start_ts (per the rationale above).

> +
> +       start_ts = tcp_retransmit_stamp(sk);
> +       if (!icsk->icsk_user_timeout || !start_ts)
> +               return rto;
> +       elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
> +       user_timeout = jiffies_to_msecs(icsk->icsk_user_timeout);
> +       if (elapsed >= user_timeout)
> +               rto = 1;  /* user timeout has passed; fire ASAP */
> +       else
> +               rto = min(rto, (__u32)msecs_to_jiffies(user_timeout - elapsed));

My sense is that min_t would be preferred here, e.g:

  rto = min_t(__u32, rto, msecs_to_jiffies(user_timeout - elapsed));

> +       return rto;
> +}
> +
>  /**
>   *  tcp_write_err() - close socket and save error info
>   *  @sk:  The socket the error has appeared on.
> @@ -166,14 +199,9 @@ static bool retransmits_timed_out(struct sock *sk,
>         if (!inet_csk(sk)->icsk_retransmits)
>                 return false;
>
> -       start_ts = tcp_sk(sk)->retrans_stamp;
> -       if (unlikely(!start_ts)) {
> -               struct sk_buff *head = tcp_rtx_queue_head(sk);
> -
> -               if (!head)
> -                       return false;
> -               start_ts = tcp_skb_timestamp(head);
> -       }
> +       start_ts = tcp_retransmit_stamp(sk);
> +       if (!start_ts)
> +               return false;
>
>         if (likely(timeout == 0)) {
>                 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
> @@ -407,6 +435,7 @@ void tcp_retransmit_timer(struct sock *sk)
>         struct tcp_sock *tp = tcp_sk(sk);
>         struct net *net = sock_net(sk);
>         struct inet_connection_sock *icsk = inet_csk(sk);
> +       __u32 rto;
>
>         if (tp->fastopen_rsk) {
>                 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
> @@ -535,7 +564,8 @@ void tcp_retransmit_timer(struct sock *sk)
>                 /* Use normal (exponential) backoff */
>                 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
>         }
> -       inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
> +       rto = tcp_clamp_rto_to_user_timeout(sk);
> +       inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX);
>         if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
>                 __sk_dst_reset(sk);


Thanks!
neal
David Laight July 4, 2018, 2:57 p.m. UTC | #2
From: Neal Cardwell
> Sent: 04 July 2018 15:14
> 
> On Tue, Jul 3, 2018 at 8:06 PM Jon Maxwell <jmaxwell37@gmail.com> wrote:
> >
> > Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
> > ---
> >  net/ipv4/tcp_timer.c | 48 +++++++++++++++++++++++++++++++++++++++---------
> >  1 file changed, 39 insertions(+), 9 deletions(-)
> >
> > diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> > index 3b3611729928..d129e670d02a 100644
> > --- a/net/ipv4/tcp_timer.c
> > +++ b/net/ipv4/tcp_timer.c
...
> > +static __u32 tcp_clamp_rto_to_user_timeout(struct sock *sk)
> > +{
> > +       struct inet_connection_sock *icsk = inet_csk(sk);
> > +       __u32 rto = icsk->icsk_rto;

Why cache rto past all the function calls until it is needed?

> > +       __u32 elapsed, user_timeout;
> > +       unsigned int start_ts;
> 
> I'd suggest u32 here for start_ts (per the rationale above).
> 
> > +
> > +       start_ts = tcp_retransmit_stamp(sk);
> > +       if (!icsk->icsk_user_timeout || !start_ts)
> > +               return rto;
> > +       elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
> > +       user_timeout = jiffies_to_msecs(icsk->icsk_user_timeout);
> > +       if (elapsed >= user_timeout)
> > +               rto = 1;  /* user timeout has passed; fire ASAP */
> > +       else
> > +               rto = min(rto, (__u32)msecs_to_jiffies(user_timeout - elapsed));
> 
> My sense is that min_t would be preferred here, e.g:
> 
>   rto = min_t(__u32, rto, msecs_to_jiffies(user_timeout - elapsed));

Think I'd just write the conditional...

> > +       return rto;
> > +}

Looks to me like it doesn't correctly allow for the conversions
between millisecond and jiffies either.
I suspect the msecs_to_jiffies() call can return zero (unless it rounds up).
Coding with a signed 'time until user timeout' might make it simpler.

	David
Jon Maxwell July 4, 2018, 11:34 p.m. UTC | #3
Let's wait for Eric to review. Then I'll put together the next version.

Regards

Jon

On Thu, Jul 5, 2018 at 12:57 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Neal Cardwell
>> Sent: 04 July 2018 15:14
>>
>> On Tue, Jul 3, 2018 at 8:06 PM Jon Maxwell <jmaxwell37@gmail.com> wrote:
>> >
>> > Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
>> > ---
>> >  net/ipv4/tcp_timer.c | 48 +++++++++++++++++++++++++++++++++++++++---------
>> >  1 file changed, 39 insertions(+), 9 deletions(-)
>> >
>> > diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>> > index 3b3611729928..d129e670d02a 100644
>> > --- a/net/ipv4/tcp_timer.c
>> > +++ b/net/ipv4/tcp_timer.c
> ...
>> > +static __u32 tcp_clamp_rto_to_user_timeout(struct sock *sk)
>> > +{
>> > +       struct inet_connection_sock *icsk = inet_csk(sk);
>> > +       __u32 rto = icsk->icsk_rto;
>
> Why cache rto past all the function calls until it is needed?
>
>> > +       __u32 elapsed, user_timeout;
>> > +       unsigned int start_ts;
>>
>> I'd suggest u32 here for start_ts (per the rationale above).
>>
>> > +
>> > +       start_ts = tcp_retransmit_stamp(sk);
>> > +       if (!icsk->icsk_user_timeout || !start_ts)
>> > +               return rto;
>> > +       elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
>> > +       user_timeout = jiffies_to_msecs(icsk->icsk_user_timeout);
>> > +       if (elapsed >= user_timeout)
>> > +               rto = 1;  /* user timeout has passed; fire ASAP */
>> > +       else
>> > +               rto = min(rto, (__u32)msecs_to_jiffies(user_timeout - elapsed));
>>
>> My sense is that min_t would be preferred here, e.g:
>>
>>   rto = min_t(__u32, rto, msecs_to_jiffies(user_timeout - elapsed));
>
> Think I'd just write the conditional...
>
>> > +       return rto;
>> > +}
>
> Looks to me like it doesn't correctly allow for the conversions
> between millisecond and jiffies either.
> I suspect the msecs_to_jiffies() call can return zero (unless it rounds up).
> Coding with a signed 'time until user timeout' might make it simpler.
>
>         David
>
>
Eric Dumazet July 10, 2018, 5:03 a.m. UTC | #4
On 07/04/2018 04:34 PM, Jonathan Maxwell wrote:
> Let's wait for Eric to review. Then I'll put together the next version.

Sorry for the delay (I was travelling last week) , please respin a v3, thanks !
diff mbox series

Patch

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3b3611729928..d129e670d02a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,39 @@ 
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
+unsigned int tcp_retransmit_stamp(struct sock *sk)
+{
+	unsigned int start_ts = tcp_sk(sk)->retrans_stamp;
+
+	if (unlikely(!start_ts)) {
+		struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+		if (!head)
+			return false;
+		start_ts = tcp_skb_timestamp(head);
+	}
+	return start_ts;
+}
+
+static __u32 tcp_clamp_rto_to_user_timeout(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	__u32 rto = icsk->icsk_rto;
+	__u32 elapsed, user_timeout;
+	unsigned int start_ts;
+
+	start_ts = tcp_retransmit_stamp(sk);
+	if (!icsk->icsk_user_timeout || !start_ts)
+		return rto;
+	elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
+	user_timeout = jiffies_to_msecs(icsk->icsk_user_timeout);
+	if (elapsed >= user_timeout)
+		rto = 1;  /* user timeout has passed; fire ASAP */
+	else
+		rto = min(rto, (__u32)msecs_to_jiffies(user_timeout - elapsed));
+	return rto;
+}
+
 /**
  *  tcp_write_err() - close socket and save error info
  *  @sk:  The socket the error has appeared on.
@@ -166,14 +199,9 @@  static bool retransmits_timed_out(struct sock *sk,
 	if (!inet_csk(sk)->icsk_retransmits)
 		return false;
 
-	start_ts = tcp_sk(sk)->retrans_stamp;
-	if (unlikely(!start_ts)) {
-		struct sk_buff *head = tcp_rtx_queue_head(sk);
-
-		if (!head)
-			return false;
-		start_ts = tcp_skb_timestamp(head);
-	}
+	start_ts = tcp_retransmit_stamp(sk);
+	if (!start_ts)
+		return false;
 
 	if (likely(timeout == 0)) {
 		linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -407,6 +435,7 @@  void tcp_retransmit_timer(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	__u32 rto;
 
 	if (tp->fastopen_rsk) {
 		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
@@ -535,7 +564,8 @@  void tcp_retransmit_timer(struct sock *sk)
 		/* Use normal (exponential) backoff */
 		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 	}
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+	rto = tcp_clamp_rto_to_user_timeout(sk);
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX);
 	if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
 		__sk_dst_reset(sk);