diff mbox series

[v2,bpf-next,5/7] bpf: sysctl for probe_on_drop

Message ID 20190404001250.140554-6-brakmo@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: Propagate cn to TCP | expand

Commit Message

Lawrence Brakmo April 4, 2019, 12:12 a.m. UTC
When a packet is dropped when calling queue_xmit in  __tcp_transmit_skb
and packets_out is 0, it is beneficial to set a small probe timer.
Otherwise, the throughput for the flow can suffer because it may need to
depend on the probe timer to start sending again. The default value for
the probe timer is at least 200ms, this patch sets it to 20ms when a
packet is dropped and there are no other packets in flight.

This patch introduces a new sysctl, sysctl_tcp_probe_on_drop_ms, that is
used to specify the duration of the probe timer for the case described
earlier. The allowed values are between 0 and TCP_RTO_MIN. A value of 0
disables setting the probe timer with a small value.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_output.c      | 18 +++++++++++++++---
 4 files changed, 27 insertions(+), 3 deletions(-)

Comments

Neal Cardwell April 8, 2019, 4:16 p.m. UTC | #1
On Wed, Apr 3, 2019 at 8:13 PM brakmo <brakmo@fb.com> wrote:
>
> When a packet is dropped when calling queue_xmit in  __tcp_transmit_skb
> and packets_out is 0, it is beneficial to set a small probe timer.
> Otherwise, the throughput for the flow can suffer because it may need to
> depend on the probe timer to start sending again. The default value for
> the probe timer is at least 200ms, this patch sets it to 20ms when a
> packet is dropped and there are no other packets in flight.
>
> This patch introduces a new sysctl, sysctl_tcp_probe_on_drop_ms, that is
> used to specify the duration of the probe timer for the case described
> earlier. The allowed values are between 0 and TCP_RTO_MIN. A value of 0
> disables setting the probe timer with a small value.
>
> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
> ---
>  include/net/netns/ipv4.h   |  1 +
>  net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++
>  net/ipv4/tcp_ipv4.c        |  1 +
>  net/ipv4/tcp_output.c      | 18 +++++++++++++++---
>  4 files changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 7698460a3dd1..26c52d294dea 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -166,6 +166,7 @@ struct netns_ipv4 {
>         int sysctl_tcp_wmem[3];
>         int sysctl_tcp_rmem[3];
>         int sysctl_tcp_comp_sack_nr;
> +       int sysctl_tcp_probe_on_drop_ms;
>         unsigned long sysctl_tcp_comp_sack_delay_ns;
>         struct inet_timewait_death_row tcp_death_row;
>         int sysctl_max_syn_backlog;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 2316c08e9591..b1e4420b6d6c 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -49,6 +49,7 @@ static int ip_ping_group_range_min[] = { 0, 0 };
>  static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
>  static int comp_sack_nr_max = 255;
>  static u32 u32_max_div_HZ = UINT_MAX / HZ;
> +static int probe_on_drop_max = TCP_RTO_MIN;
>
>  /* obsolete */
>  static int sysctl_tcp_low_latency __read_mostly;
> @@ -1228,6 +1229,15 @@ static struct ctl_table ipv4_net_table[] = {
>                 .extra1         = &zero,
>                 .extra2         = &comp_sack_nr_max,
>         },
> +       {
> +               .procname       = "tcp_probe_on_drop_ms",
> +               .data           = &init_net.ipv4.sysctl_tcp_probe_on_drop_ms,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = proc_dointvec_minmax,

The tcp_reset_xmit_timer () function expects a time in jiffies, so
this would probably need to be proc_dointvec_ms_jiffies?

> +               .extra1         = &zero,
> +               .extra2         = &probe_on_drop_max,
> +       },
>         {
>                 .procname       = "udp_rmem_min",
>                 .data           = &init_net.ipv4.sysctl_udp_rmem_min,
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 3979939804b7..3df6735f0f07 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2686,6 +2686,7 @@ static int __net_init tcp_sk_init(struct net *net)
>         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
>         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
>         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
> +       net->ipv4.sysctl_tcp_probe_on_drop_ms = 20;

The tcp_reset_xmit_timer () function expects a time in jiffies, so
this would probably need to be msecs_to_jiffies(20)?

>
>         /* Reno is always built in */
>         if (!net_eq(net, &init_net) &&
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index e265d1aeeb66..f101e4d7c1ae 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -1154,9 +1154,21 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
>
>         err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
>
> -       if (unlikely(err > 0)) {
> -               tcp_enter_cwr(sk);
> -               err = net_xmit_eval(err);
> +       if (unlikely(err)) {
> +               if (unlikely(err > 0)) {
> +                       tcp_enter_cwr(sk);
> +                       err = net_xmit_eval(err);
> +               }
> +               /* Packet was dropped. If there are no packets out,
> +                * we may need to depend on probe timer to start sending
> +                * again. Hence, use a smaller value.
> +                */
> +               if (!tp->packets_out && !inet_csk(sk)->icsk_pending &&
> +                   sock_net(sk)->ipv4.sysctl_tcp_probe_on_drop_ms > 0) {
> +                       tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
> +                                            sock_net(sk)->ipv4.sysctl_tcp_probe_on_drop_ms,
> +                                            TCP_RTO_MAX, NULL);
> +               }

My reading of the code is that any timer that was set here would
typically be quickly overridden by the tcp_check_probe_timer() that
usually happens after most tcp_write_xmit() calls. Am I reading that
correctly?

There seems to be a philosophical difference between this patch and
the existing logic in tcp_check_probe_timer()/tcp_probe0_base():

/* Something is really bad, we could not queue an additional packet,
 * because qdisc is full or receiver sent a 0 window, or we are paced.
 * We do not want to add fuel to the fire, or abort too early,
 * so make sure the timer we arm now is at least 200ms in the future,
 * regardless of current icsk_rto value (as it could be ~2ms)
 */
static inline unsigned long tcp_probe0_base(const struct sock *sk)
{
  return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
}
...
static inline void tcp_check_probe_timer(struct sock *sk)
{
        if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
                tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
                                     tcp_probe0_base(sk), TCP_RTO_MAX,
                                     NULL);
}

If we apply some version of this patch then we should probably at
least update this comment above tcp_probe0_base() to clarify the new
strategy.

cheers,
neal
Eric Dumazet April 8, 2019, 5:06 p.m. UTC | #2
On 04/08/2019 09:16 AM, Neal Cardwell wrote:
> On Wed, Apr 3, 2019 at 8:13 PM brakmo <brakmo@fb.com> wrote:
>>
>> When a packet is dropped when calling queue_xmit in  __tcp_transmit_skb
>> and packets_out is 0, it is beneficial to set a small probe timer.
>> Otherwise, the throughput for the flow can suffer because it may need to
>> depend on the probe timer to start sending again. The default value for
>> the probe timer is at least 200ms, this patch sets it to 20ms when a
>> packet is dropped and there are no other packets in flight.
>>
>> This patch introduces a new sysctl, sysctl_tcp_probe_on_drop_ms, that is
>> used to specify the duration of the probe timer for the case described
>> earlier. The allowed values are between 0 and TCP_RTO_MIN. A value of 0
>> disables setting the probe timer with a small value.

This seems to contradict our recent work ?

See recent Yuchung patch series :

c1d5674f8313b9f8e683c265f1c00a2582cf5fc5 tcp: less aggressive window probing on local congestion
590d2026d62418bb27de9ca87526e9131c1f48af tcp: retry more conservatively on local congestion
9721e709fa68ef9b860c322b474cfbd1f8285b0f tcp: simplify window probe aborting on USER_TIMEOUT
01a523b071618abbc634d1958229fe3bd2dfa5fa tcp: create a helper to model exponential backoff
c7d13c8faa74f4e8ef191f88a252cefab6805b38 tcp: properly track retry time on passive Fast Open
7ae189759cc48cf8b54beebff566e9fd2d4e7d7c tcp: always set retrans_stamp on recovery
7f12422c4873e9b274bc151ea59cb0cdf9415cf1 tcp: always timestamp on every skb transmission
Yuchung Cheng April 8, 2019, 5:38 p.m. UTC | #3
On Mon, Apr 8, 2019 at 10:07 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 04/08/2019 09:16 AM, Neal Cardwell wrote:
> > On Wed, Apr 3, 2019 at 8:13 PM brakmo <brakmo@fb.com> wrote:
> >>
> >> When a packet is dropped when calling queue_xmit in  __tcp_transmit_skb
> >> and packets_out is 0, it is beneficial to set a small probe timer.
> >> Otherwise, the throughput for the flow can suffer because it may need to
> >> depend on the probe timer to start sending again. The default value for
> >> the probe timer is at least 200ms, this patch sets it to 20ms when a
> >> packet is dropped and there are no other packets in flight.
> >>
> >> This patch introduces a new sysctl, sysctl_tcp_probe_on_drop_ms, that is
> >> used to specify the duration of the probe timer for the case described
> >> earlier. The allowed values are between 0 and TCP_RTO_MIN. A value of 0
> >> disables setting the probe timer with a small value.
>
> This seems to contradict our recent work ?
>
> See recent Yuchung patch series :
>
> c1d5674f8313b9f8e683c265f1c00a2582cf5fc5 tcp: less aggressive window probing on local congestion
> 590d2026d62418bb27de9ca87526e9131c1f48af tcp: retry more conservatively on local congestion

I would appreciate a direct change to TCP stack starts with tcp:
subject instead of the confusing bpf for TCP developers.

packet being dropped at local layer is a sign of severe of congestion
-- it's caused by application bursting on many (idle or new)
connections. With this patch, the (many) connections that fail on the
first try (including SYN and pure ACKs) will all come back at 20ms,
instead of the RTTs-adjusted RTOs. So the end effect is the
application repetitively pounding the local qdisc through to squeeze
out some performance.

This patch seems to apply for a special case where local congestion
only lives for a very short period. I don't think it applies well as a
general principle for congestion control.







> 9721e709fa68ef9b860c322b474cfbd1f8285b0f tcp: simplify window probe aborting on USER_TIMEOUT
> 01a523b071618abbc634d1958229fe3bd2dfa5fa tcp: create a helper to model exponential backoff
> c7d13c8faa74f4e8ef191f88a252cefab6805b38 tcp: properly track retry time on passive Fast Open
> 7ae189759cc48cf8b54beebff566e9fd2d4e7d7c tcp: always set retrans_stamp on recovery
> 7f12422c4873e9b274bc151ea59cb0cdf9415cf1 tcp: always timestamp on every skb transmission
>
Lawrence Brakmo May 4, 2019, 2:33 a.m. UTC | #4
Sorry for the delay, due to a combination of changing teams and getting pneumonia. Answers inline, but note that I will be sending a patchset for review that does not include these changes to the probe timer.

On 4/8/19, 9:17 AM, "Neal Cardwell" <ncardwell@google.com> wrote:

    On Wed, Apr 3, 2019 at 8:13 PM brakmo <brakmo@fb.com> wrote:
    >
    > When a packet is dropped when calling queue_xmit in  __tcp_transmit_skb
    > and packets_out is 0, it is beneficial to set a small probe timer.
    > Otherwise, the throughput for the flow can suffer because it may need to
    > depend on the probe timer to start sending again. The default value for
    > the probe timer is at least 200ms, this patch sets it to 20ms when a
    > packet is dropped and there are no other packets in flight.
    >
    > This patch introduces a new sysctl, sysctl_tcp_probe_on_drop_ms, that is
    > used to specify the duration of the probe timer for the case described
    > earlier. The allowed values are between 0 and TCP_RTO_MIN. A value of 0
    > disables setting the probe timer with a small value.
    >
    > Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
    > ---
    >  include/net/netns/ipv4.h   |  1 +
    >  net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++
    >  net/ipv4/tcp_ipv4.c        |  1 +
    >  net/ipv4/tcp_output.c      | 18 +++++++++++++++---
    >  4 files changed, 27 insertions(+), 3 deletions(-)
    >
    > diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
    > index 7698460a3dd1..26c52d294dea 100644
    > --- a/include/net/netns/ipv4.h
    > +++ b/include/net/netns/ipv4.h
    > @@ -166,6 +166,7 @@ struct netns_ipv4 {
    >         int sysctl_tcp_wmem[3];
    >         int sysctl_tcp_rmem[3];
    >         int sysctl_tcp_comp_sack_nr;
    > +       int sysctl_tcp_probe_on_drop_ms;
    >         unsigned long sysctl_tcp_comp_sack_delay_ns;
    >         struct inet_timewait_death_row tcp_death_row;
    >         int sysctl_max_syn_backlog;
    > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
    > index 2316c08e9591..b1e4420b6d6c 100644
    > --- a/net/ipv4/sysctl_net_ipv4.c
    > +++ b/net/ipv4/sysctl_net_ipv4.c
    > @@ -49,6 +49,7 @@ static int ip_ping_group_range_min[] = { 0, 0 };
    >  static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
    >  static int comp_sack_nr_max = 255;
    >  static u32 u32_max_div_HZ = UINT_MAX / HZ;
    > +static int probe_on_drop_max = TCP_RTO_MIN;
    >
    >  /* obsolete */
    >  static int sysctl_tcp_low_latency __read_mostly;
    > @@ -1228,6 +1229,15 @@ static struct ctl_table ipv4_net_table[] = {
    >                 .extra1         = &zero,
    >                 .extra2         = &comp_sack_nr_max,
    >         },
    > +       {
    > +               .procname       = "tcp_probe_on_drop_ms",
    > +               .data           = &init_net.ipv4.sysctl_tcp_probe_on_drop_ms,
    > +               .maxlen         = sizeof(int),
    > +               .mode           = 0644,
    > +               .proc_handler   = proc_dointvec_minmax,
    
    The tcp_reset_xmit_timer () function expects a time in jiffies, so
    this would probably need to be proc_dointvec_ms_jiffies?

Yes, good catch.
    
    > +               .extra1         = &zero,
    > +               .extra2         = &probe_on_drop_max,
    > +       },
    >         {
    >                 .procname       = "udp_rmem_min",
    >                 .data           = &init_net.ipv4.sysctl_udp_rmem_min,
    > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
    > index 3979939804b7..3df6735f0f07 100644
    > --- a/net/ipv4/tcp_ipv4.c
    > +++ b/net/ipv4/tcp_ipv4.c
    > @@ -2686,6 +2686,7 @@ static int __net_init tcp_sk_init(struct net *net)
    >         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
    >         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
    >         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
    > +       net->ipv4.sysctl_tcp_probe_on_drop_ms = 20;
    
    The tcp_reset_xmit_timer () function expects a time in jiffies, so
    this would probably need to be msecs_to_jiffies(20)?

Correct, thanks.
    
    >
    >         /* Reno is always built in */
    >         if (!net_eq(net, &init_net) &&
    > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
    > index e265d1aeeb66..f101e4d7c1ae 100644
    > --- a/net/ipv4/tcp_output.c
    > +++ b/net/ipv4/tcp_output.c
    > @@ -1154,9 +1154,21 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
    >
    >         err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
    >
    > -       if (unlikely(err > 0)) {
    > -               tcp_enter_cwr(sk);
    > -               err = net_xmit_eval(err);
    > +       if (unlikely(err)) {
    > +               if (unlikely(err > 0)) {
    > +                       tcp_enter_cwr(sk);
    > +                       err = net_xmit_eval(err);
    > +               }
    > +               /* Packet was dropped. If there are no packets out,
    > +                * we may need to depend on probe timer to start sending
    > +                * again. Hence, use a smaller value.
    > +                */
    > +               if (!tp->packets_out && !inet_csk(sk)->icsk_pending &&
    > +                   sock_net(sk)->ipv4.sysctl_tcp_probe_on_drop_ms > 0) {
    > +                       tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
    > +                                            sock_net(sk)->ipv4.sysctl_tcp_probe_on_drop_ms,
    > +                                            TCP_RTO_MAX, NULL);
    > +               }
    
    My reading of the code is that any timer that was set here would
    typically be quickly overridden by the tcp_check_probe_timer() that
    usually happens after most tcp_write_xmit() calls. Am I reading that
    correctly?

No. tcp_check_probe_timer() first checks if there is a pending timer and 
Will not overwrite it.
    
    There seems to be a philosophical difference between this patch and
    the existing logic in tcp_check_probe_timer()/tcp_probe0_base():
    
    /* Something is really bad, we could not queue an additional packet,
     * because qdisc is full or receiver sent a 0 window, or we are paced.
     * We do not want to add fuel to the fire, or abort too early,
     * so make sure the timer we arm now is at least 200ms in the future,
     * regardless of current icsk_rto value (as it could be ~2ms)
     */
    static inline unsigned long tcp_probe0_base(const struct sock *sk)
    {
      return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
    }
    ...
    static inline void tcp_check_probe_timer(struct sock *sk)
    {
            if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
                    tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
                                         tcp_probe0_base(sk), TCP_RTO_MAX,
                                         NULL);
    }
    
    If we apply some version of this patch then we should probably at
    least update this comment above tcp_probe0_base() to clarify the new
    strategy.
    
Thanks for pointing this out. In the use case we consider packets can be
dropped as a result of applying a rate limit, so in this case drops are not a
sign that things are really bad (and in my measurements do not occur very
often). In retrospect I should have set the default sysctl value to 0 so the
behavior would not change unless one desires it by setting the sysctl to a
non-zero value.

    cheers,
    neal
Lawrence Brakmo May 4, 2019, 2:35 a.m. UTC | #5
On 4/8/19, 10:07 AM, "Eric Dumazet" <eric.dumazet@gmail.com> wrote:

    
    
    On 04/08/2019 09:16 AM, Neal Cardwell wrote:
    > On Wed, Apr 3, 2019 at 8:13 PM brakmo <brakmo@fb.com> wrote:
    >>
    >> When a packet is dropped when calling queue_xmit in  __tcp_transmit_skb
    >> and packets_out is 0, it is beneficial to set a small probe timer.
    >> Otherwise, the throughput for the flow can suffer because it may need to
    >> depend on the probe timer to start sending again. The default value for
    >> the probe timer is at least 200ms, this patch sets it to 20ms when a
    >> packet is dropped and there are no other packets in flight.
    >>
    >> This patch introduces a new sysctl, sysctl_tcp_probe_on_drop_ms, that is
    >> used to specify the duration of the probe timer for the case described
    >> earlier. The allowed values are between 0 and TCP_RTO_MIN. A value of 0
    >> disables setting the probe timer with a small value.
    
    This seems to contradict our recent work ?
    
    See recent Yuchung patch series :
    
    c1d5674f8313b9f8e683c265f1c00a2582cf5fc5 tcp: less aggressive window probing on local congestion
    590d2026d62418bb27de9ca87526e9131c1f48af tcp: retry more conservatively on local congestion
    9721e709fa68ef9b860c322b474cfbd1f8285b0f tcp: simplify window probe aborting on USER_TIMEOUT
    01a523b071618abbc634d1958229fe3bd2dfa5fa tcp: create a helper to model exponential backoff
    c7d13c8faa74f4e8ef191f88a252cefab6805b38 tcp: properly track retry time on passive Fast Open
    7ae189759cc48cf8b54beebff566e9fd2d4e7d7c tcp: always set retrans_stamp on recovery
    7f12422c4873e9b274bc151ea59cb0cdf9415cf1 tcp: always timestamp on every skb transmission
    
Thank you for pointing this out. I'm taking this patch out of the patchset and will reconsider it.
Lawrence Brakmo May 4, 2019, 2:38 a.m. UTC | #6
On 4/8/19, 10:39 AM, "Yuchung Cheng" <ycheng@google.com> wrote:

    On Mon, Apr 8, 2019 at 10:07 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
    >
    >
    >
    > On 04/08/2019 09:16 AM, Neal Cardwell wrote:
    > > On Wed, Apr 3, 2019 at 8:13 PM brakmo <brakmo@fb.com> wrote:
    > >>
    > >> When a packet is dropped when calling queue_xmit in  __tcp_transmit_skb
    > >> and packets_out is 0, it is beneficial to set a small probe timer.
    > >> Otherwise, the throughput for the flow can suffer because it may need to
    > >> depend on the probe timer to start sending again. The default value for
    > >> the probe timer is at least 200ms, this patch sets it to 20ms when a
    > >> packet is dropped and there are no other packets in flight.
    > >>
    > >> This patch introduces a new sysctl, sysctl_tcp_probe_on_drop_ms, that is
    > >> used to specify the duration of the probe timer for the case described
    > >> earlier. The allowed values are between 0 and TCP_RTO_MIN. A value of 0
    > >> disables setting the probe timer with a small value.
    >
    > This seems to contradict our recent work ?
    >
    > See recent Yuchung patch series :
    >
    > c1d5674f8313b9f8e683c265f1c00a2582cf5fc5 tcp: less aggressive window probing on local congestion
    > 590d2026d62418bb27de9ca87526e9131c1f48af tcp: retry more conservatively on local congestion
    
    I would appreciate a direct change to TCP stack starts with tcp:
    subject instead of the confusing bpf for TCP developers.

You are right, thank you for pointing this out.
    
    packet being dropped at local layer is a sign of severe of congestion
    -- it's caused by application bursting on many (idle or new)
    connections. With this patch, the (many) connections that fail on the
    first try (including SYN and pure ACKs) will all come back at 20ms,
    instead of the RTTs-adjusted RTOs. So the end effect is the
    application repetitively pounding the local qdisc through to squeeze
    out some performance.
    
    This patch seems to apply for a special case where local congestion
    only lives for a very short period. I don't think it applies well as a
    general principle for congestion control.
    
I am taking this out of the patchset and will revisit it later.
    
    
    > 9721e709fa68ef9b860c322b474cfbd1f8285b0f tcp: simplify window probe aborting on USER_TIMEOUT
    > 01a523b071618abbc634d1958229fe3bd2dfa5fa tcp: create a helper to model exponential backoff
    > c7d13c8faa74f4e8ef191f88a252cefab6805b38 tcp: properly track retry time on passive Fast Open
    > 7ae189759cc48cf8b54beebff566e9fd2d4e7d7c tcp: always set retrans_stamp on recovery
    > 7f12422c4873e9b274bc151ea59cb0cdf9415cf1 tcp: always timestamp on every skb transmission
    >
diff mbox series

Patch

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7698460a3dd1..26c52d294dea 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -166,6 +166,7 @@  struct netns_ipv4 {
 	int sysctl_tcp_wmem[3];
 	int sysctl_tcp_rmem[3];
 	int sysctl_tcp_comp_sack_nr;
+	int sysctl_tcp_probe_on_drop_ms;
 	unsigned long sysctl_tcp_comp_sack_delay_ns;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 2316c08e9591..b1e4420b6d6c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -49,6 +49,7 @@  static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 static int comp_sack_nr_max = 255;
 static u32 u32_max_div_HZ = UINT_MAX / HZ;
+static int probe_on_drop_max = TCP_RTO_MIN;
 
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
@@ -1228,6 +1229,15 @@  static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &zero,
 		.extra2		= &comp_sack_nr_max,
 	},
+	{
+		.procname	= "tcp_probe_on_drop_ms",
+		.data		= &init_net.ipv4.sysctl_tcp_probe_on_drop_ms,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &probe_on_drop_max,
+	},
 	{
 		.procname	= "udp_rmem_min",
 		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3979939804b7..3df6735f0f07 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2686,6 +2686,7 @@  static int __net_init tcp_sk_init(struct net *net)
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+	net->ipv4.sysctl_tcp_probe_on_drop_ms = 20;
 
 	/* Reno is always built in */
 	if (!net_eq(net, &init_net) &&
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e265d1aeeb66..f101e4d7c1ae 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1154,9 +1154,21 @@  static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 
 	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
 
-	if (unlikely(err > 0)) {
-		tcp_enter_cwr(sk);
-		err = net_xmit_eval(err);
+	if (unlikely(err)) {
+		if (unlikely(err > 0)) {
+			tcp_enter_cwr(sk);
+			err = net_xmit_eval(err);
+		}
+		/* Packet was dropped. If there are no packets out,
+		 * we may need to depend on probe timer to start sending
+		 * again. Hence, use a smaller value.
+		 */
+		if (!tp->packets_out && !inet_csk(sk)->icsk_pending &&
+		    sock_net(sk)->ipv4.sysctl_tcp_probe_on_drop_ms > 0) {
+			tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+					     sock_net(sk)->ipv4.sysctl_tcp_probe_on_drop_ms,
+					     TCP_RTO_MAX, NULL);
+		}
 	}
 	if (!err && oskb) {
 		tcp_update_skb_after_send(sk, oskb, prior_wstamp);