diff mbox series

[net-next,3/3] tcp: decouple TLP timer from RTO timer

Message ID 20191022231051.30770-4-xiyou.wangcong@gmail.com
State Deferred
Delegated to: David Miller
Headers show
Series tcp: decouple TLP timer from RTO timer | expand

Commit Message

Cong Wang Oct. 22, 2019, 11:10 p.m. UTC
Currently RTO, TLP and PROBE0 all share a same timer instance
in kernel and use icsk->icsk_pending to dispatch the work.
This causes spinlock contention when resetting the timer is
too frequent, as clearly shown in the perf report:

   61.72%    61.71%  swapper          [kernel.kallsyms]                        [k] queued_spin_lock_slowpath
   ...
    - 58.83% tcp_v4_rcv
      - 58.80% tcp_v4_do_rcv
         - 58.80% tcp_rcv_established
            - 52.88% __tcp_push_pending_frames
               - 52.88% tcp_write_xmit
                  - 28.16% tcp_event_new_data_sent
                     - 28.15% sk_reset_timer
                        + mod_timer
                  - 24.68% tcp_schedule_loss_probe
                     - 24.68% sk_reset_timer
                        + 24.68% mod_timer

This patch decouples TLP timer from RTO timer by adding a new
timer instance but still uses icsk->icsk_pending to dispatch,
in order to minimize the risk of this patch.

After this patch, the CPU time spent in tcp_write_xmit() reduced
down to 10.92%.

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 include/net/inet_connection_sock.h |  9 +++++--
 include/net/tcp.h                  |  1 +
 net/dccp/timer.c                   |  2 +-
 net/ipv4/inet_connection_sock.c    |  5 +++-
 net/ipv4/inet_diag.c               |  8 ++++--
 net/ipv4/tcp_input.c               |  8 ++++--
 net/ipv4/tcp_ipv4.c                |  6 +++--
 net/ipv4/tcp_output.c              |  1 +
 net/ipv4/tcp_timer.c               | 43 +++++++++++++++++++++++++++---
 net/ipv6/tcp_ipv6.c                |  6 +++--
 10 files changed, 73 insertions(+), 16 deletions(-)

Comments

Eric Dumazet Oct. 22, 2019, 11:24 p.m. UTC | #1
On Tue, Oct 22, 2019 at 4:11 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> Currently RTO, TLP and PROBE0 all share a same timer instance
> in kernel and use icsk->icsk_pending to dispatch the work.
> This causes spinlock contention when resetting the timer is
> too frequent, as clearly shown in the perf report:
>
>    61.72%    61.71%  swapper          [kernel.kallsyms]                        [k] queued_spin_lock_slowpath
>    ...
>     - 58.83% tcp_v4_rcv
>       - 58.80% tcp_v4_do_rcv
>          - 58.80% tcp_rcv_established
>             - 52.88% __tcp_push_pending_frames
>                - 52.88% tcp_write_xmit
>                   - 28.16% tcp_event_new_data_sent
>                      - 28.15% sk_reset_timer
>                         + mod_timer
>                   - 24.68% tcp_schedule_loss_probe
>                      - 24.68% sk_reset_timer
>                         + 24.68% mod_timer
>
> This patch decouples TLP timer from RTO timer by adding a new
> timer instance but still uses icsk->icsk_pending to dispatch,
> in order to minimize the risk of this patch.
>
> After this patch, the CPU time spent in tcp_write_xmit() reduced
> down to 10.92%.

What is the exact benchmark you are running ?

We never saw any contention like that, so lets make sure you are not
working around another issue.
Cong Wang Oct. 23, 2019, 1:10 a.m. UTC | #2
On Tue, Oct 22, 2019 at 4:24 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Tue, Oct 22, 2019 at 4:11 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > Currently RTO, TLP and PROBE0 all share a same timer instance
> > in kernel and use icsk->icsk_pending to dispatch the work.
> > This causes spinlock contention when resetting the timer is
> > too frequent, as clearly shown in the perf report:
> >
> >    61.72%    61.71%  swapper          [kernel.kallsyms]                        [k] queued_spin_lock_slowpath
> >    ...
> >     - 58.83% tcp_v4_rcv
> >       - 58.80% tcp_v4_do_rcv
> >          - 58.80% tcp_rcv_established
> >             - 52.88% __tcp_push_pending_frames
> >                - 52.88% tcp_write_xmit
> >                   - 28.16% tcp_event_new_data_sent
> >                      - 28.15% sk_reset_timer
> >                         + mod_timer
> >                   - 24.68% tcp_schedule_loss_probe
> >                      - 24.68% sk_reset_timer
> >                         + 24.68% mod_timer
> >
> > This patch decouples TLP timer from RTO timer by adding a new
> > timer instance but still uses icsk->icsk_pending to dispatch,
> > in order to minimize the risk of this patch.
> >
> > After this patch, the CPU time spent in tcp_write_xmit() reduced
> > down to 10.92%.
>
> What is the exact benchmark you are running ?
>
> We never saw any contention like that, so lets make sure you are not
> working around another issue.

I simply ran 256 parallel netperf with 128 CPU's to trigger this
spinlock contention, 100% reproducible here.

A single netperf TCP_RR could _also_ confirm the improvement:

Before patch:

$ netperf -H XXX -t TCP_RR -l 20
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0
AF_INET to XXX () port 0 AF_INET : first burst 0
Local /Remote
Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size     Size    Time     Rate
bytes  Bytes  bytes    bytes   secs.    per sec

655360 873800 1        1       20.00    17665.59
655360 873800


After patch:

$ netperf -H XXX -t TCP_RR -l 20
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0
AF_INET to XXX () port 0 AF_INET : first burst 0
Local /Remote
Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size     Size    Time     Rate
bytes  Bytes  bytes    bytes   secs.    per sec

655360 873800 1        1       20.00    18829.31
655360 873800

(I have run it for multiple times, just pick a median one here.)

The difference can also be observed by turning off/on TLP without patch.

Thanks.
Eric Dumazet Oct. 23, 2019, 2:15 a.m. UTC | #3
On Tue, Oct 22, 2019 at 6:10 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Tue, Oct 22, 2019 at 4:24 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Tue, Oct 22, 2019 at 4:11 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > Currently RTO, TLP and PROBE0 all share a same timer instance
> > > in kernel and use icsk->icsk_pending to dispatch the work.
> > > This causes spinlock contention when resetting the timer is
> > > too frequent, as clearly shown in the perf report:
> > >
> > >    61.72%    61.71%  swapper          [kernel.kallsyms]                        [k] queued_spin_lock_slowpath
> > >    ...
> > >     - 58.83% tcp_v4_rcv
> > >       - 58.80% tcp_v4_do_rcv
> > >          - 58.80% tcp_rcv_established
> > >             - 52.88% __tcp_push_pending_frames
> > >                - 52.88% tcp_write_xmit
> > >                   - 28.16% tcp_event_new_data_sent
> > >                      - 28.15% sk_reset_timer
> > >                         + mod_timer
> > >                   - 24.68% tcp_schedule_loss_probe
> > >                      - 24.68% sk_reset_timer
> > >                         + 24.68% mod_timer
> > >
> > > This patch decouples TLP timer from RTO timer by adding a new
> > > timer instance but still uses icsk->icsk_pending to dispatch,
> > > in order to minimize the risk of this patch.
> > >
> > > After this patch, the CPU time spent in tcp_write_xmit() reduced
> > > down to 10.92%.
> >
> > What is the exact benchmark you are running ?
> >
> > We never saw any contention like that, so lets make sure you are not
> > working around another issue.
>
> I simply ran 256 parallel netperf with 128 CPU's to trigger this
> spinlock contention, 100% reproducible here.

How many TX/RX queues on the NIC ?
What is the qdisc setup ?

>
> A single netperf TCP_RR could _also_ confirm the improvement:
>
> Before patch:
>
> $ netperf -H XXX -t TCP_RR -l 20
> MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0
> AF_INET to XXX () port 0 AF_INET : first burst 0
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 655360 873800 1        1       20.00    17665.59
> 655360 873800
>
>
> After patch:
>
> $ netperf -H XXX -t TCP_RR -l 20
> MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0
> AF_INET to XXX () port 0 AF_INET : first burst 0
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 655360 873800 1        1       20.00    18829.31
> 655360 873800
>
> (I have run it for multiple times, just pick a median one here.)
>
> The difference can also be observed by turning off/on TLP without patch.

OK thanks for using something I can repro easily :)

I ran the experiment ten times :

lpaa23:/export/hda3/google/edumazet# echo 3
>/proc/sys/net/ipv4/tcp_early_retrans
lpaa23:/export/hda3/google/edumazet# for f in {1..10}; do
./super_netperf 1 -H lpaa24 -t TCP_RR -l 20; done
  26797
  26850
  25266
  27605
  26586
  26341
  27255
  27532
  26657
  27253


Then disabled tlp, and got no obvious difference

lpaa23:/export/hda3/google/edumazet# echo 0
>/proc/sys/net/ipv4/tcp_early_retrans
lpaa23:/export/hda3/google/edumazet# for f in {1..10}; do
./super_netperf 1 -H lpaa24 -t TCP_RR -l 20; done
  25311
  24658
  27105
  27421
  27604
  24649
  26259
  27615
  27543
  26217

I tried with 256 concurrent flows, and same overall observation about
tlp not changing the numbers.
(In fact I am not even sure we arm RTO at all while doing a TCP_RR)
lpaa23:/export/hda3/google/edumazet# echo 3
>/proc/sys/net/ipv4/tcp_early_retrans
lpaa23:/export/hda3/google/edumazet# for f in {1..10}; do
./super_netperf 256 -H lpaa24 -t TCP_RR -l 20; done
1578682
1572444
1573490
1536378
1514905
1580854
1575949
1578925
1511164
1568213
lpaa23:/export/hda3/google/edumazet# echo 0
>/proc/sys/net/ipv4/tcp_early_retrans
lpaa23:/export/hda3/google/edumazet# for f in {1..10}; do
./super_netperf 256 -H lpaa24 -t TCP_RR -l 20; done
1576228
1578401
1577654
1579506
1570682
1582267
1550069
1530599
1583269
1578830


I wonder if you have some IRQ smp_affinity problem maybe, or some
scheduler strategy constantly migrating your user threads ?

TLP is quite subtle, having two timers instead of one is probably
going to trigger various bugs.
Cong Wang Oct. 23, 2019, 5:40 p.m. UTC | #4
On Tue, Oct 22, 2019 at 7:15 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Tue, Oct 22, 2019 at 6:10 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Tue, Oct 22, 2019 at 4:24 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Tue, Oct 22, 2019 at 4:11 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > Currently RTO, TLP and PROBE0 all share a same timer instance
> > > > in kernel and use icsk->icsk_pending to dispatch the work.
> > > > This causes spinlock contention when resetting the timer is
> > > > too frequent, as clearly shown in the perf report:
> > > >
> > > >    61.72%    61.71%  swapper          [kernel.kallsyms]                        [k] queued_spin_lock_slowpath
> > > >    ...
> > > >     - 58.83% tcp_v4_rcv
> > > >       - 58.80% tcp_v4_do_rcv
> > > >          - 58.80% tcp_rcv_established
> > > >             - 52.88% __tcp_push_pending_frames
> > > >                - 52.88% tcp_write_xmit
> > > >                   - 28.16% tcp_event_new_data_sent
> > > >                      - 28.15% sk_reset_timer
> > > >                         + mod_timer
> > > >                   - 24.68% tcp_schedule_loss_probe
> > > >                      - 24.68% sk_reset_timer
> > > >                         + 24.68% mod_timer
> > > >
> > > > This patch decouples TLP timer from RTO timer by adding a new
> > > > timer instance but still uses icsk->icsk_pending to dispatch,
> > > > in order to minimize the risk of this patch.
> > > >
> > > > After this patch, the CPU time spent in tcp_write_xmit() reduced
> > > > down to 10.92%.
> > >
> > > What is the exact benchmark you are running ?
> > >
> > > We never saw any contention like that, so lets make sure you are not
> > > working around another issue.
> >
> > I simply ran 256 parallel netperf with 128 CPU's to trigger this
> > spinlock contention, 100% reproducible here.
>
> How many TX/RX queues on the NIC ?

60 queues (default), 25Gbps NIC, mlx5.

> What is the qdisc setup ?

fq_codel, which is default here. Its parameters are default too.

>
> >
> > A single netperf TCP_RR could _also_ confirm the improvement:
> >
> > Before patch:
> >
> > $ netperf -H XXX -t TCP_RR -l 20
> > MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0
> > AF_INET to XXX () port 0 AF_INET : first burst 0
> > Local /Remote
> > Socket Size   Request  Resp.   Elapsed  Trans.
> > Send   Recv   Size     Size    Time     Rate
> > bytes  Bytes  bytes    bytes   secs.    per sec
> >
> > 655360 873800 1        1       20.00    17665.59
> > 655360 873800
> >
> >
> > After patch:
> >
> > $ netperf -H XXX -t TCP_RR -l 20
> > MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0
> > AF_INET to XXX () port 0 AF_INET : first burst 0
> > Local /Remote
> > Socket Size   Request  Resp.   Elapsed  Trans.
> > Send   Recv   Size     Size    Time     Rate
> > bytes  Bytes  bytes    bytes   secs.    per sec
> >
> > 655360 873800 1        1       20.00    18829.31
> > 655360 873800
> >
> > (I have run it for multiple times, just pick a median one here.)
> >
> > The difference can also be observed by turning off/on TLP without patch.
>
> OK thanks for using something I can repro easily :)
>
> I ran the experiment ten times :

How many CPU's do you have?


>
> lpaa23:/export/hda3/google/edumazet# echo 3
> >/proc/sys/net/ipv4/tcp_early_retrans
> lpaa23:/export/hda3/google/edumazet# for f in {1..10}; do
> ./super_netperf 1 -H lpaa24 -t TCP_RR -l 20; done
>   26797
>   26850
>   25266
>   27605
>   26586
>   26341
>   27255
>   27532
>   26657
>   27253
>
>
> Then disabled tlp, and got no obvious difference
>
> lpaa23:/export/hda3/google/edumazet# echo 0
> >/proc/sys/net/ipv4/tcp_early_retrans
> lpaa23:/export/hda3/google/edumazet# for f in {1..10}; do
> ./super_netperf 1 -H lpaa24 -t TCP_RR -l 20; done
>   25311
>   24658
>   27105
>   27421
>   27604
>   24649
>   26259
>   27615
>   27543
>   26217
>
> I tried with 256 concurrent flows, and same overall observation about
> tlp not changing the numbers.
> (In fact I am not even sure we arm RTO at all while doing a TCP_RR)

In case you misunderstand, the CPU profiling I used is captured
during 256 parallel TCP_STREAM.


> lpaa23:/export/hda3/google/edumazet# echo 3
> >/proc/sys/net/ipv4/tcp_early_retrans
> lpaa23:/export/hda3/google/edumazet# for f in {1..10}; do
> ./super_netperf 256 -H lpaa24 -t TCP_RR -l 20; done
> 1578682
> 1572444
> 1573490
> 1536378
> 1514905
> 1580854
> 1575949
> 1578925
> 1511164
> 1568213
> lpaa23:/export/hda3/google/edumazet# echo 0
> >/proc/sys/net/ipv4/tcp_early_retrans
> lpaa23:/export/hda3/google/edumazet# for f in {1..10}; do
> ./super_netperf 256 -H lpaa24 -t TCP_RR -l 20; done
> 1576228
> 1578401
> 1577654
> 1579506
> 1570682
> 1582267
> 1550069
> 1530599
> 1583269
> 1578830
>
>
> I wonder if you have some IRQ smp_affinity problem maybe, or some
> scheduler strategy constantly migrating your user threads ?

Scheduler is default too. IRQ smp affinity is statically distributed to
each of the first 60 CPU's.

>
> TLP is quite subtle, having two timers instead of one is probably
> going to trigger various bugs.

This is exactly why I keep icsk_pending. ;)

Thanks.
Eric Dumazet Oct. 23, 2019, 6:14 p.m. UTC | #5
On Wed, Oct 23, 2019 at 10:40 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Tue, Oct 22, 2019 at 7:15 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Tue, Oct 22, 2019 at 6:10 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Tue, Oct 22, 2019 at 4:24 PM Eric Dumazet <edumazet@google.com> wrote:
> > > >
> > > > On Tue, Oct 22, 2019 at 4:11 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > >
> > > > > Currently RTO, TLP and PROBE0 all share a same timer instance
> > > > > in kernel and use icsk->icsk_pending to dispatch the work.
> > > > > This causes spinlock contention when resetting the timer is
> > > > > too frequent, as clearly shown in the perf report:
> > > > >
> > > > >    61.72%    61.71%  swapper          [kernel.kallsyms]                        [k] queued_spin_lock_slowpath
> > > > >    ...
> > > > >     - 58.83% tcp_v4_rcv
> > > > >       - 58.80% tcp_v4_do_rcv
> > > > >          - 58.80% tcp_rcv_established
> > > > >             - 52.88% __tcp_push_pending_frames
> > > > >                - 52.88% tcp_write_xmit
> > > > >                   - 28.16% tcp_event_new_data_sent
> > > > >                      - 28.15% sk_reset_timer
> > > > >                         + mod_timer
> > > > >                   - 24.68% tcp_schedule_loss_probe
> > > > >                      - 24.68% sk_reset_timer
> > > > >                         + 24.68% mod_timer
> > > > >
> > > > > This patch decouples TLP timer from RTO timer by adding a new
> > > > > timer instance but still uses icsk->icsk_pending to dispatch,
> > > > > in order to minimize the risk of this patch.
> > > > >
> > > > > After this patch, the CPU time spent in tcp_write_xmit() reduced
> > > > > down to 10.92%.
> > > >
> > > > What is the exact benchmark you are running ?
> > > >
> > > > We never saw any contention like that, so lets make sure you are not
> > > > working around another issue.
> > >
> > > I simply ran 256 parallel netperf with 128 CPU's to trigger this
> > > spinlock contention, 100% reproducible here.
> >
> > How many TX/RX queues on the NIC ?
>
> 60 queues (default), 25Gbps NIC, mlx5.
>
> > What is the qdisc setup ?
>
> fq_codel, which is default here. Its parameters are default too.
>
> >
> > >
> > > A single netperf TCP_RR could _also_ confirm the improvement:
> > >
> > > Before patch:
> > >
> > > $ netperf -H XXX -t TCP_RR -l 20
> > > MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0
> > > AF_INET to XXX () port 0 AF_INET : first burst 0
> > > Local /Remote
> > > Socket Size   Request  Resp.   Elapsed  Trans.
> > > Send   Recv   Size     Size    Time     Rate
> > > bytes  Bytes  bytes    bytes   secs.    per sec
> > >
> > > 655360 873800 1        1       20.00    17665.59
> > > 655360 873800
> > >
> > >
> > > After patch:
> > >
> > > $ netperf -H XXX -t TCP_RR -l 20
> > > MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0
> > > AF_INET to XXX () port 0 AF_INET : first burst 0
> > > Local /Remote
> > > Socket Size   Request  Resp.   Elapsed  Trans.
> > > Send   Recv   Size     Size    Time     Rate
> > > bytes  Bytes  bytes    bytes   secs.    per sec
> > >
> > > 655360 873800 1        1       20.00    18829.31
> > > 655360 873800
> > >
> > > (I have run it for multiple times, just pick a median one here.)
> > >
> > > The difference can also be observed by turning off/on TLP without patch.
> >
> > OK thanks for using something I can repro easily :)
> >
> > I ran the experiment ten times :
>
> How many CPU's do you have?
>
>
> >
> > lpaa23:/export/hda3/google/edumazet# echo 3
> > >/proc/sys/net/ipv4/tcp_early_retrans
> > lpaa23:/export/hda3/google/edumazet# for f in {1..10}; do
> > ./super_netperf 1 -H lpaa24 -t TCP_RR -l 20; done
> >   26797
> >   26850
> >   25266
> >   27605
> >   26586
> >   26341
> >   27255
> >   27532
> >   26657
> >   27253
> >
> >
> > Then disabled tlp, and got no obvious difference
> >
> > lpaa23:/export/hda3/google/edumazet# echo 0
> > >/proc/sys/net/ipv4/tcp_early_retrans
> > lpaa23:/export/hda3/google/edumazet# for f in {1..10}; do
> > ./super_netperf 1 -H lpaa24 -t TCP_RR -l 20; done
> >   25311
> >   24658
> >   27105
> >   27421
> >   27604
> >   24649
> >   26259
> >   27615
> >   27543
> >   26217
> >
> > I tried with 256 concurrent flows, and same overall observation about
> > tlp not changing the numbers.
> > (In fact I am not even sure we arm RTO at all while doing a TCP_RR)
>
> In case you misunderstand, the CPU profiling I used is captured
> during 256 parallel TCP_STREAM.

When I asked you the workload, you gave me TCP_RR output, not TCP_STREAM :/

"A single netperf TCP_RR could _also_ confirm the improvement:"
Cong Wang Oct. 23, 2019, 6:30 p.m. UTC | #6
On Wed, Oct 23, 2019 at 11:14 AM Eric Dumazet <edumazet@google.com> wrote:
> > In case you misunderstand, the CPU profiling I used is captured
> > during 256 parallel TCP_STREAM.
>
> When I asked you the workload, you gave me TCP_RR output, not TCP_STREAM :/
>
> "A single netperf TCP_RR could _also_ confirm the improvement:"

I guess you didn't understand what "also" mean? The improvement
can be measured with both TCP_STREAM and TCP_RR, only the
CPU profiling is done with TCP_STREAM.

BTW, I just tested an unpatched kernel on a machine with 64 CPU's,
turning on/off TLP makes no difference there, so this is likely related
to the number of CPU's or hardware configurations. This explains
why you can't reproduce it on your side, so far I can only reproduce
it on one particular hardware platform too, but it is real.

If you need any other information, please let me know.

Thanks.
Eric Dumazet Oct. 23, 2019, 6:55 p.m. UTC | #7
On Wed, Oct 23, 2019 at 11:30 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Wed, Oct 23, 2019 at 11:14 AM Eric Dumazet <edumazet@google.com> wrote:
> > > In case you misunderstand, the CPU profiling I used is captured
> > > during 256 parallel TCP_STREAM.
> >
> > When I asked you the workload, you gave me TCP_RR output, not TCP_STREAM :/
> >
> > "A single netperf TCP_RR could _also_ confirm the improvement:"
>
> I guess you didn't understand what "also" mean? The improvement
> can be measured with both TCP_STREAM and TCP_RR, only the
> CPU profiling is done with TCP_STREAM.
>

Except that I could not measure any gain with TCP_RR, which is expected,
since TCP_RR will not use RTO and TLP at the same time.

If you found that we were setting both RTO and TLP when sending 1-byte message,
we need to fix the stack, instead of working around it.

> BTW, I just tested an unpatched kernel on a machine with 64 CPU's,
> turning on/off TLP makes no difference there, so this is likely related
> to the number of CPU's or hardware configurations. This explains
> why you can't reproduce it on your side, so far I can only reproduce
> it on one particular hardware platform too, but it is real.
>

I have hosts with 112 cpus, I can try on them, but it will take some time.
diff mbox series

Patch

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index e46958460739..2a129fc6b522 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -121,6 +121,7 @@  struct inet_connection_sock {
 		__u16		  last_seg_size; /* Size of last incoming segment	   */
 		__u16		  rcv_mss;	 /* MSS used for delayed ACK decisions	   */
 	} icsk_ack;
+	struct timer_list	  icsk_tlp_timer;
 	struct {
 		int		  enabled;
 
@@ -170,7 +171,8 @@  enum inet_csk_ack_state_t {
 void inet_csk_init_xmit_timers(struct sock *sk,
 			       void (*retransmit_handler)(struct timer_list *),
 			       void (*delack_handler)(struct timer_list *),
-			       void (*keepalive_handler)(struct timer_list *));
+			       void (*keepalive_handler)(struct timer_list *),
+			       void (*tlp_handler)(struct timer_list *));
 void inet_csk_clear_xmit_timers(struct sock *sk);
 
 static inline void inet_csk_schedule_ack(struct sock *sk)
@@ -226,7 +228,7 @@  static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 	}
 
 	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
-	    what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) {
+	    what == ICSK_TIME_REO_TIMEOUT) {
 		icsk->icsk_pending = what;
 		icsk->icsk_timeout = jiffies + when;
 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
@@ -234,6 +236,9 @@  static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 		icsk->icsk_ack.pending |= ICSK_ACK_TIMER;
 		icsk->icsk_ack.timeout = jiffies + when;
 		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+	} else if (what == ICSK_TIME_LOSS_PROBE) {
+		icsk->icsk_pending = what;
+		sk_reset_timer(sk, &icsk->icsk_tlp_timer, jiffies + when);
 	} else {
 		pr_debug("inet_csk BUG: unknown timer value\n");
 	}
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0ee5400e751c..3319d2b6b1c4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -331,6 +331,7 @@  ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 void tcp_release_cb(struct sock *sk);
 void tcp_wfree(struct sk_buff *skb);
 void tcp_write_timer_handler(struct sock *sk);
+void tcp_tail_loss_probe_handler(struct sock *sk);
 void tcp_delack_timer_handler(struct sock *sk);
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index c0b3672637c4..897a0469e8f1 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -246,7 +246,7 @@  void dccp_init_xmit_timers(struct sock *sk)
 	tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
 	timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0);
 	inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
-				  &dccp_keepalive_timer);
+				  &dccp_keepalive_timer, NULL);
 }
 
 static ktime_t dccp_timestamp_seed;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index eb30fc1770de..4b279a86308e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -503,12 +503,14 @@  EXPORT_SYMBOL(inet_csk_accept);
 void inet_csk_init_xmit_timers(struct sock *sk,
 			       void (*retransmit_handler)(struct timer_list *t),
 			       void (*delack_handler)(struct timer_list *t),
-			       void (*keepalive_handler)(struct timer_list *t))
+			       void (*keepalive_handler)(struct timer_list *t),
+			       void (*tlp_handler)(struct timer_list *t))
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
 	timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
+	timer_setup(&icsk->icsk_tlp_timer, tlp_handler, 0);
 	timer_setup(&sk->sk_timer, keepalive_handler, 0);
 	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
 }
@@ -522,6 +524,7 @@  void inet_csk_clear_xmit_timers(struct sock *sk)
 
 	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
 	sk_stop_timer(sk, &icsk->icsk_delack_timer);
+	sk_stop_timer(sk, &icsk->icsk_tlp_timer);
 	sk_stop_timer(sk, &sk->sk_timer);
 }
 EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7dc79b973e6e..e87fe87571a1 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -221,8 +221,7 @@  int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	}
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT) {
 		r->idiag_timer = 1;
 		r->idiag_retrans = icsk->icsk_retransmits;
 		r->idiag_expires =
@@ -232,6 +231,11 @@  int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		r->idiag_retrans = icsk->icsk_probes_out;
 		r->idiag_expires =
 			jiffies_to_msecs(icsk->icsk_timeout - jiffies);
+	} else if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+		r->idiag_timer = 1;
+		r->idiag_retrans = icsk->icsk_retransmits;
+		r->idiag_expires =
+			jiffies_to_msecs(icsk->icsk_tlp_timer.expires - jiffies);
 	} else if (timer_pending(&sk->sk_timer)) {
 		r->idiag_timer = 2;
 		r->idiag_retrans = icsk->icsk_probes_out;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a2e52ad7cdab..71cbb486ef85 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3008,8 +3008,12 @@  void tcp_rearm_rto(struct sock *sk)
 			 */
 			rto = usecs_to_jiffies(max_t(int, delta_us, 1));
 		}
-		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
-				     TCP_RTO_MAX, tcp_rtx_queue_head(sk));
+		if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+			tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, rto,
+					     TCP_RTO_MAX, tcp_rtx_queue_head(sk));
+		else
+			tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+					     TCP_RTO_MAX, tcp_rtx_queue_head(sk));
 	}
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c616f0ad1fa0..f5e34fe7b2e6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2434,13 +2434,15 @@  static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 	int state;
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
 		timer_expires	= icsk->icsk_timeout;
+	} else if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+		timer_active	= 1;
+		timer_expires	= icsk->icsk_tlp_timer.expires;
 	} else if (timer_pending(&sk->sk_timer)) {
 		timer_active	= 2;
 		timer_expires	= sk->sk_timer.expires;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9822820edca4..9038d7d61d0f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -882,6 +882,7 @@  void tcp_release_cb(struct sock *sk)
 
 	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
 		tcp_write_timer_handler(sk);
+		tcp_tail_loss_probe_handler(sk);
 		__sock_put(sk);
 	}
 	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index dd5a6317a801..f112aa979e8c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -591,9 +591,6 @@  void tcp_write_timer_handler(struct sock *sk)
 	case ICSK_TIME_REO_TIMEOUT:
 		tcp_rack_reo_timeout(sk);
 		break;
-	case ICSK_TIME_LOSS_PROBE:
-		tcp_send_loss_probe(sk);
-		break;
 	case ICSK_TIME_RETRANS:
 		icsk->icsk_pending = 0;
 		tcp_retransmit_timer(sk);
@@ -626,6 +623,42 @@  static void tcp_write_timer(struct timer_list *t)
 	sock_put(sk);
 }
 
+void tcp_tail_loss_probe_handler(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN) ||
+	    icsk->icsk_pending != ICSK_TIME_LOSS_PROBE)
+		goto out;
+
+	if (timer_pending(&icsk->icsk_tlp_timer))
+		goto out;
+
+	tcp_mstamp_refresh(tcp_sk(sk));
+	if (tcp_send_loss_probe(sk))
+		icsk->icsk_retransmits++;
+out:
+	sk_mem_reclaim(sk);
+}
+
+static void tcp_tail_loss_probe_timer(struct timer_list *t)
+{
+	struct inet_connection_sock *icsk =
+			from_timer(icsk, t, icsk_tlp_timer);
+	struct sock *sk = &icsk->icsk_inet.sk;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		tcp_tail_loss_probe_handler(sk);
+	} else {
+		/* delegate our work to tcp_release_cb() */
+		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
+			sock_hold(sk);
+	}
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
 void tcp_syn_ack_timeout(const struct request_sock *req)
 {
 	struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
@@ -758,7 +791,9 @@  static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
 void tcp_init_xmit_timers(struct sock *sk)
 {
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
-				  &tcp_keepalive_timer);
+				  &tcp_keepalive_timer,
+				  &tcp_tail_loss_probe_timer);
+
 	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_ABS_PINNED_SOFT);
 	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4804b6dc5e65..7cc8dbe412af 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1874,13 +1874,15 @@  static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	srcp  = ntohs(inet->inet_sport);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
 		timer_expires	= icsk->icsk_timeout;
+	} else if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+		timer_active	= 1;
+		timer_expires	= icsk->icsk_tlp_timer.expires;
 	} else if (timer_pending(&sp->sk_timer)) {
 		timer_active	= 2;
 		timer_expires	= sp->sk_timer.expires;