diff mbox series

net/tcp: introduce TRACE_EVENT for TCP/IPv4 state transition

Message ID 1510207298-14828-1-git-send-email-laoar.shao@gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series net/tcp: introduce TRACE_EVENT for TCP/IPv4 state transition | expand

Commit Message

Yafang Shao Nov. 9, 2017, 6:01 a.m. UTC
With this newly introduced TRACE_EVENT, it will be very easy to minotor
TCP/IPv4 state transition.

A new TRACE_SYSTEM named tcp is added, in which we can trace other TCP
event as well.

Two helpers are added,
static inline void __tcp_set_state(struct sock *sk, int state)
static inline void __sk_state_store(struct sock *sk, int newstate)

When do TCP/IPv4 state transition, we should use these two helpers or
use tcp_set_state() instead of assign a value to sk_state directly.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/net/tcp.h               | 16 ++++++++++++
 include/trace/events/tcp.h      | 58 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/inet_connection_sock.c |  9 ++++---
 net/ipv4/inet_hashtables.c      |  2 +-
 net/ipv4/tcp.c                  |  2 +-
 5 files changed, 82 insertions(+), 5 deletions(-)
 create mode 100644 include/trace/events/tcp.h

Comments

Alexei Starovoitov Nov. 9, 2017, 6:43 a.m. UTC | #1
On Thu, Nov 09, 2017 at 02:01:38PM +0800, Yafang Shao wrote:
> With this newly introduced TRACE_EVENT, it will be very easy to minotor
> TCP/IPv4 state transition.
> 
> A new TRACE_SYSTEM named tcp is added, in which we can trace other TCP
> event as well.
> 
> Two helpers are added,
> static inline void __tcp_set_state(struct sock *sk, int state)
> static inline void __sk_state_store(struct sock *sk, int newstate)
> 
> When do TCP/IPv4 state transition, we should use these two helpers or
> use tcp_set_state() instead of assign a value to sk_state directly.
> 
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>

when you submit a patch pls make it clear which tree this patch is targeting.
In this case it should have been net-next,
but the patch clearly conflicts with it.
Make sure to rebase.

> +/*
> + * To trace TCP state transition.
> + */
> +static inline void __tcp_set_state(struct sock *sk, int state)
> +{
> +	trace_tcp_set_state(sk, sk->sk_state, state);
> +	sk->sk_state = state;
> +}
> +
> +static inline void __sk_state_store(struct sock *sk, int newstate)
> +{
> +	trace_tcp_set_state(sk, sk->sk_state, newstate);
> +	sk_state_store(sk, newstate);
> +}
> +
>  void tcp_done(struct sock *sk);
>  
>  int tcp_abort(struct sock *sk, int err);
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> new file mode 100644
> index 0000000..abf65af
> --- /dev/null
> +++ b/include/trace/events/tcp.h
> @@ -0,0 +1,58 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM tcp
> +
> +#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_TCP_H
> +
> +#include <linux/tracepoint.h>
> +#include <net/sock.h>
> +#include <net/inet_timewait_sock.h>
> +#include <net/request_sock.h>
> +#include <net/inet_sock.h>
> +#include <net/tcp_states.h>
> +
> +TRACE_EVENT(tcp_set_state,
> +	TP_PROTO(struct sock *sk, int oldstate, int newstate),
> +	TP_ARGS(sk, oldstate, newstate),
> +
> +	TP_STRUCT__entry(
> +		__field(__be32, dst)
> +		__field(__be32, src)
> +		__field(__u16, dport)
> +		__field(__u16, sport)
> +		__field(int, oldstate)
> +		__field(int, newstate)
> +	),
> +
> +	TP_fast_assign(
> +		if (oldstate == TCP_TIME_WAIT) {
> +			__entry->dst = inet_twsk(sk)->tw_daddr;
> +			__entry->src = inet_twsk(sk)->tw_rcv_saddr;
> +			__entry->dport = ntohs(inet_twsk(sk)->tw_dport);
> +			__entry->sport = ntohs(inet_twsk(sk)->tw_sport);
> +		} else if (oldstate == TCP_NEW_SYN_RECV) {
> +			__entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr;
> +			__entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr;
> +			__entry->dport =
> +				ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port);
> +			__entry->sport = inet_rsk(inet_reqsk(sk))->ir_num;
> +		} else {
> +			__entry->dst = inet_sk(sk)->inet_daddr;
> +			__entry->src = inet_sk(sk)->inet_rcv_saddr;
> +			__entry->dport = ntohs(inet_sk(sk)->inet_dport);
> +			__entry->sport = ntohs(inet_sk(sk)->inet_sport);
> +		}
> +
> +		__entry->oldstate = oldstate;
> +		__entry->newstate = newstate;
> +	),
> +
> +	TP_printk("%08X:%04X %08X:%04X, %02x %02x",
> +		__entry->src, __entry->sport, __entry->dst, __entry->dport,
> +		__entry->oldstate, __entry->newstate)

direct %x of state is not allowed.
This has to use show_tcp_state_name() like it's done in trace_tcp_set_state

Also I'm missing the reason to introduce another tracepoint
that looks just like trace_tcp_set_state.
Yafang Shao Nov. 9, 2017, 6:50 a.m. UTC | #2
2017-11-09 14:43 GMT+08:00 Alexei Starovoitov <alexei.starovoitov@gmail.com>:
> On Thu, Nov 09, 2017 at 02:01:38PM +0800, Yafang Shao wrote:
>> With this newly introduced TRACE_EVENT, it will be very easy to minotor
>> TCP/IPv4 state transition.
>>
>> A new TRACE_SYSTEM named tcp is added, in which we can trace other TCP
>> event as well.
>>
>> Two helpers are added,
>> static inline void __tcp_set_state(struct sock *sk, int state)
>> static inline void __sk_state_store(struct sock *sk, int newstate)
>>
>> When do TCP/IPv4 state transition, we should use these two helpers or
>> use tcp_set_state() instead of assign a value to sk_state directly.
>>
>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
>
> when you submit a patch pls make it clear which tree this patch is targeting.
> In this case it should have been net-next,
> but the patch clearly conflicts with it.
> Make sure to rebase.

I do it based on this tree
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
OK I will check the net-next then.

>
>> +/*
>> + * To trace TCP state transition.
>> + */
>> +static inline void __tcp_set_state(struct sock *sk, int state)
>> +{
>> +     trace_tcp_set_state(sk, sk->sk_state, state);
>> +     sk->sk_state = state;
>> +}
>> +
>> +static inline void __sk_state_store(struct sock *sk, int newstate)
>> +{
>> +     trace_tcp_set_state(sk, sk->sk_state, newstate);
>> +     sk_state_store(sk, newstate);
>> +}
>> +
>>  void tcp_done(struct sock *sk);
>>
>>  int tcp_abort(struct sock *sk, int err);
>> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
>> new file mode 100644
>> index 0000000..abf65af
>> --- /dev/null
>> +++ b/include/trace/events/tcp.h
>> @@ -0,0 +1,58 @@
>> +#undef TRACE_SYSTEM
>> +#define TRACE_SYSTEM tcp
>> +
>> +#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
>> +#define _TRACE_TCP_H
>> +
>> +#include <linux/tracepoint.h>
>> +#include <net/sock.h>
>> +#include <net/inet_timewait_sock.h>
>> +#include <net/request_sock.h>
>> +#include <net/inet_sock.h>
>> +#include <net/tcp_states.h>
>> +
>> +TRACE_EVENT(tcp_set_state,
>> +     TP_PROTO(struct sock *sk, int oldstate, int newstate),
>> +     TP_ARGS(sk, oldstate, newstate),
>> +
>> +     TP_STRUCT__entry(
>> +             __field(__be32, dst)
>> +             __field(__be32, src)
>> +             __field(__u16, dport)
>> +             __field(__u16, sport)
>> +             __field(int, oldstate)
>> +             __field(int, newstate)
>> +     ),
>> +
>> +     TP_fast_assign(
>> +             if (oldstate == TCP_TIME_WAIT) {
>> +                     __entry->dst = inet_twsk(sk)->tw_daddr;
>> +                     __entry->src = inet_twsk(sk)->tw_rcv_saddr;
>> +                     __entry->dport = ntohs(inet_twsk(sk)->tw_dport);
>> +                     __entry->sport = ntohs(inet_twsk(sk)->tw_sport);
>> +             } else if (oldstate == TCP_NEW_SYN_RECV) {
>> +                     __entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr;
>> +                     __entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr;
>> +                     __entry->dport =
>> +                             ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port);
>> +                     __entry->sport = inet_rsk(inet_reqsk(sk))->ir_num;
>> +             } else {
>> +                     __entry->dst = inet_sk(sk)->inet_daddr;
>> +                     __entry->src = inet_sk(sk)->inet_rcv_saddr;
>> +                     __entry->dport = ntohs(inet_sk(sk)->inet_dport);
>> +                     __entry->sport = ntohs(inet_sk(sk)->inet_sport);
>> +             }
>> +
>> +             __entry->oldstate = oldstate;
>> +             __entry->newstate = newstate;
>> +     ),
>> +
>> +     TP_printk("%08X:%04X %08X:%04X, %02x %02x",
>> +             __entry->src, __entry->sport, __entry->dst, __entry->dport,
>> +             __entry->oldstate, __entry->newstate)
>
> direct %x of state is not allowed.
> This has to use show_tcp_state_name() like it's done in trace_tcp_set_state
>
> Also I'm missing the reason to introduce another tracepoint
> that looks just like trace_tcp_set_state.
>

I will check net-next

Thanks
Yafang
Steven Rostedt Nov. 9, 2017, 6:18 p.m. UTC | #3
On Thu, 9 Nov 2017 15:43:29 +0900
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> > +TRACE_EVENT(tcp_set_state,
> > +	TP_PROTO(struct sock *sk, int oldstate, int newstate),
> > +	TP_ARGS(sk, oldstate, newstate),
> > +
> > +	TP_STRUCT__entry(
> > +		__field(__be32, dst)
> > +		__field(__be32, src)
> > +		__field(__u16, dport)
> > +		__field(__u16, sport)
> > +		__field(int, oldstate)
> > +		__field(int, newstate)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		if (oldstate == TCP_TIME_WAIT) {
> > +			__entry->dst = inet_twsk(sk)->tw_daddr;
> > +			__entry->src = inet_twsk(sk)->tw_rcv_saddr;
> > +			__entry->dport = ntohs(inet_twsk(sk)->tw_dport);
> > +			__entry->sport = ntohs(inet_twsk(sk)->tw_sport);
> > +		} else if (oldstate == TCP_NEW_SYN_RECV) {
> > +			__entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr;
> > +			__entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr;
> > +			__entry->dport =
> > +				ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port);
> > +			__entry->sport = inet_rsk(inet_reqsk(sk))->ir_num;
> > +		} else {
> > +			__entry->dst = inet_sk(sk)->inet_daddr;
> > +			__entry->src = inet_sk(sk)->inet_rcv_saddr;
> > +			__entry->dport = ntohs(inet_sk(sk)->inet_dport);
> > +			__entry->sport = ntohs(inet_sk(sk)->inet_sport);
> > +		}
> > +
> > +		__entry->oldstate = oldstate;
> > +		__entry->newstate = newstate;
> > +	),
> > +
> > +	TP_printk("%08X:%04X %08X:%04X, %02x %02x",
> > +		__entry->src, __entry->sport, __entry->dst, __entry->dport,
> > +		__entry->oldstate, __entry->newstate)  
> 
> direct %x of state is not allowed.
> This has to use show_tcp_state_name() like it's done in trace_tcp_set_state

Hmm, I need to look at trace_tcp_set_state. I'm guessing it is in
net-next too?

> 
> Also I'm missing the reason to introduce another tracepoint
> that looks just like trace_tcp_set_state.

I want to make sure that perf and trace-cmd can parse the TP_printk(),
if it is having helper functions like that in the TP_printk() part,
then the libtraceevent needs to be aware of that.

-- Steve
Song Liu Nov. 9, 2017, 6:34 p.m. UTC | #4
> On Nov 9, 2017, at 10:18 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Thu, 9 Nov 2017 15:43:29 +0900
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
>>> +TRACE_EVENT(tcp_set_state,
>>> +	TP_PROTO(struct sock *sk, int oldstate, int newstate),
>>> +	TP_ARGS(sk, oldstate, newstate),
>>> +
>>> +	TP_STRUCT__entry(
>>> +		__field(__be32, dst)
>>> +		__field(__be32, src)
>>> +		__field(__u16, dport)
>>> +		__field(__u16, sport)
>>> +		__field(int, oldstate)
>>> +		__field(int, newstate)
>>> +	),
>>> +
>>> +	TP_fast_assign(
>>> +		if (oldstate == TCP_TIME_WAIT) {
>>> +			__entry->dst = inet_twsk(sk)->tw_daddr;
>>> +			__entry->src = inet_twsk(sk)->tw_rcv_saddr;
>>> +			__entry->dport = ntohs(inet_twsk(sk)->tw_dport);
>>> +			__entry->sport = ntohs(inet_twsk(sk)->tw_sport);
>>> +		} else if (oldstate == TCP_NEW_SYN_RECV) {
>>> +			__entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr;
>>> +			__entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr;
>>> +			__entry->dport =
>>> +				ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port);
>>> +			__entry->sport = inet_rsk(inet_reqsk(sk))->ir_num;
>>> +		} else {
>>> +			__entry->dst = inet_sk(sk)->inet_daddr;
>>> +			__entry->src = inet_sk(sk)->inet_rcv_saddr;
>>> +			__entry->dport = ntohs(inet_sk(sk)->inet_dport);
>>> +			__entry->sport = ntohs(inet_sk(sk)->inet_sport);
>>> +		}
>>> +
>>> +		__entry->oldstate = oldstate;
>>> +		__entry->newstate = newstate;
>>> +	),
>>> +
>>> +	TP_printk("%08X:%04X %08X:%04X, %02x %02x",
>>> +		__entry->src, __entry->sport, __entry->dst, __entry->dport,
>>> +		__entry->oldstate, __entry->newstate)  
>> 
>> direct %x of state is not allowed.
>> This has to use show_tcp_state_name() like it's done in trace_tcp_set_state
> 
> Hmm, I need to look at trace_tcp_set_state. I'm guessing it is in
> net-next too?

Yes, in net-next. There are 6 tracepoints under tcp group:

  tcp_destroy_sock  
  tcp_receive_reset  
  tcp_retransmit_skb  
  tcp_retransmit_synack  
  tcp_send_reset  
  tcp_set_state

They are all added recently.
 
> 
>> 
>> Also I'm missing the reason to introduce another tracepoint
>> that looks just like trace_tcp_set_state.
> 
> I want to make sure that perf and trace-cmd can parse the TP_printk(),
> if it is having helper functions like that in the TP_printk() part,
> then the libtraceevent needs to be aware of that.
> 

tcp_set_state uses __print_symbolic to show state in text format. I found
trace-cmd cannot parse that part:

[011] 147338.660560: tcp_set_state:        sport=16262 dport=48346 \
    saddr=127.0.0.6 daddr=127.0.0.6 saddrv6=2401:db00:30:317e:face:0:1f:0 \
    daddrv6=2401:db00:30:31e5:face:0:7f:0 oldstate= newstate=

Other parts of the output looks good to me.

Thanks,
Song
Song Liu Nov. 9, 2017, 11:40 p.m. UTC | #5
> On Nov 9, 2017, at 10:34 AM, Song Liu <songliubraving@fb.com> wrote:
> 
>> 
>> On Nov 9, 2017, at 10:18 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
>> 
>> On Thu, 9 Nov 2017 15:43:29 +0900
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>> 
>>>> +TRACE_EVENT(tcp_set_state,
>>>> +	TP_PROTO(struct sock *sk, int oldstate, int newstate),
>>>> +	TP_ARGS(sk, oldstate, newstate),
>>>> +
>>>> +	TP_STRUCT__entry(
>>>> +		__field(__be32, dst)
>>>> +		__field(__be32, src)
>>>> +		__field(__u16, dport)
>>>> +		__field(__u16, sport)
>>>> +		__field(int, oldstate)
>>>> +		__field(int, newstate)
>>>> +	),
>>>> +
>>>> +	TP_fast_assign(
>>>> +		if (oldstate == TCP_TIME_WAIT) {
>>>> +			__entry->dst = inet_twsk(sk)->tw_daddr;
>>>> +			__entry->src = inet_twsk(sk)->tw_rcv_saddr;
>>>> +			__entry->dport = ntohs(inet_twsk(sk)->tw_dport);
>>>> +			__entry->sport = ntohs(inet_twsk(sk)->tw_sport);
>>>> +		} else if (oldstate == TCP_NEW_SYN_RECV) {
>>>> +			__entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr;
>>>> +			__entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr;
>>>> +			__entry->dport =
>>>> +				ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port);
>>>> +			__entry->sport = inet_rsk(inet_reqsk(sk))->ir_num;
>>>> +		} else {
>>>> +			__entry->dst = inet_sk(sk)->inet_daddr;
>>>> +			__entry->src = inet_sk(sk)->inet_rcv_saddr;
>>>> +			__entry->dport = ntohs(inet_sk(sk)->inet_dport);
>>>> +			__entry->sport = ntohs(inet_sk(sk)->inet_sport);
>>>> +		}
>>>> +
>>>> +		__entry->oldstate = oldstate;
>>>> +		__entry->newstate = newstate;
>>>> +	),
>>>> +
>>>> +	TP_printk("%08X:%04X %08X:%04X, %02x %02x",
>>>> +		__entry->src, __entry->sport, __entry->dst, __entry->dport,
>>>> +		__entry->oldstate, __entry->newstate)  
>>> 
>>> direct %x of state is not allowed.
>>> This has to use show_tcp_state_name() like it's done in trace_tcp_set_state
>> 
>> Hmm, I need to look at trace_tcp_set_state. I'm guessing it is in
>> net-next too?
> 
> Yes, in net-next. There are 6 tracepoints under tcp group:
> 
>  tcp_destroy_sock  
>  tcp_receive_reset  
>  tcp_retransmit_skb  
>  tcp_retransmit_synack  
>  tcp_send_reset  
>  tcp_set_state
> 
> They are all added recently.
> 
>> 
>>> 
>>> Also I'm missing the reason to introduce another tracepoint
>>> that looks just like trace_tcp_set_state.
>> 
>> I want to make sure that perf and trace-cmd can parse the TP_printk(),
>> if it is having helper functions like that in the TP_printk() part,
>> then the libtraceevent needs to be aware of that.
>> 
> 
> tcp_set_state uses __print_symbolic to show state in text format. I found
> trace-cmd cannot parse that part:
> 
> [011] 147338.660560: tcp_set_state:        sport=16262 dport=48346 \
>    saddr=127.0.0.6 daddr=127.0.0.6 saddrv6=2401:db00:30:317e:face:0:1f:0 \
>    daddrv6=2401:db00:30:31e5:face:0:7f:0 oldstate= newstate=
> 
> Other parts of the output looks good to me.
> 
> Thanks,
> Song

I am not sure whether this is the best approach, but the following patch 
fixes the output of perf:

     0.44%  sport=16262 dport=39362 saddr=127.0.0.6 daddr=127.0.0.6 \
saddrv6=2401:db00:30:317e:face:0:1f:0 daddrv6=2401:db00:30:31e5:face:0:7f:0 \
oldstate=TCP_CLOSE_WAIT newstate=TCP_LAST_ACK

Thanks,
Song


From 4b7e27631a4c7df96a38223a95ee3ede2f5f2d19 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 9 Nov 2017 15:30:07 -0800
Subject: [PATCH] libtraceevent: add flags for tcp state names

Names of TCP states are added to flags in event-parse.c.

The names are used to print symbolic names in tracepoint:
tcp/tcp_set_state.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 tools/lib/traceevent/event-parse.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 7ce724f..4972dc2 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -3790,6 +3790,20 @@ static const struct flag flags[] = {

        { "HRTIMER_NORESTART", 0 },
        { "HRTIMER_RESTART", 1 },
+
+       /* tcp state names, see include/net/tcp_states.h */
+       { "TCP_ESTABLISHED", 1 },
+       { "TCP_SYN_SENT", 2 },
+       { "TCP_SYN_RECV", 3 },
+       { "TCP_FIN_WAIT1", 4 },
+       { "TCP_FIN_WAIT2", 5 },
+       { "TCP_TIME_WAIT", 6 },
+       { "TCP_CLOSE", 7 },
+       { "TCP_CLOSE_WAIT", 8 },
+       { "TCP_LAST_ACK", 9 },
+       { "TCP_LISTEN", 10 },
+       { "TCP_CLOSING", 11 },
+       { "TCP_NEW_SYN_RECV", 12 },
 };

 static long long eval_flag(const char *flag)
--
2.9.5
Steven Rostedt Nov. 10, 2017, 12:42 a.m. UTC | #6
On Thu, 9 Nov 2017 23:40:13 +0000
Song Liu <songliubraving@fb.com> wrote:

> > tcp_set_state uses __print_symbolic to show state in text format. I found
> > trace-cmd cannot parse that part:
> > 
> > [011] 147338.660560: tcp_set_state:        sport=16262 dport=48346 \
> >    saddr=127.0.0.6 daddr=127.0.0.6 saddrv6=2401:db00:30:317e:face:0:1f:0 \
> >    daddrv6=2401:db00:30:31e5:face:0:7f:0 oldstate= newstate=

The latest trace-cmd does show oldstate=0xa newstate=0x7, since I fixed
it so undefined symbols and flags are displayed.

> > 
> > Other parts of the output looks good to me.
> > 
> > Thanks,
> > Song  
> 
> I am not sure whether this is the best approach, but the following patch 
> fixes the output of perf:

No it's not the best approach. But the below patch is ;-)

> 
>      0.44%  sport=16262 dport=39362 saddr=127.0.0.6 daddr=127.0.0.6 \
> saddrv6=2401:db00:30:317e:face:0:1f:0 daddrv6=2401:db00:30:31e5:face:0:7f:0 \
> oldstate=TCP_CLOSE_WAIT newstate=TCP_LAST_ACK
> 

I'll send a formal patch if you all approve.

-- Steve

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 07cccca6cbf1..62e5bad7901f 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -9,21 +9,36 @@
 #include <linux/tracepoint.h>
 #include <net/ipv6.h>
 
+#define tcp_state_names				\
+	EM(TCP_ESTABLISHED)			\
+	EM(TCP_SYN_SENT)			\
+	EM(TCP_SYN_RECV)			\
+	EM(TCP_FIN_WAIT1)			\
+	EM(TCP_FIN_WAIT2)			\
+	EM(TCP_TIME_WAIT)			\
+	EM(TCP_CLOSE)				\
+	EM(TCP_CLOSE_WAIT)			\
+	EM(TCP_LAST_ACK)			\
+	EM(TCP_LISTEN)				\
+	EM(TCP_CLOSING)				\
+	EMe(TCP_NEW_SYN_RECV)
+
+/* enums need to be exported to user space */
+#undef EM
+#undef EMe
+#define EM(a)         TRACE_DEFINE_ENUM(a);
+#define EMe(a)        TRACE_DEFINE_ENUM(a);
+
+tcp_state_names
+
+#undef EM
+#undef EMe
+#define EM(a)         tcp_state_name(a),
+#define EMe(a)        tcp_state_name(a)
+
 #define tcp_state_name(state)	{ state, #state }
 #define show_tcp_state_name(val)			\
-	__print_symbolic(val,				\
-		tcp_state_name(TCP_ESTABLISHED),	\
-		tcp_state_name(TCP_SYN_SENT),		\
-		tcp_state_name(TCP_SYN_RECV),		\
-		tcp_state_name(TCP_FIN_WAIT1),		\
-		tcp_state_name(TCP_FIN_WAIT2),		\
-		tcp_state_name(TCP_TIME_WAIT),		\
-		tcp_state_name(TCP_CLOSE),		\
-		tcp_state_name(TCP_CLOSE_WAIT),		\
-		tcp_state_name(TCP_LAST_ACK),		\
-		tcp_state_name(TCP_LISTEN),		\
-		tcp_state_name(TCP_CLOSING),		\
-		tcp_state_name(TCP_NEW_SYN_RECV))
+	__print_symbolic(val, tcp_state_names)
 
 /*
  * tcp event with arguments sk and skb
diff mbox series

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 89974c5..a8336d3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -49,6 +49,7 @@ 
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/bpf-cgroup.h>
+#include <trace/events/tcp.h>
 
 extern struct inet_hashinfo tcp_hashinfo;
 
@@ -1284,6 +1285,21 @@  static inline bool tcp_checksum_complete(struct sk_buff *skb)
 #endif
 void tcp_set_state(struct sock *sk, int state);
 
+/*
+ * To trace TCP state transition.
+ */
+static inline void __tcp_set_state(struct sock *sk, int state)
+{
+	trace_tcp_set_state(sk, sk->sk_state, state);
+	sk->sk_state = state;
+}
+
+static inline void __sk_state_store(struct sock *sk, int newstate)
+{
+	trace_tcp_set_state(sk, sk->sk_state, newstate);
+	sk_state_store(sk, newstate);
+}
+
 void tcp_done(struct sock *sk);
 
 int tcp_abort(struct sock *sk, int err);
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
new file mode 100644
index 0000000..abf65af
--- /dev/null
+++ b/include/trace/events/tcp.h
@@ -0,0 +1,58 @@ 
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM tcp
+
+#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TCP_H
+
+#include <linux/tracepoint.h>
+#include <net/sock.h>
+#include <net/inet_timewait_sock.h>
+#include <net/request_sock.h>
+#include <net/inet_sock.h>
+#include <net/tcp_states.h>
+
+TRACE_EVENT(tcp_set_state,
+	TP_PROTO(struct sock *sk, int oldstate, int newstate),
+	TP_ARGS(sk, oldstate, newstate),
+
+	TP_STRUCT__entry(
+		__field(__be32, dst)
+		__field(__be32, src)
+		__field(__u16, dport)
+		__field(__u16, sport)
+		__field(int, oldstate)
+		__field(int, newstate)
+	),
+
+	TP_fast_assign(
+		if (oldstate == TCP_TIME_WAIT) {
+			__entry->dst = inet_twsk(sk)->tw_daddr;
+			__entry->src = inet_twsk(sk)->tw_rcv_saddr;
+			__entry->dport = ntohs(inet_twsk(sk)->tw_dport);
+			__entry->sport = ntohs(inet_twsk(sk)->tw_sport);
+		} else if (oldstate == TCP_NEW_SYN_RECV) {
+			__entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr;
+			__entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr;
+			__entry->dport =
+				ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port);
+			__entry->sport = inet_rsk(inet_reqsk(sk))->ir_num;
+		} else {
+			__entry->dst = inet_sk(sk)->inet_daddr;
+			__entry->src = inet_sk(sk)->inet_rcv_saddr;
+			__entry->dport = ntohs(inet_sk(sk)->inet_dport);
+			__entry->sport = ntohs(inet_sk(sk)->inet_sport);
+		}
+
+		__entry->oldstate = oldstate;
+		__entry->newstate = newstate;
+	),
+
+	TP_printk("%08X:%04X %08X:%04X, %02x %02x",
+		__entry->src, __entry->sport, __entry->dst, __entry->dport,
+		__entry->oldstate, __entry->newstate)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c039c93..307a046 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -27,6 +27,9 @@ 
 #include <net/sock_reuseport.h>
 #include <net/addrconf.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/tcp.h>
+
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
 EXPORT_SYMBOL(inet_csk_timer_bug_msg);
@@ -786,7 +789,7 @@  struct sock *inet_csk_clone_lock(const struct sock *sk,
 	if (newsk) {
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 
-		newsk->sk_state = TCP_SYN_RECV;
+		__tcp_set_state(newsk, TCP_SYN_RECV);
 		newicsk->icsk_bind_hash = NULL;
 
 		inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
@@ -880,7 +883,7 @@  int inet_csk_listen_start(struct sock *sk, int backlog)
 	 * It is OK, because this socket enters to hash table only
 	 * after validation is complete.
 	 */
-	sk_state_store(sk, TCP_LISTEN);
+	__sk_state_store(sk, TCP_LISTEN);
 	if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
 		inet->inet_sport = htons(inet->inet_num);
 
@@ -891,7 +894,7 @@  int inet_csk_listen_start(struct sock *sk, int backlog)
 			return 0;
 	}
 
-	sk->sk_state = TCP_CLOSE;
+	__tcp_set_state(sk, TCP_CLOSE);
 	return err;
 }
 EXPORT_SYMBOL_GPL(inet_csk_listen_start);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 597bb4c..0f45d456 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -430,7 +430,7 @@  bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	} else {
 		percpu_counter_inc(sk->sk_prot->orphan_count);
-		sk->sk_state = TCP_CLOSE;
+		__tcp_set_state(sk, TCP_CLOSE);
 		sock_set_flag(sk, SOCK_DEAD);
 		inet_csk_destroy_sock(sk);
 	}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5091402..984dce6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2040,7 +2040,7 @@  void tcp_set_state(struct sock *sk, int state)
 	/* Change state AFTER socket is unhashed to avoid closed
 	 * socket sitting in hash tables.
 	 */
-	sk_state_store(sk, state);
+	__sk_state_store(sk, state);
 
 #ifdef STATE_TRACE
 	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);