Message ID | 1510207298-14828-1-git-send-email-laoar.shao@gmail.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
Series | net/tcp: introduce TRACE_EVENT for TCP/IPv4 state transition | expand |
On Thu, Nov 09, 2017 at 02:01:38PM +0800, Yafang Shao wrote: > With this newly introduced TRACE_EVENT, it will be very easy to minotor > TCP/IPv4 state transition. > > A new TRACE_SYSTEM named tcp is added, in which we can trace other TCP > event as well. > > Two helpers are added, > static inline void __tcp_set_state(struct sock *sk, int state) > static inline void __sk_state_store(struct sock *sk, int newstate) > > When do TCP/IPv4 state transition, we should use these two helpers or > use tcp_set_state() instead of assign a value to sk_state directly. > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com> when you submit a patch pls make it clear which tree this patch is targeting. In this case it should have been net-next, but the patch clearly conflicts with it. Make sure to rebase. > +/* > + * To trace TCP state transition. > + */ > +static inline void __tcp_set_state(struct sock *sk, int state) > +{ > + trace_tcp_set_state(sk, sk->sk_state, state); > + sk->sk_state = state; > +} > + > +static inline void __sk_state_store(struct sock *sk, int newstate) > +{ > + trace_tcp_set_state(sk, sk->sk_state, newstate); > + sk_state_store(sk, newstate); > +} > + > void tcp_done(struct sock *sk); > > int tcp_abort(struct sock *sk, int err); > diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h > new file mode 100644 > index 0000000..abf65af > --- /dev/null > +++ b/include/trace/events/tcp.h > @@ -0,0 +1,58 @@ > +#undef TRACE_SYSTEM > +#define TRACE_SYSTEM tcp > + > +#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) > +#define _TRACE_TCP_H > + > +#include <linux/tracepoint.h> > +#include <net/sock.h> > +#include <net/inet_timewait_sock.h> > +#include <net/request_sock.h> > +#include <net/inet_sock.h> > +#include <net/tcp_states.h> > + > +TRACE_EVENT(tcp_set_state, > + TP_PROTO(struct sock *sk, int oldstate, int newstate), > + TP_ARGS(sk, oldstate, newstate), > + > + TP_STRUCT__entry( > + __field(__be32, dst) > + __field(__be32, src) > + __field(__u16, dport) > + __field(__u16, sport) > + __field(int, oldstate) > + __field(int, newstate) > + ), > + > + TP_fast_assign( > + if (oldstate == TCP_TIME_WAIT) { > + __entry->dst = inet_twsk(sk)->tw_daddr; > + __entry->src = inet_twsk(sk)->tw_rcv_saddr; > + __entry->dport = ntohs(inet_twsk(sk)->tw_dport); > + __entry->sport = ntohs(inet_twsk(sk)->tw_sport); > + } else if (oldstate == TCP_NEW_SYN_RECV) { > + __entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr; > + __entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr; > + __entry->dport = > + ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port); > + __entry->sport = inet_rsk(inet_reqsk(sk))->ir_num; > + } else { > + __entry->dst = inet_sk(sk)->inet_daddr; > + __entry->src = inet_sk(sk)->inet_rcv_saddr; > + __entry->dport = ntohs(inet_sk(sk)->inet_dport); > + __entry->sport = ntohs(inet_sk(sk)->inet_sport); > + } > + > + __entry->oldstate = oldstate; > + __entry->newstate = newstate; > + ), > + > + TP_printk("%08X:%04X %08X:%04X, %02x %02x", > + __entry->src, __entry->sport, __entry->dst, __entry->dport, > + __entry->oldstate, __entry->newstate) direct %x of state is not allowed. This has to use show_tcp_state_name() like it's done in trace_tcp_set_state Also I'm missing the reason to introduce another tracepoint that looks just like trace_tcp_set_state.
2017-11-09 14:43 GMT+08:00 Alexei Starovoitov <alexei.starovoitov@gmail.com>: > On Thu, Nov 09, 2017 at 02:01:38PM +0800, Yafang Shao wrote: >> With this newly introduced TRACE_EVENT, it will be very easy to minotor >> TCP/IPv4 state transition. >> >> A new TRACE_SYSTEM named tcp is added, in which we can trace other TCP >> event as well. >> >> Two helpers are added, >> static inline void __tcp_set_state(struct sock *sk, int state) >> static inline void __sk_state_store(struct sock *sk, int newstate) >> >> When do TCP/IPv4 state transition, we should use these two helpers or >> use tcp_set_state() instead of assign a value to sk_state directly. >> >> Signed-off-by: Yafang Shao <laoar.shao@gmail.com> > > when you submit a patch pls make it clear which tree this patch is targeting. > In this case it should have been net-next, > but the patch clearly conflicts with it. > Make sure to rebase. I do it based on this tree git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git OK I will check the net-next then. > >> +/* >> + * To trace TCP state transition. >> + */ >> +static inline void __tcp_set_state(struct sock *sk, int state) >> +{ >> + trace_tcp_set_state(sk, sk->sk_state, state); >> + sk->sk_state = state; >> +} >> + >> +static inline void __sk_state_store(struct sock *sk, int newstate) >> +{ >> + trace_tcp_set_state(sk, sk->sk_state, newstate); >> + sk_state_store(sk, newstate); >> +} >> + >> void tcp_done(struct sock *sk); >> >> int tcp_abort(struct sock *sk, int err); >> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h >> new file mode 100644 >> index 0000000..abf65af >> --- /dev/null >> +++ b/include/trace/events/tcp.h >> @@ -0,0 +1,58 @@ >> +#undef TRACE_SYSTEM >> +#define TRACE_SYSTEM tcp >> + >> +#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) >> +#define _TRACE_TCP_H >> + >> +#include <linux/tracepoint.h> >> +#include <net/sock.h> >> +#include <net/inet_timewait_sock.h> >> +#include <net/request_sock.h> >> +#include <net/inet_sock.h> >> +#include <net/tcp_states.h> >> + >> +TRACE_EVENT(tcp_set_state, >> + TP_PROTO(struct sock *sk, int oldstate, int newstate), >> + TP_ARGS(sk, oldstate, newstate), >> + >> + TP_STRUCT__entry( >> + __field(__be32, dst) >> + __field(__be32, src) >> + __field(__u16, dport) >> + __field(__u16, sport) >> + __field(int, oldstate) >> + __field(int, newstate) >> + ), >> + >> + TP_fast_assign( >> + if (oldstate == TCP_TIME_WAIT) { >> + __entry->dst = inet_twsk(sk)->tw_daddr; >> + __entry->src = inet_twsk(sk)->tw_rcv_saddr; >> + __entry->dport = ntohs(inet_twsk(sk)->tw_dport); >> + __entry->sport = ntohs(inet_twsk(sk)->tw_sport); >> + } else if (oldstate == TCP_NEW_SYN_RECV) { >> + __entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr; >> + __entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr; >> + __entry->dport = >> + ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port); >> + __entry->sport = inet_rsk(inet_reqsk(sk))->ir_num; >> + } else { >> + __entry->dst = inet_sk(sk)->inet_daddr; >> + __entry->src = inet_sk(sk)->inet_rcv_saddr; >> + __entry->dport = ntohs(inet_sk(sk)->inet_dport); >> + __entry->sport = ntohs(inet_sk(sk)->inet_sport); >> + } >> + >> + __entry->oldstate = oldstate; >> + __entry->newstate = newstate; >> + ), >> + >> + TP_printk("%08X:%04X %08X:%04X, %02x %02x", >> + __entry->src, __entry->sport, __entry->dst, __entry->dport, >> + __entry->oldstate, __entry->newstate) > > direct %x of state is not allowed. > This has to use show_tcp_state_name() like it's done in trace_tcp_set_state > > Also I'm missing the reason to introduce another tracepoint > that looks just like trace_tcp_set_state. > I will check net-next Thanks Yafang
On Thu, 9 Nov 2017 15:43:29 +0900 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > +TRACE_EVENT(tcp_set_state, > > + TP_PROTO(struct sock *sk, int oldstate, int newstate), > > + TP_ARGS(sk, oldstate, newstate), > > + > > + TP_STRUCT__entry( > > + __field(__be32, dst) > > + __field(__be32, src) > > + __field(__u16, dport) > > + __field(__u16, sport) > > + __field(int, oldstate) > > + __field(int, newstate) > > + ), > > + > > + TP_fast_assign( > > + if (oldstate == TCP_TIME_WAIT) { > > + __entry->dst = inet_twsk(sk)->tw_daddr; > > + __entry->src = inet_twsk(sk)->tw_rcv_saddr; > > + __entry->dport = ntohs(inet_twsk(sk)->tw_dport); > > + __entry->sport = ntohs(inet_twsk(sk)->tw_sport); > > + } else if (oldstate == TCP_NEW_SYN_RECV) { > > + __entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr; > > + __entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr; > > + __entry->dport = > > + ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port); > > + __entry->sport = inet_rsk(inet_reqsk(sk))->ir_num; > > + } else { > > + __entry->dst = inet_sk(sk)->inet_daddr; > > + __entry->src = inet_sk(sk)->inet_rcv_saddr; > > + __entry->dport = ntohs(inet_sk(sk)->inet_dport); > > + __entry->sport = ntohs(inet_sk(sk)->inet_sport); > > + } > > + > > + __entry->oldstate = oldstate; > > + __entry->newstate = newstate; > > + ), > > + > > + TP_printk("%08X:%04X %08X:%04X, %02x %02x", > > + __entry->src, __entry->sport, __entry->dst, __entry->dport, > > + __entry->oldstate, __entry->newstate) > > direct %x of state is not allowed. > This has to use show_tcp_state_name() like it's done in trace_tcp_set_state Hmm, I need to look at trace_tcp_set_state. I'm guessing it is in net-next too? > > Also I'm missing the reason to introduce another tracepoint > that looks just like trace_tcp_set_state. I want to make sure that perf and trace-cmd can parse the TP_printk(), if it is having helper functions like that in the TP_printk() part, then the libtraceevent needs to be aware of that. -- Steve
> On Nov 9, 2017, at 10:18 AM, Steven Rostedt <rostedt@goodmis.org> wrote: > > On Thu, 9 Nov 2017 15:43:29 +0900 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > >>> +TRACE_EVENT(tcp_set_state, >>> + TP_PROTO(struct sock *sk, int oldstate, int newstate), >>> + TP_ARGS(sk, oldstate, newstate), >>> + >>> + TP_STRUCT__entry( >>> + __field(__be32, dst) >>> + __field(__be32, src) >>> + __field(__u16, dport) >>> + __field(__u16, sport) >>> + __field(int, oldstate) >>> + __field(int, newstate) >>> + ), >>> + >>> + TP_fast_assign( >>> + if (oldstate == TCP_TIME_WAIT) { >>> + __entry->dst = inet_twsk(sk)->tw_daddr; >>> + __entry->src = inet_twsk(sk)->tw_rcv_saddr; >>> + __entry->dport = ntohs(inet_twsk(sk)->tw_dport); >>> + __entry->sport = ntohs(inet_twsk(sk)->tw_sport); >>> + } else if (oldstate == TCP_NEW_SYN_RECV) { >>> + __entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr; >>> + __entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr; >>> + __entry->dport = >>> + ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port); >>> + __entry->sport = inet_rsk(inet_reqsk(sk))->ir_num; >>> + } else { >>> + __entry->dst = inet_sk(sk)->inet_daddr; >>> + __entry->src = inet_sk(sk)->inet_rcv_saddr; >>> + __entry->dport = ntohs(inet_sk(sk)->inet_dport); >>> + __entry->sport = ntohs(inet_sk(sk)->inet_sport); >>> + } >>> + >>> + __entry->oldstate = oldstate; >>> + __entry->newstate = newstate; >>> + ), >>> + >>> + TP_printk("%08X:%04X %08X:%04X, %02x %02x", >>> + __entry->src, __entry->sport, __entry->dst, __entry->dport, >>> + __entry->oldstate, __entry->newstate) >> >> direct %x of state is not allowed. >> This has to use show_tcp_state_name() like it's done in trace_tcp_set_state > > Hmm, I need to look at trace_tcp_set_state. I'm guessing it is in > net-next too? Yes, in net-next. There are 6 tracepoints under tcp group: tcp_destroy_sock tcp_receive_reset tcp_retransmit_skb tcp_retransmit_synack tcp_send_reset tcp_set_state They are all added recently. > >> >> Also I'm missing the reason to introduce another tracepoint >> that looks just like trace_tcp_set_state. > > I want to make sure that perf and trace-cmd can parse the TP_printk(), > if it is having helper functions like that in the TP_printk() part, > then the libtraceevent needs to be aware of that. > tcp_set_state uses __print_symbolic to show state in text format. I found trace-cmd cannot parse that part: [011] 147338.660560: tcp_set_state: sport=16262 dport=48346 \ saddr=127.0.0.6 daddr=127.0.0.6 saddrv6=2401:db00:30:317e:face:0:1f:0 \ daddrv6=2401:db00:30:31e5:face:0:7f:0 oldstate= newstate= Other parts of the output looks good to me. Thanks, Song
> On Nov 9, 2017, at 10:34 AM, Song Liu <songliubraving@fb.com> wrote: > >> >> On Nov 9, 2017, at 10:18 AM, Steven Rostedt <rostedt@goodmis.org> wrote: >> >> On Thu, 9 Nov 2017 15:43:29 +0900 >> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: >> >>>> +TRACE_EVENT(tcp_set_state, >>>> + TP_PROTO(struct sock *sk, int oldstate, int newstate), >>>> + TP_ARGS(sk, oldstate, newstate), >>>> + >>>> + TP_STRUCT__entry( >>>> + __field(__be32, dst) >>>> + __field(__be32, src) >>>> + __field(__u16, dport) >>>> + __field(__u16, sport) >>>> + __field(int, oldstate) >>>> + __field(int, newstate) >>>> + ), >>>> + >>>> + TP_fast_assign( >>>> + if (oldstate == TCP_TIME_WAIT) { >>>> + __entry->dst = inet_twsk(sk)->tw_daddr; >>>> + __entry->src = inet_twsk(sk)->tw_rcv_saddr; >>>> + __entry->dport = ntohs(inet_twsk(sk)->tw_dport); >>>> + __entry->sport = ntohs(inet_twsk(sk)->tw_sport); >>>> + } else if (oldstate == TCP_NEW_SYN_RECV) { >>>> + __entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr; >>>> + __entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr; >>>> + __entry->dport = >>>> + ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port); >>>> + __entry->sport = inet_rsk(inet_reqsk(sk))->ir_num; >>>> + } else { >>>> + __entry->dst = inet_sk(sk)->inet_daddr; >>>> + __entry->src = inet_sk(sk)->inet_rcv_saddr; >>>> + __entry->dport = ntohs(inet_sk(sk)->inet_dport); >>>> + __entry->sport = ntohs(inet_sk(sk)->inet_sport); >>>> + } >>>> + >>>> + __entry->oldstate = oldstate; >>>> + __entry->newstate = newstate; >>>> + ), >>>> + >>>> + TP_printk("%08X:%04X %08X:%04X, %02x %02x", >>>> + __entry->src, __entry->sport, __entry->dst, __entry->dport, >>>> + __entry->oldstate, __entry->newstate) >>> >>> direct %x of state is not allowed. >>> This has to use show_tcp_state_name() like it's done in trace_tcp_set_state >> >> Hmm, I need to look at trace_tcp_set_state. I'm guessing it is in >> net-next too? > > Yes, in net-next. There are 6 tracepoints under tcp group: > > tcp_destroy_sock > tcp_receive_reset > tcp_retransmit_skb > tcp_retransmit_synack > tcp_send_reset > tcp_set_state > > They are all added recently. > >> >>> >>> Also I'm missing the reason to introduce another tracepoint >>> that looks just like trace_tcp_set_state. >> >> I want to make sure that perf and trace-cmd can parse the TP_printk(), >> if it is having helper functions like that in the TP_printk() part, >> then the libtraceevent needs to be aware of that. >> > > tcp_set_state uses __print_symbolic to show state in text format. I found > trace-cmd cannot parse that part: > > [011] 147338.660560: tcp_set_state: sport=16262 dport=48346 \ > saddr=127.0.0.6 daddr=127.0.0.6 saddrv6=2401:db00:30:317e:face:0:1f:0 \ > daddrv6=2401:db00:30:31e5:face:0:7f:0 oldstate= newstate= > > Other parts of the output looks good to me. > > Thanks, > Song I am not sure whether this is the best approach, but the following patch fixes the output of perf: 0.44% sport=16262 dport=39362 saddr=127.0.0.6 daddr=127.0.0.6 \ saddrv6=2401:db00:30:317e:face:0:1f:0 daddrv6=2401:db00:30:31e5:face:0:7f:0 \ oldstate=TCP_CLOSE_WAIT newstate=TCP_LAST_ACK Thanks, Song From 4b7e27631a4c7df96a38223a95ee3ede2f5f2d19 Mon Sep 17 00:00:00 2001 From: Song Liu <songliubraving@fb.com> Date: Thu, 9 Nov 2017 15:30:07 -0800 Subject: [PATCH] libtraceevent: add flags for tcp state names Names of TCP states are added to flags in event-parse.c. The names are used to print symbolic names in tracepoint: tcp/tcp_set_state. Signed-off-by: Song Liu <songliubraving@fb.com> --- tools/lib/traceevent/event-parse.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 7ce724f..4972dc2 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3790,6 +3790,20 @@ static const struct flag flags[] = { { "HRTIMER_NORESTART", 0 }, { "HRTIMER_RESTART", 1 }, + + /* tcp state names, see include/net/tcp_states.h */ + { "TCP_ESTABLISHED", 1 }, + { "TCP_SYN_SENT", 2 }, + { "TCP_SYN_RECV", 3 }, + { "TCP_FIN_WAIT1", 4 }, + { "TCP_FIN_WAIT2", 5 }, + { "TCP_TIME_WAIT", 6 }, + { "TCP_CLOSE", 7 }, + { "TCP_CLOSE_WAIT", 8 }, + { "TCP_LAST_ACK", 9 }, + { "TCP_LISTEN", 10 }, + { "TCP_CLOSING", 11 }, + { "TCP_NEW_SYN_RECV", 12 }, }; static long long eval_flag(const char *flag) -- 2.9.5
On Thu, 9 Nov 2017 23:40:13 +0000 Song Liu <songliubraving@fb.com> wrote: > > tcp_set_state uses __print_symbolic to show state in text format. I found > > trace-cmd cannot parse that part: > > > > [011] 147338.660560: tcp_set_state: sport=16262 dport=48346 \ > > saddr=127.0.0.6 daddr=127.0.0.6 saddrv6=2401:db00:30:317e:face:0:1f:0 \ > > daddrv6=2401:db00:30:31e5:face:0:7f:0 oldstate= newstate= The latest trace-cmd does show oldstate=0xa newstate=0x7, since I fixed it so undefined symbols and flags are displayed. > > > > Other parts of the output looks good to me. > > > > Thanks, > > Song > > I am not sure whether this is the best approach, but the following patch > fixes the output of perf: No it's not the best approach. But the below patch is ;-) > > 0.44% sport=16262 dport=39362 saddr=127.0.0.6 daddr=127.0.0.6 \ > saddrv6=2401:db00:30:317e:face:0:1f:0 daddrv6=2401:db00:30:31e5:face:0:7f:0 \ > oldstate=TCP_CLOSE_WAIT newstate=TCP_LAST_ACK > I'll send a formal patch if you all approve. -- Steve diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 07cccca6cbf1..62e5bad7901f 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -9,21 +9,36 @@ #include <linux/tracepoint.h> #include <net/ipv6.h> +#define tcp_state_names \ + EM(TCP_ESTABLISHED) \ + EM(TCP_SYN_SENT) \ + EM(TCP_SYN_RECV) \ + EM(TCP_FIN_WAIT1) \ + EM(TCP_FIN_WAIT2) \ + EM(TCP_TIME_WAIT) \ + EM(TCP_CLOSE) \ + EM(TCP_CLOSE_WAIT) \ + EM(TCP_LAST_ACK) \ + EM(TCP_LISTEN) \ + EM(TCP_CLOSING) \ + EMe(TCP_NEW_SYN_RECV) + +/* enums need to be exported to user space */ +#undef EM +#undef EMe +#define EM(a) TRACE_DEFINE_ENUM(a); +#define EMe(a) TRACE_DEFINE_ENUM(a); + +tcp_state_names + +#undef EM +#undef EMe +#define EM(a) tcp_state_name(a), +#define EMe(a) tcp_state_name(a) + #define tcp_state_name(state) { state, #state } #define show_tcp_state_name(val) \ - __print_symbolic(val, \ - tcp_state_name(TCP_ESTABLISHED), \ - tcp_state_name(TCP_SYN_SENT), \ - tcp_state_name(TCP_SYN_RECV), \ - tcp_state_name(TCP_FIN_WAIT1), \ - tcp_state_name(TCP_FIN_WAIT2), \ - tcp_state_name(TCP_TIME_WAIT), \ - tcp_state_name(TCP_CLOSE), \ - tcp_state_name(TCP_CLOSE_WAIT), \ - tcp_state_name(TCP_LAST_ACK), \ - tcp_state_name(TCP_LISTEN), \ - tcp_state_name(TCP_CLOSING), \ - tcp_state_name(TCP_NEW_SYN_RECV)) + __print_symbolic(val, tcp_state_names) /* * tcp event with arguments sk and skb
diff --git a/include/net/tcp.h b/include/net/tcp.h index 89974c5..a8336d3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -49,6 +49,7 @@ #include <linux/bpf.h> #include <linux/filter.h> #include <linux/bpf-cgroup.h> +#include <trace/events/tcp.h> extern struct inet_hashinfo tcp_hashinfo; @@ -1284,6 +1285,21 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) #endif void tcp_set_state(struct sock *sk, int state); +/* + * To trace TCP state transition. + */ +static inline void __tcp_set_state(struct sock *sk, int state) +{ + trace_tcp_set_state(sk, sk->sk_state, state); + sk->sk_state = state; +} + +static inline void __sk_state_store(struct sock *sk, int newstate) +{ + trace_tcp_set_state(sk, sk->sk_state, newstate); + sk_state_store(sk, newstate); +} + void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h new file mode 100644 index 0000000..abf65af --- /dev/null +++ b/include/trace/events/tcp.h @@ -0,0 +1,58 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM tcp + +#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TCP_H + +#include <linux/tracepoint.h> +#include <net/sock.h> +#include <net/inet_timewait_sock.h> +#include <net/request_sock.h> +#include <net/inet_sock.h> +#include <net/tcp_states.h> + +TRACE_EVENT(tcp_set_state, + TP_PROTO(struct sock *sk, int oldstate, int newstate), + TP_ARGS(sk, oldstate, newstate), + + TP_STRUCT__entry( + __field(__be32, dst) + __field(__be32, src) + __field(__u16, dport) + __field(__u16, sport) + __field(int, oldstate) + __field(int, newstate) + ), + + TP_fast_assign( + if (oldstate == TCP_TIME_WAIT) { + __entry->dst = inet_twsk(sk)->tw_daddr; + __entry->src = inet_twsk(sk)->tw_rcv_saddr; + __entry->dport = ntohs(inet_twsk(sk)->tw_dport); + __entry->sport = ntohs(inet_twsk(sk)->tw_sport); + } else if (oldstate == TCP_NEW_SYN_RECV) { + __entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr; + __entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr; + __entry->dport = + ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port); + __entry->sport = inet_rsk(inet_reqsk(sk))->ir_num; + } else { + __entry->dst = inet_sk(sk)->inet_daddr; + __entry->src = inet_sk(sk)->inet_rcv_saddr; + __entry->dport = ntohs(inet_sk(sk)->inet_dport); + __entry->sport = ntohs(inet_sk(sk)->inet_sport); + } + + __entry->oldstate = oldstate; + __entry->newstate = newstate; + ), + + TP_printk("%08X:%04X %08X:%04X, %02x %02x", + __entry->src, __entry->sport, __entry->dst, __entry->dport, + __entry->oldstate, __entry->newstate) +); + +#endif + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c039c93..307a046 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -27,6 +27,9 @@ #include <net/sock_reuseport.h> #include <net/addrconf.h> +#define CREATE_TRACE_POINTS +#include <trace/events/tcp.h> + #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); @@ -786,7 +789,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); - newsk->sk_state = TCP_SYN_RECV; + __tcp_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; @@ -880,7 +883,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk_state_store(sk, TCP_LISTEN); + __sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); @@ -891,7 +894,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) return 0; } - sk->sk_state = TCP_CLOSE; + __tcp_set_state(sk, TCP_CLOSE); return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 597bb4c..0f45d456 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -430,7 +430,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { percpu_counter_inc(sk->sk_prot->orphan_count); - sk->sk_state = TCP_CLOSE; + __tcp_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5091402..984dce6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2040,7 +2040,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk_state_store(sk, state); + __sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
With this newly introduced TRACE_EVENT, it will be very easy to minotor TCP/IPv4 state transition. A new TRACE_SYSTEM named tcp is added, in which we can trace other TCP event as well. Two helpers are added, static inline void __tcp_set_state(struct sock *sk, int state) static inline void __sk_state_store(struct sock *sk, int newstate) When do TCP/IPv4 state transition, we should use these two helpers or use tcp_set_state() instead of assign a value to sk_state directly. Signed-off-by: Yafang Shao <laoar.shao@gmail.com> --- include/net/tcp.h | 16 ++++++++++++ include/trace/events/tcp.h | 58 +++++++++++++++++++++++++++++++++++++++++ net/ipv4/inet_connection_sock.c | 9 ++++--- net/ipv4/inet_hashtables.c | 2 +- net/ipv4/tcp.c | 2 +- 5 files changed, 82 insertions(+), 5 deletions(-) create mode 100644 include/trace/events/tcp.h