diff mbox series

[net-next] tcp: add tracepoint trace_tcp_retransmit_synack()

Message ID 20171024235709.2186047-1-songliubraving@fb.com
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series [net-next] tcp: add tracepoint trace_tcp_retransmit_synack() | expand

Commit Message

Song Liu Oct. 24, 2017, 11:57 p.m. UTC
This tracepoint can be used to trace synack retransmits. It maintains
pointer to struct request_sock.

We cannot simply reuse trace_tcp_retransmit_skb() here, because the
sk here is the LISTEN socket. The IP addresses and ports should be
extracted from struct request_sock.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/trace/events/tcp.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_output.c      |  1 +
 2 files changed, 57 insertions(+)

Comments

Song Liu Oct. 25, 2017, 4:17 p.m. UTC | #1
CCing key audience of the patch. 

Thanks,
Song

> On Oct 24, 2017, at 4:57 PM, Song Liu <songliubraving@fb.com> wrote:
> 
> This tracepoint can be used to trace synack retransmits. It maintains
> pointer to struct request_sock.
> 
> We cannot simply reuse trace_tcp_retransmit_skb() here, because the
> sk here is the LISTEN socket. The IP addresses and ports should be
> extracted from struct request_sock.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
> include/trace/events/tcp.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++
> net/ipv4/tcp_output.c      |  1 +
> 2 files changed, 57 insertions(+)
> 
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index 03699ba..07cccca 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -237,6 +237,62 @@ TRACE_EVENT(tcp_set_state,
> 		  show_tcp_state_name(__entry->newstate))
> );
> 
> +TRACE_EVENT(tcp_retransmit_synack,
> +
> +	TP_PROTO(const struct sock *sk, const struct request_sock *req),
> +
> +	TP_ARGS(sk, req),
> +
> +	TP_STRUCT__entry(
> +		__field(const void *, skaddr)
> +		__field(const void *, req)
> +		__field(__u16, sport)
> +		__field(__u16, dport)
> +		__array(__u8, saddr, 4)
> +		__array(__u8, daddr, 4)
> +		__array(__u8, saddr_v6, 16)
> +		__array(__u8, daddr_v6, 16)
> +	),
> +
> +	TP_fast_assign(
> +		struct inet_request_sock *ireq = inet_rsk(req);
> +		struct in6_addr *pin6;
> +		__be32 *p32;
> +
> +		__entry->skaddr = sk;
> +		__entry->req = req;
> +
> +		__entry->sport = ireq->ir_num;
> +		__entry->dport = ntohs(ireq->ir_rmt_port);
> +
> +		p32 = (__be32 *) __entry->saddr;
> +		*p32 = ireq->ir_loc_addr;
> +
> +		p32 = (__be32 *) __entry->daddr;
> +		*p32 = ireq->ir_rmt_addr;
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +		if (sk->sk_family == AF_INET6) {
> +			pin6 = (struct in6_addr *)__entry->saddr_v6;
> +			*pin6 = ireq->ir_v6_loc_addr;
> +			pin6 = (struct in6_addr *)__entry->daddr_v6;
> +			*pin6 = ireq->ir_v6_rmt_addr;
> +		} else
> +#endif
> +		{
> +			pin6 = (struct in6_addr *)__entry->saddr_v6;
> +			ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6);
> +			pin6 = (struct in6_addr *)__entry->daddr_v6;
> +			ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6);
> +		}
> +	),
> +
> +	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
> +		  __entry->sport, __entry->dport,
> +		  __entry->saddr, __entry->daddr,
> +		  __entry->saddr_v6, __entry->daddr_v6)
> +);
> +
> #endif /* _TRACE_TCP_H */
> 
> /* This part must be outside protection */
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 1f01f4c..6a728a5 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -3735,6 +3735,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
> 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
> 		if (unlikely(tcp_passive_fastopen(sk)))
> 			tcp_sk(sk)->total_retrans++;
> +		trace_tcp_retransmit_synack(sk, req);
> 	}
> 	return res;
> }
> -- 
> 2.9.5
>
kernel test robot Oct. 26, 2017, 3:13 a.m. UTC | #2
Hi Song,

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Song-Liu/tcp-add-tracepoint-trace_tcp_retransmit_synack/20171026-010651
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)


vim +281 include/trace/events/tcp.h

   241	
   242		TP_PROTO(const struct sock *sk, const struct request_sock *req),
   243	
   244		TP_ARGS(sk, req),
   245	
   246		TP_STRUCT__entry(
   247			__field(const void *, skaddr)
   248			__field(const void *, req)
   249			__field(__u16, sport)
   250			__field(__u16, dport)
   251			__array(__u8, saddr, 4)
   252			__array(__u8, daddr, 4)
   253			__array(__u8, saddr_v6, 16)
   254			__array(__u8, daddr_v6, 16)
   255		),
   256	
   257		TP_fast_assign(
   258			struct inet_request_sock *ireq = inet_rsk(req);
   259			struct in6_addr *pin6;
   260			__be32 *p32;
   261	
   262			__entry->skaddr = sk;
   263			__entry->req = req;
   264	
   265			__entry->sport = ireq->ir_num;
   266			__entry->dport = ntohs(ireq->ir_rmt_port);
   267	
   268			p32 = (__be32 *) __entry->saddr;
   269			*p32 = ireq->ir_loc_addr;
   270	
   271			p32 = (__be32 *) __entry->daddr;
   272			*p32 = ireq->ir_rmt_addr;
   273	
 > 274	#if IS_ENABLED(CONFIG_IPV6)
   275			if (sk->sk_family == AF_INET6) {
   276				pin6 = (struct in6_addr *)__entry->saddr_v6;
   277				*pin6 = ireq->ir_v6_loc_addr;
   278				pin6 = (struct in6_addr *)__entry->daddr_v6;
   279				*pin6 = ireq->ir_v6_rmt_addr;
   280			} else
 > 281	#endif
   282			{
   283				pin6 = (struct in6_addr *)__entry->saddr_v6;
   284				ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6);
   285				pin6 = (struct in6_addr *)__entry->daddr_v6;
   286				ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6);
   287			}
   288		),
   289	
   290		TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
   291			  __entry->sport, __entry->dport,
   292			  __entry->saddr, __entry->daddr,
   293			  __entry->saddr_v6, __entry->daddr_v6)
   294	);
   295	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Song Liu Oct. 26, 2017, 11:50 p.m. UTC | #3
> On Oct 25, 2017, at 8:13 PM, kbuild test robot <lkp@intel.com> wrote:
> 
> Hi Song,
> 
> [auto build test WARNING on net-next/master]
> 
> url:    https://github.com/0day-ci/linux/commits/Song-Liu/tcp-add-tracepoint-trace_tcp_retransmit_synack/20171026-010651
> reproduce:
>        # apt-get install sparse
>        make ARCH=x86_64 allmodconfig
>        make C=1 CF=-D__CHECK_ENDIAN__
> 
> 
> sparse warnings: (new ones prefixed by >>)
> 
> 
> vim +281 include/trace/events/tcp.h
> 
>   241	
>   242		TP_PROTO(const struct sock *sk, const struct request_sock *req),
>   243	
>   244		TP_ARGS(sk, req),
>   245	
>   246		TP_STRUCT__entry(
>   247			__field(const void *, skaddr)
>   248			__field(const void *, req)
>   249			__field(__u16, sport)
>   250			__field(__u16, dport)
>   251			__array(__u8, saddr, 4)
>   252			__array(__u8, daddr, 4)
>   253			__array(__u8, saddr_v6, 16)
>   254			__array(__u8, daddr_v6, 16)
>   255		),
>   256	
>   257		TP_fast_assign(
>   258			struct inet_request_sock *ireq = inet_rsk(req);
>   259			struct in6_addr *pin6;
>   260			__be32 *p32;
>   261	
>   262			__entry->skaddr = sk;
>   263			__entry->req = req;
>   264	
>   265			__entry->sport = ireq->ir_num;
>   266			__entry->dport = ntohs(ireq->ir_rmt_port);
>   267	
>   268			p32 = (__be32 *) __entry->saddr;
>   269			*p32 = ireq->ir_loc_addr;
>   270	
>   271			p32 = (__be32 *) __entry->daddr;
>   272			*p32 = ireq->ir_rmt_addr;
>   273	
>> 274	#if IS_ENABLED(CONFIG_IPV6)
>   275			if (sk->sk_family == AF_INET6) {
>   276				pin6 = (struct in6_addr *)__entry->saddr_v6;
>   277				*pin6 = ireq->ir_v6_loc_addr;
>   278				pin6 = (struct in6_addr *)__entry->daddr_v6;
>   279				*pin6 = ireq->ir_v6_rmt_addr;
>   280			} else
>> 281	#endif

In this case, we are putting CONFIG_IPV6 in TRACE_EVENT macro, which generates
warnings like:

./include/trace/events/tcp.h:274:1: error: directive in argument list
./include/trace/events/tcp.h:281:1: error: directive in argument list

Seems these warning cannot be easily avoided. This is also the same pattern we 
have been using in include/trace/events/tcp.h. 

Any suggestions on how shall we proceed from here?

Thanks,
Song
Cong Wang Oct. 27, 2017, 2:01 a.m. UTC | #4
On Thu, Oct 26, 2017 at 4:50 PM, Song Liu <songliubraving@fb.com> wrote:
> In this case, we are putting CONFIG_IPV6 in TRACE_EVENT macro, which generates
> warnings like:
>
> ./include/trace/events/tcp.h:274:1: error: directive in argument list
> ./include/trace/events/tcp.h:281:1: error: directive in argument list
>
> Seems these warning cannot be easily avoided. This is also the same pattern we
> have been using in include/trace/events/tcp.h.

Hmm, we use the same so why it only complains about this one?

>
> Any suggestions on how shall we proceed from here?
>

I think this warning is harmless, so perhaps not worthy time to
shut it up, unless sparse provides a simple way to do so.
Song Liu Oct. 27, 2017, 5:06 a.m. UTC | #5
> On Oct 26, 2017, at 7:01 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Thu, Oct 26, 2017 at 4:50 PM, Song Liu <songliubraving@fb.com> wrote:
>> In this case, we are putting CONFIG_IPV6 in TRACE_EVENT macro, which generates
>> warnings like:
>> 
>> ./include/trace/events/tcp.h:274:1: error: directive in argument list
>> ./include/trace/events/tcp.h:281:1: error: directive in argument list
>> 
>> Seems these warning cannot be easily avoided. This is also the same pattern we
>> have been using in include/trace/events/tcp.h.
> 
> Hmm, we use the same so why it only complains about this one?\

sparse reports same warning for all the lines in tcp.h. Don't know why
kbuild test bot only complains about this patch. 

> 
>> 
>> Any suggestions on how shall we proceed from here?
>> 
> 
> I think this warning is harmless, so perhaps not worthy time to
> shut it up, unless sparse provides a simple way to do so.
Alexei Starovoitov Oct. 27, 2017, 7:57 p.m. UTC | #6
On 10/26/17 10:06 PM, Song Liu wrote:
>
>> On Oct 26, 2017, at 7:01 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>
>> On Thu, Oct 26, 2017 at 4:50 PM, Song Liu <songliubraving@fb.com> wrote:
>>> In this case, we are putting CONFIG_IPV6 in TRACE_EVENT macro, which generates
>>> warnings like:
>>>
>>> ./include/trace/events/tcp.h:274:1: error: directive in argument list
>>> ./include/trace/events/tcp.h:281:1: error: directive in argument list
>>>
>>> Seems these warning cannot be easily avoided. This is also the same pattern we
>>> have been using in include/trace/events/tcp.h.
>>
>> Hmm, we use the same so why it only complains about this one?\
>
> sparse reports same warning for all the lines in tcp.h. Don't know why
> kbuild test bot only complains about this patch.

it's safe to ignore this sparse error.
The #ifdef inside TP_fast_assign() in net.h have been there for years
without issues.
$ make C=2 net/core/dev.o
../include/trace/events/net.h:170:1: error: directive in argument list
../include/trace/events/net.h:172:1: error: directive in argument list
../include/trace/events/net.h:174:1: error: directive in argument list
Alban Crequy Oct. 27, 2017, 8:38 p.m. UTC | #7
Hi,

On 25 October 2017 at 01:57, Song Liu <songliubraving@fb.com> wrote:
> This tracepoint can be used to trace synack retransmits. It maintains
> pointer to struct request_sock.
>
> We cannot simply reuse trace_tcp_retransmit_skb() here, because the
> sk here is the LISTEN socket. The IP addresses and ports should be
> extracted from struct request_sock.
>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  include/trace/events/tcp.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++
>  net/ipv4/tcp_output.c      |  1 +
>  2 files changed, 57 insertions(+)
>
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index 03699ba..07cccca 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -237,6 +237,62 @@ TRACE_EVENT(tcp_set_state,
>                   show_tcp_state_name(__entry->newstate))
>  );
>
> +TRACE_EVENT(tcp_retransmit_synack,
> +
> +       TP_PROTO(const struct sock *sk, const struct request_sock *req),
> +
> +       TP_ARGS(sk, req),
> +
> +       TP_STRUCT__entry(
> +               __field(const void *, skaddr)
> +               __field(const void *, req)
> +               __field(__u16, sport)
> +               __field(__u16, dport)
> +               __array(__u8, saddr, 4)
> +               __array(__u8, daddr, 4)
> +               __array(__u8, saddr_v6, 16)
> +               __array(__u8, daddr_v6, 16)

Would it make sense to add the inode of the network namespace that
owns the socket? (along with the major/minor of the nsfs)

If the kernel later gains tracepoints for TCP connect, accept, close
including the netns ino, then I might be able to replace some
ebpf-kprobes code by ebpf-tracepoints code :)

> [...]

Thanks,
Alban
Alexei Starovoitov Oct. 27, 2017, 8:58 p.m. UTC | #8
On 10/27/17 1:38 PM, Alban Crequy wrote:
> Hi,
>
> On 25 October 2017 at 01:57, Song Liu <songliubraving@fb.com> wrote:
>> This tracepoint can be used to trace synack retransmits. It maintains
>> pointer to struct request_sock.
>>
>> We cannot simply reuse trace_tcp_retransmit_skb() here, because the
>> sk here is the LISTEN socket. The IP addresses and ports should be
>> extracted from struct request_sock.
>>
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>>  include/trace/events/tcp.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++
>>  net/ipv4/tcp_output.c      |  1 +
>>  2 files changed, 57 insertions(+)
>>
>> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
>> index 03699ba..07cccca 100644
>> --- a/include/trace/events/tcp.h
>> +++ b/include/trace/events/tcp.h
>> @@ -237,6 +237,62 @@ TRACE_EVENT(tcp_set_state,
>>                   show_tcp_state_name(__entry->newstate))
>>  );
>>
>> +TRACE_EVENT(tcp_retransmit_synack,
>> +
>> +       TP_PROTO(const struct sock *sk, const struct request_sock *req),
>> +
>> +       TP_ARGS(sk, req),
>> +
>> +       TP_STRUCT__entry(
>> +               __field(const void *, skaddr)
>> +               __field(const void *, req)
>> +               __field(__u16, sport)
>> +               __field(__u16, dport)
>> +               __array(__u8, saddr, 4)
>> +               __array(__u8, daddr, 4)
>> +               __array(__u8, saddr_v6, 16)
>> +               __array(__u8, daddr_v6, 16)
>
> Would it make sense to add the inode of the network namespace that
> owns the socket? (along with the major/minor of the nsfs)

We cannot do this.
netns ino is not unique identifier of netns.
we can do such hack only inside programs by
walking skb->dev->nd_net->net with bpf_probe_read() and realizing
that this is unstable interface and not technically correct.

> If the kernel later gains tracepoints for TCP connect, accept, close
> including the netns ino, then I might be able to replace some
> ebpf-kprobes code by ebpf-tracepoints code :)

What is the use case for tracepoints in connect/accept/close ?
Just because some _useful_ bcc script is using kprobe in particular
kernel function it doesn't mean yet that we need a tracepoint in there.
imo the general rule for tracepoints is to only add them when it's 100%
certain that this is the right place for it and kprobe approach
is not enough or not possible.
In the case of recent addition of tcp tracepoints the main thing
they achieve (vs our old kprobe approach) is that they are accurate.
In this particular patch the kprobe on tcp_rtx_synack() is not
the same as trace_tcp_retransmit_synack(), since it incorrectly
counts failed send_synack(). It's solvable via kretprobe
on tcp_rtx_synack() and checking %rax inside the bpf program,
but kretprobes add runtime overhead and much slower than tracepoints.
diff mbox series

Patch

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 03699ba..07cccca 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -237,6 +237,62 @@  TRACE_EVENT(tcp_set_state,
 		  show_tcp_state_name(__entry->newstate))
 );
 
+TRACE_EVENT(tcp_retransmit_synack,
+
+	TP_PROTO(const struct sock *sk, const struct request_sock *req),
+
+	TP_ARGS(sk, req),
+
+	TP_STRUCT__entry(
+		__field(const void *, skaddr)
+		__field(const void *, req)
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__array(__u8, saddr, 4)
+		__array(__u8, daddr, 4)
+		__array(__u8, saddr_v6, 16)
+		__array(__u8, daddr_v6, 16)
+	),
+
+	TP_fast_assign(
+		struct inet_request_sock *ireq = inet_rsk(req);
+		struct in6_addr *pin6;
+		__be32 *p32;
+
+		__entry->skaddr = sk;
+		__entry->req = req;
+
+		__entry->sport = ireq->ir_num;
+		__entry->dport = ntohs(ireq->ir_rmt_port);
+
+		p32 = (__be32 *) __entry->saddr;
+		*p32 = ireq->ir_loc_addr;
+
+		p32 = (__be32 *) __entry->daddr;
+		*p32 = ireq->ir_rmt_addr;
+
+#if IS_ENABLED(CONFIG_IPV6)
+		if (sk->sk_family == AF_INET6) {
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			*pin6 = ireq->ir_v6_loc_addr;
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			*pin6 = ireq->ir_v6_rmt_addr;
+		} else
+#endif
+		{
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6);
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6);
+		}
+	),
+
+	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
+		  __entry->sport, __entry->dport,
+		  __entry->saddr, __entry->daddr,
+		  __entry->saddr_v6, __entry->daddr_v6)
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1f01f4c..6a728a5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3735,6 +3735,7 @@  int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
 		if (unlikely(tcp_passive_fastopen(sk)))
 			tcp_sk(sk)->total_retrans++;
+		trace_tcp_retransmit_synack(sk, req);
 	}
 	return res;
 }