diff mbox series

[v2,3/4] mptcp: add MPTCP socket diag interface

Message ID dd16dc56a9117d1b2e8db598e81d0e60664eb1ba.1593192703.git.pabeni@redhat.com
State Superseded, archived
Delegated to: Mat Martineau
Headers show
Series mptcp: msk diag support | expand

Commit Message

Paolo Abeni June 26, 2020, 5:33 p.m. UTC
exposes basic inet socket attribute, plus some MPTCP socket
fields comprising PM status and MPTCP-level sequence numbers.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v1 -> v2:
 - fix dump on skb full event
 - use 'flags' for fallback, can_ack
---
 include/uapi/linux/mptcp.h |  17 ++++
 net/mptcp/Kconfig          |   4 +
 net/mptcp/Makefile         |   2 +
 net/mptcp/mptcp_diag.c     | 167 +++++++++++++++++++++++++++++++++++++
 4 files changed, 190 insertions(+)
 create mode 100644 net/mptcp/mptcp_diag.c

Comments

Mat Martineau July 1, 2020, 12:59 a.m. UTC | #1
On Fri, 26 Jun 2020, Paolo Abeni wrote:

> exposes basic inet socket attribute, plus some MPTCP socket
> fields comprising PM status and MPTCP-level sequence numbers.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> v1 -> v2:
> - fix dump on skb full event
> - use 'flags' for fallback, can_ack
> ---
> include/uapi/linux/mptcp.h |  17 ++++
> net/mptcp/Kconfig          |   4 +
> net/mptcp/Makefile         |   2 +
> net/mptcp/mptcp_diag.c     | 167 +++++++++++++++++++++++++++++++++++++
> 4 files changed, 190 insertions(+)
> create mode 100644 net/mptcp/mptcp_diag.c
>
> diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
> index 5f2c77082d9e..9762660df741 100644
> --- a/include/uapi/linux/mptcp.h
> +++ b/include/uapi/linux/mptcp.h
> @@ -86,4 +86,21 @@ enum {
> 	__MPTCP_PM_CMD_AFTER_LAST
> };
>
> +#define MPTCP_INFO_FLAG_FALLBACK		_BITUL(0)
> +#define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED	_BITUL(1)
> +
> +struct mptcp_info {
> +	__u8	mptcpi_subflows;
> +	__u8	mptcpi_add_addr_signal;
> +	__u8	mptcpi_add_addr_accepted;
> +	__u8	mptcpi_subflows_max;
> +	__u8	mptcpi_add_addr_signal_max;
> +	__u8	mptcpi_add_addr_accepted_max;
> +	__u32	mptcpi_flags;
> +	__u32	mptcpi_token;
> +	__u64	mptcpi_write_seq;
> +	__u64	mptcpi_snd_una;
> +	__u64	mptcpi_rcv_nxt;
> +};
> +
> #endif /* _UAPI_MPTCP_H */
> diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
> index 0ec787d8b9a2..97c29caa65f3 100644
> --- a/net/mptcp/Kconfig
> +++ b/net/mptcp/Kconfig
> @@ -14,6 +14,10 @@ config MPTCP
>
> if MPTCP
>
> +config INET_MPTCP_DIAG
> +	depends on INET_DIAG
> +	def_tristate INET_DIAG
> +
> config MPTCP_IPV6
> 	bool "MPTCP: IPv6 support for Multipath TCP"
> 	select IPV6
> diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
> index c53f9b845523..2360cbd27d59 100644
> --- a/net/mptcp/Makefile
> +++ b/net/mptcp/Makefile
> @@ -4,6 +4,8 @@ obj-$(CONFIG_MPTCP) += mptcp.o
> mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
> 	   mib.o pm_netlink.o
>
> +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
> +
> mptcp_crypto_test-objs := crypto_test.o
> mptcp_token_test-objs := token_test.o
> obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o
> diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
> new file mode 100644
> index 000000000000..30aa768e23bd
> --- /dev/null
> +++ b/net/mptcp/mptcp_diag.c
> @@ -0,0 +1,167 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* MPTCP socket monitoring support
> + *
> + * Copyright (c) 2020 Red Hat
> + *
> + * Author: Paolo Abeni <pabeni@redhat.com>
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/net.h>
> +#include <linux/inet_diag.h>
> +#include <net/netlink.h>
> +#include <uapi/linux/mptcp.h>
> +#include "protocol.h"
> +
> +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
> +			struct netlink_callback *cb,
> +			const struct inet_diag_req_v2 *req,
> +			struct nlattr *bc, bool net_admin)
> +{
> +	if (!inet_diag_bc_sk(bc, sk))
> +		return 0;
> +
> +	return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
> +				 net_admin);
> +}
> +
> +static int mptcp_diag_dump_one(struct netlink_callback *cb,
> +			       const struct inet_diag_req_v2 *req)
> +{
> +	struct sk_buff *in_skb = cb->skb;
> +	struct mptcp_sock *msk = NULL;
> +	struct sk_buff *rep;
> +	int err = -ENOENT;
> +	struct net *net;
> +	struct sock *sk;
> +
> +	net = sock_net(in_skb->sk);
> +	msk = mptcp_token_get_sock(req->id.idiag_cookie[0]);
> +	if (!msk)
> +		goto out_nosk;
> +
> +	err = -ENOMEM;
> +	sk = (struct sock *)msk;
> +	rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
> +			inet_diag_msg_attrs_size() +
> +			nla_total_size(sizeof(struct mptcp_info)) +
> +			nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
> +			GFP_KERNEL);
> +	if (!rep)
> +		goto out;
> +
> +	err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0,
> +				netlink_net_capable(in_skb, CAP_NET_ADMIN));
> +	if (err < 0) {
> +		WARN_ON(err == -EMSGSIZE);
> +		kfree_skb(rep);
> +		goto out;
> +	}
> +	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
> +			      MSG_DONTWAIT);
> +	if (err > 0)
> +		err = 0;
> +out:
> +	sock_put(sk);
> +
> +out_nosk:
> +	return err;
> +}
> +
> +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
> +			    const struct inet_diag_req_v2 *r)
> +{
> +	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
> +	struct net *net = sock_net(skb->sk);
> +	struct inet_diag_dump_data *cb_data;
> +	struct mptcp_sock *msk;
> +	struct nlattr *bc;
> +
> +	cb_data = cb->data;
> +	bc = cb_data->inet_diag_nla_bc;
> +
> +	for (msk = mptcp_token_iter_start(&cb->args[0], &cb->args[1]);
> +	     msk; msk = mptcp_token_iter_next(&cb->args[0], &cb->args[1])) {

Should there be a rcu_read_lock/rcu_read_unlock around this loop with 
mptcp_token_iter_next() using sk_nulls_for_each_rcu()? Or is that already 
handled somewhere up the call stack?

> +		struct inet_sock *inet;
> +		struct sock *sk;
> +
> +		inet = (struct inet_sock *)msk;
> +		sk = (struct sock *)msk;
> +		if (!net_eq(sock_net(sk), net))
> +			continue;
> +		if (!(r->idiag_states & (1 << sk->sk_state)))
> +			continue;
> +		if (r->sdiag_family != AF_UNSPEC &&
> +		    sk->sk_family != r->sdiag_family)
> +			continue;
> +		if (r->id.idiag_sport != inet->inet_sport &&
> +		    r->id.idiag_sport)
> +			continue;
> +		if (r->id.idiag_dport != inet->inet_dport &&
> +		    r->id.idiag_dport)
> +			continue;
> +
> +		if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
> +			mptcp_token_iter_stop(cb->args[0]);
> +
> +			/* will retry on the same position */
> +			cb->args[1]--;
> +			break;
> +		}
> +	}
> +}
> +
> +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
> +				void *_info)
> +{
> +	struct mptcp_sock *msk = mptcp_sk(sk);
> +	struct mptcp_info *info = _info;
> +	u32 flags = 0;
> +	u8 val;
> +
> +	r->idiag_rqueue = sk_rmem_alloc_get(sk);
> +	r->idiag_wqueue = sk_wmem_alloc_get(sk);
> +	if (!info)
> +		return;
> +
> +	info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
> +	info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
> +	info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
> +	info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max);
> +	val = READ_ONCE(msk->pm.add_addr_signal_max);
> +	info->mptcpi_add_addr_signal_max = val;
> +	val = READ_ONCE(msk->pm.add_addr_accept_max);
> +	info->mptcpi_add_addr_accepted_max = val;
> +	if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
> +		flags |= MPTCP_INFO_FLAG_FALLBACK;
> +	if (READ_ONCE(msk->can_ack))
> +		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
> +	info->mptcpi_flags = flags;
> +	info->mptcpi_token = READ_ONCE(msk->token);
> +	info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
> +	info->mptcpi_snd_una = atomic64_read(&msk->snd_una);
> +	info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);

I noticed that tcp_get_info() uses lock_sock_fast() when it populates the 
tcp_info struct, is it worth using that here? It seems like the sequence 
numbers in particular could easily be inconsistent without locking.


Mat


> +}
> +
> +static const struct inet_diag_handler mptcp_diag_handler = {
> +	.dump		 = mptcp_diag_dump,
> +	.dump_one	 = mptcp_diag_dump_one,
> +	.idiag_get_info  = mptcp_diag_get_info,
> +	.idiag_type	 = IPPROTO_MPTCP,
> +	.idiag_info_size = sizeof(struct mptcp_info),
> +};
> +
> +int __init mptcp_diag_init(void)
> +{
> +	return inet_diag_register(&mptcp_diag_handler);
> +}
> +
> +static void __exit mptcp_diag_exit(void)
> +{
> +	inet_diag_unregister(&mptcp_diag_handler);
> +}
> +
> +module_init(mptcp_diag_init);
> +module_exit(mptcp_diag_exit);
> +MODULE_LICENSE("GPL");
> +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */);
> -- 
> 2.26.2

--
Mat Martineau
Intel
Paolo Abeni July 1, 2020, 10:30 a.m. UTC | #2
On Tue, 2020-06-30 at 17:59 -0700, Mat Martineau wrote:
> On Fri, 26 Jun 2020, Paolo Abeni wrote:
> 
> > exposes basic inet socket attribute, plus some MPTCP socket
> > fields comprising PM status and MPTCP-level sequence numbers.
> > 
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > v1 -> v2:
> > - fix dump on skb full event
> > - use 'flags' for fallback, can_ack
> > ---
> > include/uapi/linux/mptcp.h |  17 ++++
> > net/mptcp/Kconfig          |   4 +
> > net/mptcp/Makefile         |   2 +
> > net/mptcp/mptcp_diag.c     | 167 +++++++++++++++++++++++++++++++++++++
> > 4 files changed, 190 insertions(+)
> > create mode 100644 net/mptcp/mptcp_diag.c
> > 
> > diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
> > index 5f2c77082d9e..9762660df741 100644
> > --- a/include/uapi/linux/mptcp.h
> > +++ b/include/uapi/linux/mptcp.h
> > @@ -86,4 +86,21 @@ enum {
> > 	__MPTCP_PM_CMD_AFTER_LAST
> > };
> > 
> > +#define MPTCP_INFO_FLAG_FALLBACK		_BITUL(0)
> > +#define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED	_BITUL(1)
> > +
> > +struct mptcp_info {
> > +	__u8	mptcpi_subflows;
> > +	__u8	mptcpi_add_addr_signal;
> > +	__u8	mptcpi_add_addr_accepted;
> > +	__u8	mptcpi_subflows_max;
> > +	__u8	mptcpi_add_addr_signal_max;
> > +	__u8	mptcpi_add_addr_accepted_max;
> > +	__u32	mptcpi_flags;
> > +	__u32	mptcpi_token;
> > +	__u64	mptcpi_write_seq;
> > +	__u64	mptcpi_snd_una;
> > +	__u64	mptcpi_rcv_nxt;
> > +};
> > +
> > #endif /* _UAPI_MPTCP_H */
> > diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
> > index 0ec787d8b9a2..97c29caa65f3 100644
> > --- a/net/mptcp/Kconfig
> > +++ b/net/mptcp/Kconfig
> > @@ -14,6 +14,10 @@ config MPTCP
> > 
> > if MPTCP
> > 
> > +config INET_MPTCP_DIAG
> > +	depends on INET_DIAG
> > +	def_tristate INET_DIAG
> > +
> > config MPTCP_IPV6
> > 	bool "MPTCP: IPv6 support for Multipath TCP"
> > 	select IPV6
> > diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
> > index c53f9b845523..2360cbd27d59 100644
> > --- a/net/mptcp/Makefile
> > +++ b/net/mptcp/Makefile
> > @@ -4,6 +4,8 @@ obj-$(CONFIG_MPTCP) += mptcp.o
> > mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
> > 	   mib.o pm_netlink.o
> > 
> > +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
> > +
> > mptcp_crypto_test-objs := crypto_test.o
> > mptcp_token_test-objs := token_test.o
> > obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o
> > diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
> > new file mode 100644
> > index 000000000000..30aa768e23bd
> > --- /dev/null
> > +++ b/net/mptcp/mptcp_diag.c
> > @@ -0,0 +1,167 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* MPTCP socket monitoring support
> > + *
> > + * Copyright (c) 2020 Red Hat
> > + *
> > + * Author: Paolo Abeni <pabeni@redhat.com>
> > + */
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/net.h>
> > +#include <linux/inet_diag.h>
> > +#include <net/netlink.h>
> > +#include <uapi/linux/mptcp.h>
> > +#include "protocol.h"
> > +
> > +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
> > +			struct netlink_callback *cb,
> > +			const struct inet_diag_req_v2 *req,
> > +			struct nlattr *bc, bool net_admin)
> > +{
> > +	if (!inet_diag_bc_sk(bc, sk))
> > +		return 0;
> > +
> > +	return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
> > +				 net_admin);
> > +}
> > +
> > +static int mptcp_diag_dump_one(struct netlink_callback *cb,
> > +			       const struct inet_diag_req_v2 *req)
> > +{
> > +	struct sk_buff *in_skb = cb->skb;
> > +	struct mptcp_sock *msk = NULL;
> > +	struct sk_buff *rep;
> > +	int err = -ENOENT;
> > +	struct net *net;
> > +	struct sock *sk;
> > +
> > +	net = sock_net(in_skb->sk);
> > +	msk = mptcp_token_get_sock(req->id.idiag_cookie[0]);
> > +	if (!msk)
> > +		goto out_nosk;
> > +
> > +	err = -ENOMEM;
> > +	sk = (struct sock *)msk;
> > +	rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
> > +			inet_diag_msg_attrs_size() +
> > +			nla_total_size(sizeof(struct mptcp_info)) +
> > +			nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
> > +			GFP_KERNEL);
> > +	if (!rep)
> > +		goto out;
> > +
> > +	err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0,
> > +				netlink_net_capable(in_skb, CAP_NET_ADMIN));
> > +	if (err < 0) {
> > +		WARN_ON(err == -EMSGSIZE);
> > +		kfree_skb(rep);
> > +		goto out;
> > +	}
> > +	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
> > +			      MSG_DONTWAIT);
> > +	if (err > 0)
> > +		err = 0;
> > +out:
> > +	sock_put(sk);
> > +
> > +out_nosk:
> > +	return err;
> > +}
> > +
> > +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
> > +			    const struct inet_diag_req_v2 *r)
> > +{
> > +	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
> > +	struct net *net = sock_net(skb->sk);
> > +	struct inet_diag_dump_data *cb_data;
> > +	struct mptcp_sock *msk;
> > +	struct nlattr *bc;
> > +
> > +	cb_data = cb->data;
> > +	bc = cb_data->inet_diag_nla_bc;
> > +
> > +	for (msk = mptcp_token_iter_start(&cb->args[0], &cb->args[1]);
> > +	     msk; msk = mptcp_token_iter_next(&cb->args[0], &cb->args[1])) {
> 
> Should there be a rcu_read_lock/rcu_read_unlock around this loop with 
> mptcp_token_iter_next() using sk_nulls_for_each_rcu()? Or is that already 
> handled somewhere up the call stack?

This code tried to follow the same schema used for inet_hashtable(): it
acquires explicit per bucket spinlock and keep it held while dumping
the socket. Could be converted to RCU, but then we will get
sock_hold()/sock_put() for each entry and I was under the impression
that would be more overhead on busy servers with many mptcp sockets,
but...

> > +		struct inet_sock *inet;
> > +		struct sock *sk;
> > +
> > +		inet = (struct inet_sock *)msk;
> > +		sk = (struct sock *)msk;
> > +		if (!net_eq(sock_net(sk), net))
> > +			continue;
> > +		if (!(r->idiag_states & (1 << sk->sk_state)))
> > +			continue;
> > +		if (r->sdiag_family != AF_UNSPEC &&
> > +		    sk->sk_family != r->sdiag_family)
> > +			continue;
> > +		if (r->id.idiag_sport != inet->inet_sport &&
> > +		    r->id.idiag_sport)
> > +			continue;
> > +		if (r->id.idiag_dport != inet->inet_dport &&
> > +		    r->id.idiag_dport)
> > +			continue;
> > +
> > +		if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
> > +			mptcp_token_iter_stop(cb->args[0]);
> > +
> > +			/* will retry on the same position */
> > +			cb->args[1]--;
> > +			break;
> > +		}
> > +	}
> > +}
> > +
> > +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
> > +				void *_info)
> > +{
> > +	struct mptcp_sock *msk = mptcp_sk(sk);
> > +	struct mptcp_info *info = _info;
> > +	u32 flags = 0;
> > +	u8 val;
> > +
> > +	r->idiag_rqueue = sk_rmem_alloc_get(sk);
> > +	r->idiag_wqueue = sk_wmem_alloc_get(sk);
> > +	if (!info)
> > +		return;
> > +
> > +	info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
> > +	info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
> > +	info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
> > +	info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max);
> > +	val = READ_ONCE(msk->pm.add_addr_signal_max);
> > +	info->mptcpi_add_addr_signal_max = val;
> > +	val = READ_ONCE(msk->pm.add_addr_accept_max);
> > +	info->mptcpi_add_addr_accepted_max = val;
> > +	if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
> > +		flags |= MPTCP_INFO_FLAG_FALLBACK;
> > +	if (READ_ONCE(msk->can_ack))
> > +		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
> > +	info->mptcpi_flags = flags;
> > +	info->mptcpi_token = READ_ONCE(msk->token);
> > +	info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
> > +	info->mptcpi_snd_una = atomic64_read(&msk->snd_una);
> > +	info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
> 
> I noticed that tcp_get_info() uses lock_sock_fast() when it populates the 
> tcp_info struct, is it worth using that here? It seems like the sequence 
> numbers in particular could easily be inconsistent without locking.

... thanks for the pointer! I misread some of the inet_hashtable diag
related code (specifically, I mixed-up LISTEN and ESTABLISHED sockets
traversing). Having a better look, TCP does per established socket
sock_hold()/sock_lock_fast()/sock_unlock_fast()/sock_put(), so I guess
we can do the same and move to RCUs for traversing. I'll try to cook v3
soon.

Thanks!

Paolo
Paolo Abeni July 1, 2020, 4:05 p.m. UTC | #3
On Wed, 2020-07-01 at 12:30 +0200, Paolo Abeni wrote:
> On Tue, 2020-06-30 at 17:59 -0700, Mat Martineau wrote:
> > On Fri, 26 Jun 2020, Paolo Abeni wrote:
> > 
> > > exposes basic inet socket attribute, plus some MPTCP socket
> > > fields comprising PM status and MPTCP-level sequence numbers.
> > > 
> > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > ---
> > > v1 -> v2:
> > > - fix dump on skb full event
> > > - use 'flags' for fallback, can_ack
> > > ---
> > > include/uapi/linux/mptcp.h |  17 ++++
> > > net/mptcp/Kconfig          |   4 +
> > > net/mptcp/Makefile         |   2 +
> > > net/mptcp/mptcp_diag.c     | 167 +++++++++++++++++++++++++++++++++++++
> > > 4 files changed, 190 insertions(+)
> > > create mode 100644 net/mptcp/mptcp_diag.c
> > > 
> > > diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
> > > index 5f2c77082d9e..9762660df741 100644
> > > --- a/include/uapi/linux/mptcp.h
> > > +++ b/include/uapi/linux/mptcp.h
> > > @@ -86,4 +86,21 @@ enum {
> > > 	__MPTCP_PM_CMD_AFTER_LAST
> > > };
> > > 
> > > +#define MPTCP_INFO_FLAG_FALLBACK		_BITUL(0)
> > > +#define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED	_BITUL(1)
> > > +
> > > +struct mptcp_info {
> > > +	__u8	mptcpi_subflows;
> > > +	__u8	mptcpi_add_addr_signal;
> > > +	__u8	mptcpi_add_addr_accepted;
> > > +	__u8	mptcpi_subflows_max;
> > > +	__u8	mptcpi_add_addr_signal_max;
> > > +	__u8	mptcpi_add_addr_accepted_max;
> > > +	__u32	mptcpi_flags;
> > > +	__u32	mptcpi_token;
> > > +	__u64	mptcpi_write_seq;
> > > +	__u64	mptcpi_snd_una;
> > > +	__u64	mptcpi_rcv_nxt;
> > > +};
> > > +
> > > #endif /* _UAPI_MPTCP_H */
> > > diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
> > > index 0ec787d8b9a2..97c29caa65f3 100644
> > > --- a/net/mptcp/Kconfig
> > > +++ b/net/mptcp/Kconfig
> > > @@ -14,6 +14,10 @@ config MPTCP
> > > 
> > > if MPTCP
> > > 
> > > +config INET_MPTCP_DIAG
> > > +	depends on INET_DIAG
> > > +	def_tristate INET_DIAG
> > > +
> > > config MPTCP_IPV6
> > > 	bool "MPTCP: IPv6 support for Multipath TCP"
> > > 	select IPV6
> > > diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
> > > index c53f9b845523..2360cbd27d59 100644
> > > --- a/net/mptcp/Makefile
> > > +++ b/net/mptcp/Makefile
> > > @@ -4,6 +4,8 @@ obj-$(CONFIG_MPTCP) += mptcp.o
> > > mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
> > > 	   mib.o pm_netlink.o
> > > 
> > > +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
> > > +
> > > mptcp_crypto_test-objs := crypto_test.o
> > > mptcp_token_test-objs := token_test.o
> > > obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o
> > > diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
> > > new file mode 100644
> > > index 000000000000..30aa768e23bd
> > > --- /dev/null
> > > +++ b/net/mptcp/mptcp_diag.c
> > > @@ -0,0 +1,167 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +/* MPTCP socket monitoring support
> > > + *
> > > + * Copyright (c) 2020 Red Hat
> > > + *
> > > + * Author: Paolo Abeni <pabeni@redhat.com>
> > > + */
> > > +
> > > +#include <linux/kernel.h>
> > > +#include <linux/net.h>
> > > +#include <linux/inet_diag.h>
> > > +#include <net/netlink.h>
> > > +#include <uapi/linux/mptcp.h>
> > > +#include "protocol.h"
> > > +
> > > +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
> > > +			struct netlink_callback *cb,
> > > +			const struct inet_diag_req_v2 *req,
> > > +			struct nlattr *bc, bool net_admin)
> > > +{
> > > +	if (!inet_diag_bc_sk(bc, sk))
> > > +		return 0;
> > > +
> > > +	return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
> > > +				 net_admin);
> > > +}
> > > +
> > > +static int mptcp_diag_dump_one(struct netlink_callback *cb,
> > > +			       const struct inet_diag_req_v2 *req)
> > > +{
> > > +	struct sk_buff *in_skb = cb->skb;
> > > +	struct mptcp_sock *msk = NULL;
> > > +	struct sk_buff *rep;
> > > +	int err = -ENOENT;
> > > +	struct net *net;
> > > +	struct sock *sk;
> > > +
> > > +	net = sock_net(in_skb->sk);
> > > +	msk = mptcp_token_get_sock(req->id.idiag_cookie[0]);
> > > +	if (!msk)
> > > +		goto out_nosk;
> > > +
> > > +	err = -ENOMEM;
> > > +	sk = (struct sock *)msk;
> > > +	rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
> > > +			inet_diag_msg_attrs_size() +
> > > +			nla_total_size(sizeof(struct mptcp_info)) +
> > > +			nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
> > > +			GFP_KERNEL);
> > > +	if (!rep)
> > > +		goto out;
> > > +
> > > +	err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0,
> > > +				netlink_net_capable(in_skb, CAP_NET_ADMIN));
> > > +	if (err < 0) {
> > > +		WARN_ON(err == -EMSGSIZE);
> > > +		kfree_skb(rep);
> > > +		goto out;
> > > +	}
> > > +	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
> > > +			      MSG_DONTWAIT);
> > > +	if (err > 0)
> > > +		err = 0;
> > > +out:
> > > +	sock_put(sk);
> > > +
> > > +out_nosk:
> > > +	return err;
> > > +}
> > > +
> > > +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
> > > +			    const struct inet_diag_req_v2 *r)
> > > +{
> > > +	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
> > > +	struct net *net = sock_net(skb->sk);
> > > +	struct inet_diag_dump_data *cb_data;
> > > +	struct mptcp_sock *msk;
> > > +	struct nlattr *bc;
> > > +
> > > +	cb_data = cb->data;
> > > +	bc = cb_data->inet_diag_nla_bc;
> > > +
> > > +	for (msk = mptcp_token_iter_start(&cb->args[0], &cb->args[1]);
> > > +	     msk; msk = mptcp_token_iter_next(&cb->args[0], &cb->args[1])) {
> > 
> > Should there be a rcu_read_lock/rcu_read_unlock around this loop with 
> > mptcp_token_iter_next() using sk_nulls_for_each_rcu()? Or is that already 
> > handled somewhere up the call stack?
> 
> This code tried to follow the same schema used for inet_hashtable(): it
> acquires explicit per bucket spinlock and keep it held while dumping
> the socket. Could be converted to RCU, but then we will get
> sock_hold()/sock_put() for each entry and I was under the impression
> that would be more overhead on busy servers with many mptcp sockets,
> but...
> 
> > > +		struct inet_sock *inet;
> > > +		struct sock *sk;
> > > +
> > > +		inet = (struct inet_sock *)msk;
> > > +		sk = (struct sock *)msk;
> > > +		if (!net_eq(sock_net(sk), net))
> > > +			continue;
> > > +		if (!(r->idiag_states & (1 << sk->sk_state)))
> > > +			continue;
> > > +		if (r->sdiag_family != AF_UNSPEC &&
> > > +		    sk->sk_family != r->sdiag_family)
> > > +			continue;
> > > +		if (r->id.idiag_sport != inet->inet_sport &&
> > > +		    r->id.idiag_sport)
> > > +			continue;
> > > +		if (r->id.idiag_dport != inet->inet_dport &&
> > > +		    r->id.idiag_dport)
> > > +			continue;
> > > +
> > > +		if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
> > > +			mptcp_token_iter_stop(cb->args[0]);
> > > +
> > > +			/* will retry on the same position */
> > > +			cb->args[1]--;
> > > +			break;
> > > +		}
> > > +	}
> > > +}
> > > +
> > > +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
> > > +				void *_info)
> > > +{
> > > +	struct mptcp_sock *msk = mptcp_sk(sk);
> > > +	struct mptcp_info *info = _info;
> > > +	u32 flags = 0;
> > > +	u8 val;
> > > +
> > > +	r->idiag_rqueue = sk_rmem_alloc_get(sk);
> > > +	r->idiag_wqueue = sk_wmem_alloc_get(sk);
> > > +	if (!info)
> > > +		return;
> > > +
> > > +	info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
> > > +	info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
> > > +	info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
> > > +	info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max);
> > > +	val = READ_ONCE(msk->pm.add_addr_signal_max);
> > > +	info->mptcpi_add_addr_signal_max = val;
> > > +	val = READ_ONCE(msk->pm.add_addr_accept_max);
> > > +	info->mptcpi_add_addr_accepted_max = val;
> > > +	if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
> > > +		flags |= MPTCP_INFO_FLAG_FALLBACK;
> > > +	if (READ_ONCE(msk->can_ack))
> > > +		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
> > > +	info->mptcpi_flags = flags;
> > > +	info->mptcpi_token = READ_ONCE(msk->token);
> > > +	info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
> > > +	info->mptcpi_snd_una = atomic64_read(&msk->snd_una);
> > > +	info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
> > 
> > I noticed that tcp_get_info() uses lock_sock_fast() when it populates the 
> > tcp_info struct, is it worth using that here? It seems like the sequence 
> > numbers in particular could easily be inconsistent without locking.
> 
> ... thanks for the pointer! I misread some of the inet_hashtable diag
> related code (specifically, I mixed-up LISTEN and ESTABLISHED sockets
> traversing). Having a better look, TCP does per established socket
> sock_hold()/sock_lock_fast()/sock_unlock_fast()/sock_put(), so I guess
> we can do the same and move to RCUs for traversing. I'll try to cook v3
> soon.

Reporting here things already shared on IRC for larger audience e
better trackability

An RCU based msk diag implementation does not fit well with
lock_sock[_fast]() as we have to release the RCU lock on each socket.

We will have the same constraint even with bucket lock - as in the
current implementation - to avoid possible issue due to lock order.

inet_diag_dump_icsk() tries to amortize the cost of releasing/acquiring
the lock for each collecting up to 16 sockets in a local array and the
filling the info individually.

That add a bit of complexity and also do not fit well with iterator:
the iterator helper will have to implement the filtering as required by
the diag interface (which looks like a layering violation).

The simpler solution I can think of is avoiding the lock_sock() in
mptcp_diag_get_info(). that means possible inconsistence between
write_seq and snd_una. e.g. snd_una could be greater/later then
write_seq do we really care?

/P
Mat Martineau July 1, 2020, 4:38 p.m. UTC | #4
On Wed, 1 Jul 2020, Paolo Abeni wrote:

> On Wed, 2020-07-01 at 12:30 +0200, Paolo Abeni wrote:
>> On Tue, 2020-06-30 at 17:59 -0700, Mat Martineau wrote:
>>> On Fri, 26 Jun 2020, Paolo Abeni wrote:
>>>
>>>> exposes basic inet socket attribute, plus some MPTCP socket
>>>> fields comprising PM status and MPTCP-level sequence numbers.
>>>>
>>>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>>>> ---
>>>> v1 -> v2:
>>>> - fix dump on skb full event
>>>> - use 'flags' for fallback, can_ack
>>>> ---
>>>> include/uapi/linux/mptcp.h |  17 ++++
>>>> net/mptcp/Kconfig          |   4 +
>>>> net/mptcp/Makefile         |   2 +
>>>> net/mptcp/mptcp_diag.c     | 167 +++++++++++++++++++++++++++++++++++++
>>>> 4 files changed, 190 insertions(+)
>>>> create mode 100644 net/mptcp/mptcp_diag.c
>>>>
>>>> diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
>>>> index 5f2c77082d9e..9762660df741 100644
>>>> --- a/include/uapi/linux/mptcp.h
>>>> +++ b/include/uapi/linux/mptcp.h
>>>> @@ -86,4 +86,21 @@ enum {
>>>> 	__MPTCP_PM_CMD_AFTER_LAST
>>>> };
>>>>
>>>> +#define MPTCP_INFO_FLAG_FALLBACK		_BITUL(0)
>>>> +#define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED	_BITUL(1)
>>>> +
>>>> +struct mptcp_info {
>>>> +	__u8	mptcpi_subflows;
>>>> +	__u8	mptcpi_add_addr_signal;
>>>> +	__u8	mptcpi_add_addr_accepted;
>>>> +	__u8	mptcpi_subflows_max;
>>>> +	__u8	mptcpi_add_addr_signal_max;
>>>> +	__u8	mptcpi_add_addr_accepted_max;
>>>> +	__u32	mptcpi_flags;
>>>> +	__u32	mptcpi_token;
>>>> +	__u64	mptcpi_write_seq;
>>>> +	__u64	mptcpi_snd_una;
>>>> +	__u64	mptcpi_rcv_nxt;
>>>> +};
>>>> +
>>>> #endif /* _UAPI_MPTCP_H */
>>>> diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
>>>> index 0ec787d8b9a2..97c29caa65f3 100644
>>>> --- a/net/mptcp/Kconfig
>>>> +++ b/net/mptcp/Kconfig
>>>> @@ -14,6 +14,10 @@ config MPTCP
>>>>
>>>> if MPTCP
>>>>
>>>> +config INET_MPTCP_DIAG
>>>> +	depends on INET_DIAG
>>>> +	def_tristate INET_DIAG
>>>> +
>>>> config MPTCP_IPV6
>>>> 	bool "MPTCP: IPv6 support for Multipath TCP"
>>>> 	select IPV6
>>>> diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
>>>> index c53f9b845523..2360cbd27d59 100644
>>>> --- a/net/mptcp/Makefile
>>>> +++ b/net/mptcp/Makefile
>>>> @@ -4,6 +4,8 @@ obj-$(CONFIG_MPTCP) += mptcp.o
>>>> mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
>>>> 	   mib.o pm_netlink.o
>>>>
>>>> +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
>>>> +
>>>> mptcp_crypto_test-objs := crypto_test.o
>>>> mptcp_token_test-objs := token_test.o
>>>> obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o
>>>> diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
>>>> new file mode 100644
>>>> index 000000000000..30aa768e23bd
>>>> --- /dev/null
>>>> +++ b/net/mptcp/mptcp_diag.c
>>>> @@ -0,0 +1,167 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/* MPTCP socket monitoring support
>>>> + *
>>>> + * Copyright (c) 2020 Red Hat
>>>> + *
>>>> + * Author: Paolo Abeni <pabeni@redhat.com>
>>>> + */
>>>> +
>>>> +#include <linux/kernel.h>
>>>> +#include <linux/net.h>
>>>> +#include <linux/inet_diag.h>
>>>> +#include <net/netlink.h>
>>>> +#include <uapi/linux/mptcp.h>
>>>> +#include "protocol.h"
>>>> +
>>>> +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
>>>> +			struct netlink_callback *cb,
>>>> +			const struct inet_diag_req_v2 *req,
>>>> +			struct nlattr *bc, bool net_admin)
>>>> +{
>>>> +	if (!inet_diag_bc_sk(bc, sk))
>>>> +		return 0;
>>>> +
>>>> +	return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
>>>> +				 net_admin);
>>>> +}
>>>> +
>>>> +static int mptcp_diag_dump_one(struct netlink_callback *cb,
>>>> +			       const struct inet_diag_req_v2 *req)
>>>> +{
>>>> +	struct sk_buff *in_skb = cb->skb;
>>>> +	struct mptcp_sock *msk = NULL;
>>>> +	struct sk_buff *rep;
>>>> +	int err = -ENOENT;
>>>> +	struct net *net;
>>>> +	struct sock *sk;
>>>> +
>>>> +	net = sock_net(in_skb->sk);
>>>> +	msk = mptcp_token_get_sock(req->id.idiag_cookie[0]);
>>>> +	if (!msk)
>>>> +		goto out_nosk;
>>>> +
>>>> +	err = -ENOMEM;
>>>> +	sk = (struct sock *)msk;
>>>> +	rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
>>>> +			inet_diag_msg_attrs_size() +
>>>> +			nla_total_size(sizeof(struct mptcp_info)) +
>>>> +			nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
>>>> +			GFP_KERNEL);
>>>> +	if (!rep)
>>>> +		goto out;
>>>> +
>>>> +	err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0,
>>>> +				netlink_net_capable(in_skb, CAP_NET_ADMIN));
>>>> +	if (err < 0) {
>>>> +		WARN_ON(err == -EMSGSIZE);
>>>> +		kfree_skb(rep);
>>>> +		goto out;
>>>> +	}
>>>> +	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
>>>> +			      MSG_DONTWAIT);
>>>> +	if (err > 0)
>>>> +		err = 0;
>>>> +out:
>>>> +	sock_put(sk);
>>>> +
>>>> +out_nosk:
>>>> +	return err;
>>>> +}
>>>> +
>>>> +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
>>>> +			    const struct inet_diag_req_v2 *r)
>>>> +{
>>>> +	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
>>>> +	struct net *net = sock_net(skb->sk);
>>>> +	struct inet_diag_dump_data *cb_data;
>>>> +	struct mptcp_sock *msk;
>>>> +	struct nlattr *bc;
>>>> +
>>>> +	cb_data = cb->data;
>>>> +	bc = cb_data->inet_diag_nla_bc;
>>>> +
>>>> +	for (msk = mptcp_token_iter_start(&cb->args[0], &cb->args[1]);
>>>> +	     msk; msk = mptcp_token_iter_next(&cb->args[0], &cb->args[1])) {
>>>
>>> Should there be a rcu_read_lock/rcu_read_unlock around this loop with
>>> mptcp_token_iter_next() using sk_nulls_for_each_rcu()? Or is that already
>>> handled somewhere up the call stack?
>>
>> This code tried to follow the same schema used for inet_hashtable(): it
>> acquires explicit per bucket spinlock and keep it held while dumping
>> the socket. Could be converted to RCU, but then we will get
>> sock_hold()/sock_put() for each entry and I was under the impression
>> that would be more overhead on busy servers with many mptcp sockets,
>> but...
>>
>>>> +		struct inet_sock *inet;
>>>> +		struct sock *sk;
>>>> +
>>>> +		inet = (struct inet_sock *)msk;
>>>> +		sk = (struct sock *)msk;
>>>> +		if (!net_eq(sock_net(sk), net))
>>>> +			continue;
>>>> +		if (!(r->idiag_states & (1 << sk->sk_state)))
>>>> +			continue;
>>>> +		if (r->sdiag_family != AF_UNSPEC &&
>>>> +		    sk->sk_family != r->sdiag_family)
>>>> +			continue;
>>>> +		if (r->id.idiag_sport != inet->inet_sport &&
>>>> +		    r->id.idiag_sport)
>>>> +			continue;
>>>> +		if (r->id.idiag_dport != inet->inet_dport &&
>>>> +		    r->id.idiag_dport)
>>>> +			continue;
>>>> +
>>>> +		if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
>>>> +			mptcp_token_iter_stop(cb->args[0]);
>>>> +
>>>> +			/* will retry on the same position */
>>>> +			cb->args[1]--;
>>>> +			break;
>>>> +		}
>>>> +	}
>>>> +}
>>>> +
>>>> +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
>>>> +				void *_info)
>>>> +{
>>>> +	struct mptcp_sock *msk = mptcp_sk(sk);
>>>> +	struct mptcp_info *info = _info;
>>>> +	u32 flags = 0;
>>>> +	u8 val;
>>>> +
>>>> +	r->idiag_rqueue = sk_rmem_alloc_get(sk);
>>>> +	r->idiag_wqueue = sk_wmem_alloc_get(sk);
>>>> +	if (!info)
>>>> +		return;
>>>> +
>>>> +	info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
>>>> +	info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
>>>> +	info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
>>>> +	info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max);
>>>> +	val = READ_ONCE(msk->pm.add_addr_signal_max);
>>>> +	info->mptcpi_add_addr_signal_max = val;
>>>> +	val = READ_ONCE(msk->pm.add_addr_accept_max);
>>>> +	info->mptcpi_add_addr_accepted_max = val;
>>>> +	if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
>>>> +		flags |= MPTCP_INFO_FLAG_FALLBACK;
>>>> +	if (READ_ONCE(msk->can_ack))
>>>> +		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
>>>> +	info->mptcpi_flags = flags;
>>>> +	info->mptcpi_token = READ_ONCE(msk->token);
>>>> +	info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
>>>> +	info->mptcpi_snd_una = atomic64_read(&msk->snd_una);
>>>> +	info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
>>>
>>> I noticed that tcp_get_info() uses lock_sock_fast() when it populates the
>>> tcp_info struct, is it worth using that here? It seems like the sequence
>>> numbers in particular could easily be inconsistent without locking.
>>
>> ... thanks for the pointer! I misread some of the inet_hashtable diag
>> related code (specifically, I mixed-up LISTEN and ESTABLISHED sockets
>> traversing). Having a better look, TCP does per established socket
>> sock_hold()/sock_lock_fast()/sock_unlock_fast()/sock_put(), so I guess
>> we can do the same and move to RCUs for traversing. I'll try to cook v3
>> soon.
>
> Reporting here things already shared on IRC for larger audience e
> better trackability
>
> An RCU based msk diag implementation does not fit well with
> lock_sock[_fast]() as we have to release the RCU lock on each socket.
>
> We will have the same constraint even with bucket lock - as in the
> current implementation - to avoid possible issue due to lock order.
>
> inet_diag_dump_icsk() tries to amortize the cost of releasing/acquiring
> the lock for each collecting up to 16 sockets in a local array and the
> filling the info individually.
>
> That add a bit of complexity and also do not fit well with iterator:
> the iterator helper will have to implement the filtering as required by
> the diag interface (which looks like a layering violation).
>
> The simpler solution I can think of is avoiding the lock_sock() in
> mptcp_diag_get_info(). that means possible inconsistence between
> write_seq and snd_una. e.g. snd_una could be greater/later then
> write_seq do we really care?

And for further IRC capture:

Regarding the lock_sock question, we can probably minimize the 
inconsistencies by ordering the reads. For example, if we read snd_una 
before write_seq, then snd_una wouldn't show as later. As someone working 
on the protocol it's helpful to have a coherent snapshot of the values but 
I don't want to add too much complexity to guarantee that.

So, I'm ok with reordering some of the reads and avoiding the additional 
locking. Still interested if others have comments!

--
Mat Martineau
Intel
diff mbox series

Patch

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 5f2c77082d9e..9762660df741 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -86,4 +86,21 @@  enum {
 	__MPTCP_PM_CMD_AFTER_LAST
 };
 
+#define MPTCP_INFO_FLAG_FALLBACK		_BITUL(0)
+#define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED	_BITUL(1)
+
+struct mptcp_info {
+	__u8	mptcpi_subflows;
+	__u8	mptcpi_add_addr_signal;
+	__u8	mptcpi_add_addr_accepted;
+	__u8	mptcpi_subflows_max;
+	__u8	mptcpi_add_addr_signal_max;
+	__u8	mptcpi_add_addr_accepted_max;
+	__u32	mptcpi_flags;
+	__u32	mptcpi_token;
+	__u64	mptcpi_write_seq;
+	__u64	mptcpi_snd_una;
+	__u64	mptcpi_rcv_nxt;
+};
+
 #endif /* _UAPI_MPTCP_H */
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
index 0ec787d8b9a2..97c29caa65f3 100644
--- a/net/mptcp/Kconfig
+++ b/net/mptcp/Kconfig
@@ -14,6 +14,10 @@  config MPTCP
 
 if MPTCP
 
+config INET_MPTCP_DIAG
+	depends on INET_DIAG
+	def_tristate INET_DIAG
+
 config MPTCP_IPV6
 	bool "MPTCP: IPv6 support for Multipath TCP"
 	select IPV6
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index c53f9b845523..2360cbd27d59 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -4,6 +4,8 @@  obj-$(CONFIG_MPTCP) += mptcp.o
 mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
 	   mib.o pm_netlink.o
 
+obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
+
 mptcp_crypto_test-objs := crypto_test.o
 mptcp_token_test-objs := token_test.o
 obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
new file mode 100644
index 000000000000..30aa768e23bd
--- /dev/null
+++ b/net/mptcp/mptcp_diag.c
@@ -0,0 +1,167 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* MPTCP socket monitoring support
+ *
+ * Copyright (c) 2020 Red Hat
+ *
+ * Author: Paolo Abeni <pabeni@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/inet_diag.h>
+#include <net/netlink.h>
+#include <uapi/linux/mptcp.h>
+#include "protocol.h"
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+			struct netlink_callback *cb,
+			const struct inet_diag_req_v2 *req,
+			struct nlattr *bc, bool net_admin)
+{
+	if (!inet_diag_bc_sk(bc, sk))
+		return 0;
+
+	return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
+				 net_admin);
+}
+
+static int mptcp_diag_dump_one(struct netlink_callback *cb,
+			       const struct inet_diag_req_v2 *req)
+{
+	struct sk_buff *in_skb = cb->skb;
+	struct mptcp_sock *msk = NULL;
+	struct sk_buff *rep;
+	int err = -ENOENT;
+	struct net *net;
+	struct sock *sk;
+
+	net = sock_net(in_skb->sk);
+	msk = mptcp_token_get_sock(req->id.idiag_cookie[0]);
+	if (!msk)
+		goto out_nosk;
+
+	err = -ENOMEM;
+	sk = (struct sock *)msk;
+	rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+			inet_diag_msg_attrs_size() +
+			nla_total_size(sizeof(struct mptcp_info)) +
+			nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
+			GFP_KERNEL);
+	if (!rep)
+		goto out;
+
+	err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0,
+				netlink_net_capable(in_skb, CAP_NET_ADMIN));
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(rep);
+		goto out;
+	}
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+			      MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+out:
+	sock_put(sk);
+
+out_nosk:
+	return err;
+}
+
+static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			    const struct inet_diag_req_v2 *r)
+{
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+	struct net *net = sock_net(skb->sk);
+	struct inet_diag_dump_data *cb_data;
+	struct mptcp_sock *msk;
+	struct nlattr *bc;
+
+	cb_data = cb->data;
+	bc = cb_data->inet_diag_nla_bc;
+
+	for (msk = mptcp_token_iter_start(&cb->args[0], &cb->args[1]);
+	     msk; msk = mptcp_token_iter_next(&cb->args[0], &cb->args[1])) {
+		struct inet_sock *inet;
+		struct sock *sk;
+
+		inet = (struct inet_sock *)msk;
+		sk = (struct sock *)msk;
+		if (!net_eq(sock_net(sk), net))
+			continue;
+		if (!(r->idiag_states & (1 << sk->sk_state)))
+			continue;
+		if (r->sdiag_family != AF_UNSPEC &&
+		    sk->sk_family != r->sdiag_family)
+			continue;
+		if (r->id.idiag_sport != inet->inet_sport &&
+		    r->id.idiag_sport)
+			continue;
+		if (r->id.idiag_dport != inet->inet_dport &&
+		    r->id.idiag_dport)
+			continue;
+
+		if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
+			mptcp_token_iter_stop(cb->args[0]);
+
+			/* will retry on the same position */
+			cb->args[1]--;
+			break;
+		}
+	}
+}
+
+static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+				void *_info)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct mptcp_info *info = _info;
+	u32 flags = 0;
+	u8 val;
+
+	r->idiag_rqueue = sk_rmem_alloc_get(sk);
+	r->idiag_wqueue = sk_wmem_alloc_get(sk);
+	if (!info)
+		return;
+
+	info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
+	info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
+	info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
+	info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max);
+	val = READ_ONCE(msk->pm.add_addr_signal_max);
+	info->mptcpi_add_addr_signal_max = val;
+	val = READ_ONCE(msk->pm.add_addr_accept_max);
+	info->mptcpi_add_addr_accepted_max = val;
+	if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
+		flags |= MPTCP_INFO_FLAG_FALLBACK;
+	if (READ_ONCE(msk->can_ack))
+		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
+	info->mptcpi_flags = flags;
+	info->mptcpi_token = READ_ONCE(msk->token);
+	info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
+	info->mptcpi_snd_una = atomic64_read(&msk->snd_una);
+	info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
+}
+
+static const struct inet_diag_handler mptcp_diag_handler = {
+	.dump		 = mptcp_diag_dump,
+	.dump_one	 = mptcp_diag_dump_one,
+	.idiag_get_info  = mptcp_diag_get_info,
+	.idiag_type	 = IPPROTO_MPTCP,
+	.idiag_info_size = sizeof(struct mptcp_info),
+};
+
+int __init mptcp_diag_init(void)
+{
+	return inet_diag_register(&mptcp_diag_handler);
+}
+
+static void __exit mptcp_diag_exit(void)
+{
+	inet_diag_unregister(&mptcp_diag_handler);
+}
+
+module_init(mptcp_diag_init);
+module_exit(mptcp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */);