diff mbox series

[02/10] mptcp: tag sequence_seq with socket state

Message ID 20210408184936.6245-3-fw@strlen.de
State Superseded, archived
Delegated to: Paolo Abeni
Headers show
Series mptcp: add SOL_SOCKET support | expand

Commit Message

Florian Westphal April 8, 2021, 6:49 p.m. UTC
Paolo Abeni suggested to avoid re-syncing new subflows because
they inherit options from listener. In case options were set on
listener but are not set on mptcp-socket there is no need to
do any synchronisation for new subflows.

This change sets sockopt_seq of new mptcp sockets to the seq of
the mptcp listener sock.

Subflow sequence is set to the embedded tcp listener sk.
Add a comment explaing why sk_state is involved in sockopt_seq
generation.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/mptcp/protocol.c | 16 ++++++++++-----
 net/mptcp/protocol.h |  4 ++++
 net/mptcp/sockopt.c  | 47 ++++++++++++++++++++++++++++++++++++++++++--
 net/mptcp/subflow.c  |  4 ++++
 4 files changed, 64 insertions(+), 7 deletions(-)

Comments

Mat Martineau April 10, 2021, 12:12 a.m. UTC | #1
On Thu, 8 Apr 2021, Florian Westphal wrote:

> Paolo Abeni suggested to avoid re-syncing new subflows because
> they inherit options from listener. In case options were set on
> listener but are not set on mptcp-socket there is no need to
> do any synchronisation for new subflows.
>
> This change sets sockopt_seq of new mptcp sockets to the seq of
> the mptcp listener sock.
>
> Subflow sequence is set to the embedded tcp listener sk.
> Add a comment explaing why sk_state is involved in sockopt_seq
> generation.
>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
> net/mptcp/protocol.c | 16 ++++++++++-----
> net/mptcp/protocol.h |  4 ++++
> net/mptcp/sockopt.c  | 47 ++++++++++++++++++++++++++++++++++++++++++--
> net/mptcp/subflow.c  |  4 ++++
> 4 files changed, 64 insertions(+), 7 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 2233baf03c3a..26e3ffe8840e 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -729,11 +729,14 @@ static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
> 	if (likely(list_empty(&msk->join_list)))
> 		return false;
>
> -	ret = true;
> 	spin_lock_bh(&msk->join_list_lock);
> -	list_for_each_entry(subflow, &msk->join_list, node)
> -		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
> +	list_for_each_entry(subflow, &msk->join_list, node) {
> +		u32 sseq = READ_ONCE(subflow->setsockopt_seq);
>
> +		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
> +		if (READ_ONCE(msk->setsockopt_seq) != sseq)
> +			ret = true;
> +	}
> 	list_splice_tail_init(&msk->join_list, &msk->conn_list);
> 	spin_unlock_bh(&msk->join_list_lock);
>
> @@ -745,7 +748,8 @@ void __mptcp_flush_join_list(struct mptcp_sock *msk)
> 	if (likely(!mptcp_do_flush_join_list(msk)))
> 		return;
>
> -	if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
> +	if (msk->setsockopt_seq &&
> +	    !test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
> 		mptcp_schedule_work((struct sock *)msk);
> }
>
> @@ -758,7 +762,8 @@ static void mptcp_flush_join_list(struct mptcp_sock *msk)
> 	if (!mptcp_do_flush_join_list(msk) && !sync_needed)
> 		return;
>
> -	mptcp_sockopt_sync_all(msk);
> +	if (msk->setsockopt_seq)
> +		mptcp_sockopt_sync_all(msk);
> }
>
> static bool mptcp_timer_pending(struct sock *sk)
> @@ -2712,6 +2717,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
> 	msk->snd_nxt = msk->write_seq;
> 	msk->snd_una = msk->write_seq;
> 	msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
> +	msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
>
> 	if (mp_opt->mp_capable) {
> 		msk->can_ack = true;
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 55fe05a40216..edc0128730df 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -256,6 +256,8 @@ struct mptcp_sock {
> 		u64	time;	/* start time of measurement window */
> 		u64	rtt_us; /* last maximum rtt of subflows */
> 	} rcvq_space;
> +
> +	u32 setsockopt_seq;
> };
>
> #define mptcp_lock_sock(___sk, cb) do {					\
> @@ -414,6 +416,8 @@ struct mptcp_subflow_context {
> 	long	delegated_status;
> 	struct	list_head delegated_node;   /* link into delegated_action, protected by local BH */
>
> +	u32 setsockopt_seq;
> +
> 	struct	sock *tcp_sock;	    /* tcp sk backpointer */
> 	struct	sock *conn;	    /* parent mptcp_sock */
> 	const	struct inet_connection_sock_af_ops *icsk_af_ops;
> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> index 4fdc0ad6acf7..b1950be15abe 100644
> --- a/net/mptcp/sockopt.c
> +++ b/net/mptcp/sockopt.c
> @@ -24,6 +24,27 @@ static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
> 	return msk->first;
> }
>
> +static u32 sockopt_seq_reset(const struct sock *sk)
> +{
> +	sock_owned_by_me(sk);
> +
> +	/* Highbits contain state.  Allows to distinguish sockopt_seq
> +	 * of listener and established:
> +	 * s0 = new_listener()
> +	 * sockopt(s0) - seq is 1
> +	 * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0)
> +	 * sockopt(s0) - seq increments to 2 on s0
> +	 * sockopt(s1) // seq increments to 2 on s1 (different option)
> +	 * new ssk completes join, inherits options from s0 // seq 2
> +	 * Needs sync from mptcp join logic, but ssk->seq == msk->seq
> +	 *
> +	 * Set High order bits to sk_state so ssk->seq == msk->seq test
> +	 * will fail.
> +	 */
> +
> +	return sk->sk_state << 24u;

I think this works because sk->sk_state gets automatically promoted to an 
int, but maybe "(u32)sk->sk_state << 24" is the intent?

> +}
> +
> static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
> 				       sockptr_t optval, unsigned int optlen)
> {
> @@ -350,22 +371,44 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname,
> 	return -EOPNOTSUPP;
> }
>
> +static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
> +{
> +}
> +
> void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
> {
> +	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
> +
> 	msk_owned_by_me(msk);
> +
> +	if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
> +		__mptcp_sockopt_sync(msk, ssk);
> +
> +		subflow->setsockopt_seq = msk->setsockopt_seq;
> +	}
> }
>
> void mptcp_sockopt_sync_all(struct mptcp_sock *msk)
> {
> 	struct mptcp_subflow_context *subflow;
> +	struct sock *sk = (struct sock *)msk;
> +	u32 seq;
>
> -	msk_owned_by_me(msk);
> +	seq = sockopt_seq_reset(sk);
>
> 	mptcp_for_each_subflow(msk, subflow) {
> 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> +		u32 sseq = READ_ONCE(subflow->setsockopt_seq);
>
> -		mptcp_sockopt_sync(msk, ssk);
> +		if (sseq != msk->setsockopt_seq) {
> +			__mptcp_sockopt_sync(msk, ssk);
> +			WRITE_ONCE(subflow->setsockopt_seq, seq);
> +		} else if (sseq != seq) {
> +			WRITE_ONCE(subflow->setsockopt_seq, seq);
> +		}
>
> 		cond_resched();
> 	}
> +
> +	msk->setsockopt_seq = seq;

One of my comments on the first RFC of this series was a a suggestion to 
avoid setsockopt_seq overflow by resetting it in mptcp_sockopt_sync_all() 
- which this code does!

What I didn't realize at that time was that mptcp_sockopt_sync_all() might 
not be called for a very long time (or at all?) if there are no joins to 
trigger it. To really protect against overflow, mptcp_setsockopt() can 
call (or schedule) mptcp_sockopt_sync_all() if the non-sk_state bits of 
msk->setsockopt_seq exceed a threshold.

> }
> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> index fc7107f8d81b..82e91b00ad39 100644
> --- a/net/mptcp/subflow.c
> +++ b/net/mptcp/subflow.c
> @@ -681,6 +681,9 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
> 			goto out;
> 		}
>
> +		/* ssk inherits options of listener sk */
> +		ctx->setsockopt_seq = listener->setsockopt_seq;
> +
> 		if (ctx->mp_capable) {
> 			/* this can't race with mptcp_close(), as the msk is
> 			 * not yet exposted to user-space
> @@ -696,6 +699,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
> 			 * created mptcp socket
> 			 */
> 			new_msk->sk_destruct = mptcp_sock_destruct;
> +			mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq;
> 			mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
> 			mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
> 			ctx->conn = new_msk;
> -- 
> 2.26.3
>
>
>

--
Mat Martineau
Intel
Florian Westphal April 12, 2021, 8:05 a.m. UTC | #2
Mat Martineau <mathew.j.martineau@linux.intel.com> wrote:
> On Thu, 8 Apr 2021, Florian Westphal wrote:
> 
> > +	return sk->sk_state << 24u;
> 
> I think this works because sk->sk_state gets automatically promoted to an
> int, but maybe "(u32)sk->sk_state << 24" is the intent?

Yes, I can add an explicit cast.

> What I didn't realize at that time was that mptcp_sockopt_sync_all() might
> not be called for a very long time (or at all?) if there are no joins to
> trigger it. To really protect against overflow, mptcp_setsockopt() can call
> (or schedule) mptcp_sockopt_sync_all() if the non-sk_state bits of
> msk->setsockopt_seq exceed a threshold.

Next patch adds sockopt_seq_inc() which is supposed to guarantee that it
never wraps to 0.  I can change it so it will wrap in the 24bit sequence
space only, so it would cycle back to sk->sk_state << 24u.
Paolo Abeni April 13, 2021, 9:58 a.m. UTC | #3
On Thu, 2021-04-08 at 20:49 +0200, Florian Westphal wrote:
> Paolo Abeni suggested to avoid re-syncing new subflows because
> they inherit options from listener. In case options were set on
> listener but are not set on mptcp-socket there is no need to
> do any synchronisation for new subflows.
> 
> This change sets sockopt_seq of new mptcp sockets to the seq of
> the mptcp listener sock.
> 
> Subflow sequence is set to the embedded tcp listener sk.
> Add a comment explaing why sk_state is involved in sockopt_seq
> generation.
> 
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  net/mptcp/protocol.c | 16 ++++++++++-----
>  net/mptcp/protocol.h |  4 ++++
>  net/mptcp/sockopt.c  | 47 ++++++++++++++++++++++++++++++++++++++++++--
>  net/mptcp/subflow.c  |  4 ++++
>  4 files changed, 64 insertions(+), 7 deletions(-)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 2233baf03c3a..26e3ffe8840e 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -729,11 +729,14 @@ static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
>  	if (likely(list_empty(&msk->join_list)))
>  		return false;
>  
> -	ret = true;
>  	spin_lock_bh(&msk->join_list_lock);
> -	list_for_each_entry(subflow, &msk->join_list, node)
> -		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
> +	list_for_each_entry(subflow, &msk->join_list, node) {
> +		u32 sseq = READ_ONCE(subflow->setsockopt_seq);
>  
> +		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
> +		if (READ_ONCE(msk->setsockopt_seq) != sseq)
> +			ret = true;

I'm a bit lost WRT 'ret' scope. I don't see where it's initialized.
Does the compiler emit a warning here?

Perhaps moving the 'ret' variable definition to this patch will makes
the things clearer?!?

> +	}
>  	list_splice_tail_init(&msk->join_list, &msk->conn_list);
>  	spin_unlock_bh(&msk->join_list_lock);
>  
> @@ -745,7 +748,8 @@ void __mptcp_flush_join_list(struct mptcp_sock *msk)
>  	if (likely(!mptcp_do_flush_join_list(msk)))
>  		return;
>  
> -	if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
> +	if (msk->setsockopt_seq &&
> +	    !test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
>  		mptcp_schedule_work((struct sock *)msk);

It's not obvious to me why the 'msk->setsockopt_seq' check is needed
here (and in the below chunk) ???

Perhaps a comment would help?

Cheers,

Paolo
Florian Westphal April 13, 2021, 2:58 p.m. UTC | #4
Paolo Abeni <pabeni@redhat.com> wrote:
> > +	list_for_each_entry(subflow, &msk->join_list, node) {
> > +		u32 sseq = READ_ONCE(subflow->setsockopt_seq);
> >  
> > +		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
> > +		if (READ_ONCE(msk->setsockopt_seq) != sseq)
> > +			ret = true;
> 
> I'm a bit lost WRT 'ret' scope. I don't see where it's initialized.
> Does the compiler emit a warning here?

Did not see one, but it looks broken.

> Perhaps moving the 'ret' variable definition to this patch will makes
> the things clearer?!?

Done.

> > -	if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
> > +	if (msk->setsockopt_seq &&
> > +	    !test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
> >  		mptcp_schedule_work((struct sock *)msk);
> 
> It's not obvious to me why the 'msk->setsockopt_seq' check is needed
> here (and in the below chunk) ???
> 
> Perhaps a comment would help?

Perhaps:
+static bool setsockopt_was_called(const struct mptcp_sock *msk)
+{
+       return msk->setsockopt_seq > 0;
+}
+

if (setsockopt_was_called(msk) && ..

?
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 2233baf03c3a..26e3ffe8840e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -729,11 +729,14 @@  static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
 	if (likely(list_empty(&msk->join_list)))
 		return false;
 
-	ret = true;
 	spin_lock_bh(&msk->join_list_lock);
-	list_for_each_entry(subflow, &msk->join_list, node)
-		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
+	list_for_each_entry(subflow, &msk->join_list, node) {
+		u32 sseq = READ_ONCE(subflow->setsockopt_seq);
 
+		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
+		if (READ_ONCE(msk->setsockopt_seq) != sseq)
+			ret = true;
+	}
 	list_splice_tail_init(&msk->join_list, &msk->conn_list);
 	spin_unlock_bh(&msk->join_list_lock);
 
@@ -745,7 +748,8 @@  void __mptcp_flush_join_list(struct mptcp_sock *msk)
 	if (likely(!mptcp_do_flush_join_list(msk)))
 		return;
 
-	if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
+	if (msk->setsockopt_seq &&
+	    !test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
 		mptcp_schedule_work((struct sock *)msk);
 }
 
@@ -758,7 +762,8 @@  static void mptcp_flush_join_list(struct mptcp_sock *msk)
 	if (!mptcp_do_flush_join_list(msk) && !sync_needed)
 		return;
 
-	mptcp_sockopt_sync_all(msk);
+	if (msk->setsockopt_seq)
+		mptcp_sockopt_sync_all(msk);
 }
 
 static bool mptcp_timer_pending(struct sock *sk)
@@ -2712,6 +2717,7 @@  struct sock *mptcp_sk_clone(const struct sock *sk,
 	msk->snd_nxt = msk->write_seq;
 	msk->snd_una = msk->write_seq;
 	msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
+	msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
 
 	if (mp_opt->mp_capable) {
 		msk->can_ack = true;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 55fe05a40216..edc0128730df 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -256,6 +256,8 @@  struct mptcp_sock {
 		u64	time;	/* start time of measurement window */
 		u64	rtt_us; /* last maximum rtt of subflows */
 	} rcvq_space;
+
+	u32 setsockopt_seq;
 };
 
 #define mptcp_lock_sock(___sk, cb) do {					\
@@ -414,6 +416,8 @@  struct mptcp_subflow_context {
 	long	delegated_status;
 	struct	list_head delegated_node;   /* link into delegated_action, protected by local BH */
 
+	u32 setsockopt_seq;
+
 	struct	sock *tcp_sock;	    /* tcp sk backpointer */
 	struct	sock *conn;	    /* parent mptcp_sock */
 	const	struct inet_connection_sock_af_ops *icsk_af_ops;
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 4fdc0ad6acf7..b1950be15abe 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -24,6 +24,27 @@  static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
 	return msk->first;
 }
 
+static u32 sockopt_seq_reset(const struct sock *sk)
+{
+	sock_owned_by_me(sk);
+
+	/* Highbits contain state.  Allows to distinguish sockopt_seq
+	 * of listener and established:
+	 * s0 = new_listener()
+	 * sockopt(s0) - seq is 1
+	 * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0)
+	 * sockopt(s0) - seq increments to 2 on s0
+	 * sockopt(s1) // seq increments to 2 on s1 (different option)
+	 * new ssk completes join, inherits options from s0 // seq 2
+	 * Needs sync from mptcp join logic, but ssk->seq == msk->seq
+	 *
+	 * Set High order bits to sk_state so ssk->seq == msk->seq test
+	 * will fail.
+	 */
+
+	return sk->sk_state << 24u;
+}
+
 static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
 				       sockptr_t optval, unsigned int optlen)
 {
@@ -350,22 +371,44 @@  int mptcp_getsockopt(struct sock *sk, int level, int optname,
 	return -EOPNOTSUPP;
 }
 
+static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
+{
+}
+
 void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
 {
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
 	msk_owned_by_me(msk);
+
+	if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
+		__mptcp_sockopt_sync(msk, ssk);
+
+		subflow->setsockopt_seq = msk->setsockopt_seq;
+	}
 }
 
 void mptcp_sockopt_sync_all(struct mptcp_sock *msk)
 {
 	struct mptcp_subflow_context *subflow;
+	struct sock *sk = (struct sock *)msk;
+	u32 seq;
 
-	msk_owned_by_me(msk);
+	seq = sockopt_seq_reset(sk);
 
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+		u32 sseq = READ_ONCE(subflow->setsockopt_seq);
 
-		mptcp_sockopt_sync(msk, ssk);
+		if (sseq != msk->setsockopt_seq) {
+			__mptcp_sockopt_sync(msk, ssk);
+			WRITE_ONCE(subflow->setsockopt_seq, seq);
+		} else if (sseq != seq) {
+			WRITE_ONCE(subflow->setsockopt_seq, seq);
+		}
 
 		cond_resched();
 	}
+
+	msk->setsockopt_seq = seq;
 }
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index fc7107f8d81b..82e91b00ad39 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -681,6 +681,9 @@  static struct sock *subflow_syn_recv_sock(const struct sock *sk,
 			goto out;
 		}
 
+		/* ssk inherits options of listener sk */
+		ctx->setsockopt_seq = listener->setsockopt_seq;
+
 		if (ctx->mp_capable) {
 			/* this can't race with mptcp_close(), as the msk is
 			 * not yet exposted to user-space
@@ -696,6 +699,7 @@  static struct sock *subflow_syn_recv_sock(const struct sock *sk,
 			 * created mptcp socket
 			 */
 			new_msk->sk_destruct = mptcp_sock_destruct;
+			mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq;
 			mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
 			mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
 			ctx->conn = new_msk;