diff mbox series

[RFC,11/12] mptcp: allow picking different xmit subflows

Message ID 960519a1162d5a416245fa239e57392ab0efa0be.1596216310.git.pabeni@redhat.com
State Superseded, archived
Headers show
Series mptcp: multiple xmit substreams support | expand

Commit Message

Paolo Abeni July 31, 2020, 5:39 p.m. UTC
Update the scheduler to less trivial heuristic: cache
the last used subflow, and try to send on it a reasonably
long burst of data. When the burst or the subflow send
space is exausted, move to the next one with available
send space.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 112 +++++++++++++++++++++++++++++++++++++------
 net/mptcp/protocol.h |   6 ++-
 2 files changed, 101 insertions(+), 17 deletions(-)

Comments

Florian Westphal Aug. 1, 2020, 11:50 p.m. UTC | #1
Paolo Abeni <pabeni@redhat.com> wrote:
> Update the scheduler to less trivial heuristic: cache
> the last used subflow, and try to send on it a reasonably
> long burst of data. When the burst or the subflow send
> space is exausted, move to the next one with available
> send space.
> 
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  net/mptcp/protocol.c | 112 +++++++++++++++++++++++++++++++++++++------
>  net/mptcp/protocol.h |   6 ++-
>  2 files changed, 101 insertions(+), 17 deletions(-)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index ef5b68c4ff49..89c0400593d1 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1012,39 +1012,112 @@ static void mptcp_nospace(struct sock *sk, struct sock *ssk)
>  		set_bit(SOCK_NOSPACE, &sock->flags);
>  }
>  
> +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow,
> +				 bool fallback)
> +{
> +	struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> +
> +	/* subflow must be open for write */
> +	if ((1 << ssk->sk_state) &
> +	    (TCPF_CLOSE | TCPF_LAST_ACK | TCPF_CLOSING | TCPF_FIN_WAIT2 |
> +	     TCPF_FIN_WAIT1))
> +		return false;

Hmmm, why is this not checking for TCP_ESTABLISHED -> true, rather than
checking 'states i do not want'?

> +	/* we can xmit on MPC and fallen back subflows in
> +	 * TCP_SYN_SENT/TCP_SYN_RECV status, but we need fully established
> +	 * MP_JOIN subflows.
> +	 */

I don't understand this comment.  The part with 'fully established
MP_JOIN' is clear to me.

Is this about SYN_SENT state is fine if ssk tested is the 'msk->first'
subflow?

If so, it might make sense to re-arrange the check to something like
'subflow->request_join && established && subflow->fully_established &&
!fallback' -> return true
else, return state == ESTABLISHED?

or even, ".. else subflow requal to msk->first and state eq ESTABLISHED
return true'?

>  static void ssk_check_wmem(struct sock *sk, struct sock *ssk)
> @@ -1142,6 +1215,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  		}
>  
>  		copied += ret;
> +		msk->snd_burst -= ret;

Maybe add a comment that ret > snd_burst is fine.

>  		tx_ok = msg_data_left(msg);
>  		if (!tx_ok)
> @@ -1358,6 +1432,10 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
>  	unsigned int moved = 0;
>  	bool done;
>  
> +	if (((struct sock *)msk)->sk_state == TCP_CLOSE)
> +		return false;

This looks funny -- why is this needed?

>  static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
>  {
> +	bool fallback = __mptcp_check_fallback(msk);
>  	struct mptcp_subflow_context *subflow;
>  	struct sock *backup = NULL;
>  
> @@ -1521,6 +1600,9 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
>  	mptcp_for_each_subflow(msk, subflow) {
>  		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
>  
> +		if (!mptcp_subflow_active(subflow, fallback))
> +			continue;

Hmm, should this be a separate bug fix (prevent non-fully-established-join)?
Paolo Abeni Aug. 3, 2020, 11:13 a.m. UTC | #2
On Sun, 2020-08-02 at 01:50 +0200, Florian Westphal wrote:
> Paolo Abeni <pabeni@redhat.com> wrote:
> > Update the scheduler to less trivial heuristic: cache
> > the last used subflow, and try to send on it a reasonably
> > long burst of data. When the burst or the subflow send
> > space is exausted, move to the next one with available
> > send space.
> > 
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> >  net/mptcp/protocol.c | 112 +++++++++++++++++++++++++++++++++++++------
> >  net/mptcp/protocol.h |   6 ++-
> >  2 files changed, 101 insertions(+), 17 deletions(-)
> > 
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index ef5b68c4ff49..89c0400593d1 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -1012,39 +1012,112 @@ static void mptcp_nospace(struct sock *sk, struct sock *ssk)
> >  		set_bit(SOCK_NOSPACE, &sock->flags);
> >  }
> >  
> > +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow,
> > +				 bool fallback)
> > +{
> > +	struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> > +
> > +	/* subflow must be open for write */
> > +	if ((1 << ssk->sk_state) &
> > +	    (TCPF_CLOSE | TCPF_LAST_ACK | TCPF_CLOSING | TCPF_FIN_WAIT2 |
> > +	     TCPF_FIN_WAIT1))
> > +		return false;
> 
> Hmmm, why is this not checking for TCP_ESTABLISHED -> true, rather than
> checking 'states i do not want'?

I think we can pick a subflow for xmit even after receiving a TCP FIN
on such subflows, so the subflow status could be different from
TCP_ESTABLISHED. 

To my twiested mind was more natural check for not allowed states, I
can flip to allowed one if needed.

> > +	/* we can xmit on MPC and fallen back subflows in
> > +	 * TCP_SYN_SENT/TCP_SYN_RECV status, but we need fully established
> > +	 * MP_JOIN subflows.
> > +	 */
> 
> I don't understand this comment.  The part with 'fully established
> MP_JOIN' is clear to me.
> 
> Is this about SYN_SENT state is fine if ssk tested is the 'msk->first'
> subflow?

yes, exactly.

> If so, it might make sense to re-arrange the check to something like
> 'subflow->request_join && established && subflow->fully_established &&
> !fallback' -> return true
> else, return state == ESTABLISHED?
> 
> or even, ".. else subflow requal to msk->first and state eq ESTABLISHED
> return true'?

I think we need to check for more TCP states than ESTABLISHED for both
first and MP_JOIN subflow. For '->first' we should additionally
allow SYN_SENT and TCP_SYN_RECV. Not sure the above will simplify ?!?
> 
> >  static void ssk_check_wmem(struct sock *sk, struct sock *ssk)
> > @@ -1142,6 +1215,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> >  		}
> >  
> >  		copied += ret;
> > +		msk->snd_burst -= ret;
> 
> Maybe add a comment that ret > snd_burst is fine.
> 
> >  		tx_ok = msg_data_left(msg);
> >  		if (!tx_ok)
> > @@ -1358,6 +1432,10 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
> >  	unsigned int moved = 0;
> >  	bool done;
> >  
> > +	if (((struct sock *)msk)->sk_state == TCP_CLOSE)
> > +		return false;
> 
> This looks funny -- why is this needed?

mptcp workqueue is called on a subflow with empty rx queue - flushed by 
__mptcp_close_ssk - and not null data_avail. __mptcp_move_skbs() will
loop forever. I can likely clear data_avail in __mptcp_close_ssk(), but
never tried.

> >  static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
> >  {
> > +	bool fallback = __mptcp_check_fallback(msk);
> >  	struct mptcp_subflow_context *subflow;
> >  	struct sock *backup = NULL;
> >  
> > @@ -1521,6 +1600,9 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
> >  	mptcp_for_each_subflow(msk, subflow) {
> >  		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> >  
> > +		if (!mptcp_subflow_active(subflow, fallback))
> > +			continue;
> 
> Hmm, should this be a separate bug fix (prevent non-fully-established-join)?

I think the bug is only apparent after we have non backup subflows
and/or remove addr support ?!?

reinjection will likely need revisiting, but I haven't looked at that
yet.

/P
Florian Westphal Aug. 3, 2020, 12:56 p.m. UTC | #3
Paolo Abeni <pabeni@redhat.com> wrote:
> On Sun, 2020-08-02 at 01:50 +0200, Florian Westphal wrote:
> > Paolo Abeni <pabeni@redhat.com> wrote:
> > > Update the scheduler to less trivial heuristic: cache
> > > the last used subflow, and try to send on it a reasonably
> > > long burst of data. When the burst or the subflow send
> > > space is exausted, move to the next one with available
> > > send space.
> > > 
> > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > ---
> > >  net/mptcp/protocol.c | 112 +++++++++++++++++++++++++++++++++++++------
> > >  net/mptcp/protocol.h |   6 ++-
> > >  2 files changed, 101 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > > index ef5b68c4ff49..89c0400593d1 100644
> > > --- a/net/mptcp/protocol.c
> > > +++ b/net/mptcp/protocol.c
> > > @@ -1012,39 +1012,112 @@ static void mptcp_nospace(struct sock *sk, struct sock *ssk)
> > >  		set_bit(SOCK_NOSPACE, &sock->flags);
> > >  }
> > >  
> > > +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow,
> > > +				 bool fallback)
> > > +{
> > > +	struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> > > +
> > > +	/* subflow must be open for write */
> > > +	if ((1 << ssk->sk_state) &
> > > +	    (TCPF_CLOSE | TCPF_LAST_ACK | TCPF_CLOSING | TCPF_FIN_WAIT2 |
> > > +	     TCPF_FIN_WAIT1))
> > > +		return false;
> > 
> > Hmmm, why is this not checking for TCP_ESTABLISHED -> true, rather than
> > checking 'states i do not want'?
> 
> I think we can pick a subflow for xmit even after receiving a TCP FIN
> on such subflows, so the subflow status could be different from
> TCP_ESTABLISHED. 
>
> To my twiested mind was more natural check for not allowed states, I
> can flip to allowed one if needed.

I would prefer that.  Otherwise I have to invert manually during review,
and this makes no sense to me when I do the negation:

States:
TCPF_ESTABLISHED = (1 << TCP_ESTABLISHED),
TCPF_SYN_SENT    = (1 << TCP_SYN_SENT),
TCPF_SYN_RECV    = (1 << TCP_SYN_RECV),
TCPF_FIN_WAIT1   = (1 << TCP_FIN_WAIT1),
TCPF_FIN_WAIT2   = (1 << TCP_FIN_WAIT2),
TCPF_TIME_WAIT   = (1 << TCP_TIME_WAIT),
TCPF_CLOSE       = (1 << TCP_CLOSE),
TCPF_CLOSE_WAIT  = (1 << TCP_CLOSE_WAIT),
TCPF_LAST_ACK    = (1 << TCP_LAST_ACK),
TCPF_LISTEN      = (1 << TCP_LISTEN),
TCPF_CLOSING     = (1 << TCP_CLOSING),
TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),

So, above checks disables:
TCPF_CLOSE       = (1 << TCP_CLOSE),
TCPF_LAST_ACK    = (1 << TCP_LAST_ACK),
TCPF_CLOSING     = (1 << TCP_CLOSING),
TCPF_FIN_WAIT1   = (1 << TCP_FIN_WAIT1),
TCPF_FIN_WAIT2   = (1 << TCP_FIN_WAIT2),

... which makes following ok:

TCPF_ESTABLISHED = (1 << TCP_ESTABLISHED),

Agree on that.

TCPF_SYN_SENT    = (1 << TCP_SYN_SENT),

we don't know yet if connection will complete, or if
mptcp will be available.

TCPF_SYN_RECV    = (1 << TCP_SYN_RECV),

same.

TCPF_TIME_WAIT   = (1 << TCP_TIME_WAIT),

This is also strange, this is a quiesence period to catch
in-flight packets.  I don't think this subflow should still
be used for xmit.

TCPF_CLOSE_WAIT  = (1 << TCP_CLOSE_WAIT),

Agree on this one as well -- its ok to xmit here.

TCPF_LISTEN      = (1 << TCP_LISTEN),

Should not happen of course, but its definitely
not usable :-)

TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),

and that is... mhhh... strange.  Why is this one ok?

Checking 'fallback' made no sense to me.  Why do we need to check for this?

Am I missing anything?  If yes, please add a comment explaining that
scenario.  Same for fallback, I do not understand how fallback yes/no makes
a difference here.

> > I don't understand this comment.  The part with 'fully established
> > MP_JOIN' is clear to me.
> > 
> > Is this about SYN_SENT state is fine if ssk tested is the 'msk->first'
> > subflow?
> 
> yes, exactly.

Ok.  Would still like to understand how this can occur (data to transmit,
first connection not established yet).

> > If so, it might make sense to re-arrange the check to something like
> > 'subflow->request_join && established && subflow->fully_established &&
> > !fallback' -> return true
> > else, return state == ESTABLISHED?
> > 
> > or even, ".. else subflow requal to msk->first and state eq ESTABLISHED
> > return true'?
> 
> I think we need to check for more TCP states than ESTABLISHED for both
> first and MP_JOIN subflow. For '->first' we should additionally
> allow SYN_SENT and TCP_SYN_RECV. Not sure the above will simplify ?!?

Check for syn sent/recv makes no sense to me.  How can we get
data to send at this point?

I would propose something like:

static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
  struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

  /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
  if (subflow->request_join && !subflow->fully_established)
     return false;

  /* only send if our side has not closed yet */
  return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
}

Way less cases/tests.  What breaks/is missing here?

> > > @@ -1358,6 +1432,10 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
> > >  	unsigned int moved = 0;
> > >  	bool done;
> > >  
> > > +	if (((struct sock *)msk)->sk_state == TCP_CLOSE)
> > > +		return false;
> > 
> > This looks funny -- why is this needed?
> 
> mptcp workqueue is called on a subflow with empty rx queue - flushed by 
> __mptcp_close_ssk - and not null data_avail. __mptcp_move_skbs() will
> loop forever. I can likely clear data_avail in __mptcp_close_ssk(), but
> never tried.

Ok, if that doesn't work please just add a small comment with above
explanation.  Thanks.

> > Hmm, should this be a separate bug fix (prevent non-fully-established-join)?
> 
> I think the bug is only apparent after we have non backup subflows
> and/or remove addr support ?!?

What about other mptcp stack peer?  Couldn't they create a non-backup flow?

[ I don't mind if you don't want a separate patch for this, it just
looked like a small fix to me that could be applied already ]
Paolo Abeni Aug. 3, 2020, 1:23 p.m. UTC | #4
On Mon, 2020-08-03 at 14:56 +0200, Florian Westphal wrote:
> Paolo Abeni <pabeni@redhat.com> wrote:
> > On Sun, 2020-08-02 at 01:50 +0200, Florian Westphal wrote:
> > > Paolo Abeni <pabeni@redhat.com> wrote:
> > > > Update the scheduler to less trivial heuristic: cache
> > > > the last used subflow, and try to send on it a reasonably
> > > > long burst of data. When the burst or the subflow send
> > > > space is exausted, move to the next one with available
> > > > send space.
> > > > 
> > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > > ---
> > > >  net/mptcp/protocol.c | 112 +++++++++++++++++++++++++++++++++++++------
> > > >  net/mptcp/protocol.h |   6 ++-
> > > >  2 files changed, 101 insertions(+), 17 deletions(-)
> > > > 
> > > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > > > index ef5b68c4ff49..89c0400593d1 100644
> > > > --- a/net/mptcp/protocol.c
> > > > +++ b/net/mptcp/protocol.c
> > > > @@ -1012,39 +1012,112 @@ static void mptcp_nospace(struct sock *sk, struct sock *ssk)
> > > >  		set_bit(SOCK_NOSPACE, &sock->flags);
> > > >  }
> > > >  
> > > > +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow,
> > > > +				 bool fallback)
> > > > +{
> > > > +	struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> > > > +
> > > > +	/* subflow must be open for write */
> > > > +	if ((1 << ssk->sk_state) &
> > > > +	    (TCPF_CLOSE | TCPF_LAST_ACK | TCPF_CLOSING | TCPF_FIN_WAIT2 |
> > > > +	     TCPF_FIN_WAIT1))
> > > > +		return false;
> > > 
> > > Hmmm, why is this not checking for TCP_ESTABLISHED -> true, rather than
> > > checking 'states i do not want'?
> > 
> > I think we can pick a subflow for xmit even after receiving a TCP FIN
> > on such subflows, so the subflow status could be different from
> > TCP_ESTABLISHED. 
> > 
> > To my twiested mind was more natural check for not allowed states, I
> > can flip to allowed one if needed.
> 
> I would prefer that.  Otherwise I have to invert manually during review,
> and this makes no sense to me when I do the negation:
> 
> States:
> TCPF_ESTABLISHED = (1 << TCP_ESTABLISHED),
> TCPF_SYN_SENT    = (1 << TCP_SYN_SENT),
> TCPF_SYN_RECV    = (1 << TCP_SYN_RECV),
> TCPF_FIN_WAIT1   = (1 << TCP_FIN_WAIT1),
> TCPF_FIN_WAIT2   = (1 << TCP_FIN_WAIT2),
> TCPF_TIME_WAIT   = (1 << TCP_TIME_WAIT),
> TCPF_CLOSE       = (1 << TCP_CLOSE),
> TCPF_CLOSE_WAIT  = (1 << TCP_CLOSE_WAIT),
> TCPF_LAST_ACK    = (1 << TCP_LAST_ACK),
> TCPF_LISTEN      = (1 << TCP_LISTEN),
> TCPF_CLOSING     = (1 << TCP_CLOSING),
> TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),
> 
> So, above checks disables:
> TCPF_CLOSE       = (1 << TCP_CLOSE),
> TCPF_LAST_ACK    = (1 << TCP_LAST_ACK),
> TCPF_CLOSING     = (1 << TCP_CLOSING),
> TCPF_FIN_WAIT1   = (1 << TCP_FIN_WAIT1),
> TCPF_FIN_WAIT2   = (1 << TCP_FIN_WAIT2),
> 
> ... which makes following ok:
> 
> TCPF_ESTABLISHED = (1 << TCP_ESTABLISHED),
> 
> Agree on that.
> 
> TCPF_SYN_SENT    = (1 << TCP_SYN_SENT),
> 
> we don't know yet if connection will complete, or if
> mptcp will be available.
> 
> TCPF_SYN_RECV    = (1 << TCP_SYN_RECV),
> 
> same.
> 
> TCPF_TIME_WAIT   = (1 << TCP_TIME_WAIT),
> 
> This is also strange, this is a quiesence period to catch
> in-flight packets.  I don't think this subflow should still
> be used for xmit.
> 
> TCPF_CLOSE_WAIT  = (1 << TCP_CLOSE_WAIT),
> 
> Agree on this one as well -- its ok to xmit here.
> 
> TCPF_LISTEN      = (1 << TCP_LISTEN),
> 
> Should not happen of course, but its definitely
> not usable :-)
> 
> TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),
> 
> and that is... mhhh... strange.  Why is this one ok?

Agreed to the above - we also aready have sk_stream_wait_connect() if
state is not TCPF_ESTABLISHED | TCPF_CLOSE_WAIT.

> Checking 'fallback' made no sense to me.  Why do we need to check for this?

In teory we should not have additional subflows in the fallback status,
but we can have an MPJ handskake and fallback racing togethar, so we
end-up with msk in fallback status and one or more fully established
subflows. If/when that happen we must avoid picking mpj subflows. 

Perhaps is just easier/clearer add an explict (unlikely???) case
in mptcp_subflow_get_send() and mptcp_subflow_get_retrans()

> > > Hmm, should this be a separate bug fix (prevent non-fully-established-join)?
> > 
> > I think the bug is only apparent after we have non backup subflows
> > and/or remove addr support ?!?
> 
> What about other mptcp stack peer?  Couldn't they create a non-backup flow?
> 
> [ I don't mind if you don't want a separate patch for this, it just
> looked like a small fix to me that could be applied already ]

yep, we can have non backup subflows with mptcp.org peers. I still
would opt for a single patch ;)

/P
Florian Westphal Aug. 3, 2020, 1:49 p.m. UTC | #5
Paolo Abeni <pabeni@redhat.com> wrote:
> > Checking 'fallback' made no sense to me.  Why do we need to check for this?
> 
> In teory we should not have additional subflows in the fallback status,
> but we can have an MPJ handskake and fallback racing togethar, so we
> end-up with msk in fallback status and one or more fully established
> subflows. If/when that happen we must avoid picking mpj subflows. 

That sounds bad.  I'd suggest to make sure that we only use
msk->first if msk is marked fallback.

> Perhaps is just easier/clearer add an explict (unlikely???) case
> in mptcp_subflow_get_send() and mptcp_subflow_get_retrans()

Agree, its easier to understand, at least.

> > > > Hmm, should this be a separate bug fix (prevent non-fully-established-join)?
> > > 
> > > I think the bug is only apparent after we have non backup subflows
> > > and/or remove addr support ?!?
> > 
> > What about other mptcp stack peer?  Couldn't they create a non-backup flow?
> > 
> > [ I don't mind if you don't want a separate patch for this, it just
> > looked like a small fix to me that could be applied already ]
> 
> yep, we can have non backup subflows with mptcp.org peers. I still
> would opt for a single patch ;)

Ok, fair enough, I thought you could split that from the series and
get it merged already.
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index ef5b68c4ff49..89c0400593d1 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1012,39 +1012,112 @@  static void mptcp_nospace(struct sock *sk, struct sock *ssk)
 		set_bit(SOCK_NOSPACE, &sock->flags);
 }
 
+static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow,
+				 bool fallback)
+{
+	struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+	/* subflow must be open for write */
+	if ((1 << ssk->sk_state) &
+	    (TCPF_CLOSE | TCPF_LAST_ACK | TCPF_CLOSING | TCPF_FIN_WAIT2 |
+	     TCPF_FIN_WAIT1))
+		return false;
+
+	/* we can xmit on MPC and fallen back subflows in
+	 * TCP_SYN_SENT/TCP_SYN_RECV status, but we need fully established
+	 * MP_JOIN subflows.
+	 */
+	return !subflow->request_join ||
+	       (subflow->fully_established && !fallback);
+}
+
+#define MPTCP_SEND_BURST_SIZE		(1 << 15)
+
+static struct list_head *mptcp_next_subflow(struct mptcp_sock *msk,
+					    struct list_head *pos,
+					    bool wrap_around)
+{
+	if (list_is_last(pos, &msk->conn_list) && wrap_around)
+		return msk->conn_list.next;
+	return pos->next;
+}
+
 static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
 					   int *sndbuf)
 {
+	int next_wspace = 0, next_bu_wspace = 0, last_snd_wspace = 0;
+	bool fallback = __mptcp_check_fallback(msk);
 	struct mptcp_subflow_context *subflow;
-	struct sock *sk = (struct sock *)msk;
-	struct sock *backup = NULL;
+	struct sock *next_backup = NULL;
+	struct list_head *pos, *start;
+	struct sock *next_ssk = NULL;
+	bool wrap_around;
 
-	sock_owned_by_me(sk);
+	sock_owned_by_me((struct sock *)msk);
 
 	*sndbuf = 0;
 	if (!mptcp_ext_cache_refill(msk))
 		return NULL;
 
-	mptcp_for_each_subflow(msk, subflow) {
-		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+	/* lookup the first writeable subflow and first writable back subflow
+	 * starting from last used, with wrap-around
+	 */
+	if (msk->last_snd) {
+		start = &mptcp_subflow_ctx(msk->last_snd)->node;
+		wrap_around = true;
+	} else {
+		start = &msk->conn_list;
+		wrap_around = false;
+	}
+	pr_debug("msk=%p start=%p pos=%p wrap_around=%d last=%p", msk, start,
+		 mptcp_next_subflow(msk, start, wrap_around), wrap_around,
+		 msk->last_snd);
+	for (pos = mptcp_next_subflow(msk, start, wrap_around); pos != start;
+	     pos = mptcp_next_subflow(msk, pos, wrap_around)) {
+		struct sock *ssk;
+		int wspace;
 
-		if (!sk_stream_memory_free(sk)) {
-			mptcp_nospace(sk, ssk);
-			return NULL;
-		}
+		subflow = list_entry(pos, struct mptcp_subflow_context, node);
+		ssk =  mptcp_subflow_tcp_sock(subflow);
+		if (!mptcp_subflow_active(subflow, fallback))
+			continue;
 
 		*sndbuf = max(ssk->sk_sndbuf, *sndbuf);
-		if (subflow->backup) {
-			if (!backup)
-				backup = ssk;
-
+		wspace = sk_stream_wspace(ssk);
+		if (wspace <= 0)
 			continue;
+
+		if (!subflow->backup && !next_ssk) {
+			next_ssk = ssk;
+			next_wspace = wspace;
 		}
 
-		return ssk;
+		if (subflow->backup && !next_backup) {
+			next_backup = ssk;
+			next_bu_wspace = wspace;
+		}
+	}
+	if (!next_ssk) {
+		next_ssk = next_backup;
+		next_wspace = next_bu_wspace;
 	}
+	if (msk->last_snd) {
+		*sndbuf = max(msk->last_snd->sk_sndbuf, *sndbuf);
+		last_snd_wspace = sk_stream_wspace(msk->last_snd);
+	}
+	pr_debug("msk=%p ssk=%p last=%p wspace=%d last wspace=%d burst=%d", msk,
+		 next_ssk, msk->last_snd, next_wspace, last_snd_wspace,
+		 msk->snd_burst);
 
-	return backup;
+	/* use the looked-up subflow if the previusly used has exauted the burst
+	 * or is not writable
+	 */
+	if (next_ssk && (last_snd_wspace <= 0 || msk->snd_burst <= 0)) {
+		msk->last_snd = next_ssk;
+		msk->snd_burst = min(MPTCP_SEND_BURST_SIZE, next_wspace);
+		last_snd_wspace = next_wspace;
+	}
+	return last_snd_wspace > 0 ? msk->last_snd : NULL;
 }
 
 static void ssk_check_wmem(struct sock *sk, struct sock *ssk)
@@ -1142,6 +1215,7 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		}
 
 		copied += ret;
+		msk->snd_burst -= ret;
 
 		tx_ok = msg_data_left(msg);
 		if (!tx_ok)
@@ -1358,6 +1432,10 @@  static bool __mptcp_move_skbs(struct mptcp_sock *msk)
 	unsigned int moved = 0;
 	bool done;
 
+	if (((struct sock *)msk)->sk_state == TCP_CLOSE)
+		return false;
+
+	__mptcp_flush_join_list(msk);
 	do {
 		struct sock *ssk = mptcp_subflow_recv_lookup(msk);
 
@@ -1513,6 +1591,7 @@  static void mptcp_retransmit_timer(struct timer_list *t)
  */
 static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
 {
+	bool fallback = __mptcp_check_fallback(msk);
 	struct mptcp_subflow_context *subflow;
 	struct sock *backup = NULL;
 
@@ -1521,6 +1600,9 @@  static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
+		if (!mptcp_subflow_active(subflow, fallback))
+			continue;
+
 		/* still data outstanding at TCP level?  Don't retransmit. */
 		if (!tcp_write_queue_empty(ssk))
 			return NULL;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 5467eb1adebd..0381e63d866d 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -196,6 +196,8 @@  struct mptcp_sock {
 	u64		write_seq;
 	u64		ack_seq;
 	u64		rcv_data_fin_seq;
+	struct sock	*last_snd;
+	int		snd_burst;
 	atomic64_t	snd_una;
 	unsigned long	timer_ival;
 	u32		token;
@@ -469,12 +471,12 @@  static inline bool before64(__u64 seq1, __u64 seq2)
 
 void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
 
-static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
+static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
 {
 	return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
 }
 
-static inline bool mptcp_check_fallback(struct sock *sk)
+static inline bool mptcp_check_fallback(const struct sock *sk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);