Message ID | 20200905103324.27066-3-fw@strlen.de |
---|---|
State | Superseded, archived |
Commit | bef7797391292b1210d0ecd4b4a90483e5747e05 |
Delegated to: | Florian Westphal |
Headers | show |
Series | mptcp: rework poll+nospace handling | expand |
On Sat, 2020-09-05 at 12:33 +0200, Florian Westphal wrote: > At this time, MPTCP maintains a status bit, MPTCP_SEND_SPACE, > that is set when at least one subflow and the mptcp socket itself > are writeable. > > poll path set EPOLLOUT if the bit is set. > sendmsg path makes sure MPTCP_SEND_SPACE gets cleared when last > write has used up all subflows or the mptcp socket wmem. > > This reworks nospace handling as follows: > > MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE. > This bit is set when the mptcp socket is not writeable anymore > and makes the mptcp-level ack path schedule the mptcp worker. > > This is needed to wake up userspace processes that wait for a POLLOUT > event. > > sendmsg will set MPTCP_NOSPACE only when it has to wait for more > wmem (blocking I/O case). > > poll path will also set MPTCP_NOSPACE in case the mptcp socket > is not writeable. > > Normal tcp-level notification (SOCK_NOSPACE) is only enabled > in case the subflow socket has no available wmem. > > Signed-off-by: Florian Westphal <fw@strlen.de> > --- > net/mptcp/protocol.c | 98 ++++++++++++++++++++++++++++++-------------- > net/mptcp/protocol.h | 2 +- > net/mptcp/subflow.c | 11 +++-- > 3 files changed, 74 insertions(+), 37 deletions(-) > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > index 15ecb471602c..bc5231d5b06d 100644 > --- a/net/mptcp/protocol.c > +++ b/net/mptcp/protocol.c > @@ -680,10 +680,12 @@ static void mptcp_reset_timer(struct sock *sk) > > void mptcp_data_acked(struct sock *sk) > { > + struct mptcp_sock *msk = mptcp_sk(sk); > + > mptcp_reset_timer(sk); > > - if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || > - (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && > + if ((test_bit(MPTCP_NOSPACE, &msk->flags) || > + (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && > schedule_work(&mptcp_sk(sk)->work)) > sock_hold(sk); > } > @@ -841,12 +843,7 @@ static void mptcp_clean_una(struct sock *sk) > > /* Only wake up writers if a subflow is ready */ > if (mptcp_is_writeable(msk)) { > - set_bit(MPTCP_SEND_SPACE, &msk->flags); > - smp_mb__after_atomic(); > - > - /* set SEND_SPACE before sk_stream_write_space clears > - * NOSPACE > - */ > + clear_bit(MPTCP_NOSPACE, &msk->flags); > sk_stream_write_space(sk); > } > } > @@ -1038,21 +1035,37 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, > return ret; > } > > -static void mptcp_nospace(struct mptcp_sock *msk) > +/* set SOCK_NOSPACE for any subflow that has no more wmem. > + * > + * Returns true if at least one subflow is writeable. > + */ > +static bool mptcp_set_subflow_nospace(struct mptcp_sock *msk) > { > struct mptcp_subflow_context *subflow; > - > - clear_bit(MPTCP_SEND_SPACE, &msk->flags); > - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ > + bool ssk_any_writeable = false; > > mptcp_for_each_subflow(msk, subflow) { Not strictly related to this patch, but perhpas we need a __mptcp_flush_join_list() call before traversing the subflow list?!? > struct sock *ssk = mptcp_subflow_tcp_sock(subflow); > + bool ssk_writeable = sk_stream_is_writeable(ssk); > struct socket *sock = READ_ONCE(ssk->sk_socket); > > + if (ssk_writeable) { > + ssk_any_writeable = true; > + continue; > + } > + > + if (!sock) > + continue; > + > /* enables ssk->write_space() callbacks */ > - if (sock) > - set_bit(SOCK_NOSPACE, &sock->flags); > + set_bit(SOCK_NOSPACE, &sock->flags); > + > + ssk_writeable = sk_stream_is_writeable(ssk); > + if (ssk_writeable) > + ssk_any_writeable = true; > } > + > + return ssk_any_writeable; > } > > static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) > @@ -1160,10 +1173,28 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, > return free ? msk->last_snd : NULL; > } > > -static void ssk_check_wmem(struct mptcp_sock *msk) > +static bool __mptcp_check_writeable(struct mptcp_sock *msk) > { > - if (unlikely(!mptcp_is_writeable(msk))) > - mptcp_nospace(msk); > + struct sock *sk = (struct sock *)msk; > + bool mptcp_writable; > + > + __mptcp_clean_una(sk); > + mptcp_writable = sk_stream_is_writeable(sk); > + if (!mptcp_writable) { > + set_bit(MPTCP_NOSPACE, &msk->flags); > + smp_mb__after_atomic(); > + __mptcp_clean_una(sk); > + > + mptcp_writable = sk_stream_is_writeable(sk); > + if (likely(!mptcp_writable)) > + return false; > + > + /* MPTCP ack after MPTCP_NOSPACE was set */ > + clear_bit(MPTCP_NOSPACE, &msk->flags); > + } > + > + /* mptcp socket is writeable, now check subflows */ > + return mptcp_set_subflow_nospace(msk); > } > > static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) > @@ -1217,7 +1248,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) > mptcp_reset_timer(sk); > } > > - mptcp_nospace(msk); > + __mptcp_check_writeable(msk); > + Again not strictily related to this patch, but currently we start sleeping/blocking if msk/ssk are not writeable. I think we should do that instead when !sk_stream_memory_free() ?!? > ret = sk_stream_wait_memory(sk, &timeo); > if (ret) > goto out; > @@ -1314,7 +1346,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) > > release_sock(ssk); > out: > - ssk_check_wmem(msk); > release_sock(sk); > return copied ? : ret; > } > @@ -1811,7 +1842,6 @@ static int __mptcp_init_sock(struct sock *sk) > INIT_LIST_HEAD(&msk->conn_list); > INIT_LIST_HEAD(&msk->join_list); > INIT_LIST_HEAD(&msk->rtx_queue); > - __set_bit(MPTCP_SEND_SPACE, &msk->flags); > INIT_WORK(&msk->work, mptcp_worker); > msk->out_of_order_queue = RB_ROOT; > > @@ -2422,13 +2452,6 @@ bool mptcp_finish_join(struct sock *sk) > return true; > } > > -static bool mptcp_memory_free(const struct sock *sk, int wake) > -{ > - struct mptcp_sock *msk = mptcp_sk(sk); > - > - return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; > -} > - > static struct proto mptcp_prot = { > .name = "MPTCP", > .owner = THIS_MODULE, > @@ -2449,7 +2472,6 @@ static struct proto mptcp_prot = { > .sockets_allocated = &mptcp_sockets_allocated, > .memory_allocated = &tcp_memory_allocated, > .memory_pressure = &tcp_memory_pressure, > - .stream_memory_free = mptcp_memory_free, > .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), > .sysctl_mem = sysctl_tcp_mem, > .obj_size = sizeof(struct mptcp_sock), > @@ -2622,6 +2644,23 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) > 0; > } > > +static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) > +{ > + struct sock *sk = (struct sock *)msk; > + __poll_t ret = 0; > + bool slow; > + > + if (sk->sk_shutdown & SEND_SHUTDOWN) > + return 0; > + > + slow = lock_sock_fast(sk); > + if (__mptcp_check_writeable(msk)) > + ret = EPOLLOUT | EPOLLWRNORM; > + > + unlock_sock_fast(sk, slow); It's a pity we need to acquire the lock in mptcp_poll()... perhaps we could try to acquire it only if sk_stream_is_writeable(msk)? > + return ret; > +} > + > static __poll_t mptcp_poll(struct file *file, struct socket *sock, > struct poll_table_struct *wait) > { > @@ -2640,8 +2679,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, > > if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { > mask |= mptcp_check_readable(msk); > - if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) > - mask |= EPOLLOUT | EPOLLWRNORM; > + mask |= mptcp_check_writeable(msk); > } > if (sk->sk_shutdown & RCV_SHUTDOWN) > mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h > index ba253a6947b0..d18af92723e4 100644 > --- a/net/mptcp/protocol.h > +++ b/net/mptcp/protocol.h > @@ -86,7 +86,7 @@ > > /* MPTCP socket flags */ > #define MPTCP_DATA_READY 0 > -#define MPTCP_SEND_SPACE 1 > +#define MPTCP_NOSPACE 1 > #define MPTCP_WORK_RTX 2 > #define MPTCP_WORK_EOF 3 > #define MPTCP_FALLBACK_DONE 4 > diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c > index e31900ff0c56..bdf28d1baba8 100644 > --- a/net/mptcp/subflow.c > +++ b/net/mptcp/subflow.c > @@ -972,17 +972,16 @@ static void subflow_data_ready(struct sock *sk) > static void subflow_write_space(struct sock *sk) > { > struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); > + struct socket *sock = READ_ONCE(sk->sk_socket); > struct sock *parent = subflow->conn; > > if (!sk_stream_is_writeable(sk)) > return; > > - if (sk_stream_is_writeable(parent)) { > - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); > - smp_mb__after_atomic(); > - /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ > - sk_stream_write_space(parent); > - } > + if (sock) > + clear_bit(SOCK_NOSPACE, &sock->flags); for server socket 'sock' is 'shared' among all the subflows (ssk- >sk_socket points to the same data). The above is disabling write space notification on all subflows, is that intended/correct? > + > + sk_stream_write_space(parent); If I read correctly, this means that the msk could be woken-up even if not writeable (a subflow is writeable, but msk does not have enough wirte space), am I correct? Thanks! Paolo
On Sun, 2020-09-06 at 17:58 +0200, Paolo Abeni wrote: > On Sat, 2020-09-05 at 12:33 +0200, Florian Westphal wrote: > > +static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) > > +{ > > + struct sock *sk = (struct sock *)msk; > > + __poll_t ret = 0; > > + bool slow; > > + > > + if (sk->sk_shutdown & SEND_SHUTDOWN) > > + return 0; > > + > > + slow = lock_sock_fast(sk); > > + if (__mptcp_check_writeable(msk)) > > + ret = EPOLLOUT | EPOLLWRNORM; > > + > > + unlock_sock_fast(sk, slow); > > It's a pity we need to acquire the lock in mptcp_poll()... perhaps we > could try to acquire it only if sk_stream_is_writeable(msk)? I'm also wondering if checking sk_stream_is_writeable(msk) could be enough??? perhaps with a different sndbuf auto-tuning strategy. e.g. if msk sndbuf <= max (ssk sndbuf), I *think* that sk_stream_is_writeable(msk) implies at least a ssk is writeable, too /P
Paolo Abeni <pabeni@redhat.com> wrote: > On Sun, 2020-09-06 at 17:58 +0200, Paolo Abeni wrote: > > On Sat, 2020-09-05 at 12:33 +0200, Florian Westphal wrote: > > > +static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) > > > +{ > > > + struct sock *sk = (struct sock *)msk; > > > + __poll_t ret = 0; > > > + bool slow; > > > + > > > + if (sk->sk_shutdown & SEND_SHUTDOWN) > > > + return 0; > > > + > > > + slow = lock_sock_fast(sk); > > > + if (__mptcp_check_writeable(msk)) > > > + ret = EPOLLOUT | EPOLLWRNORM; > > > + > > > + unlock_sock_fast(sk, slow); > > > > It's a pity we need to acquire the lock in mptcp_poll()... perhaps we > > could try to acquire it only if sk_stream_is_writeable(msk)? > > I'm also wondering if checking sk_stream_is_writeable(msk) could be > enough??? perhaps with a different sndbuf auto-tuning strategy. Not right now. But perhaps this is worth exploring -- we would need to change mptcp_sendmsg to append data to the rtx queue in case no ssk can be found. I'd give this a try unless you or anyone else has objections. > e.g. if msk sndbuf <= max (ssk sndbuf), I *think* > that sk_stream_is_writeable(msk) implies at least a ssk is writeable, > too You mean mptcp_is_writeable()? [ Question above still stands though ]
Paolo Abeni <pabeni@redhat.com> wrote: > On Sat, 2020-09-05 at 12:33 +0200, Florian Westphal wrote: > > At this time, MPTCP maintains a status bit, MPTCP_SEND_SPACE, > > that is set when at least one subflow and the mptcp socket itself > > are writeable. > > > > poll path set EPOLLOUT if the bit is set. > > sendmsg path makes sure MPTCP_SEND_SPACE gets cleared when last > > write has used up all subflows or the mptcp socket wmem. > > > > This reworks nospace handling as follows: > > > > MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE. > > This bit is set when the mptcp socket is not writeable anymore > > and makes the mptcp-level ack path schedule the mptcp worker. > > > > This is needed to wake up userspace processes that wait for a POLLOUT > > event. > > > > sendmsg will set MPTCP_NOSPACE only when it has to wait for more > > wmem (blocking I/O case). > > > > poll path will also set MPTCP_NOSPACE in case the mptcp socket > > is not writeable. > > > > Normal tcp-level notification (SOCK_NOSPACE) is only enabled > > in case the subflow socket has no available wmem. > > > > Signed-off-by: Florian Westphal <fw@strlen.de> > > --- > > net/mptcp/protocol.c | 98 ++++++++++++++++++++++++++++++-------------- > > net/mptcp/protocol.h | 2 +- > > net/mptcp/subflow.c | 11 +++-- > > 3 files changed, 74 insertions(+), 37 deletions(-) > > > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > > index 15ecb471602c..bc5231d5b06d 100644 > > --- a/net/mptcp/protocol.c > > +++ b/net/mptcp/protocol.c > > @@ -680,10 +680,12 @@ static void mptcp_reset_timer(struct sock *sk) > > > > void mptcp_data_acked(struct sock *sk) > > { > > + struct mptcp_sock *msk = mptcp_sk(sk); > > + > > mptcp_reset_timer(sk); > > > > - if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || > > - (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && > > + if ((test_bit(MPTCP_NOSPACE, &msk->flags) || > > + (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && > > schedule_work(&mptcp_sk(sk)->work)) > > sock_hold(sk); > > } > > @@ -841,12 +843,7 @@ static void mptcp_clean_una(struct sock *sk) > > > > /* Only wake up writers if a subflow is ready */ > > if (mptcp_is_writeable(msk)) { > > - set_bit(MPTCP_SEND_SPACE, &msk->flags); > > - smp_mb__after_atomic(); > > - > > - /* set SEND_SPACE before sk_stream_write_space clears > > - * NOSPACE > > - */ > > + clear_bit(MPTCP_NOSPACE, &msk->flags); > > sk_stream_write_space(sk); > > } > > } > > @@ -1038,21 +1035,37 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, > > return ret; > > } > > > > -static void mptcp_nospace(struct mptcp_sock *msk) > > +/* set SOCK_NOSPACE for any subflow that has no more wmem. > > + * > > + * Returns true if at least one subflow is writeable. > > + */ > > +static bool mptcp_set_subflow_nospace(struct mptcp_sock *msk) > > { > > struct mptcp_subflow_context *subflow; > > - > > - clear_bit(MPTCP_SEND_SPACE, &msk->flags); > > - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ > > + bool ssk_any_writeable = false; > > > > mptcp_for_each_subflow(msk, subflow) { > > Not strictly related to this patch, but perhpas we need a > __mptcp_flush_join_list() call before traversing the subflow list?!? Yes, I think it makes sense to update the list here. > > - mptcp_nospace(msk); > > + __mptcp_check_writeable(msk); > > + > > Again not strictily related to this patch, but currently we start > sleeping/blocking if msk/ssk are not writeable. > > I think we should do that instead when !sk_stream_memory_free() ?!? Do what? Sleep only if mptcp socket has not enough wspace? We can do that, requires change to mptcp_sendmsg to append to rtx queue (and make sure retrans timer is on), also see my other reply. Is that what you meant? > > + slow = lock_sock_fast(sk); > > + if (__mptcp_check_writeable(msk)) > > + ret = EPOLLOUT | EPOLLWRNORM; > > + > > + unlock_sock_fast(sk, slow); > > It's a pity we need to acquire the lock in mptcp_poll()... perhaps we > could try to acquire it only if sk_stream_is_writeable(msk)? I think we need to determine desired behaviour of send path when no ssk is ready (but mptcp sk has wspace). I think we should change it to queue the data even if no ssk is available (but wmem permits it). Then we could simplify poll too, as ssk state becomes irrelevant. > > + if (sock) > > + clear_bit(SOCK_NOSPACE, &sock->flags); > > for server socket 'sock' is 'shared' among all the subflows (ssk- > >sk_socket points to the same data). The above is disabling write space > notification on all subflows, is that intended/correct? Yes, current sendmsg path accepts data if mptcp socket has wmem and one subflow is ready, so it only sleeps if mptcp wmem is also exhausted. And for that, the MPTCP_NOSPACE bit has been toggled so next data-ack wakes a blocking poll or blocking sendmsg. > > + > > + sk_stream_write_space(parent); > > If I read correctly, this means that the msk could be woken-up even if > not writeable (a subflow is writeable, but msk does not have enough > wirte space), am I correct? No, this is in a mptcp_is_writeable() conditional so msk has enough wmem. My main takeaway from your response is that I should rework this towards making the mptcp write buffer space the only relevant criteria wrt. when to signal/wake userspace. Does that sound good to you?
On Sun, 6 Sep 2020, Florian Westphal wrote: > Paolo Abeni <pabeni@redhat.com> wrote: >> On Sun, 2020-09-06 at 17:58 +0200, Paolo Abeni wrote: >>> On Sat, 2020-09-05 at 12:33 +0200, Florian Westphal wrote: >>>> +static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) >>>> +{ >>>> + struct sock *sk = (struct sock *)msk; >>>> + __poll_t ret = 0; >>>> + bool slow; >>>> + >>>> + if (sk->sk_shutdown & SEND_SHUTDOWN) >>>> + return 0; >>>> + >>>> + slow = lock_sock_fast(sk); >>>> + if (__mptcp_check_writeable(msk)) >>>> + ret = EPOLLOUT | EPOLLWRNORM; >>>> + >>>> + unlock_sock_fast(sk, slow); >>> >>> It's a pity we need to acquire the lock in mptcp_poll()... perhaps we >>> could try to acquire it only if sk_stream_is_writeable(msk)? >> >> I'm also wondering if checking sk_stream_is_writeable(msk) could be >> enough??? perhaps with a different sndbuf auto-tuning strategy. > > Not right now. But perhaps this is worth exploring -- we would need > to change mptcp_sendmsg to append data to the rtx queue in case no ssk > can be found. > > I'd give this a try unless you or anyone else has objections. No objection here, I think it would be good to explore - might fit the "easier to follow" criteria you mention in the cover letter. Having mptcp_poll() not only locking, but also doing the __mptcp_clean_una() work, seems kind of heavyweight for a poll operation. > >> e.g. if msk sndbuf <= max (ssk sndbuf), I *think* >> that sk_stream_is_writeable(msk) implies at least a ssk is writeable, >> too > > You mean mptcp_is_writeable()? > [ Question above still stands though ] -- Mat Martineau Intel
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 15ecb471602c..bc5231d5b06d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -680,10 +680,12 @@ static void mptcp_reset_timer(struct sock *sk) void mptcp_data_acked(struct sock *sk) { + struct mptcp_sock *msk = mptcp_sk(sk); + mptcp_reset_timer(sk); - if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || - (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && + if ((test_bit(MPTCP_NOSPACE, &msk->flags) || + (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && schedule_work(&mptcp_sk(sk)->work)) sock_hold(sk); } @@ -841,12 +843,7 @@ static void mptcp_clean_una(struct sock *sk) /* Only wake up writers if a subflow is ready */ if (mptcp_is_writeable(msk)) { - set_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - - /* set SEND_SPACE before sk_stream_write_space clears - * NOSPACE - */ + clear_bit(MPTCP_NOSPACE, &msk->flags); sk_stream_write_space(sk); } } @@ -1038,21 +1035,37 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, return ret; } -static void mptcp_nospace(struct mptcp_sock *msk) +/* set SOCK_NOSPACE for any subflow that has no more wmem. + * + * Returns true if at least one subflow is writeable. + */ +static bool mptcp_set_subflow_nospace(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ + bool ssk_any_writeable = false; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool ssk_writeable = sk_stream_is_writeable(ssk); struct socket *sock = READ_ONCE(ssk->sk_socket); + if (ssk_writeable) { + ssk_any_writeable = true; + continue; + } + + if (!sock) + continue; + /* enables ssk->write_space() callbacks */ - if (sock) - set_bit(SOCK_NOSPACE, &sock->flags); + set_bit(SOCK_NOSPACE, &sock->flags); + + ssk_writeable = sk_stream_is_writeable(ssk); + if (ssk_writeable) + ssk_any_writeable = true; } + + return ssk_any_writeable; } static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) @@ -1160,10 +1173,28 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, return free ? msk->last_snd : NULL; } -static void ssk_check_wmem(struct mptcp_sock *msk) +static bool __mptcp_check_writeable(struct mptcp_sock *msk) { - if (unlikely(!mptcp_is_writeable(msk))) - mptcp_nospace(msk); + struct sock *sk = (struct sock *)msk; + bool mptcp_writable; + + __mptcp_clean_una(sk); + mptcp_writable = sk_stream_is_writeable(sk); + if (!mptcp_writable) { + set_bit(MPTCP_NOSPACE, &msk->flags); + smp_mb__after_atomic(); + __mptcp_clean_una(sk); + + mptcp_writable = sk_stream_is_writeable(sk); + if (likely(!mptcp_writable)) + return false; + + /* MPTCP ack after MPTCP_NOSPACE was set */ + clear_bit(MPTCP_NOSPACE, &msk->flags); + } + + /* mptcp socket is writeable, now check subflows */ + return mptcp_set_subflow_nospace(msk); } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) @@ -1217,7 +1248,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) mptcp_reset_timer(sk); } - mptcp_nospace(msk); + __mptcp_check_writeable(msk); + ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto out; @@ -1314,7 +1346,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) release_sock(ssk); out: - ssk_check_wmem(msk); release_sock(sk); return copied ? : ret; } @@ -1811,7 +1842,6 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); - __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); msk->out_of_order_queue = RB_ROOT; @@ -2422,13 +2452,6 @@ bool mptcp_finish_join(struct sock *sk) return true; } -static bool mptcp_memory_free(const struct sock *sk, int wake) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; -} - static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -2449,7 +2472,6 @@ static struct proto mptcp_prot = { .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, - .stream_memory_free = mptcp_memory_free, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), @@ -2622,6 +2644,23 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 0; } +static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + __poll_t ret = 0; + bool slow; + + if (sk->sk_shutdown & SEND_SHUTDOWN) + return 0; + + slow = lock_sock_fast(sk); + if (__mptcp_check_writeable(msk)) + ret = EPOLLOUT | EPOLLWRNORM; + + unlock_sock_fast(sk, slow); + return ret; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -2640,8 +2679,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) - mask |= EPOLLOUT | EPOLLWRNORM; + mask |= mptcp_check_writeable(msk); } if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ba253a6947b0..d18af92723e4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -86,7 +86,7 @@ /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 -#define MPTCP_SEND_SPACE 1 +#define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e31900ff0c56..bdf28d1baba8 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -972,17 +972,16 @@ static void subflow_data_ready(struct sock *sk) static void subflow_write_space(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct socket *sock = READ_ONCE(sk->sk_socket); struct sock *parent = subflow->conn; if (!sk_stream_is_writeable(sk)) return; - if (sk_stream_is_writeable(parent)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); - smp_mb__after_atomic(); - /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ - sk_stream_write_space(parent); - } + if (sock) + clear_bit(SOCK_NOSPACE, &sock->flags); + + sk_stream_write_space(parent); } static struct inet_connection_sock_af_ops *
At this time, MPTCP maintains a status bit, MPTCP_SEND_SPACE, that is set when at least one subflow and the mptcp socket itself are writeable. poll path set EPOLLOUT if the bit is set. sendmsg path makes sure MPTCP_SEND_SPACE gets cleared when last write has used up all subflows or the mptcp socket wmem. This reworks nospace handling as follows: MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE. This bit is set when the mptcp socket is not writeable anymore and makes the mptcp-level ack path schedule the mptcp worker. This is needed to wake up userspace processes that wait for a POLLOUT event. sendmsg will set MPTCP_NOSPACE only when it has to wait for more wmem (blocking I/O case). poll path will also set MPTCP_NOSPACE in case the mptcp socket is not writeable. Normal tcp-level notification (SOCK_NOSPACE) is only enabled in case the subflow socket has no available wmem. Signed-off-by: Florian Westphal <fw@strlen.de> --- net/mptcp/protocol.c | 98 ++++++++++++++++++++++++++++++-------------- net/mptcp/protocol.h | 2 +- net/mptcp/subflow.c | 11 +++-- 3 files changed, 74 insertions(+), 37 deletions(-)