Message ID | aa8d90d38dbbb5f97a5a883389378d979f32b186.camel@redhat.com |
---|---|
State | RFC, archived |
Headers | show |
Series | MPTCP stream performances | expand |
Hello, adding some comments to give a little more context. This is getting close to my stream of consciousness, so I guess it will not more easily readable then "the ulysses"... On Mon, 2020-10-26 at 16:46 +0100, Paolo Abeni wrote: > diff --git a/net/mptcp/options.c b/net/mptcp/options.c > index 4e02c259134f..2d07d099d038 100644 > --- a/net/mptcp/options.c > +++ b/net/mptcp/options.c > @@ -530,6 +530,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, > opts->ext_copy.ack64 = 0; > } > opts->ext_copy.use_ack = 1; > + WRITE_ONCE(msk->old_wspace, tcp_space((struct sock *)msk)); here a cmpxchg would be needed to make the above safe, but at worst we will have an additional unneeded call to tcp_cleanp_rbuf(), and the latter will avoid dup acks, so I think it's not tragic. > > /* Add kind/length/subtype/flag overhead if mapping is not populated */ > if (dss_size == 0) > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > index e69852fa32e2..328aa98c51f2 100644 > --- a/net/mptcp/protocol.c > +++ b/net/mptcp/protocol.c > @@ -406,7 +406,7 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) > mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; > } > > -static void mptcp_send_ack(struct mptcp_sock *msk) > +static void mptcp_send_ack(struct mptcp_sock *msk, bool force) > { > struct mptcp_subflow_context *subflow; > > @@ -414,8 +414,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk) > struct sock *ssk = mptcp_subflow_tcp_sock(subflow); > > lock_sock(ssk); > - tcp_send_ack(ssk); > + if (force) > + tcp_send_ack(ssk); > + else > + tcp_cleanup_rbuf(ssk, 1); > release_sock(ssk); > + if (!force) > + break; > } > } > > @@ -467,7 +472,7 @@ static bool mptcp_check_data_fin(struct sock *sk) > > ret = true; > mptcp_set_timeout(sk, NULL); > - mptcp_send_ack(msk); > + mptcp_send_ack(msk, true); > if (sock_flag(sk, SOCK_DEAD)) > return ret; > > @@ -490,7 +495,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, > unsigned int moved = 0; > bool more_data_avail; > struct tcp_sock *tp; > - u32 old_copied_seq; > bool done = false; > int sk_rbuf; > > @@ -507,7 +511,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, > > pr_debug("msk=%p ssk=%p", msk, ssk); > tp = tcp_sk(ssk); > - old_copied_seq = tp->copied_seq; > do { > u32 map_remaining, offset; > u32 seq = tp->copied_seq; > @@ -572,10 +575,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, > } > } while (more_data_avail); > > - *bytes += moved; > - if (tp->copied_seq != old_copied_seq) > - tcp_cleanup_rbuf(ssk, 1); __mptcp_move_skbs_from_subflow() is invoked by the subflow receive path, via subflow_data_ready()/mptcp_data_ready() and by mptcp_recvmsg(), via __mptcp_move_skbs(). In the first case we don't need to send additional acks, the TCP stack will do that for us in tcp_rcv_established() via tcp_ack_snd_check() just after updating the msk->ack_seq. To cover the second case, is better moving the ack hook elsewhere, see below. Note: in tcp_rcv_established() we always hit the TCP 'slow-path', as TCP header prediction always fails for packets carrying any type of MPTCP options. We could change that. e.g. including a DSS64 in the predicted header len, but then the TCP ack will be generated _before_ msk->ack_seq is updated, so we are better off with header misprediction;) > - > + *bytes = moved; > return done; > } > > @@ -1526,7 +1526,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) > > static bool __mptcp_move_skbs(struct mptcp_sock *msk) > { > - unsigned int moved = 0; > + unsigned int moved_total = 0; > bool done; > > /* avoid looping forever below on racing close */ > @@ -1536,6 +1536,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) > __mptcp_flush_join_list(msk); > do { > struct sock *ssk = mptcp_subflow_recv_lookup(msk); > + unsigned int moved; > bool slowpath; > > if (!ssk) > @@ -1543,12 +1544,14 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) > > slowpath = lock_sock_fast(ssk); > done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); > + if (moved) > + tcp_send_ack(ssk); This may generare an high volume of TCP dup acks, but calling tcp_cleanup_rbuf() here is not enough, because the packets are still accounted in the msk receive buffer, the msk receive windows is not changed with the above call, so neither the subflow receive window. tcp_cleanup_rbuf() will likely _not_ generate a new ack when needed. Even replacing tcp_send_ack() with __tcp_ack_snd_check() will not improve much: since the rcv window is not changed, and the relevant segment has been already acked at the TCP level, such call will either generate a dup ack or no ack at all. Additionally __tcp_ack_snd_check() is currently static. All the above makes me thing that trying to remove the workqueue for the receive path could be a better solution: if incoming pkts are always moved into the msk receive queue by subflow_data_ready(), the ACK packet generated by the TCP stack will be enough to cover all the above, without any need for dup acks. With a caveat: currently we leave some packet in the subflow receive queue while the msk receive queue is full. Possibly keeping the incoming skb always accounted into the ingress ssk would avoid that case - currently we move the accounting from ssk to msk in __mptcp_move_skbs_from_subflow(). Handling the skb freing at receive time and subflow tear down will not be trivial. /P
Paolo Abeni <pabeni@redhat.com> wrote: > On Mon, 2020-10-26 at 16:46 +0100, Paolo Abeni wrote: > > diff --git a/net/mptcp/options.c b/net/mptcp/options.c > > index 4e02c259134f..2d07d099d038 100644 > > --- a/net/mptcp/options.c > > +++ b/net/mptcp/options.c > > @@ -530,6 +530,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, > > opts->ext_copy.ack64 = 0; > > } > > opts->ext_copy.use_ack = 1; > > + WRITE_ONCE(msk->old_wspace, tcp_space((struct sock *)msk)); > > here a cmpxchg would be needed to make the above safe, but at worst we > will have an additional unneeded call to tcp_cleanp_rbuf(), and the > latter will avoid dup acks, so I think it's not tragic. Agree. > __mptcp_move_skbs_from_subflow() is invoked by the subflow receive > path, via subflow_data_ready()/mptcp_data_ready() and by > mptcp_recvmsg(), via __mptcp_move_skbs(). > > In the first case we don't need to send additional acks, the TCP stack > will do that for us in tcp_rcv_established() via tcp_ack_snd_check() > just after updating the msk->ack_seq. > To cover the second case, is better moving the ack hook elsewhere, see > below. [..] > > - > > + *bytes = moved; > > return done; > > } > > > > @@ -1526,7 +1526,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) > > > > static bool __mptcp_move_skbs(struct mptcp_sock *msk) > > { > > - unsigned int moved = 0; > > + unsigned int moved_total = 0; > > bool done; > > > > /* avoid looping forever below on racing close */ > > @@ -1536,6 +1536,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) > > __mptcp_flush_join_list(msk); > > do { > > struct sock *ssk = mptcp_subflow_recv_lookup(msk); > > + unsigned int moved; > > bool slowpath; > > > > if (!ssk) > > @@ -1543,12 +1544,14 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) > > > > slowpath = lock_sock_fast(ssk); > > done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); > > + if (moved) > > + tcp_send_ack(ssk); > > This may generare an high volume of TCP dup acks, but calling > tcp_cleanup_rbuf() here is not enough, because the packets are still > accounted in the msk receive buffer, the msk receive windows is not > changed with the above call, so neither the subflow receive window. > tcp_cleanup_rbuf() will likely _not_ generate a new ack when needed. Hmm, right. > Even replacing tcp_send_ack() with __tcp_ack_snd_check() will not > improve much: since the rcv window is not changed, and the relevant > segment has been already acked at the TCP level, such call will either > generate a dup ack or no ack at all. Additionally __tcp_ack_snd_check() > is currently static. > > All the above makes me thing that trying to remove the workqueue for > the receive path could be a better solution: if incoming pkts are > always moved into the msk receive queue by subflow_data_ready(), the > ACK packet generated by the TCP stack will be enough to cover all the > above, without any need for dup acks. Agree, we will need to either remove or largely avoid mptcp-level ack_seq advancement, waiting for the work queue to schedule makes the mptcp-level ack updates too bursty, we should try to send those updates more smoothly. > With a caveat: currently we leave some packet in the subflow receive > queue while the msk receive queue is full. Possibly keeping the > incoming skb always accounted into the ingress ssk would avoid that > case - currently we move the accounting from ssk to msk > in __mptcp_move_skbs_from_subflow(). Handling the skb freing at receive > time and subflow tear down will not be trivial. Problem is that we rely on the mptcp-socket accountign as well for the receive window. I guess you should try to make this patch less hack-ish, probably by completely avoiding __mptcp_move_skbs() change, and then try to change the rx path to place skbs into the msk queue directly. What about a 'pre queue' with dedicated lock and then have recv() or worker splice that to the msk rx queue? This would be similar to what UDP is doing.
diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 4e02c259134f..2d07d099d038 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -530,6 +530,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; + WRITE_ONCE(msk->old_wspace, tcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e69852fa32e2..328aa98c51f2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -406,7 +406,7 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } -static void mptcp_send_ack(struct mptcp_sock *msk) +static void mptcp_send_ack(struct mptcp_sock *msk, bool force) { struct mptcp_subflow_context *subflow; @@ -414,8 +414,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk) struct sock *ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); - tcp_send_ack(ssk); + if (force) + tcp_send_ack(ssk); + else + tcp_cleanup_rbuf(ssk, 1); release_sock(ssk); + if (!force) + break; } } @@ -467,7 +472,7 @@ static bool mptcp_check_data_fin(struct sock *sk) ret = true; mptcp_set_timeout(sk, NULL); - mptcp_send_ack(msk); + mptcp_send_ack(msk, true); if (sock_flag(sk, SOCK_DEAD)) return ret; @@ -490,7 +495,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; - u32 old_copied_seq; bool done = false; int sk_rbuf; @@ -507,7 +511,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, pr_debug("msk=%p ssk=%p", msk, ssk); tp = tcp_sk(ssk); - old_copied_seq = tp->copied_seq; do { u32 map_remaining, offset; u32 seq = tp->copied_seq; @@ -572,10 +575,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } } while (more_data_avail); - *bytes += moved; - if (tp->copied_seq != old_copied_seq) - tcp_cleanup_rbuf(ssk, 1); - + *bytes = moved; return done; } @@ -1526,7 +1526,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) static bool __mptcp_move_skbs(struct mptcp_sock *msk) { - unsigned int moved = 0; + unsigned int moved_total = 0; bool done; /* avoid looping forever below on racing close */ @@ -1536,6 +1536,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) __mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); + unsigned int moved; bool slowpath; if (!ssk) @@ -1543,12 +1544,14 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) slowpath = lock_sock_fast(ssk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + if (moved) + tcp_send_ack(ssk); unlock_sock_fast(ssk, slowpath); + moved_total += moved; } while (!done); - if (mptcp_ofo_queue(msk) || moved > 0) { - if (!mptcp_check_data_fin((struct sock *)msk)) - mptcp_send_ack(msk); + if (mptcp_ofo_queue(msk) || moved_total > 0) { + mptcp_check_data_fin((struct sock *)msk); return true; } return false; @@ -1573,7 +1576,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, __mptcp_flush_join_list(msk); while (len > (size_t)copied) { - int bytes_read; + int bytes_read, old_space; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); if (unlikely(bytes_read < 0)) { @@ -1588,6 +1591,11 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, __mptcp_move_skbs(msk)) continue; + /* be sure to advertize window change */ + old_space = READ_ONCE(msk->old_wspace); + if ((tcp_space(sk) - old_space) >= old_space) + mptcp_send_ack(msk, false); + /* only the master socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5117093a4de4..091ecb586f58 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -220,6 +220,7 @@ struct mptcp_sock { u64 rcv_data_fin_seq; struct sock *last_snd; int snd_burst; + int old_wspace; atomic64_t snd_una; atomic64_t wnd_end; unsigned long timer_ival; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index b210bc92fc32..07e44ffbeb30 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -850,8 +850,6 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; - if (incr) - tcp_cleanup_rbuf(ssk, incr); } static bool subflow_check_data_avail(struct sock *ssk)