@@ -1063,26 +1063,25 @@ static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
}
-#define MPTCP_SEND_BURST_SIZE (1 << 15)
+#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
+ sizeof(struct tcphdr) - \
+ MAX_TCP_OPTION_SPACE - \
+ sizeof(struct ipv6hdr) - \
+ sizeof(struct frag_hdr))
-static struct list_head *mptcp_next_subflow(struct mptcp_sock *msk,
- struct list_head *pos,
- bool wrap_around)
-{
- if (list_is_last(pos, &msk->conn_list) && wrap_around)
- return msk->conn_list.next;
- return pos->next;
-}
+struct subflow_send_info {
+ struct sock *ssk;
+ uint64_t ratio;
+};
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
u32 *sndbuf)
{
+ struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
- struct sock *next_backup = NULL;
- struct list_head *pos, *start;
- struct sock *next_ssk = NULL;
- bool wrap_around;
- bool free;
+ int nr_active = 0;
+ struct sock *ssk;
+ int64_t ratio;
sock_owned_by_me((struct sock *)msk);
@@ -1095,65 +1094,57 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
return msk->first;
}
- /* lookup the first writeable subflow and first writable back subflow
- * starting from last used, with wrap-around
- */
- if (msk->last_snd) {
- start = &mptcp_subflow_ctx(msk->last_snd)->node;
- wrap_around = true;
- } else {
- start = &msk->conn_list;
- wrap_around = false;
- }
- pr_debug("msk=%p start=%p pos=%p wrap_around=%d last=%p", msk, start,
- mptcp_next_subflow(msk, start, wrap_around), wrap_around,
- msk->last_snd);
- for (pos = mptcp_next_subflow(msk, start, wrap_around); pos != start;
- pos = mptcp_next_subflow(msk, pos, wrap_around)) {
- struct sock *ssk;
+ /* re-use last subflow, if the burst allow that */
+ if (msk->last_snd && msk->snd_burst > 0 &&
+ sk_stream_memory_free(msk->last_snd) &&
+ mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
+ mptcp_for_each_subflow(msk, subflow) {
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
+ }
+ return msk->last_snd;
+ }
- subflow = list_entry(pos, struct mptcp_subflow_context, node);
+ /* pick the subflow with the lower wmem/wspace ratio */
+ send_info[0].ssk = NULL;
+ send_info[0].ratio = -1;
+ send_info[1].ssk = NULL;
+ send_info[1].ratio = -1;
+ mptcp_for_each_subflow(msk, subflow) {
ssk = mptcp_subflow_tcp_sock(subflow);
if (!mptcp_subflow_active(subflow))
continue;
+ nr_active += !subflow->backup;
*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
- if (next_backup || next_ssk)
+ if (!sk_stream_memory_free(subflow->tcp_sock))
continue;
- free = sk_stream_is_writeable(subflow->tcp_sock);
- if (!free)
+ if (!READ_ONCE(ssk->sk_pacing_rate))
continue;
- if (!subflow->backup && !next_ssk)
- next_ssk = ssk;
-
- if (subflow->backup && !next_backup)
- next_backup = ssk;
+ ratio = (int64_t)READ_ONCE(ssk->sk_wmem_queued) << 32 /
+ READ_ONCE(ssk->sk_pacing_rate) >> 12;
+ if (ratio < send_info[subflow->backup].ratio) {
+ send_info[subflow->backup].ssk = ssk;
+ send_info[subflow->backup].ratio = ratio;
+ }
}
- if (!next_ssk)
- next_ssk = next_backup;
- if (msk->last_snd) {
- *sndbuf = max(tcp_sk(msk->last_snd)->snd_wnd, *sndbuf);
- free = sk_stream_memory_free(msk->last_snd);
- } else {
- free = false;
- }
- pr_debug("msk=%p ssk=%p:%d last=%p last free=%d burst=%d", msk,
- next_ssk, next_ssk ? sk_stream_wspace(next_ssk) : 0,
- msk->last_snd, free, msk->snd_burst);
+ /* pick the best backup if no other subflow is active */
+ if (!nr_active)
+ send_info[0].ssk = send_info[1].ssk;
- /* use the looked-up subflow if the previusly one has exauted the burst
- * or is not writable
- */
- if (next_ssk && (!free || msk->snd_burst <= 0)) {
- msk->last_snd = next_ssk;
+ pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
+ msk, nr_active, send_info[0].ssk, send_info[0].ratio,
+ send_info[1].ssk, send_info[1].ratio);
+ if (send_info[0].ssk) {
+ msk->last_snd = send_info[0].ssk;
msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
- sk_stream_wspace(next_ssk));
- free = true;
+ sk_stream_wspace(msk->last_snd));
+ return msk->last_snd;
}
- return free ? msk->last_snd : NULL;
+ return NULL;
}
static void ssk_check_wmem(struct mptcp_sock *msk)
This improves the current packet scheduler for scenarios where the msk sockets is using different links with different bandwith limits. The commit comment should be changed into: """ Update the scheduler to less trivial heuristic: cache the last used subflow, and try to send on it a reasonably long burst of data. When the burst or the subflow send space is exhausted, pick the subflow with the lower ratio between write space and send buffer - that is, the subflow with the greater relative amount of free space. """ Signed-off-by: Paolo Abeni <pabeni@redhat.com> --- Note: - this is almost an RFC ;) - subflows are now selected when sk_stream_memory_free(ssk), while before we used sk_stream_is_writeable(ssk). The current choice is closer to plain TCP and IMHO more correct, but even the old one should not be buggy. Anyhow I saw reproducible write error (EAGAIN) when using sk_stream_is_writeable() with this strategy, and I was unable to track them down, so possibly there is some other bug there I could not find. - simuilt xmit self-tests are still failing, even if we are now closer to success. The failures are now caused by the high delay configured on the virtual link. This problem would be addressed by a larger initial send buffer, but the current xmit path is unable to cope with that correctly: with a larger send buffer it will send pkts outside the MPTCP window. To properly address the above we additionally need something alike "mptcp: keep track of receivers advertised window" from Florian, plus the ability to enqueue data at the MPTCP level and send it when the MPTCP level window is open. --- net/mptcp/protocol.c | 107 ++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 58 deletions(-)