diff mbox series

[net-next,3/3] tcp: keep tcp_collapse controllable even after processing starts

Message ID 20171014072745.23462-4-den@klaipeden.com
State Rejected, archived
Delegated to: David Miller
Headers show
Series optimisations and reorganisations of tcp_collapse | expand

Commit Message

Koichiro Den Oct. 14, 2017, 7:27 a.m. UTC
Combining actual collapsing with reasoning for deciding the starting
point, we can apply its logic in a consistent manner such that we can
avoid costly yet not much useful collapsing. When collapsing to be
triggered, it's not rare that most of the skbs in the receive or ooo
queue are large ones without much metadata overhead. This also
simplifies code and makes it easier to apply logic in a fair manner.

Subtle subsidiary changes included:
- When the end_seq of the skb we are trying to collapse was larger than
  the 'end' argument provided, we would end up copying to the 'end'
  even though we couldn't collapse the original one. Current users of
  tcp_collapse does not require such reserves so redefines it as the
  point over which skbs whose seq passes guranteed not to be collapsed.
- Naturally tcp_collapse_ofo_queue shapes up and we no longer need
  'tail' argument.

Signed-off-by: Koichiro Den <den@klaipeden.com>
---
 net/ipv4/tcp_input.c | 197 +++++++++++++++++++++++----------------------------
 1 file changed, 90 insertions(+), 107 deletions(-)

Comments

Eric Dumazet Oct. 14, 2017, 3:46 p.m. UTC | #1
On Sat, 2017-10-14 at 16:27 +0900, Koichiro Den wrote:
> Combining actual collapsing with reasoning for deciding the starting
> point, we can apply its logic in a consistent manner such that we can
> avoid costly yet not much useful collapsing. When collapsing to be
> triggered, it's not rare that most of the skbs in the receive or ooo
> queue are large ones without much metadata overhead. This also
> simplifies code and makes it easier to apply logic in a fair manner.
> 
> Subtle subsidiary changes included:
> - When the end_seq of the skb we are trying to collapse was larger than
>   the 'end' argument provided, we would end up copying to the 'end'
>   even though we couldn't collapse the original one. Current users of
>   tcp_collapse does not require such reserves so redefines it as the
>   point over which skbs whose seq passes guranteed not to be collapsed.
> - Naturally tcp_collapse_ofo_queue shapes up and we no longer need
>   'tail' argument.


I am not inclined to review such a large change, without you providing
actual numbers.

We have a problem in TCP right now, that receiver announces a too big
window, and that is the main reason we trigger collapsing.

I would rather fix the root cause.
Koichiro Den Oct. 15, 2017, 2:45 a.m. UTC | #2
On Sat, 2017-10-14 at 08:46 -0700, Eric Dumazet wrote:
> On Sat, 2017-10-14 at 16:27 +0900, Koichiro Den wrote:
> > Combining actual collapsing with reasoning for deciding the starting
> > point, we can apply its logic in a consistent manner such that we can
> > avoid costly yet not much useful collapsing. When collapsing to be
> > triggered, it's not rare that most of the skbs in the receive or ooo
> > queue are large ones without much metadata overhead. This also
> > simplifies code and makes it easier to apply logic in a fair manner.
> > 
> > Subtle subsidiary changes included:
> > - When the end_seq of the skb we are trying to collapse was larger than
> >   the 'end' argument provided, we would end up copying to the 'end'
> >   even though we couldn't collapse the original one. Current users of
> >   tcp_collapse does not require such reserves so redefines it as the
> >   point over which skbs whose seq passes guranteed not to be collapsed.
> > - Naturally tcp_collapse_ofo_queue shapes up and we no longer need
> >   'tail' argument.
> 
> 
> I am not inclined to review such a large change, without you providing
> actual numbers.
> 
> We have a problem in TCP right now, that receiver announces a too big
> window, and that is the main reason we trigger collapsing.
> 
> I would rather fix the root cause.
> 
> 
Ok I got it, thank you.
diff mbox series

Patch

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1a74457db0f3..5fb90cc0ae95 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4756,108 +4756,117 @@  void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 
 /* Collapse contiguous sequence of skbs head..tail with
  * sequence numbers start..end.
- *
- * If tail is NULL, this means until the end of the queue.
- *
- * Segments with FIN/SYN are not collapsed (only because this
- * simplifies code)
  */
 static void
 tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
-	     struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
+	     struct sk_buff *head, u32 start, u32 *end)
 {
-	struct sk_buff *skb = head, *n;
+	struct sk_buff *skb = head, *n, *nskb = NULL;
+	int copy = 0, offset, size;
 	struct sk_buff_head tmp;
-	bool end_of_skbs;
 
-	/* First, check that queue is collapsible and find
-	 * the point where collapsing can be useful.
-	 */
-restart:
-	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
-		/* If list is ooo queue, it will get purged when
-		 * this FIN will get moved to sk_receive_queue.
-		 * SYN packet is not expected here. We will get
-		 * error message when actual receiving.
-		 */
-		if (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_FIN | TCPHDR_SYN))
-			return;
+	if (!list)
+		__skb_queue_head_init(&tmp); /* To defer rb tree insertion */
 
-		n = tcp_skb_next(skb, list);
-
-		/* No new bits? It is possible on ofo queue. */
+	while (skb) {
 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 			skb = tcp_collapse_one(sk, skb, list, root);
-			if (!skb)
-				break;
-			goto restart;
+			continue;
 		}
+		n = tcp_skb_next(skb, list);
 
-		/* The first skb to collapse is:
-		 * - bloated or contains data before "start" or
-		 *   overlaps to the next one.
-		 */
-		if (tcp_win_from_space(skb->truesize) > skb->len ||
-		    before(TCP_SKB_CB(skb)->seq, start)) {
-			end_of_skbs = false;
+examine:
+		/* Nothing beneficial to expect any more if SYN/FIN or last. */
+		if (!n ||
+		    TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_FIN | TCPHDR_SYN))
 			break;
+
+		/* If hole found, skip to the next. */
+		if (after(TCP_SKB_CB(n)->seq, TCP_SKB_CB(skb)->end_seq)) {
+			skb = n;
+			continue;
 		}
 
-		if (n && n != tail &&
-		    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
-			end_of_skbs = false;
-			break;
+		/* If the 2nd skb has no newer bits than the current one,
+		 * just collapse and advance it to re-examine.
+		 */
+		if (!after(TCP_SKB_CB(n)->end_seq, TCP_SKB_CB(skb)->end_seq)) {
+			n = tcp_collapse_one(sk, skb, list, root);
+			if (!n)
+				break;
+			goto examine;
 		}
 
-		/* Decided to skip this, advance start seq. */
-		start = TCP_SKB_CB(skb)->end_seq;
-	}
-	if (end_of_skbs ||
-	    (TCP_SKB_CB(skb)->seq == start && TCP_SKB_CB(skb)->end_seq == end))
-		return;
+		/* If the next skb passes the end hint, finish. */
+		if (end && !before(TCP_SKB_CB(n)->seq, *end))
+			break;
 
-	__skb_queue_head_init(&tmp);
+		/* If we can put more into the new skb created before, do so.
+		 * Otherwise we conclude the processing is worth the effort
+		 * only if the two skbs overlaps or either one is bloated.
+		 */
+		if ((!nskb || !after(TCP_SKB_CB(skb)->seq,
+		     TCP_SKB_CB(nskb)->end_seq)) &&
+		    TCP_SKB_CB(n)->seq == TCP_SKB_CB(skb)->end_seq &&
+		    tcp_win_from_space(skb->truesize) <= skb->len &&
+		    tcp_win_from_space(n->truesize) <= n->len) {
+			skb = n;
+			continue;
+		}
 
-	while (before(start, end)) {
-		int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
-		struct sk_buff *nskb;
+		/* Now we decide to take some action for the two.
+		 * Note: at this point, the followings hold true:
+		 * - start < TCP_SKB_CB(skb)->end_seq < TCP_SKB_CB(n)->end_seq
+		 * - TCP_SKB_CB(skb)->seq < TCP_SKB_CB(n)->seq < end
+		 *
+		 * TODO: (split +) shift + collapse if it deserves. */
+		while (after(TCP_SKB_CB(n)->end_seq, start)) {
+			if (!copy) {
+				copy = !end ? SKB_MAX_ORDER(0, 0) :
+				       min_t(int, SKB_MAX_ORDER(0, 0),
+					     *end - start);
+				nskb = alloc_skb(copy, GFP_ATOMIC);
+				if (!nskb)
+					/* Trying to trim the last nskb does not
+					 * help much, lets just abort.
+					 */
+					goto end;
 
-		nskb = alloc_skb(copy, GFP_ATOMIC);
-		if (!nskb)
-			break;
+				memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+				TCP_SKB_CB(nskb)->seq =
+				TCP_SKB_CB(nskb)->end_seq = start;
 
-		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
-		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
-		if (list)
-			__skb_queue_before(list, skb, nskb);
-		else
-			__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
-		skb_set_owner_r(nskb, sk);
+				if (list)
+					__skb_queue_before(list, skb, nskb);
+				else
+					__skb_queue_tail(&tmp, nskb);
+				skb_set_owner_r(nskb, sk);
+			}
 
-		/* Copy data, releasing collapsed skbs. */
-		while (copy > 0) {
-			int offset = start - TCP_SKB_CB(skb)->seq;
-			int size = TCP_SKB_CB(skb)->end_seq - start;
+			/* Copy data, releasing collapsed skbs. */
+			offset = start - TCP_SKB_CB(skb)->seq;
+			size = TCP_SKB_CB(skb)->end_seq - start;
 
 			BUG_ON(offset < 0);
-			if (size > 0) {
-				size = min(copy, size);
-				if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
-					BUG();
-				TCP_SKB_CB(nskb)->end_seq += size;
-				copy -= size;
-				start += size;
-			}
-			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+			BUG_ON(size <= 0);
+
+			size = min(copy, size);
+			if (skb_copy_bits(skb, offset, skb_put(nskb, size),
+					  size))
+				BUG();
+
+			TCP_SKB_CB(nskb)->end_seq += size;
+			copy -= size;
+			start += size;
+
+			if (!before(start, TCP_SKB_CB(skb)->seq))
 				skb = tcp_collapse_one(sk, skb, list, root);
-				if (!skb || skb == tail)
-					goto end;
-			}
 		}
 	}
 end:
-	skb_queue_walk_safe(&tmp, skb, n)
-		tcp_rbtree_insert(root, skb);
+	if (!list)
+		skb_queue_walk_safe(&tmp, skb, n)
+			tcp_rbtree_insert(root, skb);
 }
 
 /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4866,37 +4875,12 @@  tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
 static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb, *head;
-	u32 start, end;
-
-	skb = skb_rb_first(&tp->out_of_order_queue);
-new_range:
-	if (!skb) {
-		tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
-		return;
-	}
-	start = TCP_SKB_CB(skb)->seq;
-	end = TCP_SKB_CB(skb)->end_seq;
+	struct sk_buff *head;
 
-	for (head = skb;;) {
-		skb = skb_rb_next(skb);
-
-		/* Range is terminated when we see a gap or when
-		 * we are at the queue end.
-		 */
-		if (!skb ||
-		    after(TCP_SKB_CB(skb)->seq, end) ||
-		    before(TCP_SKB_CB(skb)->end_seq, start)) {
-			tcp_collapse(sk, NULL, &tp->out_of_order_queue,
-				     head, skb, start, end);
-			goto new_range;
-		}
-
-		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
-			start = TCP_SKB_CB(skb)->seq;
-		if (after(TCP_SKB_CB(skb)->end_seq, end))
-			end = TCP_SKB_CB(skb)->end_seq;
-	}
+	head = skb_rb_first(&tp->out_of_order_queue);
+	tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+		     head, TCP_SKB_CB(head)->seq, NULL);
+	tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
 }
 
 /*
@@ -4965,8 +4949,7 @@  static int tcp_prune_queue(struct sock *sk)
 	if (!skb_queue_empty(&sk->sk_receive_queue))
 		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
 			     skb_peek(&sk->sk_receive_queue),
-			     NULL,
-			     tp->copied_seq, tp->rcv_nxt);
+			     tp->copied_seq, &tp->rcv_nxt);
 	sk_mem_reclaim(sk);
 
 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)