diff mbox

[v2] tcp: input header length, prediction, and timestamp bugs

Message ID 4B50BFFC.8010108@gmail.com
State Deferred, archived
Delegated to: David Miller
Headers show

Commit Message

William Allen Simpson Jan. 15, 2010, 7:20 p.m. UTC
Fix incorrect header prediction flags documentation.

Don't use output calculated tp->tcp_header_len for input decisions.
While the output header is usually the same as the input (same options
in both directions), that's a poor assumption. In particular, Sack will
be different. Newer options are not guaranteed.

Moreover, in the fast path, that only saved a shift or two. The other
efficiencies in this patch more than make up the difference.

Instead, use tp->rx_opt.tstamp_ok to accurately predict header length.

Likewise, use tp->rx_opt.tstamp_ok for received MSS calculations.

Don't use "sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED" to guess that
the timestamp is present. This may have been OK in the days with fewer
possible options, but various combinations of newer options may yield
the same header length.  (This bug is in 3 places.)

Instead, use tp->rx_opt.saw_tstamp to determine a timestamp is present.

There's no need to test buffer length against header length, already
checked by tcp_v[4,6]_rcv(). Straighten code for minor efficiency gain.

Stand-alone patch, originally developed for TCPCT.

Requires:
   net: tcp_header_len_th and tcp_option_len_th
   tcp: harmonize tcp_vx_rcv header length assumptions

Signed-off-by: William.Allen.Simpson@gmail.com
---
  include/linux/tcp.h  |    6 +++-
  include/net/tcp.h    |    9 +++++-
  net/ipv4/tcp_input.c |   84 +++++++++++++++++++-------------------------------
  3 files changed, 45 insertions(+), 54 deletions(-)

Comments

William Allen Simpson Jan. 19, 2010, 5:35 p.m. UTC | #1
Over the weekend, I've been reviewing the .lst output that Andi
explained, and I've found a few interesting things.

1) The 4.4 compiler isn't very smart about shifts:

static inline unsigned int tcp_header_len_th(const struct tcphdr *th)
{
	return th->doff * 4;
c04ea93e:	0f b6 47 0c          	movzbl 0xc(%edi),%eax
c04ea942:	c0 e8 04             	shr    $0x4,%al
c04ea945:	0f b6 c0             	movzbl %al,%eax
c04ea948:	c1 e0 02             	shl    $0x2,%eax

That could easily be done as shr $0x2 instead.

2) This "fast path" code is under quite a bit of register pressure on
the 32-bit i386.  There's a lot of saving and re-loading.

3) Particularly, the seldom used *th and len parameters are saved and
reloaded in the very beginning of the fast path, really wasting time.

4) Since both *th and len are actually indexed loads from *skb (which
the compiler keeps in a register most of the time), doing indexed loads
from the stack (%ebp) is the same, so they shouldn't be sent as
parameters at all!

5) There's already code, added back in 2006 for DMA, that directly
references skb->len instead of the len parameter.  Probably lack of
header documentation, so the coder failed to notice:

				if (copied_early)
					tcp_cleanup_rbuf(sk, skb->len);
c04ead5d:	8b 56 50             	mov    0x50(%esi),%edx
c04ead60:	89 d8                	mov    %ebx,%eax
c04ead62:	e8 fc ff ff ff       	call   c04ead63 <tcp_rcv_established+0x633>

Therefore, I'll resubmit this patch, removing the existing len parameter.
And maybe *th, too....
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
William Allen Simpson Jan. 19, 2010, 7:35 p.m. UTC | #2
William Allen Simpson wrote:
> Therefore, I'll resubmit this patch, removing the existing len parameter.
> And maybe *th, too....
> 
Just to quickly note that gcc 4.4 doesn't properly remember that it has
already loaded *th with this rampant use of an inline function (unlike the
older macro method):

c04ea739:	89 d3                	mov    %edx,%ebx

static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb)
{
	return (struct tcphdr *)skb_transport_header(skb);
c04ea743:	8b 92 94 00 00 00    	mov    0x94(%edx),%edx
	 *
	 *	Our current scheme is not silly either but we take the
	 *	extra cost of the net_bh soft interrupt processing...
	 *	We do checksum and copy also but from device to kernel.
	 */
	if ((tcp_flag_word(tcp_hdr(skb)) & TCP_HP_BITS) == tp->pred_flags &&
...


Note that the index is in both %edx and %ebx, but it uses replaced %edx.
Although by inspection that result stays in %edx, it reloaded twice more:

	res = tcp_validate_incoming(sk, skb, tcp_hdr(skb), 1);
c04ea78c:	8b 8b 94 00 00 00    	mov    0x94(%ebx),%ecx
c04ea792:	89 da                	mov    %ebx,%edx
c04ea794:	89 f0                	mov    %esi,%eax
c04ea796:	c7 04 24 01 00 00 00 	movl   $0x1,(%esp)
c04ea79d:	e8 0e c3 ff ff       	call   c04e6ab0 <tcp_validate_incoming>
	if (res <= 0)
c04ea7a2:	85 c0                	test   %eax,%eax
c04ea7a4:	0f 8e 8e 03 00 00    	jle    c04eab38 <tcp_rcv_established+0x408>

#else /* NET_SKBUFF_DATA_USES_OFFSET */

static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
	return skb->transport_header;
c04ea7aa:	8b 83 94 00 00 00    	mov    0x94(%ebx),%eax
c04ea7b0:	f6 40 0d 10          	testb  $0x10,0xd(%eax)
c04ea7b4:	0f 85 5e 03 00 00    	jne    c04eab18 <tcp_rcv_established+0x3e8>


This doesn't happen with the parameter *th (undisturbed in %edi):

c04ea78a:	89 f9                	mov    %edi,%ecx
c04ea78c:	89 f2                	mov    %esi,%edx
c04ea78e:	89 d8                	mov    %ebx,%eax
c04ea790:	c7 04 24 01 00 00 00 	movl   $0x1,(%esp)
c04ea797:	e8 14 c3 ff ff       	call   c04e6ab0 <tcp_validate_incoming>
	if (res <= 0)
c04ea79c:	85 c0                	test   %eax,%eax
c04ea79e:	0f 8e 8c 03 00 00    	jle    c04eab30 <tcp_rcv_established+0x400>
		return -res;

step5:
	if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
c04ea7a4:	f6 47 0d 10          	testb  $0x10,0xd(%edi)
c04ea7a8:	0f 85 62 03 00 00    	jne    c04eab10 <tcp_rcv_established+0x3e0>


Therefore, keeping the parameter *th.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
William Allen Simpson Jan. 19, 2010, 7:58 p.m. UTC | #3
William Allen Simpson wrote:
> Therefore, I'll resubmit this patch, removing the existing len parameter.
> 
And for record, there's a jtcp_rcv_established(), and the len isn't used
there either (it uses skb->len instead).

I also discovered that the related tcp_rcv_state_process() and
tcp_rcv_synsent_state_process() never uses their len parameter.
So, I'll remove them, too.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 74728f7..2987ee8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -301,7 +301,11 @@  struct tcp_sock {
 
 /*
  *	Header prediction flags
- *	0x5?10 << 16 + snd_wnd in net byte order
+ *	S << 28 + TCP_FLAG_ACK + snd_wnd, in net byte order
+ *		(PSH flag is ignored)
+ *		S is 5 (no options), or 8 (timestamp aligned)
+ *	otherwise, 0 to turn it off -- for instance, when there are
+ *	holes in receive space.
  */
 	__be32	pred_flags;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 34f5cc2..30817b1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -533,9 +533,16 @@  static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 	return (tp->srtt >> 3) + tp->rttvar;
 }
 
+static inline u16 __tcp_fast_path_header_length(const struct tcp_sock *tp)
+{
+	return tp->rx_opt.tstamp_ok
+		? sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED
+		: sizeof(struct tcphdr);
+}
+
 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
-	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
+	tp->pred_flags = htonl((__tcp_fast_path_header_length(tp) << (28 - 2)) |
 			       ntohl(TCP_FLAG_ACK) |
 			       snd_wnd);
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 28e0296..0aa2254 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -152,7 +152,7 @@  static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
 			 * tcp header plus fixed timestamp option length.
 			 * Resulting "len" is MSS free of SACK jitter.
 			 */
-			len -= tcp_sk(sk)->tcp_header_len;
+			len -= __tcp_fast_path_header_length(tcp_sk(sk));
 			icsk->icsk_ack.last_seg_size = len;
 			if (len == lss) {
 				icsk->icsk_ack.rcv_mss = len;
@@ -5225,31 +5225,15 @@  int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	 *	extra cost of the net_bh soft interrupt processing...
 	 *	We do checksum and copy also but from device to kernel.
 	 */
-
-	tp->rx_opt.saw_tstamp = 0;
-
-	/*	pred_flags is 0xS?10 << 16 + snd_wnd
-	 *	if header_prediction is to be made
-	 *	'S' will always be tp->tcp_header_len >> 2
-	 *	'?' will be 0 for the fast path, otherwise pred_flags is 0 to
-	 *  turn it off	(when there are holes in the receive
-	 *	 space for instance)
-	 *	PSH flag is ignored.
-	 */
-
 	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
 	    TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
 	    !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
-		int tcp_header_len = tp->tcp_header_len;
-
-		/* Timestamp header prediction: tcp_header_len
-		 * is automatically equal to th->doff*4 due to pred_flags
-		 * match.
-		 */
+		int tcp_header_len = tcp_header_len_th(th);
 
-		/* Check timestamp */
-		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
-			/* No? Slow path! */
+		/* Timestamp header prediction */
+		if (tcp_header_len != sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) {
+			tp->rx_opt.saw_tstamp = 0; /* false */
+		} else {
 			if (!tcp_parse_aligned_timestamp(tp, th))
 				goto slow_path;
 
@@ -5264,30 +5248,7 @@  int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			 */
 		}
 
-		if (len <= tcp_header_len) {
-			/* Bulk data transfer: sender */
-			if (len == tcp_header_len) {
-				/* Predicted packet is in window by definition.
-				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
-				 * Hence, check seq<=rcv_wup reduces to:
-				 */
-				if (tcp_header_len ==
-				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
-				    tp->rcv_nxt == tp->rcv_wup)
-					tcp_store_ts_recent(tp);
-
-				/* We know that such packets are checksummed
-				 * on entry.
-				 */
-				tcp_ack(sk, skb, 0);
-				__kfree_skb(skb);
-				tcp_data_snd_check(sk);
-				return 0;
-			} else { /* Header too small */
-				TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
-				goto discard;
-			}
-		} else {
+		if (tcp_header_len < len) {
 			int eaten = 0;
 			int copied_early = 0;
 
@@ -5311,9 +5272,7 @@  int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 					 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
 					 * Hence, check seq<=rcv_wup reduces to:
 					 */
-					if (tcp_header_len ==
-					    (sizeof(struct tcphdr) +
-					     TCPOLEN_TSTAMP_ALIGNED) &&
+					if (tp->rx_opt.saw_tstamp &&
 					    tp->rcv_nxt == tp->rcv_wup)
 						tcp_store_ts_recent(tp);
 
@@ -5334,8 +5293,7 @@  int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
 				 * Hence, check seq<=rcv_wup reduces to:
 				 */
-				if (tcp_header_len ==
-				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+				if (tp->rx_opt.saw_tstamp &&
 				    tp->rcv_nxt == tp->rcv_wup)
 					tcp_store_ts_recent(tp);
 
@@ -5376,11 +5334,33 @@  no_ack:
 			else
 				sk->sk_data_ready(sk, 0);
 			return 0;
+		} else {
+			/* Bulk data transfer: sender
+			 *
+			 * tcp_header_len > len never happens,
+			 * already checked by tcp_v[4,6]_rcv()
+			 *
+			 * Predicted packet is in window by definition.
+			 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+			 * Hence, check seq<=rcv_wup reduces to:
+			 */
+			if (tp->rx_opt.saw_tstamp &&
+			    tp->rcv_nxt == tp->rcv_wup)
+				tcp_store_ts_recent(tp);
+
+			/* We know that such packets are checksummed
+			 * on entry.
+			 */
+			tcp_ack(sk, skb, 0);
+			__kfree_skb(skb);
+			tcp_data_snd_check(sk);
+			return 0;
 		}
 	}
 
 slow_path:
-	if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
+	/* Assumes header and options unchanged since checksum_init() */
+	if (tcp_checksum_complete_user(sk, skb))
 		goto csum_error;
 
 	/*