Patchwork ipv4: IP defragmentation must be ECN aware

login
register
mail settings
Submitter Eric Dumazet
Date Jan. 5, 2011, 1:59 p.m.
Message ID <1294235942.2775.191.camel@edumazet-laptop>
Download mbox | patch
Permalink /patch/77596/
State Superseded
Delegated to: David Miller
Headers show

Comments

Eric Dumazet - Jan. 5, 2011, 1:59 p.m.
RFC3168 (The Addition of Explicit Congestion Notification to IP)
states :

5.3.  Fragmentation

   ECN-capable packets MAY have the DF (Don't Fragment) bit set.
   Reassembly of a fragmented packet MUST NOT lose indications of
   congestion.  In other words, if any fragment of an IP packet to be
   reassembled has the CE codepoint set, then one of two actions MUST be
   taken:

      * Set the CE codepoint on the reassembled packet.  However, this
        MUST NOT occur if any of the other fragments contributing to
        this reassembly carries the Not-ECT codepoint.

      * The packet is dropped, instead of being reassembled, for any
        other reason.

This patch implements this requirement for IPv4, choosing the first
action : 

If one fragment had NO-ECT codepoint
	reassembled frame has NO-ECT
ElIf one fragment had CE codepoint
	reassembled frame has CE

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/ip_fragment.c |   34 ++++++++++++++++++++++++++++++++++
 1 files changed, 34 insertions(+)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger - Jan. 5, 2011, 5:13 p.m.
On Wed, 05 Jan 2011 14:59:02 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> +
> +static inline int ip4_frag_ecn(int tos)

Since tos is only a byte, this should be:

static inline u8 ip4_frag_ecn(u8 tos)
Eric Dumazet - Jan. 5, 2011, 5:41 p.m.
Le mercredi 05 janvier 2011 à 09:13 -0800, Stephen Hemminger a écrit :
> On Wed, 05 Jan 2011 14:59:02 +0100
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > +
> > +static inline int ip4_frag_ecn(int tos)
> 
> Since tos is only a byte, this should be:
> 
> static inline u8 ip4_frag_ecn(u8 tos)
> 
> 

In fact, generated code is the same on x86, but some arches have faster
arithmetic on WORD units.

And I added the 'inline' because on x86_64 gcc, compiler chose _not_ to
inline this 6 instruction sequence ! Code was much larger.

31 d2                   xor    %edx,%edx
0f b6 40 01             movzbl 0x1(%rax),%eax
83 e0 03                and    $0x3,%eax
ff c0                   inc    %eax
a8 02                   test   $0x2,%al
0f 44 d0                cmove  %eax,%edx


We do roughly the same (working on WORD arith) in 

static inline int IP_ECN_set_ce(struct iphdr *iph)
{
u32 ecn = (iph->tos + 1) & INET_ECN_MASK;
...
}

What others think ? I have no real strong opinion.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger - Jan. 5, 2011, 5:48 p.m.
On Wed, 05 Jan 2011 18:41:01 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le mercredi 05 janvier 2011 à 09:13 -0800, Stephen Hemminger a écrit :
> > On Wed, 05 Jan 2011 14:59:02 +0100
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > 
> > > +
> > > +static inline int ip4_frag_ecn(int tos)
> > 
> > Since tos is only a byte, this should be:
> > 
> > static inline u8 ip4_frag_ecn(u8 tos)
> > 
> > 
> 
> In fact, generated code is the same on x86, but some arches have faster
> arithmetic on WORD units.
> 
> And I added the 'inline' because on x86_64 gcc, compiler chose _not_ to
> inline this 6 instruction sequence ! Code was much larger.
> 
> 31 d2                   xor    %edx,%edx
> 0f b6 40 01             movzbl 0x1(%rax),%eax
> 83 e0 03                and    $0x3,%eax
> ff c0                   inc    %eax
> a8 02                   test   $0x2,%al
> 0f 44 d0                cmove  %eax,%edx
> 
> 
> We do roughly the same (working on WORD arith) in 
>
At least make it unsigned int?

Patch

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e6215bd..e6b53a7 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -45,6 +45,7 @@ 
 #include <linux/udp.h>
 #include <linux/inet.h>
 #include <linux/netfilter_ipv4.h>
+#include <net/inet_ecn.h>
 
 /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
  * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -70,11 +71,28 @@  struct ipq {
 	__be32		daddr;
 	__be16		id;
 	u8		protocol;
+	u8		ecn; /* RFC3168 support */
 	int             iif;
 	unsigned int    rid;
 	struct inet_peer *peer;
 };
 
+#define IPFRAG_ECN_CLEAR  0x01 /* one frag had INET_ECN_NOT_ECT */
+#define IPFRAG_ECN_SET_CE 0x04 /* one frag had INET_ECN_CE */
+
+static inline int ip4_frag_ecn(int tos)
+{
+	tos = (tos & INET_ECN_MASK) + 1;
+	/*
+	 * After the last operation we have (in binary):
+	 * INET_ECN_NOT_ECT => 001
+	 * INET_ECN_ECT_1   => 010
+	 * INET_ECN_ECT_0   => 011
+	 * INET_ECN_CE      => 100
+	 */
+	return (tos & 2) ? 0 : tos;
+}
+
 static struct inet_frags ip4_frags;
 
 int ip_frag_nqueues(struct net *net)
@@ -137,6 +155,7 @@  static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 
 	qp->protocol = arg->iph->protocol;
 	qp->id = arg->iph->id;
+	qp->ecn = ip4_frag_ecn(arg->iph->tos);
 	qp->saddr = arg->iph->saddr;
 	qp->daddr = arg->iph->daddr;
 	qp->user = arg->user;
@@ -316,6 +335,7 @@  static int ip_frag_reinit(struct ipq *qp)
 	qp->q.fragments = NULL;
 	qp->q.fragments_tail = NULL;
 	qp->iif = 0;
+	qp->ecn = 0;
 
 	return 0;
 }
@@ -328,6 +348,7 @@  static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	int flags, offset;
 	int ihl, end;
 	int err = -ENOENT;
+	u8 ecn;
 
 	if (qp->q.last_in & INET_FRAG_COMPLETE)
 		goto err;
@@ -339,6 +360,7 @@  static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		goto err;
 	}
 
+	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
 	offset = ntohs(ip_hdr(skb)->frag_off);
 	flags = offset & ~IP_OFFSET;
 	offset &= IP_OFFSET;
@@ -472,6 +494,7 @@  found:
 	}
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
+	qp->ecn |= ecn;
 	atomic_add(skb->truesize, &qp->q.net->mem);
 	if (offset == 0)
 		qp->q.last_in |= INET_FRAG_FIRST_IN;
@@ -583,6 +606,17 @@  static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	iph = ip_hdr(head);
 	iph->frag_off = 0;
 	iph->tot_len = htons(len);
+	/* RFC3168 5.3 Fragmentation support :
+	 * If one fragment had INET_ECN_NOT_ECT,
+	 *	reassembled frame also has INET_ECN_NOT_ECT
+	 * Elif one fragment had INET_ECN_CE
+	 *	reassembled frame also has INET_ECN_CE
+	 */
+	if (qp->ecn & IPFRAG_ECN_CLEAR)
+		iph->tos &= ~INET_ECN_MASK;
+	else if (qp->ecn & IPFRAG_ECN_SET_CE)
+		iph->tos |= INET_ECN_CE;
+
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
 	qp->q.fragments_tail = NULL;