diff mbox

gre: copy ToS/DiffServ bits to outer IP header

Message ID 20090713133225.GA20946@urbino.open.ch
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Andreas Jaggi July 13, 2009, 1:32 p.m. UTC
When tunneling IP traffic with GRE this patch makes it possible to export the
ToS/DiffServ information to the outer IP header. This is particularly useful in
a scenario with ESP/AH where the inner IP header is encrypted but the packet
priority/DiffServ information should still be respected by the transporting
routers (for example in an MPLS backbone network).

The feature is disabled by default and can be enabled on per-interface basis.
The flag is stored in an unused bit of ip_tunnel_parm.o_flags, and can be
modified through the rntl_link interface.

Also does this bring Linux back in the game, as JunOS/IOS provide this for
quite some time:
http://www.cisco.com/en/US/docs/ios/11_3/feature/guide/greqos.html
http://www.juniper.net/techpubs/software/junos/junos94/swconfig-services/configuring-a-gre-tunnel-to-copy-tos-bits-to-the-outer-ip-header.html

Signed-off-by: Andreas Jaggi <aj@open.ch>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

stephen hemminger July 13, 2009, 3:33 p.m. UTC | #1
On Mon, 13 Jul 2009 15:32:25 +0200
Andreas Jaggi <aj@open.ch> wrote:

> When tunneling IP traffic with GRE this patch makes it possible to export the
> ToS/DiffServ information to the outer IP header. This is particularly useful in
> a scenario with ESP/AH where the inner IP header is encrypted but the packet
> priority/DiffServ information should still be respected by the transporting
> routers (for example in an MPLS backbone network).
>
> The feature is disabled by default and can be enabled on per-interface basis.
> The flag is stored in an unused bit of ip_tunnel_parm.o_flags, and can be
> modified through the rntl_link interface.

Why make it an option? Sounds like it should always be on.

> Also does this bring Linux back in the game, as JunOS/IOS provide this for
> quite some time:
> http://www.cisco.com/en/US/docs/ios/11_3/feature/guide/greqos.html
> http://www.juniper.net/techpubs/software/junos/junos94/swconfig-services/configuring-a-gre-tunnel-to-copy-tos-bits-to-the-outer-ip-header.html
> 
> Signed-off-by: Andreas Jaggi <aj@open.ch>
> 
> diff -urN vanilla-linux-2.6.29.4/include/linux/if_tunnel.h dev-gre/include/linux/if_tunnel.h
> --- vanilla-linux-2.6.29.4/include/linux/if_tunnel.h	2009-05-19 01:52:34.000000000 +0200
> +++ dev-gre/include/linux/if_tunnel.h	2009-07-13 15:15:26.000000000 +0200
> @@ -24,6 +24,7 @@
>  #define GRE_REC		__constant_htons(0x0700)
>  #define GRE_FLAGS	__constant_htons(0x00F8)
>  #define GRE_VERSION	__constant_htons(0x0007)
> +#define GRE_COPY_TOS	__constant_htons(0x0008)
>  
>  struct ip_tunnel_parm
>  {
> diff -urN vanilla-linux-2.6.29.4/net/ipv4/ip_gre.c dev-gre/net/ipv4/ip_gre.c
> --- vanilla-linux-2.6.29.4/net/ipv4/ip_gre.c	2009-05-19 01:52:34.000000000 +0200
> +++ dev-gre/net/ipv4/ip_gre.c	2009-07-01 15:30:44.000000000 +0200
> @@ -677,7 +677,7 @@
>  	}
>  
>  	tos = tiph->tos;
> -	if (tos&1) {
> +	if (tunnel->parms.o_flags&GRE_COPY_TOS || tos&1) {

This needs whitespace and parenthesis, to be safe and conform to
kernel coding style.

>  		if (skb->protocol == htons(ETH_P_IP))
>  			tos = old_iph->tos;
>  		tos &= ~1;
> @@ -804,7 +804,7 @@
>  			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
>  	}
>  
> -	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
> +	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags&~GRE_FLAGS;
more white space

>  	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
>  				   htons(ETH_P_TEB) : skb->protocol;
>  
> @@ -1080,7 +1080,7 @@
>  	__be16 *p = (__be16*)(iph+1);
>  
>  	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
> -	p[0]		= t->parms.o_flags;
> +	p[0]		= t->parms.o_flags&~GRE_FLAGS;
>  	p[1]		= htons(type);
>  
>  	/*
> @@ -1503,6 +1503,7 @@
>  	t->parms.iph.ttl = p.iph.ttl;
>  	t->parms.iph.tos = p.iph.tos;
>  	t->parms.iph.frag_off = p.iph.frag_off;
> +	t->parms.o_flags |= p.o_flags&GRE_COPY_TOS;
>  
>  	if (t->parms.link != p.link) {
>  		t->parms.link = p.link;
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller July 13, 2009, 5:44 p.m. UTC | #2
From: Andreas Jaggi <aj@open.ch>
Date: Mon, 13 Jul 2009 15:32:25 +0200

> @@ -24,6 +24,7 @@
>  #define GRE_REC		__constant_htons(0x0700)
>  #define GRE_FLAGS	__constant_htons(0x00F8)
>  #define GRE_VERSION	__constant_htons(0x0007)
> +#define GRE_COPY_TOS	__constant_htons(0x0008)
>  
>  struct ip_tunnel_parm
>  {

This seems dangerious, and unnecessary.

If some future RFC defines this flag, we'll have to change
this and add ugly compat code to keep older tools working.

Why not just create a new IFLA_GRE_* attribute to pass in
flags like this that purely are private to the GRE Linux
tunnel driver and have nothing to do with values defined
by GRE tunnel headers at all?

I'm definitely not applying this as-is.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Jaggi July 14, 2009, 7:14 a.m. UTC | #3
On Mon, Jul 13, 2009 at 10:44:33AM -0700, David Miller wrote:
> Why not just create a new IFLA_GRE_* attribute to pass in
> flags like this that purely are private to the GRE Linux
> tunnel driver and have nothing to do with values defined
> by GRE tunnel headers at all?

OK. What would be an appropriate place to store such a private flag in
the GRE driver?

I thought of extending the ip_tunnel_parms struct with '__u16 priv_flags'
(But this would break compatibility with the iproute2 tools?)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Jaggi July 14, 2009, 2:21 p.m. UTC | #4
On Tue, Jul 14, 2009 at 01:39:48PM +0100, Wojtek Sawasciuk wrote:
> I think I have similar problems to yours, and I spotted your email on  
> the lists when googling :)
> Andreas, sorry but I think your patch is unnecessary, at last in this form.
> From long time there is possibility to create tunnel with tos options,  
> you can use "inherit" or any other value to set it to encapsulated 
> packets.
>
> ip tun add mode gre name tun1 remote 1.2.3.5 local 10.0.0.234 tos inherit
> or
> ip tun add mode gre name tun2 remote 1.2.3.6 local 10.0.0.234 tos 2
>
> and its working, *almost* ok.
> only problem is when you set tos value with bit 1 set in it, regarding  
> this fragment of code from ip_gre.c , in ipgre_tunnel_xmit function:
>
>        tos = tiph->tos;
>        if (tos&1) {
>                if (skb->protocol == htons(ETH_P_IP))
>                        tos = old_iph->tos;
>                tos &= ~1;
>        }
>
> (ie. set tos to 2 - it will replicate it to 2, set tos to 3 - it will  
> change it to 2).
>
> IHMO it should be replaced to use maybe MBZ bit (look on RFC1349) ,  
> instead of bit 1 , to pass on value of tos , from tunnel parameter to IP  
> header.
> and this should be sufficient to make it work properly.

You're right, my patch in this form is not necessary.

But as you point out, there is a bug in the handling of the ToS/DiffServ
bits in GRE.
iproute2 and the Linux GRE driver use a tos value of 1 to indicate an
'inherit'/'copy-tos' behavior (and then clear this bit before using the
tos value in the IP header of the encapsulation GRE packet).

This was fine for RFC1349 where this bit is defined as MBZ/must-be-zero.
But with RFC3168, this bit (and the other previously unused one in the tos field)
are used for the ECN bits (ECT + CE). Therefore a tos value of 1 is 'legal' and
conflicts with the 'inherit'/'copy-tos' indication of iproute2 and the
Linux GRE driver.

This creates the problem you described, where IP packets who have a tos
value of 1 (with the ECT bit set to 1) are handled with the 'inherit'/'copy-tos'
mechanism and the resulting GRE IP header has a tos value of 0! (the ECT bit
set to 0 instead of 1)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller July 14, 2009, 4:35 p.m. UTC | #5
From: Andreas Jaggi <aj@open.ch>
Date: Tue, 14 Jul 2009 09:14:10 +0200

> On Mon, Jul 13, 2009 at 10:44:33AM -0700, David Miller wrote:
>> Why not just create a new IFLA_GRE_* attribute to pass in
>> flags like this that purely are private to the GRE Linux
>> tunnel driver and have nothing to do with values defined
>> by GRE tunnel headers at all?
> 
> OK. What would be an appropriate place to store such a private flag in
> the GRE driver?
> 
> I thought of extending the ip_tunnel_parms struct with '__u16 priv_flags'
> (But this would break compatibility with the iproute2 tools?)

You'll need to find a place.  Worst case you can make a top-level
new structure, that contains the user interface structure and
then the new __u16 priv_flags thing.  And allocate that per-tunnel
instead of what it does now.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff -urN vanilla-linux-2.6.29.4/include/linux/if_tunnel.h dev-gre/include/linux/if_tunnel.h
--- vanilla-linux-2.6.29.4/include/linux/if_tunnel.h	2009-05-19 01:52:34.000000000 +0200
+++ dev-gre/include/linux/if_tunnel.h	2009-07-13 15:15:26.000000000 +0200
@@ -24,6 +24,7 @@ 
 #define GRE_REC		__constant_htons(0x0700)
 #define GRE_FLAGS	__constant_htons(0x00F8)
 #define GRE_VERSION	__constant_htons(0x0007)
+#define GRE_COPY_TOS	__constant_htons(0x0008)
 
 struct ip_tunnel_parm
 {
diff -urN vanilla-linux-2.6.29.4/net/ipv4/ip_gre.c dev-gre/net/ipv4/ip_gre.c
--- vanilla-linux-2.6.29.4/net/ipv4/ip_gre.c	2009-05-19 01:52:34.000000000 +0200
+++ dev-gre/net/ipv4/ip_gre.c	2009-07-01 15:30:44.000000000 +0200
@@ -677,7 +677,7 @@ 
 	}
 
 	tos = tiph->tos;
-	if (tos&1) {
+	if (tunnel->parms.o_flags&GRE_COPY_TOS || tos&1) {
 		if (skb->protocol == htons(ETH_P_IP))
 			tos = old_iph->tos;
 		tos &= ~1;
@@ -804,7 +804,7 @@ 
 			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
 	}
 
-	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
+	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags&~GRE_FLAGS;
 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
 				   htons(ETH_P_TEB) : skb->protocol;
 
@@ -1080,7 +1080,7 @@ 
 	__be16 *p = (__be16*)(iph+1);
 
 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
-	p[0]		= t->parms.o_flags;
+	p[0]		= t->parms.o_flags&~GRE_FLAGS;
 	p[1]		= htons(type);
 
 	/*
@@ -1503,6 +1503,7 @@ 
 	t->parms.iph.ttl = p.iph.ttl;
 	t->parms.iph.tos = p.iph.tos;
 	t->parms.iph.frag_off = p.iph.frag_off;
+	t->parms.o_flags |= p.o_flags&GRE_COPY_TOS;
 
 	if (t->parms.link != p.link) {
 		t->parms.link = p.link;