Patchwork ip6ip6 tunnel routing issue

login
register
mail settings
Submitter =?UTF-8?B?VGFrw6FjcyBBbmRyw6Fz?=
Date Feb. 22, 2012, 9:25 a.m.
Message ID <4F44B48B.70504@wakoond.hu>
Download mbox | patch
Permalink /patch/142422/
State Rejected
Delegated to: David Miller
Headers show

Comments

=?UTF-8?B?VGFrw6FjcyBBbmRyw6Fz?= - Feb. 22, 2012, 9:25 a.m.
Dear All,



We're using Mobile IPv6 for vehicle communication, and we have found a 
very annoying issue in IPv6 routing.

We're running kernel 2.6.35.14, but we think, that the problem still 
exists in the latest kernel tree also.



The description of the problem:

Mobile IPv6 implementation in Linux is using different metric values for 
each Care-of Addresses, when creates or modifies default routes. This 
cause, that all of the packets are goes out on the interface which has 
lowest metric value, independently from the tunnel interface. In 
practice it cause that the packets which are routed into ip6tnl2 are 
goes out on eth1, instead of the setup, which binds ip6tnl2 to eth2.

I have attached a test script, which reproduces the problem, without 
Mobile IPv6. The init argument initializes the environment.



I have tried to find the problem in the kernel source. I found the 
followings:

In the find_rr_leaf function (net/ipv6/route.c) the iteration goes until 
the metric is equal to the specified one. If you look the calling 
environment, you should see, that this metric value is the metric of the 
first route info entry: http://pastebin.com/XfALRrrY

We have two totally same route entries, where only the interfaces and 
the metric values are different:

default via fe80::20c:29ff:fe3b:4d16 dev eth1  proto ra  metric 1023 
mtu 1500 advmss 1440 hoplimit 0
default via fe80::20c:29ff:fe3b:4d20 dev eth2  proto ra  metric 1053 
mtu 1500 advmss 1440 hoplimit 0

In this case, the above loop, will call find_match only once, for eth1. 
If it happens, the find_match function, couldn't find eth2, which 
belongs to ip6tnl2, and returns with eth1.



Unfortunately, the (IPv6) routing mechanism in quite complex in the 
kernel. I could find only a very ugly workaround for our problem:

The attached patch, introduces a new route lookup flag: 
RT6_LOOKUP_F_IP6TUNNEL. It is passed step-by-step to find_rr_leaf, and 
it will ignore the metric condition in the loop, if this flag has been 
enabled.

The ip6ip6-metric-fix.patch contains the fix for 2.6.35.14. We had to 
edit the definition of ip6_route_output. Because of this, the 
ip6ip6-metric-stuff.patch contains the modification of calling this at 
any other occurrences.


What is your opinion about this problem? What do you think about this 
workaround? Could anybody help to us to find a more elegant solution for 
this issue?


Best Regards,
András Takács

Index: /trunk/kernel/linux-2.6.35.14/net/netfilter/ipvs/ip_vs_xmit.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/netfilter/ipvs/ip_vs_xmit.c	(revision 269)
+++ /trunk/kernel/linux-2.6.35.14/net/netfilter/ipvs/ip_vs_xmit.c	(revision 384)
@@ -139,11 +139,11 @@
 					},
 				},
 			};
 
 			rt = (struct rt6_info *)ip6_route_output(&init_net,
-								 NULL, &fl);
+								 NULL, &fl, 0);
 			if (!rt) {
 				spin_unlock(&dest->dst_lock);
 				IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
 					     &dest->addr.in6);
 				return NULL;
@@ -165,11 +165,11 @@
 					},
 				},
 			},
 		};
 
-		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl, 0);
 		if (!rt) {
 			IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
 				     &cp->daddr.in6);
 			return NULL;
 		}
@@ -299,11 +299,11 @@
 				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
 	};
 
 	EnterFunction(10);
 
-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl, 0);
 	if (!rt) {
 		IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
 			     __func__, &iph->daddr);
 		goto tx_error_icmp;
 	}
Index: /trunk/kernel/linux-2.6.35.14/net/netfilter/ipvs/ip_vs_ctl.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/netfilter/ipvs/ip_vs_ctl.c	(revision 269)
+++ /trunk/kernel/linux-2.6.35.14/net/netfilter/ipvs/ip_vs_ctl.c	(revision 384)
@@ -110,11 +110,11 @@
 			.ip6_u = {
 				.daddr = *addr,
 				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
 	};
 
-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl, 0);
 	if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
 			return 1;
 
 	return 0;
 }
Index: /trunk/kernel/linux-2.6.35.14/net/netfilter/xt_TEE.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/netfilter/xt_TEE.c	(revision 269)
+++ /trunk/kernel/linux-2.6.35.14/net/netfilter/xt_TEE.c	(revision 384)
@@ -151,11 +151,11 @@
 		fl.oif = info->priv->oif;
 	}
 	fl.nl_u.ip6_u.daddr = info->gw.in6;
 	fl.nl_u.ip6_u.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
 				  (iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
-	dst = ip6_route_output(net, NULL, &fl);
+	dst = ip6_route_output(net, NULL, &fl, 0);
 	if (dst == NULL)
 		return false;
 
 	skb_dst_drop(skb);
 	skb_dst_set(skb, dst);
Index: /trunk/kernel/linux-2.6.35.14/net/sctp/ipv6.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/sctp/ipv6.c	(revision 68)
+++ /trunk/kernel/linux-2.6.35.14/net/sctp/ipv6.c	(revision 384)
@@ -256,11 +256,11 @@
 	if (saddr) {
 		ipv6_addr_copy(&fl.fl6_src, &saddr->v6.sin6_addr);
 		SCTP_DEBUG_PRINTK("SRC=%pI6 - ", &fl.fl6_src);
 	}
 
-	dst = ip6_route_output(&init_net, NULL, &fl);
+	dst = ip6_route_output(&init_net, NULL, &fl, 0);
 	if (!dst->error) {
 		struct rt6_info *rt;
 		rt = (struct rt6_info *)dst;
 		SCTP_DEBUG_PRINTK("rt6_dst:%pI6 rt6_src:%pI6\n",
 			&rt->rt6i_dst.addr, &rt->rt6i_src.addr);
Index: /trunk/kernel/linux-2.6.35.14/net/ipv6/ndisc.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/ipv6/ndisc.c	(revision 379)
+++ /trunk/kernel/linux-2.6.35.14/net/ipv6/ndisc.c	(revision 384)
@@ -1531,11 +1531,11 @@
 	}
 
 	icmpv6_flow_init(sk, &fl, NDISC_REDIRECT,
 			 &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
 
-	dst = ip6_route_output(net, NULL, &fl);
+	dst = ip6_route_output(net, NULL, &fl, 0);
 	if (dst == NULL)
 		return;
 
 	err = xfrm_lookup(net, &dst, &fl, NULL, 0);
 	if (err)
Index: /trunk/kernel/linux-2.6.35.14/net/ipv6/netfilter/ip6t_REJECT.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/ipv6/netfilter/ip6t_REJECT.c	(revision 379)
+++ /trunk/kernel/linux-2.6.35.14/net/ipv6/netfilter/ip6t_REJECT.c	(revision 384)
@@ -96,11 +96,11 @@
 	ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr);
 	ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr);
 	fl.fl_ip_sport = otcph.dest;
 	fl.fl_ip_dport = otcph.source;
 	security_skb_classify_flow(oldskb, &fl);
-	dst = ip6_route_output(net, NULL, &fl);
+	dst = ip6_route_output(net, NULL, &fl, 0);
 	if (dst == NULL || dst->error) {
 		dst_release(dst);
 		return;
 	}
 	if (xfrm_lookup(net, &dst, &fl, NULL, 0))
Index: /trunk/kernel/linux-2.6.35.14/net/ipv6/ip6_output.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/ipv6/ip6_output.c	(revision 68)
+++ /trunk/kernel/linux-2.6.35.14/net/ipv6/ip6_output.c	(revision 384)
@@ -927,11 +927,11 @@
 {
 	int err;
 	struct net *net = sock_net(sk);
 
 	if (*dst == NULL)
-		*dst = ip6_route_output(net, sk, fl);
+		*dst = ip6_route_output(net, sk, fl, 0);
 
 	if ((err = (*dst)->error))
 		goto out_err_release;
 
 	if (ipv6_addr_any(&fl->fl6_src)) {
@@ -970,11 +970,11 @@
 			 * default router instead
 			 */
 			dst_release(*dst);
 			memcpy(&fl_gw, fl, sizeof(struct flowi));
 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
-			*dst = ip6_route_output(net, sk, &fl_gw);
+			*dst = ip6_route_output(net, sk, &fl_gw, 0);
 			if ((err = (*dst)->error))
 				goto out_err_release;
 		}
 	}
 #endif
Index: /trunk/kernel/linux-2.6.35.14/net/ipv6/xfrm6_policy.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/ipv6/xfrm6_policy.c	(revision 68)
+++ /trunk/kernel/linux-2.6.35.14/net/ipv6/xfrm6_policy.c	(revision 384)
@@ -36,11 +36,11 @@
 
 	memcpy(&fl.fl6_dst, daddr, sizeof(fl.fl6_dst));
 	if (saddr)
 		memcpy(&fl.fl6_src, saddr, sizeof(fl.fl6_src));
 
-	dst = ip6_route_output(net, NULL, &fl);
+	dst = ip6_route_output(net, NULL, &fl, 0);
 
 	err = dst->error;
 	if (dst->error) {
 		dst_release(dst);
 		dst = ERR_PTR(err);
Index: /trunk/kernel/linux-2.6.35.14/net/ipv6/netfilter.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/ipv6/netfilter.c	(revision 68)
+++ /trunk/kernel/linux-2.6.35.14/net/ipv6/netfilter.c	(revision 384)
@@ -22,11 +22,11 @@
 		{ .ip6_u =
 		  { .daddr = iph->daddr,
 		    .saddr = iph->saddr, } },
 	};
 
-	dst = ip6_route_output(net, skb->sk, &fl);
+	dst = ip6_route_output(net, skb->sk, &fl, 0);
 	if (dst->error) {
 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 		LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
 		dst_release(dst);
 		return -EINVAL;
@@ -91,11 +91,11 @@
 	return 0;
 }
 
 static int nf_ip6_route(struct dst_entry **dst, struct flowi *fl)
 {
-	*dst = ip6_route_output(&init_net, NULL, fl);
+	*dst = ip6_route_output(&init_net, NULL, fl, 0);
 	return (*dst)->error;
 }
 
 __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
 			     unsigned int dataoff, u_int8_t protocol)
Index: /trunk/kernel/linux-2.6.35.14/net/ipv6/ip6mr.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/ipv6/ip6mr.c	(revision 68)
+++ /trunk/kernel/linux-2.6.35.14/net/ipv6/ip6mr.c	(revision 384)
@@ -1845,11 +1845,11 @@
 		.nl_u = { .ip6_u =
 				{ .daddr = ipv6h->daddr, }
 		}
 	};
 
-	dst = ip6_route_output(net, NULL, &fl);
+	dst = ip6_route_output(net, NULL, &fl, 0);
 	if (!dst)
 		goto out_free;
 
 	skb_dst_drop(skb);
 	skb_dst_set(skb, dst);
Index: /trunk/kernel/linux-2.6.35.14/net/ipv6/icmp.c
===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/ipv6/icmp.c	(revision 68)
+++ /trunk/kernel/linux-2.6.35.14/net/ipv6/icmp.c	(revision 384)
@@ -175,11 +175,11 @@
 	/*
 	 * Look up the output route.
 	 * XXX: perhaps the expire for routing entries cloned by
 	 * this lookup should be more aggressive (not longer than timeout).
 	 */
-	dst = ip6_route_output(net, sk, fl);
+	dst = ip6_route_output(net, sk, fl, 0);
 	if (dst->error) {
 		IP6_INC_STATS(net, ip6_dst_idev(dst),
 			      IPSTATS_MIB_OUTNOROUTES);
 	} else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
 		res = 1;

Patch

Index: /trunk/kernel/linux-2.6.35.14/include/net/ip6_route.h

===================================================================
--- /trunk/kernel/linux-2.6.35.14/include/net/ip6_route.h	(revision 68)

+++ /trunk/kernel/linux-2.6.35.14/include/net/ip6_route.h	(revision 384)

@@ -34,10 +34,11 @@ 

 #define RT6_LOOKUP_F_REACHABLE		0x00000002
 #define RT6_LOOKUP_F_HAS_SADDR		0x00000004
 #define RT6_LOOKUP_F_SRCPREF_TMP	0x00000008
 #define RT6_LOOKUP_F_SRCPREF_PUBLIC	0x00000010
 #define RT6_LOOKUP_F_SRCPREF_COA	0x00000020
+#define RT6_LOOKUP_F_IP6TUNNEL      0x00000040

 
 /*
  * rt6_srcprefs2flags() and rt6_flags2srcprefs() translate
  * between IPV6_ADDR_PREFERENCES socket option values
  *	IPV6_PREFER_SRC_TMP    = 0x1
@@ -58,11 +59,11 @@ 

 
 extern void			ip6_route_input(struct sk_buff *skb);
 
 extern struct dst_entry *	ip6_route_output(struct net *net,
 						 struct sock *sk,
-						 struct flowi *fl);

+						 struct flowi *fl, int flags);

 
 extern int			ip6_route_init(void);
 extern void			ip6_route_cleanup(void);
 
 extern int			ipv6_route_ioctl(struct net *net,
Index: /trunk/kernel/linux-2.6.35.14/net/ipv6/ip6_tunnel.c

===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/ipv6/ip6_tunnel.c	(revision 68)

+++ /trunk/kernel/linux-2.6.35.14/net/ipv6/ip6_tunnel.c	(revision 384)

@@ -860,11 +860,11 @@ 

 	int pkt_len;
 
 	if ((dst = ip6_tnl_dst_check(t)) != NULL)
 		dst_hold(dst);
 	else {
-		dst = ip6_route_output(net, NULL, fl);

+		dst = ip6_route_output(net, NULL, fl, RT6_LOOKUP_F_IP6TUNNEL);

 
 		if (dst->error || xfrm_lookup(net, &dst, fl, NULL, 0) < 0)
 			goto tx_err_link_failure;
 	}
 
Index: /trunk/kernel/linux-2.6.35.14/net/ipv6/route.c

===================================================================
--- /trunk/kernel/linux-2.6.35.14/net/ipv6/route.c	(revision 68)

+++ /trunk/kernel/linux-2.6.35.14/net/ipv6/route.c	(revision 384)

@@ -398,27 +398,32 @@ 

 	return match;
 }
 
 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 				     struct rt6_info *rr_head,
-				     u32 metric, int oif, int strict)

+				     u32 metric, int oif, int flags, int reachable)

 {
 	struct rt6_info *rt, *match;
 	int mpri = -1;
+	int strict = 0;

+

+	strict |= flags & RT6_LOOKUP_F_IFACE;

 
 	match = NULL;
-	for (rt = rr_head; rt && rt->rt6i_metric == metric;

-	     rt = rt->u.dst.rt6_next)

+	for (rt = rr_head; rt && ((flags & RT6_LOOKUP_F_IP6TUNNEL) || rt->rt6i_metric == metric);

+	     rt = rt->u.dst.rt6_next) {

 		match = find_match(rt, oif, strict, &mpri, match);
-	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;

-	     rt = rt->u.dst.rt6_next)

+    }

+	for (rt = fn->leaf; rt && rt != rr_head && ((flags & RT6_LOOKUP_F_IP6TUNNEL) || rt->rt6i_metric == metric);

+	     rt = rt->u.dst.rt6_next) {

 		match = find_match(rt, oif, strict, &mpri, match);
+    }

 
 	return match;
 }
 
-static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)

+static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int flags, int reachable)

 {
 	struct rt6_info *match, *rt0;
 	struct net *net;
 
 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
@@ -426,14 +431,13 @@ 

 
 	rt0 = fn->rr_ptr;
 	if (!rt0)
 		fn->rr_ptr = rt0 = fn->leaf;
 
-	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);

-

-	if (!match &&

-	    (strict & RT6_LOOKUP_F_REACHABLE)) {

+	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, flags, reachable);

+

+	if (!match && reachable) {

 		struct rt6_info *next = rt0->u.dst.rt6_next;
 
 		/* no entries matched; do round-robin */
 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
 			next = fn->leaf;
@@ -703,25 +707,22 @@ 

 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 				      struct flowi *fl, int flags)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt, *nrt;
-	int strict = 0;

 	int attempts = 3;
 	int err;
 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 
-	strict |= flags & RT6_LOOKUP_F_IFACE;

-

 relookup:
 	read_lock_bh(&table->tb6_lock);
 
 restart_2:
 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 
 restart:
-	rt = rt6_select(fn, oif, strict | reachable);

+    rt = rt6_select(fn, oif, flags, reachable);

 
 	BACKTRACK(net, &fl->fl6_src);
 	if (rt == net->ipv6.ip6_null_entry ||
 	    rt->rt6i_flags & RTF_CACHE)
 		goto out;
@@ -768,11 +769,15 @@ 

 	read_unlock_bh(&table->tb6_lock);
 out2:
 	rt->u.dst.lastuse = jiffies;
 	rt->u.dst.__use++;
 
-	return rt;

+    if (flags & RT6_LOOKUP_F_IP6TUNNEL) {

+        printk(KERN_INFO "*** %s: %s\n", __FUNCTION__, rt->rt6i_dev->name);

+    }

+	

+    return rt;

 }
 
 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
 					    struct flowi *fl, int flags)
 {
@@ -808,14 +813,12 @@ 

 {
 	return ip6_pol_route(net, table, fl->oif, fl, flags);
 }
 
 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
-				    struct flowi *fl)

-{

-	int flags = 0;

-

+				    struct flowi *fl, int flags)

+{

 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
 		flags |= RT6_LOOKUP_F_IFACE;
 
 	if (!ipv6_addr_any(&fl->fl6_src))
 		flags |= RT6_LOOKUP_F_HAS_SADDR;
@@ -2381,11 +2384,11 @@ 

 	   through good chunk of routing engine.
 	 */
 	skb_reset_mac_header(skb);
 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
 
-	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);

+	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl, 0);

 	skb_dst_set(skb, &rt->u.dst);
 
 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
 			    nlh->nlmsg_seq, 0, 0, 0);