diff mbox

[net,v2,2/2] ipv6: fix ECMP route replacement

Message ID c1fd1b1527dc2a6905d4eeff04f1e4ed3f93964e.1431546602.git.mkubecek@suse.cz
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Michal Kubecek May 13, 2015, 7:59 p.m. UTC
When replacing an IPv6 multipath route with "ip route replace", i.e.
NLM_F_CREATE | NLM_F_REPLACE, fib6_add_rt2node() replaces only first
matching route without fixing its siblings, resulting in corrupted
siblings linked list; removing one of the siblings can then end in an
infinite loop.

Replacing the whole set of nexthops does IMHO make more sense than
replacing a random one. We also need to remove the NLM_F_REPLACE flag
after replacing old nexthops by first new so that each subsequent
nexthop does not replace previous one.

Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
 net/ipv6/ip6_fib.c | 17 ++++++++++++++---
 net/ipv6/route.c   |  8 +++++---
 2 files changed, 19 insertions(+), 6 deletions(-)

Comments

Nicolas Dichtel May 14, 2015, 6:58 p.m. UTC | #1
Le 13/05/2015 21:59, Michal Kubecek a écrit :
> When replacing an IPv6 multipath route with "ip route replace", i.e.
> NLM_F_CREATE | NLM_F_REPLACE, fib6_add_rt2node() replaces only first
> matching route without fixing its siblings, resulting in corrupted
> siblings linked list; removing one of the siblings can then end in an
> infinite loop.
>
> Replacing the whole set of nexthops does IMHO make more sense than
> replacing a random one. We also need to remove the NLM_F_REPLACE flag
> after replacing old nexthops by first new so that each subsequent
> nexthop does not replace previous one.
>
> Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
> Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
> ---
>   net/ipv6/ip6_fib.c | 17 ++++++++++++++---
>   net/ipv6/route.c   |  8 +++++---
>   2 files changed, 19 insertions(+), 6 deletions(-)
>
> diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
> index 96dbffff5a24..abf4e4e5bdab 100644
> --- a/net/ipv6/ip6_fib.c
> +++ b/net/ipv6/ip6_fib.c
> @@ -815,6 +815,8 @@ add:
>   		}
>
>   	} else {
> +		struct rt6_info *next;
> +
>   		if (!found) {
>   			if (add)
>   				goto add;
> @@ -828,15 +830,24 @@ add:
>
>   		*ins = rt;
>   		rt->rt6i_node = fn;
> -		rt->dst.rt6_next = iter->dst.rt6_next;
> +
> +		/* skip potential siblings */
> +		next = iter->dst.rt6_next;
> +		while (next && next->rt6i_metric == rt->rt6i_metric)
> +			next = next->dst.rt6_next;
I wonder if we should not loop over the siblings list here (rt->rt6i_siblings).
Only routes that match 'rt6_qualify_for_ecmp()' are siblings.

> +		rt->dst.rt6_next = next;
> +
>   		atomic_inc(&rt->rt6i_ref);
>   		inet6_rt_notify(RTM_NEWROUTE, rt, info);
>   		if (!(fn->fn_flags & RTN_RTINFO)) {
>   			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
>   			fn->fn_flags |= RTN_RTINFO;
>   		}
> -		fib6_purge_rt(iter, fn, info->nl_net);
> -		rt6_release(iter);
> +		while (iter != next) {
> +			fib6_purge_rt(iter, fn, info->nl_net);
> +			rt6_release(iter);
> +			iter = iter->dst.rt6_next;
> +		}
Same here.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michal Kubecek May 14, 2015, 9:49 p.m. UTC | #2
On Thu, May 14, 2015 at 08:58:59PM +0200, Nicolas Dichtel wrote:
> Le 13/05/2015 21:59, Michal Kubecek a écrit :
> >When replacing an IPv6 multipath route with "ip route replace", i.e.
> >NLM_F_CREATE | NLM_F_REPLACE, fib6_add_rt2node() replaces only first
> >matching route without fixing its siblings, resulting in corrupted
> >siblings linked list; removing one of the siblings can then end in an
> >infinite loop.
> >
> >Replacing the whole set of nexthops does IMHO make more sense than
> >replacing a random one. We also need to remove the NLM_F_REPLACE flag
> >after replacing old nexthops by first new so that each subsequent
> >nexthop does not replace previous one.
> >
> >Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
> >Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
> >---
> >  net/ipv6/ip6_fib.c | 17 ++++++++++++++---
> >  net/ipv6/route.c   |  8 +++++---
> >  2 files changed, 19 insertions(+), 6 deletions(-)
> >
> >diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
> >index 96dbffff5a24..abf4e4e5bdab 100644
> >--- a/net/ipv6/ip6_fib.c
> >+++ b/net/ipv6/ip6_fib.c
> >@@ -815,6 +815,8 @@ add:
> >  		}
> >
> >  	} else {
> >+		struct rt6_info *next;
> >+
> >  		if (!found) {
> >  			if (add)
> >  				goto add;
> >@@ -828,15 +830,24 @@ add:
> >
> >  		*ins = rt;
> >  		rt->rt6i_node = fn;
> >-		rt->dst.rt6_next = iter->dst.rt6_next;
> >+
> >+		/* skip potential siblings */
> >+		next = iter->dst.rt6_next;
> >+		while (next && next->rt6i_metric == rt->rt6i_metric)
> >+			next = next->dst.rt6_next;
> I wonder if we should not loop over the siblings list here
> (rt->rt6i_siblings).  Only routes that match 'rt6_qualify_for_ecmp()'
> are siblings.

Problem with looping over the siblings list is that then we would have
to find each of them in the (unidirectional) list linked by dst.rt6_next
to be able to delete them from this list.  Do we at least know that all
routes in this list with matching metric and rt6_qualify_for_ecmp() are
siblings? If so, we could still do the cleanup on one pass over the
dst.rt6_next list.

                                                         Michal Kubecek

> 
> >+		rt->dst.rt6_next = next;
> >+
> >  		atomic_inc(&rt->rt6i_ref);
> >  		inet6_rt_notify(RTM_NEWROUTE, rt, info);
> >  		if (!(fn->fn_flags & RTN_RTINFO)) {
> >  			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
> >  			fn->fn_flags |= RTN_RTINFO;
> >  		}
> >-		fib6_purge_rt(iter, fn, info->nl_net);
> >-		rt6_release(iter);
> >+		while (iter != next) {
> >+			fib6_purge_rt(iter, fn, info->nl_net);
> >+			rt6_release(iter);
> >+			iter = iter->dst.rt6_next;
> >+		}
> Same here.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michal Kubecek May 15, 2015, 8:51 a.m. UTC | #3
On Thu, May 14, 2015 at 11:49:07PM +0200, Michal Kubecek wrote:
> On Thu, May 14, 2015 at 08:58:59PM +0200, Nicolas Dichtel wrote:
> > Le 13/05/2015 21:59, Michal Kubecek a écrit :
> > >When replacing an IPv6 multipath route with "ip route replace", i.e.
> > >NLM_F_CREATE | NLM_F_REPLACE, fib6_add_rt2node() replaces only first
> > >matching route without fixing its siblings, resulting in corrupted
> > >siblings linked list; removing one of the siblings can then end in an
> > >infinite loop.
> > >
> > >Replacing the whole set of nexthops does IMHO make more sense than
> > >replacing a random one. We also need to remove the NLM_F_REPLACE flag
> > >after replacing old nexthops by first new so that each subsequent
> > >nexthop does not replace previous one.
> > >
> > >Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
> > >Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
> > >---
> > >  net/ipv6/ip6_fib.c | 17 ++++++++++++++---
> > >  net/ipv6/route.c   |  8 +++++---
> > >  2 files changed, 19 insertions(+), 6 deletions(-)
> > >
> > >diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
> > >index 96dbffff5a24..abf4e4e5bdab 100644
> > >--- a/net/ipv6/ip6_fib.c
> > >+++ b/net/ipv6/ip6_fib.c
> > >@@ -815,6 +815,8 @@ add:
> > >  		}
> > >
> > >  	} else {
> > >+		struct rt6_info *next;
> > >+
> > >  		if (!found) {
> > >  			if (add)
> > >  				goto add;
> > >@@ -828,15 +830,24 @@ add:
> > >
> > >  		*ins = rt;
> > >  		rt->rt6i_node = fn;
> > >-		rt->dst.rt6_next = iter->dst.rt6_next;
> > >+
> > >+		/* skip potential siblings */
> > >+		next = iter->dst.rt6_next;
> > >+		while (next && next->rt6i_metric == rt->rt6i_metric)
> > >+			next = next->dst.rt6_next;
> > I wonder if we should not loop over the siblings list here
> > (rt->rt6i_siblings).  Only routes that match 'rt6_qualify_for_ecmp()'
> > are siblings.
> 
> Problem with looping over the siblings list is that then we would have
> to find each of them in the (unidirectional) list linked by dst.rt6_next
> to be able to delete them from this list.  Do we at least know that all
> routes in this list with matching metric and rt6_qualify_for_ecmp() are
> siblings? If so, we could still do the cleanup on one pass over the
> dst.rt6_next list.

Hm... it's still a bit more complicated. In the "replace" case, we break
the loop once we find any route with matching metric, i.e. we can find a
non-ECMP one even if there are some ECMP siblings farther in the chain.
As far as I can see, replacing such route would cause an inconsistency
as nsiblings would no longer match the number of ECMP-able routes in the
chain.

IMHO it's not completely clear what the "replace" semantics should be
for multiple routes (and, worse, for a mix of non-ECMP and ECMP ones).
One possible approach would be

  - when new route is ECMP-able, try to find an ECMP-able route and
    replace it with all its siblings
  - if there is none, fall back to replacing first matching non-ECMP
    route (or just add if creating is allowed)
  - when new route is not ECMP-able (can this really happen with
    NLM_F_REPLACE?), replace first matching non-ECMP or insert new one

But I still rather feel like replacing all existing matching routes
would better reflect what I expect "replace" to do.

                                                        Michal Kubecek

> > >+		rt->dst.rt6_next = next;
> > >+
> > >  		atomic_inc(&rt->rt6i_ref);
> > >  		inet6_rt_notify(RTM_NEWROUTE, rt, info);
> > >  		if (!(fn->fn_flags & RTN_RTINFO)) {
> > >  			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
> > >  			fn->fn_flags |= RTN_RTINFO;
> > >  		}
> > >-		fib6_purge_rt(iter, fn, info->nl_net);
> > >-		rt6_release(iter);
> > >+		while (iter != next) {
> > >+			fib6_purge_rt(iter, fn, info->nl_net);
> > >+			rt6_release(iter);
> > >+			iter = iter->dst.rt6_next;
> > >+		}
> > Same here.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller May 15, 2015, 4:12 p.m. UTC | #4
From: Michal Kubecek <mkubecek@suse.cz>
Date: Fri, 15 May 2015 10:51:52 +0200

> But I still rather feel like replacing all existing matching routes
> would better reflect what I expect "replace" to do.

What does IPV4 do?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michal Kubecek May 15, 2015, 5:41 p.m. UTC | #5
On Fri, May 15, 2015 at 12:12:12PM -0400, David Miller wrote:
> From: Michal Kubecek <mkubecek@suse.cz>
> Date: Fri, 15 May 2015 10:51:52 +0200
> 
> > But I still rather feel like replacing all existing matching routes
> > would better reflect what I expect "replace" to do.
> 
> What does IPV4 do?

Apparently there is some problem too as on 4.1-rc3, "ip route replace"
seems to always remove first matching IPv4 route (multipath or not) but
doesn't add the new one unless there was no matching route before
calling the command (then it behaves like "add").

I also tried 3.0 and it replaces first matching route, whether it was
multipath or not and whether the new one is multipath or not. This
behaviour cannot be directly transfered to IPv6 as there are no real
IPv6 multipath routes; instead, the set of all routes under a node with
the same metric satisfying rt6_qualify_for_ecmp() is handled as a
multipath route (so that there can be only one). The semantics I
suggested in my previous mail might be a reasonable approximation.

                                                         Michal Kubecek

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller May 16, 2015, 9:18 p.m. UTC | #6
From: Michal Kubecek <mkubecek@suse.cz>
Date: Fri, 15 May 2015 19:41:56 +0200

> The semantics I suggested in my previous mail might be a reasonable
> approximation.

Ok, then let's try for that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michal Kubecek May 18, 2015, 6:53 p.m. UTC | #7
(1) When adding a nexthop of a multipath route fails (e.g. because of a
conflict with an existing route), we are supposed to delete nexthops
already added. However, currently we try to also delete all nexthops we
haven't even tried to add yet so that a "ip route add" command can
actually remove pre-existing routes if it fails.

(2) Attempt to replace a multipath route results in a broken siblings
linked list. Following commands (like "ip route del") can then either
follow a link into freed memory or end in an infinite loop (if the slab
object has been reused).

v2: fix an omission in first patch

v3: change the semantics of replace operation to better match IPv4

Michal Kubecek (2):
  ipv6: do not delete previously existing ECMP routes if add fails
  ipv6: fix ECMP route replacement

 net/ipv6/ip6_fib.c | 39 +++++++++++++++++++++++++++++++++++++--
 net/ipv6/route.c   | 14 +++++++++-----
 2 files changed, 46 insertions(+), 7 deletions(-)
David Miller May 20, 2015, 4:03 p.m. UTC | #8
From: Michal Kubecek <mkubecek@suse.cz>
Date: Mon, 18 May 2015 20:53:50 +0200 (CEST)

> (1) When adding a nexthop of a multipath route fails (e.g. because of a
> conflict with an existing route), we are supposed to delete nexthops
> already added. However, currently we try to also delete all nexthops we
> haven't even tried to add yet so that a "ip route add" command can
> actually remove pre-existing routes if it fails.
> 
> (2) Attempt to replace a multipath route results in a broken siblings
> linked list. Following commands (like "ip route del") can then either
> follow a link into freed memory or end in an infinite loop (if the slab
> object has been reused).
> 
> v2: fix an omission in first patch
> 
> v3: change the semantics of replace operation to better match IPv4

Series applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 96dbffff5a24..abf4e4e5bdab 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -815,6 +815,8 @@  add:
 		}
 
 	} else {
+		struct rt6_info *next;
+
 		if (!found) {
 			if (add)
 				goto add;
@@ -828,15 +830,24 @@  add:
 
 		*ins = rt;
 		rt->rt6i_node = fn;
-		rt->dst.rt6_next = iter->dst.rt6_next;
+
+		/* skip potential siblings */
+		next = iter->dst.rt6_next;
+		while (next && next->rt6i_metric == rt->rt6i_metric)
+			next = next->dst.rt6_next;
+		rt->dst.rt6_next = next;
+
 		atomic_inc(&rt->rt6i_ref);
 		inet6_rt_notify(RTM_NEWROUTE, rt, info);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
 		}
-		fib6_purge_rt(iter, fn, info->nl_net);
-		rt6_release(iter);
+		while (iter != next) {
+			fib6_purge_rt(iter, fn, info->nl_net);
+			rt6_release(iter);
+			iter = iter->dst.rt6_next;
+		}
 	}
 
 	return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3821a3517478..cf842cb92b2f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2541,11 +2541,13 @@  beginning:
 			}
 		}
 		/* Because each route is added like a single route we remove
-		 * this flag after the first nexthop (if there is a collision,
+		 * these flags after the first nexthop: if there is a collision,
 		 * we have already fail to add the first nexthop:
-		 * fib6_add_rt2node() has reject it).
+		 * fib6_add_rt2node() has reject it; when replacing, old
+		 * nexthops are removed by first new, the rest is added to it.
 		 */
-		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL;
+		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
+						     NLM_F_REPLACE);
 		rtnh = rtnh_next(rtnh, &remaining);
 	}