diff mbox

Unable to flush ICMP redirect routes in kernel 3.0+

Message ID 1321632128.3277.29.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Nov. 18, 2011, 4:02 p.m. UTC
David, unless I missed something, we should revert commit f39925dbde77
ipv4: Cache learned redirect information in inetpeer.)

With following patch, redirects now work for me.

Thanks !



[PATCH net-next] ipv4: fix redirect handling

commit f39925dbde77 (ipv4: Cache learned redirect information in
inetpeer.) introduced a regression in ICMP redirect handling.

It assumed ipv4_dst_check() would be called because all possible routes
were attached to the inetpeer we modify in ip_rt_redirect(), but thats
not true.

commit 7cc9150ebe (route: fix ICMP redirect validation) tried to fix
this but solution was not complete. (It fixed only one route)

So we must lookup existing routes (including different TOS values) and
call check_peer_redir() on them.

Reported-by: Ivan Zahariev <famzah@icdsoft.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Flavio Leitner <fbl@redhat.com>
---
 net/ipv4/route.c |  110 ++++++++++++++++++++++++---------------------
 1 file changed, 59 insertions(+), 51 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Flavio Leitner Nov. 18, 2011, 4:30 p.m. UTC | #1
On Fri, 18 Nov 2011 17:02:08 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> 
> 
> David, unless I missed something, we should revert commit f39925dbde77
> ipv4: Cache learned redirect information in inetpeer.)
> 
> With following patch, redirects now work for me.
> 
> Thanks !
> 
> 
> 
> [PATCH net-next] ipv4: fix redirect handling
> 
> commit f39925dbde77 (ipv4: Cache learned redirect information in
> inetpeer.) introduced a regression in ICMP redirect handling.
> 
> It assumed ipv4_dst_check() would be called because all possible
> routes were attached to the inetpeer we modify in ip_rt_redirect(),
> but thats not true.
> 
> commit 7cc9150ebe (route: fix ICMP redirect validation) tried to fix
> this but solution was not complete. (It fixed only one route)
> 
> So we must lookup existing routes (including different TOS values) and
> call check_peer_redir() on them.
> 
> Reported-by: Ivan Zahariev <famzah@icdsoft.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Flavio Leitner <fbl@redhat.com>
> ---
>  net/ipv4/route.c |  110 ++++++++++++++++++++++++---------------------
>  1 file changed, 59 insertions(+), 51 deletions(-)
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 511f4a7..0c74da8 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1304,16 +1304,42 @@ static void rt_del(unsigned hash, struct
> rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash));
>  }
>  
> +static int check_peer_redir(struct dst_entry *dst, struct inet_peer
> *peer) +{
> +	struct rtable *rt = (struct rtable *) dst;
> +	__be32 orig_gw = rt->rt_gateway;
> +	struct neighbour *n, *old_n;
> +
> +	dst_confirm(&rt->dst);
> +
> +	rt->rt_gateway = peer->redirect_learned.a4;
> +
> +	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
> +	if (IS_ERR(n))
> +		return PTR_ERR(n);
> +	old_n = xchg(&rt->dst._neighbour, n);
> +	if (old_n)
> +		neigh_release(old_n);
> +	if (!n || !(n->nud_state & NUD_VALID)) {
> +		if (n)
> +			neigh_event_send(n, NULL);
> +		rt->rt_gateway = orig_gw;
> +		return -EAGAIN;
> +	} else {
> +		rt->rt_flags |= RTCF_REDIRECTED;
> +		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
> +	}
> +	return 0;
> +}
> +
>  /* called in rcu_read_lock() section */
>  void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
>  		    __be32 saddr, struct net_device *dev)
>  {
>  	int s, i;
>  	struct in_device *in_dev = __in_dev_get_rcu(dev);
> -	struct rtable *rt;
>  	__be32 skeys[2] = { saddr, 0 };
>  	int    ikeys[2] = { dev->ifindex, 0 };
> -	struct flowi4 fl4;
>  	struct inet_peer *peer;
>  	struct net *net;
>  
> @@ -1336,33 +1362,42 @@ void ip_rt_redirect(__be32 old_gw, __be32
> daddr, __be32 new_gw, goto reject_redirect;
>  	}
>  
> -	memset(&fl4, 0, sizeof(fl4));
> -	fl4.daddr = daddr;
>  	for (s = 0; s < 2; s++) {
>  		for (i = 0; i < 2; i++) {
> -			fl4.flowi4_oif = ikeys[i];
> -			fl4.saddr = skeys[s];
> -			rt = __ip_route_output_key(net, &fl4);
> -			if (IS_ERR(rt))
> -				continue;
> -
> -			if (rt->dst.error || rt->dst.dev != dev ||
> -			    rt->rt_gateway != old_gw) {
> -				ip_rt_put(rt);
> -				continue;
> -			}
> +			unsigned int hash;
> +			struct rtable __rcu **rthp;
> +			struct rtable *rt;
> +
> +			hash = rt_hash(daddr, skeys[s], ikeys[i],
> rt_genid(net)); +
> +			rthp = &rt_hash_table[hash].chain;
> +
> +			while ((rt = rcu_dereference(*rthp)) !=
> NULL) {
> +				rthp = &rt->dst.rt_next;
> +
> +				if (rt->rt_key_dst != daddr ||
> +				    rt->rt_key_src != skeys[s] ||
> +				    rt->rt_oif != ikeys[i] ||
> +				    rt_is_input_route(rt) ||
> +				    rt_is_expired(rt) ||
> +				    !net_eq(dev_net(rt->dst.dev),
> net) ||
> +				    rt->dst.error ||
> +				    rt->dst.dev != dev ||
> +				    rt->rt_gateway != old_gw)
> +					continue;
>  

I know we are reverting to get it fixed, but this adds the routing
cache back, so what is the plan? Revert to get it working and then
think on new approach to remove the route cache again later?

I had one previous patch using the routing cache posted to the list,
but it won't fix the route flush problem.

thanks,
fbl
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 18, 2011, 4:34 p.m. UTC | #2
Le vendredi 18 novembre 2011 à 14:30 -0200, Flavio Leitner a écrit :

> I know we are reverting to get it fixed, but this adds the routing
> cache back, so what is the plan? Revert to get it working and then
> think on new approach to remove the route cache again later?
> 
> I had one previous patch using the routing cache posted to the list,
> but it won't fix the route flush problem.
> 

I dont "add the routing cache back".

Note I only fix existing route entries in the cache ;)

A "revert" is probably safe, since we should push a fix for 3.0/3.1/3.2
kernels...



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Flavio Leitner Nov. 18, 2011, 5:05 p.m. UTC | #3
On Fri, 18 Nov 2011 17:34:06 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le vendredi 18 novembre 2011 à 14:30 -0200, Flavio Leitner a écrit :
> 
> > I know we are reverting to get it fixed, but this adds the routing
> > cache back, so what is the plan? Revert to get it working and then
> > think on new approach to remove the route cache again later?
> > 
> > I had one previous patch using the routing cache posted to the list,
> > but it won't fix the route flush problem.
> > 
> 
> I dont "add the routing cache back".

Sorry, I meant that we are trying to avoid doing this:
+			hash = rt_hash(daddr, skeys[s], ikeys[i],rt_genid(net));
+
+			rthp = &rt_hash_table[hash].chain;
+
+			while ((rt = rcu_dereference(*rthp)) != NULL) {
+				rthp = &rt->dst.rt_next;

anyway, see below.

> Note I only fix existing route entries in the cache ;)
Exactly.
 
> A "revert" is probably safe, since we should push a fix for
> 3.0/3.1/3.2 kernels...

I agree that reverting is probably safe.
fbl

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 18, 2011, 5:07 p.m. UTC | #4
Le vendredi 18 novembre 2011 à 15:05 -0200, Flavio Leitner a écrit :

> Sorry, I meant that we are trying to avoid doing this:
> +			hash = rt_hash(daddr, skeys[s], ikeys[i],rt_genid(net));
> +
> +			rthp = &rt_hash_table[hash].chain;
> +
> +			while ((rt = rcu_dereference(*rthp)) != NULL) {
> +				rthp = &rt->dst.rt_next;

Sure, but this is still needed right now.

Once route cache is removed, this loop wont exist anymore ;)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Flavio Leitner Nov. 18, 2011, 5:21 p.m. UTC | #5
On Fri, 18 Nov 2011 18:07:53 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le vendredi 18 novembre 2011 à 15:05 -0200, Flavio Leitner a écrit :
> 
> > Sorry, I meant that we are trying to avoid doing this:
> > +			hash = rt_hash(daddr, skeys[s],
> > ikeys[i],rt_genid(net)); +
> > +			rthp = &rt_hash_table[hash].chain;
> > +
> > +			while ((rt = rcu_dereference(*rthp)) !=
> > NULL) {
> > +				rthp = &rt->dst.rt_next;
> 
> Sure, but this is still needed right now.

Yes, David will not be happy, unfortunately :)

> Once route cache is removed, this loop wont exist anymore ;)

That's the problem, we need to get rid of it first. :)

fbl
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Nov. 18, 2011, 6:04 p.m. UTC | #6
From: Flavio Leitner <fbl@redhat.com>
Date: Fri, 18 Nov 2011 15:21:42 -0200

> On Fri, 18 Nov 2011 18:07:53 +0100
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
>> Le vendredi 18 novembre 2011 à 15:05 -0200, Flavio Leitner a écrit :
>> 
>> > Sorry, I meant that we are trying to avoid doing this:
>> > +			hash = rt_hash(daddr, skeys[s],
>> > ikeys[i],rt_genid(net)); +
>> > +			rthp = &rt_hash_table[hash].chain;
>> > +
>> > +			while ((rt = rcu_dereference(*rthp)) !=
>> > NULL) {
>> > +				rthp = &rt->dst.rt_next;
>> 
>> Sure, but this is still needed right now.
> 
> Yes, David will not be happy, unfortunately :)

He better be happy that someone is fixing all the bugs he added.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Nov. 18, 2011, 8:26 p.m. UTC | #7
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 18 Nov 2011 17:02:08 +0100

> David, unless I missed something, we should revert commit f39925dbde77
> ipv4: Cache learned redirect information in inetpeer.)
> 
> With following patch, redirects now work for me.

Yes, it doesn't work very well... sigh.

I've applied your patch and queued it up for stable.

Long term we need a different scheme for redirects.

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 511f4a7..0c74da8 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1304,16 +1304,42 @@  static void rt_del(unsigned hash, struct rtable *rt)
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
+static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	__be32 orig_gw = rt->rt_gateway;
+	struct neighbour *n, *old_n;
+
+	dst_confirm(&rt->dst);
+
+	rt->rt_gateway = peer->redirect_learned.a4;
+
+	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
+	if (IS_ERR(n))
+		return PTR_ERR(n);
+	old_n = xchg(&rt->dst._neighbour, n);
+	if (old_n)
+		neigh_release(old_n);
+	if (!n || !(n->nud_state & NUD_VALID)) {
+		if (n)
+			neigh_event_send(n, NULL);
+		rt->rt_gateway = orig_gw;
+		return -EAGAIN;
+	} else {
+		rt->rt_flags |= RTCF_REDIRECTED;
+		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+	}
+	return 0;
+}
+
 /* called in rcu_read_lock() section */
 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 		    __be32 saddr, struct net_device *dev)
 {
 	int s, i;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
-	struct rtable *rt;
 	__be32 skeys[2] = { saddr, 0 };
 	int    ikeys[2] = { dev->ifindex, 0 };
-	struct flowi4 fl4;
 	struct inet_peer *peer;
 	struct net *net;
 
@@ -1336,33 +1362,42 @@  void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 			goto reject_redirect;
 	}
 
-	memset(&fl4, 0, sizeof(fl4));
-	fl4.daddr = daddr;
 	for (s = 0; s < 2; s++) {
 		for (i = 0; i < 2; i++) {
-			fl4.flowi4_oif = ikeys[i];
-			fl4.saddr = skeys[s];
-			rt = __ip_route_output_key(net, &fl4);
-			if (IS_ERR(rt))
-				continue;
-
-			if (rt->dst.error || rt->dst.dev != dev ||
-			    rt->rt_gateway != old_gw) {
-				ip_rt_put(rt);
-				continue;
-			}
+			unsigned int hash;
+			struct rtable __rcu **rthp;
+			struct rtable *rt;
+
+			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
+
+			rthp = &rt_hash_table[hash].chain;
+
+			while ((rt = rcu_dereference(*rthp)) != NULL) {
+				rthp = &rt->dst.rt_next;
+
+				if (rt->rt_key_dst != daddr ||
+				    rt->rt_key_src != skeys[s] ||
+				    rt->rt_oif != ikeys[i] ||
+				    rt_is_input_route(rt) ||
+				    rt_is_expired(rt) ||
+				    !net_eq(dev_net(rt->dst.dev), net) ||
+				    rt->dst.error ||
+				    rt->dst.dev != dev ||
+				    rt->rt_gateway != old_gw)
+					continue;
 
-			if (!rt->peer)
-				rt_bind_peer(rt, rt->rt_dst, 1);
+				if (!rt->peer)
+					rt_bind_peer(rt, rt->rt_dst, 1);
 
-			peer = rt->peer;
-			if (peer) {
-				peer->redirect_learned.a4 = new_gw;
-				atomic_inc(&__rt_peer_genid);
+				peer = rt->peer;
+				if (peer) {
+					if (peer->redirect_learned.a4 != new_gw) {
+						peer->redirect_learned.a4 = new_gw;
+						atomic_inc(&__rt_peer_genid);
+					}
+					check_peer_redir(&rt->dst, peer);
+				}
 			}
-
-			ip_rt_put(rt);
-			return;
 		}
 	}
 	return;
@@ -1649,33 +1684,6 @@  static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 	}
 }
 
-static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
-{
-	struct rtable *rt = (struct rtable *) dst;
-	__be32 orig_gw = rt->rt_gateway;
-	struct neighbour *n, *old_n;
-
-	dst_confirm(&rt->dst);
-
-	rt->rt_gateway = peer->redirect_learned.a4;
-
-	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
-	if (IS_ERR(n))
-		return PTR_ERR(n);
-	old_n = xchg(&rt->dst._neighbour, n);
-	if (old_n)
-		neigh_release(old_n);
-	if (!n || !(n->nud_state & NUD_VALID)) {
-		if (n)
-			neigh_event_send(n, NULL);
-		rt->rt_gateway = orig_gw;
-		return -EAGAIN;
-	} else {
-		rt->rt_flags |= RTCF_REDIRECTED;
-		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
-	}
-	return 0;
-}
 
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {