diff mbox

[net-next,4/6] ipv6: Only create RTF_CACHE routes after encountering pmtu exception

Message ID 1430255273-3045254-5-git-send-email-kafai@fb.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Martin KaFai Lau April 28, 2015, 9:07 p.m. UTC
This patch creates a RTF_CACHE routes only after encountering a pmtu
exception.

After ip6_rt_update_pmtu() has inserted the RTF_CACHE route to the fib6
tree, the rt->rt6i_node->fn_sernum is bumped which will fail the
ip6_dst_check() and trigger a relookup.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/route.c | 96 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 48 insertions(+), 48 deletions(-)

Comments

Steffen Klassert April 29, 2015, 11:39 a.m. UTC | #1
On Tue, Apr 28, 2015 at 02:07:51PM -0700, Martin KaFai Lau wrote:
>  
> -static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
> -			       struct sk_buff *skb, u32 mtu)
> +static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
> +				 const struct ipv6hdr *iph, u32 mtu)
>  {
>  	struct rt6_info *rt6 = (struct rt6_info *)dst;
> +	struct net *net;
> +
> +	if (rt6->rt6i_flags & RTF_LOCAL)
> +		return;
>  
>  	dst_confirm(dst);
> -	if (mtu < dst_mtu(dst) && (rt6->rt6i_flags & RTF_CACHE)) {
> -		struct net *net = dev_net(dst->dev);
> +	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
> +	if (mtu >= dst_mtu(dst))
> +		return;
>  
> -		rt6->rt6i_flags |= RTF_MODIFIED;
> -		if (mtu < IPV6_MIN_MTU)
> -			mtu = IPV6_MIN_MTU;
> +	if (!(rt6->rt6i_flags & RTF_CACHE)) {
> +		const struct in6_addr *daddr, *saddr;
> +		struct rt6_info *nrt6;
>  
> -		rt6->rt6i_pmtu = mtu;
> -		rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
> +		if (iph) {
> +			daddr = &iph->daddr;
> +			saddr = &iph->saddr;
> +		} else if (sk) {
> +			daddr = &sk->sk_v6_daddr;
> +			saddr = &inet6_sk(sk)->saddr;
> +		} else {
> +			return;
> +		}
> +		nrt6 = ip6_pmtu_rt_cache_alloc(rt6, daddr, saddr);
> +		if (!nrt6)
> +			return;
> +		/* ip6_ins_rt(nrt6) will bump the rt6->rt6i_node->fn_sernum
> +		 * which will fail the next rt6_check() and invalidate the
> +		 * sk->sk_dst_cache.
> +		 */
> +		if (ip6_ins_rt(nrt6)) {
> +			dst_destroy(&nrt6->dst);

fib6_add() does a dst_free() on error, so calling dst_destroy()
here might result in a use after free.


> +			return;
> +		}
> +
> +		rt6 = nrt6;
> +		dst = &nrt6->dst;
>  	}
> +
> +	net = dev_net(dst->dev);
> +	rt6->rt6i_flags |= RTF_MODIFIED;
> +	rt6->rt6i_pmtu = mtu;
> +	rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);

The update of expires and the setting of rt6i_pmtu should
happen before the route is inserted with ip6_ins_rt().
This is because fib6_add_rt2node() tries to reuse old
expired routes if still in the fib tree, the necessary
informations are copied from the new route before it
returnes -EEXIST on the new route. If your new route
has no expires value set, fib6_add_rt2node() cleans
expires of the old route before it resues it.

Also rt6i_pmtu should be copied to the reused route in
fib6_add_rt2node(), this should be done already in your
first patchset. Otherwise we might use stale pmtu informations.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin KaFai Lau April 29, 2015, 6:31 p.m. UTC | #2
On Wed, Apr 29, 2015 at 01:39:18PM +0200, Steffen Klassert wrote:
> On Tue, Apr 28, 2015 at 02:07:51PM -0700, Martin KaFai Lau wrote:
> > +		if (ip6_ins_rt(nrt6)) {
> > +			dst_destroy(&nrt6->dst);
> 
> fib6_add() does a dst_free() on error, so calling dst_destroy()
> here might result in a use after free.
Good catch.

> 
> 
> > +			return;
> > +		}
> > +
> > +		rt6 = nrt6;
> > +		dst = &nrt6->dst;
> >  	}
> > +
> > +	net = dev_net(dst->dev);
> > +	rt6->rt6i_flags |= RTF_MODIFIED;
> > +	rt6->rt6i_pmtu = mtu;
> > +	rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
> 
> The update of expires and the setting of rt6i_pmtu should
> happen before the route is inserted with ip6_ins_rt().
>
> This is because fib6_add_rt2node() tries to reuse old
> expired routes if still in the fib tree, the necessary
> informations are copied from the new route before it
> returnes -EEXIST on the new route. If your new route
> has no expires value set, fib6_add_rt2node() cleans
> expires of the old route before it resues it.
>
> Also rt6i_pmtu should be copied to the reused route in
> fib6_add_rt2node(), this should be done already in your
> first patchset. Otherwise we might use stale pmtu informations.
Good catch.

A similar race may also happen in the current ip6_pol_route()
where it may clear the RTF_EXPIRES of the existing pmtu clone.

Hence, copying rt6i_pmtu (at fib6_add_rt2node()) in the last patchset will
not be right.

I will do the copying and early-set-expire in this patchset instead.

Thanks,
---Martin
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8bc83bc..09ab0f4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -873,16 +873,13 @@  static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 				      struct flowi6 *fl6, int flags)
 {
 	struct fib6_node *fn, *saved_fn;
-	struct rt6_info *rt, *nrt;
+	struct rt6_info *rt;
 	int strict = 0;
-	int attempts = 3;
-	int err;
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
 	if (net->ipv6.devconf_all->forwarding == 0)
 		strict |= RT6_LOOKUP_F_REACHABLE;
 
-redo_fib6_lookup_lock:
 	read_lock_bh(&table->tb6_lock);
 
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
@@ -901,46 +898,12 @@  redo_rt6_select:
 			strict &= ~RT6_LOOKUP_F_REACHABLE;
 			fn = saved_fn;
 			goto redo_rt6_select;
-		} else {
-			dst_hold(&rt->dst);
-			read_unlock_bh(&table->tb6_lock);
-			goto out2;
 		}
 	}
 
 	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
 
-	if (rt->rt6i_flags & RTF_CACHE)
-		goto out2;
-
-	if (!rt6_is_gw_or_nonexthop(rt) ||
-	    !(rt->dst.flags & DST_HOST) || !(rt->dst.flags & RTF_LOCAL))
-		nrt = ip6_pmtu_rt_cache_alloc(rt, &fl6->daddr, &fl6->saddr);
-	else
-		goto out2;
-
-	ip6_rt_put(rt);
-	rt = nrt ? : net->ipv6.ip6_null_entry;
-
-	dst_hold(&rt->dst);
-	if (nrt) {
-		err = ip6_ins_rt(nrt);
-		if (!err)
-			goto out2;
-	}
-
-	if (--attempts <= 0)
-		goto out2;
-
-	/*
-	 * Race condition! In the gap, when table->tb6_lock was
-	 * released someone could insert this route.  Relookup.
-	 */
-	ip6_rt_put(rt);
-	goto redo_fib6_lookup_lock;
-
-out2:
 	rt6_dst_from_metrics_check(rt);
 	rt->dst.lastuse = jiffies;
 	rt->dst.__use++;
@@ -1113,22 +1076,59 @@  static void ip6_link_failure(struct sk_buff *skb)
 	}
 }
 
-static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
-			       struct sk_buff *skb, u32 mtu)
+static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
+				 const struct ipv6hdr *iph, u32 mtu)
 {
 	struct rt6_info *rt6 = (struct rt6_info *)dst;
+	struct net *net;
+
+	if (rt6->rt6i_flags & RTF_LOCAL)
+		return;
 
 	dst_confirm(dst);
-	if (mtu < dst_mtu(dst) && (rt6->rt6i_flags & RTF_CACHE)) {
-		struct net *net = dev_net(dst->dev);
+	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+	if (mtu >= dst_mtu(dst))
+		return;
 
-		rt6->rt6i_flags |= RTF_MODIFIED;
-		if (mtu < IPV6_MIN_MTU)
-			mtu = IPV6_MIN_MTU;
+	if (!(rt6->rt6i_flags & RTF_CACHE)) {
+		const struct in6_addr *daddr, *saddr;
+		struct rt6_info *nrt6;
 
-		rt6->rt6i_pmtu = mtu;
-		rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
+		if (iph) {
+			daddr = &iph->daddr;
+			saddr = &iph->saddr;
+		} else if (sk) {
+			daddr = &sk->sk_v6_daddr;
+			saddr = &inet6_sk(sk)->saddr;
+		} else {
+			return;
+		}
+		nrt6 = ip6_pmtu_rt_cache_alloc(rt6, daddr, saddr);
+		if (!nrt6)
+			return;
+		/* ip6_ins_rt(nrt6) will bump the rt6->rt6i_node->fn_sernum
+		 * which will fail the next rt6_check() and invalidate the
+		 * sk->sk_dst_cache.
+		 */
+		if (ip6_ins_rt(nrt6)) {
+			dst_destroy(&nrt6->dst);
+			return;
+		}
+
+		rt6 = nrt6;
+		dst = &nrt6->dst;
 	}
+
+	net = dev_net(dst->dev);
+	rt6->rt6i_flags |= RTF_MODIFIED;
+	rt6->rt6i_pmtu = mtu;
+	rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
+}
+
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			       struct sk_buff *skb, u32 mtu)
+{
+	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
 }
 
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
@@ -1147,7 +1147,7 @@  void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
 
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (!dst->error)
-		ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
+		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
 	dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_update_pmtu);