diff mbox

[net-next,1/3] ipv4: properly refresh rtable entries on pmtu/redirect events

Message ID 1369723593-5307-1-git-send-email-timo.teras@iki.fi
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Timo Teras May 28, 2013, 6:46 a.m. UTC
This reverts commit 05ab86c5 (xfrm4: Invalidate all ipv4 routes on
IPsec pmtu events). Flushing all cached entries is not needed.

Instead, invalidate only the related next hop dsts to recheck for
the added next hop exception where needed. This also fixes a subtle
race due to bumping generation id's before updating the pmtu.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Timo Teräs <timo.teras@iki.fi>
---
 net/ipv4/ah4.c    |  7 ++-----
 net/ipv4/esp4.c   |  7 ++-----
 net/ipv4/ipcomp.c |  7 ++-----
 net/ipv4/route.c  | 63 ++++++++++++++++++++++++++++++++-----------------------
 4 files changed, 43 insertions(+), 41 deletions(-)

Comments

Julian Anastasov May 28, 2013, 8:25 a.m. UTC | #1
Hello,

On Tue, 28 May 2013, Timo Teräs wrote:

> This reverts commit 05ab86c5 (xfrm4: Invalidate all ipv4 routes on
> IPsec pmtu events). Flushing all cached entries is not needed.
> 
> Instead, invalidate only the related next hop dsts to recheck for
> the added next hop exception where needed. This also fixes a subtle
> race due to bumping generation id's before updating the pmtu.
> 
> Cc: Steffen Klassert <steffen.klassert@secunet.com>
> Signed-off-by: Timo Teräs <timo.teras@iki.fi>
> ---

> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 550781a..561a378 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -594,11 +594,25 @@ static inline u32 fnhe_hashfun(__be32 daddr)
>  	return hval & (FNHE_HASH_SIZE - 1);
>  }
>  
> +static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
> +{
> +	rt->rt_pmtu = fnhe->fnhe_pmtu;
> +	rt->dst.expires = fnhe->fnhe_expires;

	The 'if (time_before' ... dst_set_expires() logic from
rt_bind_exception() is removed, may be it should be moved here,
i.e. fnhe_pmtu should be ignored if expired.

> @@ -627,8 +641,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
>  			fnhe->fnhe_gw = gw;
>  		if (pmtu) {
>  			fnhe->fnhe_pmtu = pmtu;
> -			fnhe->fnhe_expires = expires;
> +			fnhe->fnhe_expires = max(1UL, expires);
>  		}
> +		/* Update all cached dsts too */
> +		rt = rcu_dereference(fnhe->fnhe_rth);

	rt = rcu_dereference_protected(fnhe->fnhe_rth, 1);

Regards

--
Julian Anastasov <ja@ssi.bg>
Timo Teras May 28, 2013, 8:53 a.m. UTC | #2
On Tue, 28 May 2013 11:25:55 +0300 (EEST)
Julian Anastasov <ja@ssi.bg> wrote:

> 
> 	Hello,
> 
> On Tue, 28 May 2013, Timo Teräs wrote:
> 
> > This reverts commit 05ab86c5 (xfrm4: Invalidate all ipv4 routes on
> > IPsec pmtu events). Flushing all cached entries is not needed.
> > 
> > Instead, invalidate only the related next hop dsts to recheck for
> > the added next hop exception where needed. This also fixes a subtle
> > race due to bumping generation id's before updating the pmtu.
> > 
> > Cc: Steffen Klassert <steffen.klassert@secunet.com>
> > Signed-off-by: Timo Teräs <timo.teras@iki.fi>
> > ---
> 
> > diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> > index 550781a..561a378 100644
> > --- a/net/ipv4/route.c
> > +++ b/net/ipv4/route.c
> > @@ -594,11 +594,25 @@ static inline u32 fnhe_hashfun(__be32 daddr)
> >  	return hval & (FNHE_HASH_SIZE - 1);
> >  }
> >  
> > +static void fill_route_from_fnhe(struct rtable *rt, struct
> > fib_nh_exception *fnhe) +{
> > +	rt->rt_pmtu = fnhe->fnhe_pmtu;
> > +	rt->dst.expires = fnhe->fnhe_expires;
> 
> 	The 'if (time_before' ... dst_set_expires() logic from
> rt_bind_exception() is removed, may be it should be moved here,
> i.e. fnhe_pmtu should be ignored if expired.

That code would not help much. The route's rt_pmtu is never reset to
zero after the fnhe expires, so this would not make much difference.
The old rt_pmtu and dst.expires would be left there anyway. All rt
accesses check for expires too.

This was actually intentional on the old code, as non-zero rt_pmtu
implied that we had "next hop exception route" instead of "next hop
route" and affected how the rt was invalidated in pmtu update.

If we want to clear out these fields, then it would make sense to have
rt_bind_exception() to reset these on expiry to the struct
fib_nh_exception directly and keep the direct assignments in
fill_route_from_fnhe().

- Timo
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Julian Anastasov May 28, 2013, 7:44 p.m. UTC | #3
Hello,

On Tue, 28 May 2013, Timo Teras wrote:

> > > +static void fill_route_from_fnhe(struct rtable *rt, struct
> > > fib_nh_exception *fnhe) +{
> > > +	rt->rt_pmtu = fnhe->fnhe_pmtu;
> > > +	rt->dst.expires = fnhe->fnhe_expires;
> > 
> > 	The 'if (time_before' ... dst_set_expires() logic from
> > rt_bind_exception() is removed, may be it should be moved here,
> > i.e. fnhe_pmtu should be ignored if expired.
> 
> That code would not help much. The route's rt_pmtu is never reset to
> zero after the fnhe expires, so this would not make much difference.
> The old rt_pmtu and dst.expires would be left there anyway. All rt
> accesses check for expires too.

	I see, ipv4_mtu validates rt_pmtu expiration, so
such checks are not needed in rt_bind_exception and
fill_route_from_fnhe.

> This was actually intentional on the old code, as non-zero rt_pmtu
> implied that we had "next hop exception route" instead of "next hop
> route" and affected how the rt was invalidated in pmtu update.
> 
> If we want to clear out these fields, then it would make sense to have
> rt_bind_exception() to reset these on expiry to the struct
> fib_nh_exception directly and keep the direct assignments in
> fill_route_from_fnhe().
> 
> - Timo

Regards

--
Julian Anastasov <ja@ssi.bg>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller June 3, 2013, 7:09 a.m. UTC | #4
From: Timo Teräs <timo.teras@iki.fi>
Date: Tue, 28 May 2013 09:46:31 +0300

> This reverts commit 05ab86c5 (xfrm4: Invalidate all ipv4 routes on
> IPsec pmtu events). Flushing all cached entries is not needed.
> 
> Instead, invalidate only the related next hop dsts to recheck for
> the added next hop exception where needed. This also fixes a subtle
> race due to bumping generation id's before updating the pmtu.
> 
> Cc: Steffen Klassert <steffen.klassert@secunet.com>
> Signed-off-by: Timo Teräs <timo.teras@iki.fi>

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 2e7f194..7179026 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -419,12 +419,9 @@  static void ah4_err(struct sk_buff *skb, u32 info)
 	if (!x)
 		return;
 
-	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) {
-		atomic_inc(&flow_cache_genid);
-		rt_genid_bump(net);
-
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
 		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
-	} else
+	else
 		ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
 	xfrm_state_put(x);
 }
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 4cfe34d..ab3d814 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -502,12 +502,9 @@  static void esp4_err(struct sk_buff *skb, u32 info)
 	if (!x)
 		return;
 
-	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) {
-		atomic_inc(&flow_cache_genid);
-		rt_genid_bump(net);
-
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
 		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
-	} else
+	else
 		ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
 	xfrm_state_put(x);
 }
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 59cb8c7..826be4c 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -47,12 +47,9 @@  static void ipcomp4_err(struct sk_buff *skb, u32 info)
 	if (!x)
 		return;
 
-	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) {
-		atomic_inc(&flow_cache_genid);
-		rt_genid_bump(net);
-
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
 		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
-	} else
+	else
 		ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
 	xfrm_state_put(x);
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 550781a..561a378 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -594,11 +594,25 @@  static inline u32 fnhe_hashfun(__be32 daddr)
 	return hval & (FNHE_HASH_SIZE - 1);
 }
 
+static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
+{
+	rt->rt_pmtu = fnhe->fnhe_pmtu;
+	rt->dst.expires = fnhe->fnhe_expires;
+
+	if (fnhe->fnhe_gw) {
+		rt->rt_flags |= RTCF_REDIRECTED;
+		rt->rt_gateway = fnhe->fnhe_gw;
+		rt->rt_uses_gateway = 1;
+	}
+}
+
 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 				  u32 pmtu, unsigned long expires)
 {
 	struct fnhe_hash_bucket *hash;
 	struct fib_nh_exception *fnhe;
+	struct rtable *rt;
+	unsigned int i;
 	int depth;
 	u32 hval = fnhe_hashfun(daddr);
 
@@ -627,8 +641,12 @@  static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 			fnhe->fnhe_gw = gw;
 		if (pmtu) {
 			fnhe->fnhe_pmtu = pmtu;
-			fnhe->fnhe_expires = expires;
+			fnhe->fnhe_expires = max(1UL, expires);
 		}
+		/* Update all cached dsts too */
+		rt = rcu_dereference(fnhe->fnhe_rth);
+		if (rt)
+			fill_route_from_fnhe(rt, fnhe);
 	} else {
 		if (depth > FNHE_RECLAIM_DEPTH)
 			fnhe = fnhe_oldest(hash);
@@ -644,6 +662,18 @@  static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 		fnhe->fnhe_gw = gw;
 		fnhe->fnhe_pmtu = pmtu;
 		fnhe->fnhe_expires = expires;
+
+		/* Exception created; mark the cached routes for the nexthop
+		 * stale, so anyone caching it rechecks if this exception
+		 * applies to them.
+		 */
+		for_each_possible_cpu(i) {
+			struct rtable __rcu **prt;
+			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
+			rt = rcu_dereference(*prt);
+			if (rt)
+				rt->dst.obsolete = DST_OBSOLETE_KILL;
+		}
 	}
 
 	fnhe->fnhe_stamp = jiffies;
@@ -917,13 +947,6 @@  static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	if (mtu < ip_rt_min_pmtu)
 		mtu = ip_rt_min_pmtu;
 
-	if (!rt->rt_pmtu) {
-		dst->obsolete = DST_OBSOLETE_KILL;
-	} else {
-		rt->rt_pmtu = mtu;
-		dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
-	}
-
 	rcu_read_lock();
 	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
@@ -1063,11 +1086,11 @@  static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
 	 * into this function always.
 	 *
-	 * When a PMTU/redirect information update invalidates a
-	 * route, this is indicated by setting obsolete to
-	 * DST_OBSOLETE_KILL.
+	 * When a PMTU/redirect information update invalidates a route,
+	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
+	 * DST_OBSOLETE_DEAD by dst_free().
 	 */
-	if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
+	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
 		return NULL;
 	return dst;
 }
@@ -1215,20 +1238,8 @@  static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 			fnhe->fnhe_pmtu = 0;
 			fnhe->fnhe_expires = 0;
 		}
-		if (fnhe->fnhe_pmtu) {
-			unsigned long expires = fnhe->fnhe_expires;
-			unsigned long diff = expires - jiffies;
-
-			if (time_before(jiffies, expires)) {
-				rt->rt_pmtu = fnhe->fnhe_pmtu;
-				dst_set_expires(&rt->dst, diff);
-			}
-		}
-		if (fnhe->fnhe_gw) {
-			rt->rt_flags |= RTCF_REDIRECTED;
-			rt->rt_gateway = fnhe->fnhe_gw;
-			rt->rt_uses_gateway = 1;
-		} else if (!rt->rt_gateway)
+		fill_route_from_fnhe(rt, fnhe);
+		if (!rt->rt_gateway)
 			rt->rt_gateway = daddr;
 
 		rcu_assign_pointer(fnhe->fnhe_rth, rt);