diff mbox

[net-next] ipv4: use next hop exceptions also for input routes

Message ID 1372318025-7634-1-git-send-email-timo.teras@iki.fi
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Timo Teras June 27, 2013, 7:27 a.m. UTC
Commit d2d68ba9 (ipv4: Cache input routes in fib_info nexthops)
assmued that "locally destined, and routed packets, never trigger
PMTU events or redirects that will be processed by us".

However, it seems that tunnel devices do trigger PMTU events in certain
cases. At least ip_gre, ip6_gre, sit, and ipip do use the inner flow's
skb_dst(skb)->ops->update_pmtu to propage mtu information from the
outer flows. These can cause the inner flow mtu to be decreased. If
next hop exceptions are not consulted for pmtu, IP fragmentation will
not be done properly for these routes.

It also seems that we really need to have the PMTU information always
for netfilter TCPMSS clamp-to-pmtu feature to work properly.

So for the time being, cache separate copies of input routes for
each next hop exception.

Signed-off-by: Timo Teräs <timo.teras@iki.fi>
---
Seems that the previous send was marked as RFC, and got no review
comments. No alternate patches seem to have been committed either.

I'm resending now without RFC tag, and rebased against current
net-next. I've been using this locally for several weeks and have
not found any problems, and this does fix the two regression bugs
mentioned above.

 include/net/ip_fib.h     |  3 ++-
 net/ipv4/fib_semantics.c |  3 ++-
 net/ipv4/route.c         | 65 +++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 54 insertions(+), 17 deletions(-)

Comments

Julian Anastasov June 27, 2013, 8:52 a.m. UTC | #1
Hello,

On Thu, 27 Jun 2013, Timo Teräs wrote:

> Commit d2d68ba9 (ipv4: Cache input routes in fib_info nexthops)
> assmued that "locally destined, and routed packets, never trigger
> PMTU events or redirects that will be processed by us".
> 
> However, it seems that tunnel devices do trigger PMTU events in certain
> cases. At least ip_gre, ip6_gre, sit, and ipip do use the inner flow's
> skb_dst(skb)->ops->update_pmtu to propage mtu information from the
> outer flows. These can cause the inner flow mtu to be decreased. If
> next hop exceptions are not consulted for pmtu, IP fragmentation will
> not be done properly for these routes.
> 
> It also seems that we really need to have the PMTU information always
> for netfilter TCPMSS clamp-to-pmtu feature to work properly.
> 
> So for the time being, cache separate copies of input routes for
> each next hop exception.
> 
> Signed-off-by: Timo Teräs <timo.teras@iki.fi>

	Looks good to me

Reviewed-by: Julian Anastasov <ja@ssi.bg>

> ---
> Seems that the previous send was marked as RFC, and got no review
> comments. No alternate patches seem to have been committed either.
> 
> I'm resending now without RFC tag, and rebased against current
> net-next. I've been using this locally for several weeks and have
> not found any problems, and this does fix the two regression bugs
> mentioned above.

Regards

--
Julian Anastasov <ja@ssi.bg>
David Miller June 29, 2013, 4:28 a.m. UTC | #2
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 27 Jun 2013 11:52:14 +0300 (EEST)

> 
> 	Hello,
> 
> On Thu, 27 Jun 2013, Timo Teräs wrote:
> 
>> Commit d2d68ba9 (ipv4: Cache input routes in fib_info nexthops)
>> assmued that "locally destined, and routed packets, never trigger
>> PMTU events or redirects that will be processed by us".
>> 
>> However, it seems that tunnel devices do trigger PMTU events in certain
>> cases. At least ip_gre, ip6_gre, sit, and ipip do use the inner flow's
>> skb_dst(skb)->ops->update_pmtu to propage mtu information from the
>> outer flows. These can cause the inner flow mtu to be decreased. If
>> next hop exceptions are not consulted for pmtu, IP fragmentation will
>> not be done properly for these routes.
>> 
>> It also seems that we really need to have the PMTU information always
>> for netfilter TCPMSS clamp-to-pmtu feature to work properly.
>> 
>> So for the time being, cache separate copies of input routes for
>> each next hop exception.
>> 
>> Signed-off-by: Timo Teräs <timo.teras@iki.fi>
> 
> 	Looks good to me
> 
> Reviewed-by: Julian Anastasov <ja@ssi.bg>

Applied to net-next, thanks everyone.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 44424e9..aac8553 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -56,7 +56,8 @@  struct fib_nh_exception {
 	u32				fnhe_pmtu;
 	__be32				fnhe_gw;
 	unsigned long			fnhe_expires;
-	struct rtable __rcu		*fnhe_rth;
+	struct rtable __rcu		*fnhe_rth_input;
+	struct rtable __rcu		*fnhe_rth_output;
 	unsigned long			fnhe_stamp;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8f6cb7a..d5dbca5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -169,7 +169,8 @@  static void free_nh_exceptions(struct fib_nh *nh)
 			
 			next = rcu_dereference_protected(fnhe->fnhe_next, 1);
 
-			rt_fibinfo_free(&fnhe->fnhe_rth);
+			rt_fibinfo_free(&fnhe->fnhe_rth_input);
+			rt_fibinfo_free(&fnhe->fnhe_rth_output);
 
 			kfree(fnhe);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f3fa42e..a9a54a2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -565,10 +565,25 @@  static inline void rt_free(struct rtable *rt)
 
 static DEFINE_SPINLOCK(fnhe_lock);
 
+static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
+{
+	struct rtable *rt;
+
+	rt = rcu_dereference(fnhe->fnhe_rth_input);
+	if (rt) {
+		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
+		rt_free(rt);
+	}
+	rt = rcu_dereference(fnhe->fnhe_rth_output);
+	if (rt) {
+		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
+		rt_free(rt);
+	}
+}
+
 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 {
 	struct fib_nh_exception *fnhe, *oldest;
-	struct rtable *orig;
 
 	oldest = rcu_dereference(hash->chain);
 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
@@ -576,11 +591,7 @@  static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 			oldest = fnhe;
 	}
-	orig = rcu_dereference(oldest->fnhe_rth);
-	if (orig) {
-		RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
-		rt_free(orig);
-	}
+	fnhe_flush_routes(oldest);
 	return oldest;
 }
 
@@ -644,7 +655,10 @@  static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 			fnhe->fnhe_expires = max(1UL, expires);
 		}
 		/* Update all cached dsts too */
-		rt = rcu_dereference(fnhe->fnhe_rth);
+		rt = rcu_dereference(fnhe->fnhe_rth_input);
+		if (rt)
+			fill_route_from_fnhe(rt, fnhe);
+		rt = rcu_dereference(fnhe->fnhe_rth_output);
 		if (rt)
 			fill_route_from_fnhe(rt, fnhe);
 	} else {
@@ -668,6 +682,10 @@  static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 		 * stale, so anyone caching it rechecks if this exception
 		 * applies to them.
 		 */
+		rt = rcu_dereference(nh->nh_rth_input);
+		if (rt)
+			rt->dst.obsolete = DST_OBSOLETE_KILL;
+
 		for_each_possible_cpu(i) {
 			struct rtable __rcu **prt;
 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
@@ -1242,25 +1260,36 @@  static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 	spin_lock_bh(&fnhe_lock);
 
 	if (daddr == fnhe->fnhe_daddr) {
+		struct rtable __rcu **porig;
+		struct rtable *orig;
 		int genid = fnhe_genid(dev_net(rt->dst.dev));
-		struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
+
+		if (rt_is_input_route(rt))
+			porig = &fnhe->fnhe_rth_input;
+		else
+			porig = &fnhe->fnhe_rth_output;
+		orig = rcu_dereference(*porig);
 
 		if (fnhe->fnhe_genid != genid) {
 			fnhe->fnhe_genid = genid;
 			fnhe->fnhe_gw = 0;
 			fnhe->fnhe_pmtu = 0;
 			fnhe->fnhe_expires = 0;
+			fnhe_flush_routes(fnhe);
+			orig = NULL;
 		}
 		fill_route_from_fnhe(rt, fnhe);
 		if (!rt->rt_gateway)
 			rt->rt_gateway = daddr;
 
-		rcu_assign_pointer(fnhe->fnhe_rth, rt);
-		if (orig)
-			rt_free(orig);
+		if (!(rt->dst.flags & DST_NOCACHE)) {
+			rcu_assign_pointer(*porig, rt);
+			if (orig)
+				rt_free(orig);
+			ret = true;
+		}
 
 		fnhe->fnhe_stamp = jiffies;
-		ret = true;
 	}
 	spin_unlock_bh(&fnhe_lock);
 
@@ -1492,6 +1521,7 @@  static int __mkroute_input(struct sk_buff *skb,
 			   struct in_device *in_dev,
 			   __be32 daddr, __be32 saddr, u32 tos)
 {
+	struct fib_nh_exception *fnhe;
 	struct rtable *rth;
 	int err;
 	struct in_device *out_dev;
@@ -1538,8 +1568,13 @@  static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
+	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
 	if (do_cache) {
-		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+		if (fnhe != NULL)
+			rth = rcu_dereference(fnhe->fnhe_rth_input);
+		else
+			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+
 		if (rt_cache_valid(rth)) {
 			skb_dst_set_noref(skb, &rth->dst);
 			goto out;
@@ -1567,7 +1602,7 @@  static int __mkroute_input(struct sk_buff *skb,
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
 
-	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
+	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
 	skb_dst_set(skb, &rth->dst);
 out:
 	err = 0;
@@ -1882,7 +1917,7 @@  static struct rtable *__mkroute_output(const struct fib_result *res,
 
 		fnhe = find_exception(nh, fl4->daddr);
 		if (fnhe)
-			prth = &fnhe->fnhe_rth;
+			prth = &fnhe->fnhe_rth_output;
 		else {
 			if (unlikely(fl4->flowi4_flags &
 				     FLOWI_FLAG_KNOWN_NH &&