diff mbox

[v6,BONUS,4/3] ipv4: Store rtable entries directly in FIB

Message ID 20110505.163614.212671515.davem@davemloft.net
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

David Miller May 5, 2011, 11:36 p.m. UTC
Ok, here is the fun patch showing the scheme I'm working on.  Two
things going on here.

First, we store pre-constructed rtable entries, on demand, inside of
the routing table objects themselves.

Second, we get rid of RT_TABLE_LOCAL and load all routes equally
into RT_TABLE_MAIN.

Signed-off-by: David S. Miller <davem@davemloft.net>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Julian Anastasov May 6, 2011, 9:12 a.m. UTC | #1
Hello,

On Thu, 5 May 2011, David Miller wrote:

> Ok, here is the fun patch showing the scheme I'm working on.  Two
> things going on here.
> 
> First, we store pre-constructed rtable entries, on demand, inside of
> the routing table objects themselves.
> 
> Second, we get rid of RT_TABLE_LOCAL and load all routes equally
> into RT_TABLE_MAIN.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 10422ef..f3c9598 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -44,6 +44,7 @@ struct fib_config {
>   };
>  
>  struct fib_info;
> +struct rtable;
>  
>  struct fib_nh {
>  	struct net_device	*nh_dev;
> @@ -62,6 +63,7 @@ struct fib_nh {
>  	__be32			nh_gw;
>  	__be32			nh_saddr;
>  	int			nh_saddr_genid;
> +	struct rtable		*nh_rtable;

	Caching results of __mkroute_output in NH does
not work well for RTN_MULTICAST because ip_check_mc_rcu
wants to further restrict local delivery depending on
the source address and protocol. Even the routing cache
does not cache the protocol as key. May be received IGMP report
can create input cache entry with RTCF_LOCAL flag and later the
UDP stack can see unwatned incoming traffic that should be
dropped by MCAST_INCLUDE/MCAST_EXCLUDE settings for the same
group. I.e. the routing code calls ip_check_mc_rcu for IGMP
but the cache prevents the next calls for UDP to drop these
sources.

	Before now ip_rt_multicast_event was used to
notify about changes in subscriptions for groups and
the routing cache can update its information (RTCF_LOCAL)
per indev+saddr+daddr (but no protocol). Without routing cache
we can not solve the ip_check_mc_rcu problem with
nh_mc_genid fields because NH can be used for many
different saddr addresses.

	Same problem is in ip_route_input_common,
we have to call ip_check_mc_rcu for every packet and
this can be a problem with long lists. But I'm not
sure if the stack can see many filters. If yes, may be
only using hash table for in_dev->mc_list and its
"sources" can help here because we have to call
ip_check_mc_rcu for every input and output packet
if dev+saddr+daddr+proto results are not cached for mcast.

Regards

--
Julian Anastasov <ja@ssi.bg>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller May 6, 2011, 5:57 p.m. UTC | #2
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 6 May 2011 12:12:26 +0300 (EEST)

> 	Caching results of __mkroute_output in NH does
> not work well for RTN_MULTICAST because ip_check_mc_rcu
> wants to further restrict local delivery depending on
> the source address and protocol.

I understand that multicast needs special handling.

I'm concentrating on unicast/broadcast at the moment because
there is a predominantly clear path for making that work.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 10422ef..f3c9598 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -44,6 +44,7 @@  struct fib_config {
  };
 
 struct fib_info;
+struct rtable;
 
 struct fib_nh {
 	struct net_device	*nh_dev;
@@ -62,6 +63,7 @@  struct fib_nh {
 	__be32			nh_gw;
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
+	struct rtable		*nh_rtable;
 };
 
 /*
@@ -200,10 +202,6 @@  static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
 {
 	struct fib_table *table;
 
-	table = fib_get_table(net, RT_TABLE_LOCAL);
-	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
-		return 0;
-
 	table = fib_get_table(net, RT_TABLE_MAIN);
 	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
 		return 0;
diff --git a/include/net/route.h b/include/net/route.h
index 70155fb..04e7197 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -109,6 +109,7 @@  extern int		ip_rt_init(void);
 extern void		ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
 				       __be32 src, struct net_device *dev);
 extern void		rt_cache_flush(struct net *net, int how);
+extern struct rtable *ip_route_output_new(struct net *, struct flowi4 *flp);
 extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
 extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
 					   struct sock *sk);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 33bbbda..24e67d8 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -155,7 +155,7 @@  static inline unsigned __inet_dev_addr_type(struct net *net,
 	res.r = NULL;
 #endif
 
-	local_table = fib_get_table(net, RT_TABLE_LOCAL);
+	local_table = fib_get_table(net, RT_TABLE_MAIN);
 	if (local_table) {
 		ret = RTN_UNICAST;
 		rcu_read_lock();
@@ -662,11 +662,7 @@  static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
 		},
 	};
 
-	if (type == RTN_UNICAST)
-		tb = fib_new_table(net, RT_TABLE_MAIN);
-	else
-		tb = fib_new_table(net, RT_TABLE_LOCAL);
-
+	tb = fib_new_table(net, RT_TABLE_MAIN);
 	if (tb == NULL)
 		return;
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 641a5a2..c37ebd3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -148,6 +148,10 @@  static void free_fib_info_rcu(struct rcu_head *head)
 {
 	struct fib_info *fi = container_of(head, struct fib_info, rcu);
 
+	change_nexthops(fi) {
+		ip_rt_put(nexthop_nh->nh_rtable);
+		nexthop_nh->nh_rtable = NULL;
+	} endfor_nexthops(fi);
 	if (fi->fib_metrics != (u32 *) dst_default_metrics)
 		kfree(fi->fib_metrics);
 	kfree(fi);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1e67624..2f77d28 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1861,6 +1861,68 @@  out:
 }
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
+struct rtable *ip_route_output_new(struct net *net, struct flowi4 *fl4)
+{
+	struct net_device *dev_out = NULL;
+	u32 tos	= RT_FL_TOS(fl4);
+	unsigned int flags = 0;
+	struct fib_result res;
+	struct rtable *rth;
+	int orig_oif;
+
+	res.fi = NULL;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	res.r = NULL;
+#endif
+
+	orig_oif = fl4->flowi4_oif;
+
+	fl4->flowi4_iif = net->loopback_dev->ifindex;
+	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
+	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
+			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
+
+	rcu_read_lock();
+	if (fib_lookup(net, fl4, &res)) {
+		rth = ERR_PTR(-ENETUNREACH);
+		goto out;
+	}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
+		fib_select_multipath(&res);
+	else
+#endif
+	if (!res.prefixlen && res.table->tb_num_default > 1 &&
+	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
+		fib_select_default(&res);
+
+	if (!fl4->saddr)
+		fl4->saddr = FIB_RES_PREFSRC(net, res);
+
+	dev_out = FIB_RES_DEV(res);
+	fl4->flowi4_oif = dev_out->ifindex;
+
+	rth = FIB_RES_NH(res).nh_rtable;
+	if (!rth) {
+		if (res.type == RTN_LOCAL)
+			flags |= RTCF_LOCAL;
+		rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
+		if (!IS_ERR(rth))
+			rth = rt_finalize(rth, NULL);
+		if (!IS_ERR(rth))
+			FIB_RES_NH(res).nh_rtable = rth;
+	}
+
+	if (!IS_ERR(rth))
+		atomic_inc(&rth->dst.__refcnt);
+
+out:
+	rcu_read_unlock();
+	return rth;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_new);
+
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	return NULL;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 544f435..9bb827e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -929,7 +929,7 @@  int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 				   faddr, saddr, dport, inet->inet_sport);
 
 		security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
-		rt = ip_route_output_flow(net, &fl4, sk);
+		rt = ip_route_output_new(net, &fl4);
 		if (IS_ERR(rt)) {
 			err = PTR_ERR(rt);
 			rt = NULL;