Message ID | 20110505.163614.212671515.davem@davemloft.net |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
Hello, On Thu, 5 May 2011, David Miller wrote: > Ok, here is the fun patch showing the scheme I'm working on. Two > things going on here. > > First, we store pre-constructed rtable entries, on demand, inside of > the routing table objects themselves. > > Second, we get rid of RT_TABLE_LOCAL and load all routes equally > into RT_TABLE_MAIN. > > Signed-off-by: David S. Miller <davem@davemloft.net> > > diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h > index 10422ef..f3c9598 100644 > --- a/include/net/ip_fib.h > +++ b/include/net/ip_fib.h > @@ -44,6 +44,7 @@ struct fib_config { > }; > > struct fib_info; > +struct rtable; > > struct fib_nh { > struct net_device *nh_dev; > @@ -62,6 +63,7 @@ struct fib_nh { > __be32 nh_gw; > __be32 nh_saddr; > int nh_saddr_genid; > + struct rtable *nh_rtable; Caching results of __mkroute_output in NH does not work well for RTN_MULTICAST because ip_check_mc_rcu wants to further restrict local delivery depending on the source address and protocol. Even the routing cache does not cache the protocol as key. May be received IGMP report can create input cache entry with RTCF_LOCAL flag and later the UDP stack can see unwatned incoming traffic that should be dropped by MCAST_INCLUDE/MCAST_EXCLUDE settings for the same group. I.e. the routing code calls ip_check_mc_rcu for IGMP but the cache prevents the next calls for UDP to drop these sources. Before now ip_rt_multicast_event was used to notify about changes in subscriptions for groups and the routing cache can update its information (RTCF_LOCAL) per indev+saddr+daddr (but no protocol). Without routing cache we can not solve the ip_check_mc_rcu problem with nh_mc_genid fields because NH can be used for many different saddr addresses. Same problem is in ip_route_input_common, we have to call ip_check_mc_rcu for every packet and this can be a problem with long lists. But I'm not sure if the stack can see many filters. If yes, may be only using hash table for in_dev->mc_list and its "sources" can help here because we have to call ip_check_mc_rcu for every input and output packet if dev+saddr+daddr+proto results are not cached for mcast. Regards -- Julian Anastasov <ja@ssi.bg> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Julian Anastasov <ja@ssi.bg> Date: Fri, 6 May 2011 12:12:26 +0300 (EEST) > Caching results of __mkroute_output in NH does > not work well for RTN_MULTICAST because ip_check_mc_rcu > wants to further restrict local delivery depending on > the source address and protocol. I understand that multicast needs special handling. I'm concentrating on unicast/broadcast at the moment because there is a predominantly clear path for making that work. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 10422ef..f3c9598 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -44,6 +44,7 @@ struct fib_config { }; struct fib_info; +struct rtable; struct fib_nh { struct net_device *nh_dev; @@ -62,6 +63,7 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; + struct rtable *nh_rtable; }; /* @@ -200,10 +202,6 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp, { struct fib_table *table; - table = fib_get_table(net, RT_TABLE_LOCAL); - if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF)) - return 0; - table = fib_get_table(net, RT_TABLE_MAIN); if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF)) return 0; diff --git a/include/net/route.h b/include/net/route.h index 70155fb..04e7197 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -109,6 +109,7 @@ extern int ip_rt_init(void); extern void ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw, __be32 src, struct net_device *dev); extern void rt_cache_flush(struct net *net, int how); +extern struct rtable *ip_route_output_new(struct net *, struct flowi4 *flp); extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, struct sock *sk); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 33bbbda..24e67d8 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -155,7 +155,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net, res.r = NULL; #endif - local_table = fib_get_table(net, RT_TABLE_LOCAL); + local_table = fib_get_table(net, RT_TABLE_MAIN); if (local_table) { ret = RTN_UNICAST; rcu_read_lock(); @@ -662,11 +662,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad }, }; - if (type == RTN_UNICAST) - tb = fib_new_table(net, RT_TABLE_MAIN); - else - tb = fib_new_table(net, RT_TABLE_LOCAL); - + tb = fib_new_table(net, RT_TABLE_MAIN); if (tb == NULL) return; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 641a5a2..c37ebd3 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -148,6 +148,10 @@ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + change_nexthops(fi) { + ip_rt_put(nexthop_nh->nh_rtable); + nexthop_nh->nh_rtable = NULL; + } endfor_nexthops(fi); if (fi->fib_metrics != (u32 *) dst_default_metrics) kfree(fi->fib_metrics); kfree(fi); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1e67624..2f77d28 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1861,6 +1861,68 @@ out: } EXPORT_SYMBOL_GPL(__ip_route_output_key); +struct rtable *ip_route_output_new(struct net *net, struct flowi4 *fl4) +{ + struct net_device *dev_out = NULL; + u32 tos = RT_FL_TOS(fl4); + unsigned int flags = 0; + struct fib_result res; + struct rtable *rth; + int orig_oif; + + res.fi = NULL; +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + orig_oif = fl4->flowi4_oif; + + fl4->flowi4_iif = net->loopback_dev->ifindex; + fl4->flowi4_tos = tos & IPTOS_RT_MASK; + fl4->flowi4_scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); + + rcu_read_lock(); + if (fib_lookup(net, fl4, &res)) { + rth = ERR_PTR(-ENETUNREACH); + goto out; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) + fib_select_multipath(&res); + else +#endif + if (!res.prefixlen && res.table->tb_num_default > 1 && + res.type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(&res); + + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, res); + + dev_out = FIB_RES_DEV(res); + fl4->flowi4_oif = dev_out->ifindex; + + rth = FIB_RES_NH(res).nh_rtable; + if (!rth) { + if (res.type == RTN_LOCAL) + flags |= RTCF_LOCAL; + rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); + if (!IS_ERR(rth)) + rth = rt_finalize(rth, NULL); + if (!IS_ERR(rth)) + FIB_RES_NH(res).nh_rtable = rth; + } + + if (!IS_ERR(rth)) + atomic_inc(&rth->dst.__refcnt); + +out: + rcu_read_unlock(); + return rth; +} +EXPORT_SYMBOL_GPL(ip_route_output_new); + static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) { return NULL; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 544f435..9bb827e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -929,7 +929,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, faddr, saddr, dport, inet->inet_sport); security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(net, &fl4, sk); + rt = ip_route_output_new(net, &fl4); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL;
Ok, here is the fun patch showing the scheme I'm working on. Two things going on here. First, we store pre-constructed rtable entries, on demand, inside of the routing table objects themselves. Second, we get rid of RT_TABLE_LOCAL and load all routes equally into RT_TABLE_MAIN. Signed-off-by: David S. Miller <davem@davemloft.net> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html