Patchwork net: implement emergency route cache rebulds when gc_elasticity is exceeded

login
register
mail settings
Submitter Neil Horman
Date Oct. 16, 2008, 11:35 p.m.
Message ID <20081016233517.GA21243@localhost.localdomain>
Download mbox | patch
Permalink /patch/4784/
State Changes Requested
Delegated to: David Miller
Headers show

Comments

Neil Horman - Oct. 16, 2008, 11:35 p.m.
On Thu, Oct 16, 2008 at 12:36:44PM -0400, Neil Horman wrote:
> On Thu, Oct 16, 2008 at 02:25:47PM +0200, Eric Dumazet wrote:
> > Neil Horman a écrit :
> Yeah, that was quite stupid of me.  I rescind this, and I'll post a patch with the 
> missing chunk later tonight after I spin/test it.
> 


Ok, heres a new patch, same as before, but added in proper initalization for
stack variables, as well as the missing chunk that actually computed the
standard deviation and maximum chain length.  Built/tested by me successfully.
Sorry for the prior noise.  Please review/ack.

Regards
Neil

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>


 include/linux/sysctl.h     |    1 
 include/net/netns/ipv4.h   |    2 
 kernel/sysctl_check.c      |    1 
 net/ipv4/route.c           |  130 ++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/sysctl_net_ipv4.c |   12 ++++
 5 files changed, 144 insertions(+), 2 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet - Oct. 17, 2008, 4:53 a.m.
Neil Horman a écrit :
> On Thu, Oct 16, 2008 at 12:36:44PM -0400, Neil Horman wrote:
>> On Thu, Oct 16, 2008 at 02:25:47PM +0200, Eric Dumazet wrote:
>>> Neil Horman a écrit :
>> Yeah, that was quite stupid of me.  I rescind this, and I'll post a patch with the 
>> missing chunk later tonight after I spin/test it.
>>
> 
> 
> Ok, heres a new patch, same as before, but added in proper initalization for
> stack variables, as well as the missing chunk that actually computed the
> standard deviation and maximum chain length.  Built/tested by me successfully.
> Sorry for the prior noise.  Please review/ack.
> 
> Regards
> Neil

OK

First, please include the description, everytime you submit a patch.
An extensive description is probably the most important part of a patch.
Many people will only read description.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> 
> 
>  include/linux/sysctl.h     |    1 
>  include/net/netns/ipv4.h   |    2 
>  kernel/sysctl_check.c      |    1 
>  net/ipv4/route.c           |  130 ++++++++++++++++++++++++++++++++++++++++++++-
>  net/ipv4/sysctl_net_ipv4.c |   12 ++++
>  5 files changed, 144 insertions(+), 2 deletions(-)
> 
> 
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index d0437f3..481aa44 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -435,6 +435,7 @@ enum
>  	NET_TCP_ALLOWED_CONG_CONTROL=123,
>  	NET_TCP_MAX_SSTHRESH=124,
>  	NET_TCP_FRTO_RESPONSE=125,
> +	NET_IPV4_RT_CACHE_REBUILD_COUNT=126,
>  };
>  
>  enum {
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index a6ed838..4fef762 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -46,6 +46,8 @@ struct netns_ipv4 {
>  	int sysctl_icmp_ratelimit;
>  	int sysctl_icmp_ratemask;
>  	int sysctl_icmp_errors_use_inbound_ifaddr;
> +	int sysctl_rt_cache_rebuild_count;
> +	int current_rt_cache_rebuild_count;
>  
>  	struct timer_list rt_secret_timer;
>  	atomic_t rt_genid;
> diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
> index c35da23..eb9fb57 100644
> --- a/kernel/sysctl_check.c
> +++ b/kernel/sysctl_check.c
> @@ -389,6 +389,7 @@ static const struct trans_ctl_table trans_net_ipv4_table[] = {
>  	{ NET_TCP_ALLOWED_CONG_CONTROL,		"tcp_allowed_congestion_control" },
>  	{ NET_TCP_MAX_SSTHRESH,			"tcp_max_ssthresh" },
>  	{ NET_TCP_FRTO_RESPONSE,		"tcp_frto_response" },
> +	{ NET_IPV4_RT_CACHE_REBUILD_COUNT,	"rt_cache_rebuild_count" },
>  	{ 2088 /* NET_IPQ_QMAX */,		"ip_queue_maxlen" },
>  	{}
>  };
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 6ee5354..0ff28c4 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
>  static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
>  static int ip_rt_min_advmss __read_mostly	= 256;
>  static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
> +static int rt_chain_length_max __read_mostly	= 8;

Some machine come to life on hostile environments and receive lots of UDP messages
from many hosts and may hit this limit before rt_check_expire() has started its first
run. Please consider default route cache settings, that allow 16 elements per chain
in average. And because of the formula avg+4*sd, a default of 20 would be better.

>  
>  static void rt_worker_func(struct work_struct *work);
>  static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
> @@ -145,6 +146,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
>  static void		 ipv4_link_failure(struct sk_buff *skb);
>  static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
>  static int rt_garbage_collect(struct dst_ops *ops);
> +static void rt_emergency_hash_rebuild(struct net *net);
>  
>  
>  static struct dst_ops ipv4_dst_ops = {
> @@ -201,6 +203,7 @@ const __u8 ip_tos2prio[16] = {
>  struct rt_hash_bucket {
>  	struct rtable	*chain;
>  };
> +
>  #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
>  	defined(CONFIG_PROVE_LOCKING)
>  /*
> @@ -669,6 +672,19 @@ static inline u32 rt_score(struct rtable *rt)
>  	return score;
>  }
>  
> +static inline int rt_caching(struct net *net)
> +{
> +	return net->ipv4.current_rt_cache_rebuild_count <=
> +		net->ipv4.sysctl_rt_cache_rebuild_count;
> +}
> +
> +static inline int compare_hash_inputs(struct flowi *fl1, struct flowi *fl2)
> +{
> +	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
> +		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
> +		(fl1->iif ^ fl2->iif)) == 0);
> +}
> +
>  static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
>  {
>  	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
> @@ -748,11 +764,24 @@ static void rt_do_flush(int process_context)
>  	}
>  }
>  
> +/*
> + * While freeing expired entries, we compute average chain length
> + * and standard deviation, using fixed-point arithmetic.
> + * This to have an estimation of rt_chain_length_max
> + *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
> + * We use 3 bits for frational part, and 29 (or 61) for magnitude.
> + */
> +
> +#define FRACT_BITS 3
> +#define ONE (1UL << FRACT_BITS)
> +
>  static void rt_check_expire(void)
>  {
>  	static unsigned int rover;
>  	unsigned int i = rover, goal;
>  	struct rtable *rth, **rthp;
> +	unsigned long length = 0, samples = 0;
> +	unsigned long sum = 0, sum2 = 0;
>  	u64 mult;
>  
>  	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
> @@ -761,6 +790,7 @@ static void rt_check_expire(void)
>  	goal = (unsigned int)mult;
>  	if (goal > rt_hash_mask)
>  		goal = rt_hash_mask + 1;
> +	length = 0;
>  	for (; goal > 0; goal--) {
>  		unsigned long tmo = ip_rt_gc_timeout;
>  
> @@ -770,6 +800,8 @@ static void rt_check_expire(void)
>  		if (need_resched())
>  			cond_resched();
>  
> +		samples++;
> +
>  		if (*rthp == NULL)
>  			continue;
>  		spin_lock_bh(rt_hash_lock_addr(i));
> @@ -784,11 +816,29 @@ static void rt_check_expire(void)
>  				if (time_before_eq(jiffies, rth->u.dst.expires)) {
>  					tmo >>= 1;
>  					rthp = &rth->u.dst.rt_next;
> +					/*
> +					 * Only bump our length if the hash
> +					 * inputs on entries n and n+1 are not
> +					 * the same, we only count entries on
> +					 * a chain with equal hash inputs once
> +					 * so that entries for different QOS
> +					 * levels, and other non-hash input
> +					 * attributes don't unfairly skew
> +					 * the length computation
> +					 */
> +					if (*rthp &&
> +					    !compare_hash_inputs(&(*rthp)->fl,
> +								 &rth->fl))
> +						length += ONE;

Here you ignore chains with one elements. This introduce a bias in the average and sd computation.
A correct test would be :
if ((*rthp == NULL) ||
	!compare_hash_inputs(&(*rthp)->fl,&rth->fl))
	length += ONE;

>  					continue;
>  				}
>  			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
>  				tmo >>= 1;
>  				rthp = &rth->u.dst.rt_next;
> +				if (*rthp &&
> +				    !compare_hash_inputs(&(*rthp)->fl,
> +							 &rth->fl))
> +					length += ONE;

same remark here

>  				continue;
>  			}
>  
> @@ -797,6 +847,15 @@ static void rt_check_expire(void)
>  			rt_free(rth);
>  		}
>  		spin_unlock_bh(rt_hash_lock_addr(i));
> +		sum += length;
> +		sum2 += length*length;
> +	}
> +	if (samples) {
> +		unsigned long avg = sum / samples;
> +		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
> +		rt_chain_length_max = max_t(unsigned long,
> +					ip_rt_gc_elasticity,
> +					(avg + 4*sd) >> FRACT_BITS);
>  	}
>  	rover = i;
>  }
> @@ -846,6 +905,26 @@ static void rt_secret_rebuild(unsigned long __net)
>  	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
>  }
>  
> +static void rt_secret_rebuild_oneshot(struct net *net)
> +{
> +	del_timer_sync(&net->ipv4.rt_secret_timer);
> +	rt_cache_invalidate(net);
> +	if (ip_rt_secret_interval) {
> +		net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
> +		add_timer(&net->ipv4.rt_secret_timer);
> +	}
> +}
> +
> +static void rt_emergency_hash_rebuild(struct net *net)
> +{
> +	if (net_ratelimit()) {
> +		printk(KERN_WARNING "Route hash chain too long!\n");
> +		printk(KERN_WARNING "Adjust your secret_interval!\n");
> +	}
> +
> +	rt_secret_rebuild_oneshot(net);
> +}
> +
>  /*
>     Short description of GC goals.
>  
> @@ -984,6 +1063,7 @@ out:	return 0;
>  static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
>  {
>  	struct rtable	*rth, **rthp;
> +	struct rtable	*rthi;
>  	unsigned long	now;
>  	struct rtable *cand, **candp;
>  	u32 		min_score;
> @@ -997,7 +1077,13 @@ restart:
>  	candp = NULL;
>  	now = jiffies;
>  
> +	if (!rt_caching(dev_net(rt->u.dst.dev))) {
> +		rt_drop(rt);

Would be nice to add a new counter that one can check in /proc/net/stat/rt_cache
RT_CACHE_STAT_INC(notcached)
> +		return 0;
> +	}
> +
>  	rthp = &rt_hash_table[hash].chain;
> +	rthi = NULL;
>  
>  	spin_lock_bh(rt_hash_lock_addr(hash));
>  	while ((rth = *rthp) != NULL) {
> @@ -1043,6 +1129,17 @@ restart:
>  		chain_length++;
>  
>  		rthp = &rth->u.dst.rt_next;
> +
> +		/*
> +		 * check to see if the next entry in the chain
> +		 * contains the same hash input values as rt.  If it does
> +		 * This is where we will insert into the list, instead of
> +		 * at the head.  This groups entries that differ by aspects not
> +		 * relvant to the hash function together, which we use to adjust
> +		 * our chain length
> +		 */
> +		if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
> +			rthi = rth;
>  	}
>  
>  	if (cand) {
> @@ -1056,6 +1153,16 @@ restart:
>  			*candp = cand->u.dst.rt_next;
>  			rt_free(cand);
>  		}
> +	} else {
> +		if (chain_length > rt_chain_length_max) {
> +			struct net *net = dev_net(rt->u.dst.dev);
> +			int num = ++net->ipv4.current_rt_cache_rebuild_count;
> +			if (!rt_caching(dev_net(rt->u.dst.dev))) {
> +				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
> +					rt->u.dst.dev->name, num);
> +			}
> +			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
> +		}
>  	}
>  
>  	/* Try to bind route to arp only if it is output
> @@ -1093,7 +1200,11 @@ restart:
>  		}
>  	}
>  
> -	rt->u.dst.rt_next = rt_hash_table[hash].chain;
> +	if (rthi)
> +		rt->u.dst.rt_next = rthi->u.dst.rt_next;
> +	else
> +		rt->u.dst.rt_next = rt_hash_table[hash].chain;
> +
>  #if RT_CACHE_DEBUG >= 2
>  	if (rt->u.dst.rt_next) {
>  		struct rtable *trt;
> @@ -1104,7 +1215,10 @@ restart:
>  		printk("\n");
>  	}
>  #endif
> -	rt_hash_table[hash].chain = rt;

Please respin your patch to last tree, this misses my last patch about
memory barrier...

Ah, maybe David did not committed it yet...


> +	if (rthi)
> +		rthi->u.dst.rt_next = rt;
> +	else
> +		rt_hash_table[hash].chain = rt;
>  	spin_unlock_bh(rt_hash_lock_addr(hash));
>  	*rp = rt;
>  	return 0;
> @@ -1207,6 +1321,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
>  	    || ipv4_is_zeronet(new_gw))
>  		goto reject_redirect;
>  
> +	if (!rt_caching(net))
> +		goto reject_redirect;
> +
>  	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
>  		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
>  			goto reject_redirect;
> @@ -2120,6 +2237,10 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
>  	struct net *net;
>  
>  	net = dev_net(dev);
> +
> +	if (!rt_caching(net))
> +		goto skip_cache;
> +
>  	tos &= IPTOS_RT_MASK;
>  	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
>  
> @@ -2144,6 +2265,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
>  	}
>  	rcu_read_unlock();
>  
> +skip_cache:
>  	/* Multicast recognition logic is moved from route cache to here.
>  	   The problem was that too many Ethernet cards have broken/missing
>  	   hardware multicast filters :-( As result the host on multicasting
> @@ -2523,6 +2645,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
>  	unsigned hash;
>  	struct rtable *rth;
>  
> +	if (!rt_caching(net))
> +		goto slow_output;
> +
>  	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
>  
>  	rcu_read_lock_bh();
> @@ -2547,6 +2672,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
>  	}
>  	rcu_read_unlock_bh();
>  
> +slow_output:
>  	return ip_route_output_slow(net, rp, flp);
>  }
>  
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index e0689fd..6d9ab73 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -798,6 +798,14 @@ static struct ctl_table ipv4_net_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= &proc_dointvec
>  	},
> +	{
> +		.ctl_name	= NET_IPV4_RT_CACHE_REBUILD_COUNT,
> +		.procname	= "rt_cache_rebuild_count",
> +		.data		= &init_net.ipv4.sysctl_rt_cache_rebuild_count,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= &proc_dointvec
> +	},

You should describe this in Documentation/networking/ip-sysctl.txt

>  	{ }
>  };
>  
> @@ -830,8 +838,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>  			&net->ipv4.sysctl_icmp_ratelimit;
>  		table[5].data =
>  			&net->ipv4.sysctl_icmp_ratemask;
> +		table[6].data =
> +			&net->ipv4.sysctl_rt_cache_rebuild_count;
>  	}
>  
> +	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
> +
>  	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>  			net_ipv4_ctl_path, table);
>  	if (net->ipv4.ipv4_hdr == NULL)
> 
> 

Thank you



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger - Oct. 17, 2008, 5:03 a.m.
On Thu, 16 Oct 2008 19:35:17 -0400
Neil Horman <nhorman@tuxdriver.com> wrote:

> On Thu, Oct 16, 2008 at 12:36:44PM -0400, Neil Horman wrote:
> > On Thu, Oct 16, 2008 at 02:25:47PM +0200, Eric Dumazet wrote:
> > > Neil Horman a écrit :
> > Yeah, that was quite stupid of me.  I rescind this, and I'll post a patch with the 
> > missing chunk later tonight after I spin/test it.
> > 
> 
> 
> Ok, heres a new patch, same as before, but added in proper initalization for
> stack variables, as well as the missing chunk that actually computed the
> standard deviation and maximum chain length.  Built/tested by me successfully.
> Sorry for the prior noise.  Please review/ack.
> 
> Regards
> Neil
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> 
> 
>  include/linux/sysctl.h     |    1 
>  include/net/netns/ipv4.h   |    2 
>  kernel/sysctl_check.c      |    1 
>  net/ipv4/route.c           |  130 ++++++++++++++++++++++++++++++++++++++++++++-
>  net/ipv4/sysctl_net_ipv4.c |   12 ++++
>  5 files changed, 144 insertions(+), 2 deletions(-)
> 
> 
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index d0437f3..481aa44 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -435,6 +435,7 @@ enum
>  	NET_TCP_ALLOWED_CONG_CONTROL=123,
>  	NET_TCP_MAX_SSTHRESH=124,
>  	NET_TCP_FRTO_RESPONSE=125,
> +	NET_IPV4_RT_CACHE_REBUILD_COUNT=126,
>  };
>  

Don't add new sysctl numbers use CTL_UNNUMBERED
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger - Oct. 17, 2008, 5:06 a.m.
O
> +static inline int rt_caching(struct net *net)
> +{
> +	return net->ipv4.current_rt_cache_rebuild_count <=
> +		net->ipv4.sysctl_rt_cache_rebuild_count;
> +}

Why not?

static inline  bool rt_caching(const struct net *)

 

> +static inline int compare_hash_inputs(struct flowi *fl1, struct flowi *fl2)
> +{
> +	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
> +		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
> +		(fl1->iif ^ fl2->iif)) == 0);
> +}

static inline bool compare_hash_inputs(const struct flowi *fl1, struct flowi *fl2)

...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller - Oct. 17, 2008, 5:23 a.m.
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 17 Oct 2008 06:53:54 +0200

> Please respin your patch to last tree, this misses my last patch about
> memory barrier...
> 
> Ah, maybe David did not committed it yet...

It's there in net-2.6
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman - Oct. 17, 2008, 10:39 a.m.
On Thu, Oct 16, 2008 at 10:06:24PM -0700, Stephen Hemminger wrote:
> O
> > +static inline int rt_caching(struct net *net)
> > +{
> > +	return net->ipv4.current_rt_cache_rebuild_count <=
> > +		net->ipv4.sysctl_rt_cache_rebuild_count;
> > +}
> 
> Why not?
> 
I assume that you're asking why we use these values to determine if this
namespace should use the route cache?  The answer is this thread.  Because
ideally we should never rebuild the route cache unless something is giving the
cache an uneven distribution.  The consensus is that if we rebuild too many
times, its an indicator that someone knows enough about our hashing algorithm to
maliginantly force the creation of routes in the cache on the same chain
regardless of how many times we rebuild it.  In that case the best thing to do
is skip the use of the cache entirely.

Neil

Patch

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index d0437f3..481aa44 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -435,6 +435,7 @@  enum
 	NET_TCP_ALLOWED_CONG_CONTROL=123,
 	NET_TCP_MAX_SSTHRESH=124,
 	NET_TCP_FRTO_RESPONSE=125,
+	NET_IPV4_RT_CACHE_REBUILD_COUNT=126,
 };
 
 enum {
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index a6ed838..4fef762 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -46,6 +46,8 @@  struct netns_ipv4 {
 	int sysctl_icmp_ratelimit;
 	int sysctl_icmp_ratemask;
 	int sysctl_icmp_errors_use_inbound_ifaddr;
+	int sysctl_rt_cache_rebuild_count;
+	int current_rt_cache_rebuild_count;
 
 	struct timer_list rt_secret_timer;
 	atomic_t rt_genid;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c35da23..eb9fb57 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -389,6 +389,7 @@  static const struct trans_ctl_table trans_net_ipv4_table[] = {
 	{ NET_TCP_ALLOWED_CONG_CONTROL,		"tcp_allowed_congestion_control" },
 	{ NET_TCP_MAX_SSTHRESH,			"tcp_max_ssthresh" },
 	{ NET_TCP_FRTO_RESPONSE,		"tcp_frto_response" },
+	{ NET_IPV4_RT_CACHE_REBUILD_COUNT,	"rt_cache_rebuild_count" },
 	{ 2088 /* NET_IPQ_QMAX */,		"ip_queue_maxlen" },
 	{}
 };
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6ee5354..0ff28c4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -129,6 +129,7 @@  static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly	= 256;
 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
+static int rt_chain_length_max __read_mostly	= 8;
 
 static void rt_worker_func(struct work_struct *work);
 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
@@ -145,6 +146,7 @@  static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void		 ipv4_link_failure(struct sk_buff *skb);
 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 static int rt_garbage_collect(struct dst_ops *ops);
+static void rt_emergency_hash_rebuild(struct net *net);
 
 
 static struct dst_ops ipv4_dst_ops = {
@@ -201,6 +203,7 @@  const __u8 ip_tos2prio[16] = {
 struct rt_hash_bucket {
 	struct rtable	*chain;
 };
+
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 	defined(CONFIG_PROVE_LOCKING)
 /*
@@ -669,6 +672,19 @@  static inline u32 rt_score(struct rtable *rt)
 	return score;
 }
 
+static inline int rt_caching(struct net *net)
+{
+	return net->ipv4.current_rt_cache_rebuild_count <=
+		net->ipv4.sysctl_rt_cache_rebuild_count;
+}
+
+static inline int compare_hash_inputs(struct flowi *fl1, struct flowi *fl2)
+{
+	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
+		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
+		(fl1->iif ^ fl2->iif)) == 0);
+}
+
 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 {
 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
@@ -748,11 +764,24 @@  static void rt_do_flush(int process_context)
 	}
 }
 
+/*
+ * While freeing expired entries, we compute average chain length
+ * and standard deviation, using fixed-point arithmetic.
+ * This to have an estimation of rt_chain_length_max
+ *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
+ * We use 3 bits for frational part, and 29 (or 61) for magnitude.
+ */
+
+#define FRACT_BITS 3
+#define ONE (1UL << FRACT_BITS)
+
 static void rt_check_expire(void)
 {
 	static unsigned int rover;
 	unsigned int i = rover, goal;
 	struct rtable *rth, **rthp;
+	unsigned long length = 0, samples = 0;
+	unsigned long sum = 0, sum2 = 0;
 	u64 mult;
 
 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
@@ -761,6 +790,7 @@  static void rt_check_expire(void)
 	goal = (unsigned int)mult;
 	if (goal > rt_hash_mask)
 		goal = rt_hash_mask + 1;
+	length = 0;
 	for (; goal > 0; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
 
@@ -770,6 +800,8 @@  static void rt_check_expire(void)
 		if (need_resched())
 			cond_resched();
 
+		samples++;
+
 		if (*rthp == NULL)
 			continue;
 		spin_lock_bh(rt_hash_lock_addr(i));
@@ -784,11 +816,29 @@  static void rt_check_expire(void)
 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
 					tmo >>= 1;
 					rthp = &rth->u.dst.rt_next;
+					/*
+					 * Only bump our length if the hash
+					 * inputs on entries n and n+1 are not
+					 * the same, we only count entries on
+					 * a chain with equal hash inputs once
+					 * so that entries for different QOS
+					 * levels, and other non-hash input
+					 * attributes don't unfairly skew
+					 * the length computation
+					 */
+					if (*rthp &&
+					    !compare_hash_inputs(&(*rthp)->fl,
+								 &rth->fl))
+						length += ONE;
 					continue;
 				}
 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 				tmo >>= 1;
 				rthp = &rth->u.dst.rt_next;
+				if (*rthp &&
+				    !compare_hash_inputs(&(*rthp)->fl,
+							 &rth->fl))
+					length += ONE;
 				continue;
 			}
 
@@ -797,6 +847,15 @@  static void rt_check_expire(void)
 			rt_free(rth);
 		}
 		spin_unlock_bh(rt_hash_lock_addr(i));
+		sum += length;
+		sum2 += length*length;
+	}
+	if (samples) {
+		unsigned long avg = sum / samples;
+		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+		rt_chain_length_max = max_t(unsigned long,
+					ip_rt_gc_elasticity,
+					(avg + 4*sd) >> FRACT_BITS);
 	}
 	rover = i;
 }
@@ -846,6 +905,26 @@  static void rt_secret_rebuild(unsigned long __net)
 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 }
 
+static void rt_secret_rebuild_oneshot(struct net *net)
+{
+	del_timer_sync(&net->ipv4.rt_secret_timer);
+	rt_cache_invalidate(net);
+	if (ip_rt_secret_interval) {
+		net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
+		add_timer(&net->ipv4.rt_secret_timer);
+	}
+}
+
+static void rt_emergency_hash_rebuild(struct net *net)
+{
+	if (net_ratelimit()) {
+		printk(KERN_WARNING "Route hash chain too long!\n");
+		printk(KERN_WARNING "Adjust your secret_interval!\n");
+	}
+
+	rt_secret_rebuild_oneshot(net);
+}
+
 /*
    Short description of GC goals.
 
@@ -984,6 +1063,7 @@  out:	return 0;
 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 {
 	struct rtable	*rth, **rthp;
+	struct rtable	*rthi;
 	unsigned long	now;
 	struct rtable *cand, **candp;
 	u32 		min_score;
@@ -997,7 +1077,13 @@  restart:
 	candp = NULL;
 	now = jiffies;
 
+	if (!rt_caching(dev_net(rt->u.dst.dev))) {
+		rt_drop(rt);
+		return 0;
+	}
+
 	rthp = &rt_hash_table[hash].chain;
+	rthi = NULL;
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
@@ -1043,6 +1129,17 @@  restart:
 		chain_length++;
 
 		rthp = &rth->u.dst.rt_next;
+
+		/*
+		 * check to see if the next entry in the chain
+		 * contains the same hash input values as rt.  If it does
+		 * This is where we will insert into the list, instead of
+		 * at the head.  This groups entries that differ by aspects not
+		 * relvant to the hash function together, which we use to adjust
+		 * our chain length
+		 */
+		if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
+			rthi = rth;
 	}
 
 	if (cand) {
@@ -1056,6 +1153,16 @@  restart:
 			*candp = cand->u.dst.rt_next;
 			rt_free(cand);
 		}
+	} else {
+		if (chain_length > rt_chain_length_max) {
+			struct net *net = dev_net(rt->u.dst.dev);
+			int num = ++net->ipv4.current_rt_cache_rebuild_count;
+			if (!rt_caching(dev_net(rt->u.dst.dev))) {
+				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
+					rt->u.dst.dev->name, num);
+			}
+			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
+		}
 	}
 
 	/* Try to bind route to arp only if it is output
@@ -1093,7 +1200,11 @@  restart:
 		}
 	}
 
-	rt->u.dst.rt_next = rt_hash_table[hash].chain;
+	if (rthi)
+		rt->u.dst.rt_next = rthi->u.dst.rt_next;
+	else
+		rt->u.dst.rt_next = rt_hash_table[hash].chain;
+
 #if RT_CACHE_DEBUG >= 2
 	if (rt->u.dst.rt_next) {
 		struct rtable *trt;
@@ -1104,7 +1215,10 @@  restart:
 		printk("\n");
 	}
 #endif
-	rt_hash_table[hash].chain = rt;
+	if (rthi)
+		rthi->u.dst.rt_next = rt;
+	else
+		rt_hash_table[hash].chain = rt;
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
 	return 0;
@@ -1207,6 +1321,9 @@  void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 	    || ipv4_is_zeronet(new_gw))
 		goto reject_redirect;
 
+	if (!rt_caching(net))
+		goto reject_redirect;
+
 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 			goto reject_redirect;
@@ -2120,6 +2237,10 @@  int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	struct net *net;
 
 	net = dev_net(dev);
+
+	if (!rt_caching(net))
+		goto skip_cache;
+
 	tos &= IPTOS_RT_MASK;
 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
 
@@ -2144,6 +2265,7 @@  int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	}
 	rcu_read_unlock();
 
+skip_cache:
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
 	   hardware multicast filters :-( As result the host on multicasting
@@ -2523,6 +2645,9 @@  int __ip_route_output_key(struct net *net, struct rtable **rp,
 	unsigned hash;
 	struct rtable *rth;
 
+	if (!rt_caching(net))
+		goto slow_output;
+
 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
 
 	rcu_read_lock_bh();
@@ -2547,6 +2672,7 @@  int __ip_route_output_key(struct net *net, struct rtable **rp,
 	}
 	rcu_read_unlock_bh();
 
+slow_output:
 	return ip_route_output_slow(net, rp, flp);
 }
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e0689fd..6d9ab73 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -798,6 +798,14 @@  static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= NET_IPV4_RT_CACHE_REBUILD_COUNT,
+		.procname	= "rt_cache_rebuild_count",
+		.data		= &init_net.ipv4.sysctl_rt_cache_rebuild_count,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{ }
 };
 
@@ -830,8 +838,12 @@  static __net_init int ipv4_sysctl_init_net(struct net *net)
 			&net->ipv4.sysctl_icmp_ratelimit;
 		table[5].data =
 			&net->ipv4.sysctl_icmp_ratemask;
+		table[6].data =
+			&net->ipv4.sysctl_rt_cache_rebuild_count;
 	}
 
+	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
+
 	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
 			net_ipv4_ctl_path, table);
 	if (net->ipv4.ipv4_hdr == NULL)