diff mbox

net: implement emergency route cache rebulds when gc_elasticity is exceeded

Message ID 20080929191254.GA20074@hmsreliant.think-freely.org
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Neil Horman Sept. 29, 2008, 7:12 p.m. UTC
Hey all-
	We currently have the ability to disable our route cache secret interval
rebuild timer (by setting it to zero), but if we do that its possible for an
attacker (if they guess our route cache hash secret, to fill our system with
routes that all hash to the same bucket, destroying our performance.  This patch
provides a backstop for that issues.  In the event that our rebuild interval is
disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
an emergency hash rebuild.  During the hash rebuild we:
1) warn the user of the emergency
2) disable the rebuild timer
3) invalidate the route caches
4) re-enable the rebuild timer with its old value

Regards
Neil

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>


 route.c |   36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

Comments

Eric Dumazet Sept. 29, 2008, 8:22 p.m. UTC | #1
Neil Horman a écrit :
> Hey all-
> 	We currently have the ability to disable our route cache secret interval
> rebuild timer (by setting it to zero), but if we do that its possible for an
> attacker (if they guess our route cache hash secret, to fill our system with
> routes that all hash to the same bucket, destroying our performance.  This patch
> provides a backstop for that issues.  In the event that our rebuild interval is
> disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
> an emergency hash rebuild.  During the hash rebuild we:
> 1) warn the user of the emergency
> 2) disable the rebuild timer
> 3) invalidate the route caches
> 4) re-enable the rebuild timer with its old value
> 
> Regards
> Neil

This sounds not good at all to me.

1) Dont set ip_rt_secret_interval to zero, this is plain silly, since
   you give attackers infinite time to break your machine.

To quote Herbert (who allowed to set this interval to 0)

    "Let me first state that disabling the route cache hash rebuild
     should not be done without extensive analysis on the risk profile
     and careful deliberation.

     However, there are times when this can be done safely or for
     testing.  For example, when you have mechanisms for ensuring
     that offending parties do not exist in your network."


2) Many machines have ip_rt_gc_elasticity set to 2,
   because they have a huge hash table, but low chain depths.


> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> 
> 
>  route.c |   36 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 35 insertions(+), 1 deletion(-)
> 
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 6ee5354..b95e02a 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -145,7 +145,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
>  static void		 ipv4_link_failure(struct sk_buff *skb);
>  static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
>  static int rt_garbage_collect(struct dst_ops *ops);
> -
> +static void rt_emergency_hash_rebuild(void);
>  
>  static struct dst_ops ipv4_dst_ops = {
>  	.family =		AF_INET,
> @@ -1056,6 +1056,15 @@ restart:
>  			*candp = cand->u.dst.rt_next;
>  			rt_free(cand);
>  		}
> +	} else if (chain_length > ip_rt_gc_elasticity) {
> +		/*
> +		 * We didn't find a route entry we could safely free,
> +		 * yet our chain length is over our elasticity value.
> +		 * Someone may have guessed our hash secret and is artifically
> +		 * filling up our route cache.  Lets do an emergency rebuild
> +		 * to be safe
> +		 */
> +		rt_emergency_hash_rebuild();
>  	}
>  
>  	/* Try to bind route to arp only if it is output
> @@ -2946,6 +2955,31 @@ static void rt_secret_reschedule(int old)
>  	rtnl_unlock();
>  }
>  
> +static void rt_secret_rebuild_oneshot(void) {
> +	struct net *net;
> +	int old_interval = ip_rt_secret_interval;
> +
> +	ip_rt_secret_interval = 0;
> +
> +	rt_secret_reschedule(old_interval);
> +
> +	for_each_net(net) 
> +		rt_cache_invalidate(net);
> +
> +	ip_rt_secret_interval = old_interval;
> +
> +	rt_secret_reschedule(0);
> +}
> +
> +static void rt_emergency_hash_rebuild(void) {
> +	if (net_ratelimit()) {
> +		printk(KERN_WARNING "Route hash chain too long!\n");
> +		printk(KERN_WARNING "Adjust your secret_interval!\n");
> +	}
> +
> +	rt_secret_rebuild_oneshot();
> +}
> +
>  static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
>  					  struct file *filp,
>  					  void __user *buffer, size_t *lenp,




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Sept. 29, 2008, 8:27 p.m. UTC | #2
On Mon, Sep 29, 2008 at 10:22:03PM +0200, Eric Dumazet wrote:
> Neil Horman a écrit :
>> Hey all-
>> 	We currently have the ability to disable our route cache secret interval
>> rebuild timer (by setting it to zero), but if we do that its possible for an
>> attacker (if they guess our route cache hash secret, to fill our system with
>> routes that all hash to the same bucket, destroying our performance.  This patch
>> provides a backstop for that issues.  In the event that our rebuild interval is
>> disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
>> an emergency hash rebuild.  During the hash rebuild we:
>> 1) warn the user of the emergency
>> 2) disable the rebuild timer
>> 3) invalidate the route caches
>> 4) re-enable the rebuild timer with its old value
>>
>> Regards
>> Neil
>
> This sounds not good at all to me.
>
> 1) Dont set ip_rt_secret_interval to zero, this is plain silly, since
>   you give attackers infinite time to break your machine.
>
> To quote Herbert (who allowed to set this interval to 0)
>
>    "Let me first state that disabling the route cache hash rebuild
>     should not be done without extensive analysis on the risk profile
>     and careful deliberation.
>
>     However, there are times when this can be done safely or for
>     testing.  For example, when you have mechanisms for ensuring
>     that offending parties do not exist in your network."
>
Thats really rather the motivation behind this.  The patch that Herbert
submitted with that commit explicitly lets one disable their rebuild timer.  I
agree its stupid to do that, but we added code to allow it.  This provides a
patch to help people who are victimized because they've done exactly this
(additionaly providing them a warning to stop doing it).


>
> 2) Many machines have ip_rt_gc_elasticity set to 2,
>   because they have a huge hash table, but low chain depths.
Ok, that seem reasonable, and this isn't going to disallow that.  By the same
resoning, people who have huge hash tables, and low chain depths won't
want their low chain length being violated, would they?  This patch will warn
them if their assumptions are being violated.

Neil
Eric Dumazet Sept. 29, 2008, 9 p.m. UTC | #3
Neil Horman a écrit :
> On Mon, Sep 29, 2008 at 10:22:03PM +0200, Eric Dumazet wrote:
>> Neil Horman a écrit :
>>> Hey all-
>>> 	We currently have the ability to disable our route cache secret interval
>>> rebuild timer (by setting it to zero), but if we do that its possible for an
>>> attacker (if they guess our route cache hash secret, to fill our system with
>>> routes that all hash to the same bucket, destroying our performance.  This patch
>>> provides a backstop for that issues.  In the event that our rebuild interval is
>>> disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
>>> an emergency hash rebuild.  During the hash rebuild we:
>>> 1) warn the user of the emergency
>>> 2) disable the rebuild timer
>>> 3) invalidate the route caches
>>> 4) re-enable the rebuild timer with its old value
>>>
>>> Regards
>>> Neil
>> This sounds not good at all to me.
>>
>> 1) Dont set ip_rt_secret_interval to zero, this is plain silly, since
>>   you give attackers infinite time to break your machine.
>>
>> To quote Herbert (who allowed to set this interval to 0)
>>
>>    "Let me first state that disabling the route cache hash rebuild
>>     should not be done without extensive analysis on the risk profile
>>     and careful deliberation.
>>
>>     However, there are times when this can be done safely or for
>>     testing.  For example, when you have mechanisms for ensuring
>>     that offending parties do not exist in your network."
>>
> Thats really rather the motivation behind this.  The patch that Herbert
> submitted with that commit explicitly lets one disable their rebuild timer.  I
> agree its stupid to do that, but we added code to allow it.  This provides a
> patch to help people who are victimized because they've done exactly this
> (additionaly providing them a warning to stop doing it).

Strange... many kernel parameters can be set to hazardous values that make machine unusable...

ip_rt_gc_interval can also be set to a very large value : No more route cache gc

> 
> 
>> 2) Many machines have ip_rt_gc_elasticity set to 2,
>>   because they have a huge hash table, but low chain depths.
> Ok, that seem reasonable, and this isn't going to disallow that.  By the same
> resoning, people who have huge hash tables, and low chain depths won't
> want their low chain length being violated, would they?  This patch will warn
> them if their assumptions are being violated.

Warn only ? If I read your patch, you not only warn in this case.
(you invalidate cache for each struct net, potentially wraping rt_genid)

When you have 2^20 slots in route cache hash table, you dont care if few slots have 3 or 4 elements.
And chance is very high that more than one slot has 3 or even 4 elements, no need for an attacker.

Now if you change your code to something like

 if (unlikely(chain_length > some_quite_big_number &&
     ip_rt_secret_interval == 0)) {
 	do_something();
 }

some_quite_big_number could be >= 30 or something...

then it might be OK (at least it wont break common setups)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Sept. 29, 2008, 10:38 p.m. UTC | #4
On Mon, Sep 29, 2008 at 11:00:35PM +0200, Eric Dumazet wrote:
> Neil Horman a écrit :
>> On Mon, Sep 29, 2008 at 10:22:03PM +0200, Eric Dumazet wrote:
>>> Neil Horman a écrit :
>>>> Hey all-
>>>> 	We currently have the ability to disable our route cache secret interval
>>>> rebuild timer (by setting it to zero), but if we do that its possible for an
>>>> attacker (if they guess our route cache hash secret, to fill our system with
>>>> routes that all hash to the same bucket, destroying our performance.  This patch
>>>> provides a backstop for that issues.  In the event that our rebuild interval is
>>>> disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
>>>> an emergency hash rebuild.  During the hash rebuild we:
>>>> 1) warn the user of the emergency
>>>> 2) disable the rebuild timer
>>>> 3) invalidate the route caches
>>>> 4) re-enable the rebuild timer with its old value
>>>>
>>>> Regards
>>>> Neil
>>> This sounds not good at all to me.
>>>
>>> 1) Dont set ip_rt_secret_interval to zero, this is plain silly, since
>>>   you give attackers infinite time to break your machine.
>>>
>>> To quote Herbert (who allowed to set this interval to 0)
>>>
>>>    "Let me first state that disabling the route cache hash rebuild
>>>     should not be done without extensive analysis on the risk profile
>>>     and careful deliberation.
>>>
>>>     However, there are times when this can be done safely or for
>>>     testing.  For example, when you have mechanisms for ensuring
>>>     that offending parties do not exist in your network."
>>>
>> Thats really rather the motivation behind this.  The patch that Herbert
>> submitted with that commit explicitly lets one disable their rebuild timer.  I
>> agree its stupid to do that, but we added code to allow it.  This provides a
>> patch to help people who are victimized because they've done exactly this
>> (additionaly providing them a warning to stop doing it).
>
> Strange... many kernel parameters can be set to hazardous values that make machine unusable...
>
> ip_rt_gc_interval can also be set to a very large value : No more route cache gc
>
If you want to talk philosophy, then I accept your premise: we can tune the
kernel in ways that make it unusable.  Does that mean we should avoid doing
things to prevent that and maintain usibility.  Ithink this patch accomplishes
that goal in its small way.
 
>>
>>
>>> 2) Many machines have ip_rt_gc_elasticity set to 2,
>>>   because they have a huge hash table, but low chain depths.
>> Ok, that seem reasonable, and this isn't going to disallow that.  By the same
>> resoning, people who have huge hash tables, and low chain depths won't
>> want their low chain length being violated, would they?  This patch will warn
>> them if their assumptions are being violated.
>
> Warn only ? If I read your patch, you not only warn in this case.
No, not warn only, Warn and correct is the clear intent here

> (you invalidate cache for each struct net, potentially wraping rt_genid)
>
If you overflow your elasticity 2^16 times, yes (I think rt_genid is a 16 bit
value, but I don't have the kernel in front of me).  I would hope that would be
enough warnings to make a sysadmin address the problem

> When you have 2^20 slots in route cache hash table, you dont care if few slots have 3 or 4 elements.
> And chance is very high that more than one slot has 3 or even 4 elements, no need for an attacker.
Ok, then I would assert if your ok with a few chains getting to be X entries in
length, then you should set your elasticity to be X.

>
> Now if you change your code to something like
>
> if (unlikely(chain_length > some_quite_big_number &&
>     ip_rt_secret_interval == 0)) {
> 	do_something();
> }
>
> some_quite_big_number could be >= 30 or something...
>
I'd be ok with that, if some others chimed in with consensus on the matter.  I
felt that we had a value definining what chain length should be
(ip_rt_gc_elasticity is already used in comparison on chain length in
rt_intern_hash, so I was keeping with that).  But if some others speak up and
support a new sysctl, I can get behind that

> then it might be OK (at least it wont break common setups)
>
I don't think it will break many setups, the default value is 8 for this sysctl.
If someone has set it lower, and sees a warning, they won't loose their system
altogether, they'll just see a momentary reduction in throughput, and a
warning to increase the value they have set.  Its going to far to say anything
will 'break'.

Thanks & Regards
Neil

>
>
>
Eric Dumazet Sept. 30, 2008, 6:02 a.m. UTC | #5
Neil Horman a écrit :
> On Mon, Sep 29, 2008 at 11:00:35PM +0200, Eric Dumazet wrote:
>> Neil Horman a écrit :
>>> On Mon, Sep 29, 2008 at 10:22:03PM +0200, Eric Dumazet wrote:
>>>> Neil Horman a écrit :
>>>>> Hey all-
>>>>> 	We currently have the ability to disable our route cache secret interval
>>>>> rebuild timer (by setting it to zero), but if we do that its possible for an
>>>>> attacker (if they guess our route cache hash secret, to fill our system with
>>>>> routes that all hash to the same bucket, destroying our performance.  This patch
>>>>> provides a backstop for that issues.  In the event that our rebuild interval is
>>>>> disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
>>>>> an emergency hash rebuild.  During the hash rebuild we:
>>>>> 1) warn the user of the emergency
>>>>> 2) disable the rebuild timer
>>>>> 3) invalidate the route caches
>>>>> 4) re-enable the rebuild timer with its old value
>>>>>
>>>>> Regards
>>>>> Neil
>>>> This sounds not good at all to me.
>>>>
>>>> 1) Dont set ip_rt_secret_interval to zero, this is plain silly, since
>>>>   you give attackers infinite time to break your machine.
>>>>
>>>> To quote Herbert (who allowed to set this interval to 0)
>>>>
>>>>    "Let me first state that disabling the route cache hash rebuild
>>>>     should not be done without extensive analysis on the risk profile
>>>>     and careful deliberation.
>>>>
>>>>     However, there are times when this can be done safely or for
>>>>     testing.  For example, when you have mechanisms for ensuring
>>>>     that offending parties do not exist in your network."
>>>>
>>> Thats really rather the motivation behind this.  The patch that Herbert
>>> submitted with that commit explicitly lets one disable their rebuild timer.  I
>>> agree its stupid to do that, but we added code to allow it.  This provides a
>>> patch to help people who are victimized because they've done exactly this
>>> (additionaly providing them a warning to stop doing it).
>> Strange... many kernel parameters can be set to hazardous values that make machine unusable...
>>
>> ip_rt_gc_interval can also be set to a very large value : No more route cache gc
>>
> If you want to talk philosophy, then I accept your premise: we can tune the
> kernel in ways that make it unusable.  Does that mean we should avoid doing
> things to prevent that and maintain usibility.  Ithink this patch accomplishes
> that goal in its small way.
>  
>>>
>>>> 2) Many machines have ip_rt_gc_elasticity set to 2,
>>>>   because they have a huge hash table, but low chain depths.
>>> Ok, that seem reasonable, and this isn't going to disallow that.  By the same
>>> resoning, people who have huge hash tables, and low chain depths won't
>>> want their low chain length being violated, would they?  This patch will warn
>>> them if their assumptions are being violated.
>> Warn only ? If I read your patch, you not only warn in this case.
> No, not warn only, Warn and correct is the clear intent here
> 
>> (you invalidate cache for each struct net, potentially wraping rt_genid)
>>
> If you overflow your elasticity 2^16 times, yes (I think rt_genid is a 16 bit
> value, but I don't have the kernel in front of me).  I would hope that would be
> enough warnings to make a sysadmin address the problem
> 
>> When you have 2^20 slots in route cache hash table, you dont care if few slots have 3 or 4 elements.
>> And chance is very high that more than one slot has 3 or even 4 elements, no need for an attacker.
> Ok, then I would assert if your ok with a few chains getting to be X entries in
> length, then you should set your elasticity to be X.

Nope. We set elasticity to 2 when we want to trigger code starting at line 1048

        if (cand) {
                if (chain_length > ip_rt_gc_elasticity) {
                        *candp = cand->u.dst.rt_next;
                        rt_free(cand);
                }
        }

That is, freeing one entry if chain length is > 2 AND one entry is freeable.
This means that *some* chains eventually can have live entries and length > 2
without any problem. This is not an emergency case. I have seen on real 
machines some chains hitting length of 13 (elasticity=2), that was normal traffic,
and rt cache was on equilibrium.

Your patch adds to the "if (chain_length > elasticity)" case :

If no entry is freeable in this slot, invalidate *all* cache and put a warning.

Invalidating live entries is puting a high pressure on dst_garbage.list.
Having 1.000.000 entries in this list is not very cool, and should be done
only if really necessary.

When you know you have about 1.000.000 live entries in rt cache, you
can safely make your hash table sized to 2^20 slots, and elasticity set to 2
so that average chain length is <= 2, reducing cache misses at lookup time.

That is quite important to be able to handle 100.000 packets per second

> 
>> Now if you change your code to something like
>>
>> if (unlikely(chain_length > some_quite_big_number &&
>>     ip_rt_secret_interval == 0)) {
>> 	do_something();
>> }
>>
>> some_quite_big_number could be >= 30 or something...
>>
> I'd be ok with that, if some others chimed in with consensus on the matter.  I
> felt that we had a value definining what chain length should be
> (ip_rt_gc_elasticity is already used in comparison on chain length in
> rt_intern_hash, so I was keeping with that).  But if some others speak up and
> support a new sysctl, I can get behind that

I said 30, but could have said 100. No need for a sysctl.
If only one chain is really that long (and attacked), all its entries are hot.
If many chains are hit, then other rt...sysctl_params will control and limit cache grow.

> 
>> then it might be OK (at least it wont break common setups)
>>
> I don't think it will break many setups, the default value is 8 for this sysctl.
> If someone has set it lower, and sees a warning, they won't loose their system
> altogether, they'll just see a momentary reduction in throughput, and a
> warning to increase the value they have set.  Its going to far to say anything
> will 'break'.

I spent many days so that route cache doesnt crash some big machines I have around,
I have feelings your patch will make them crawl again, unless sysadmin is smart
enough to change once again rt sysctls.
So my replies are not just random things from me.

When a machine is targeted by a DDOS attack, about all slots of the hash table
are fully loaded (ie chain length >= elasticity). We dont need to invalidate 
the cache, but find an equilibrium, with small adjustements.

Thank you




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Sept. 30, 2008, 11:23 a.m. UTC | #6
On Tue, Sep 30, 2008 at 08:02:44AM +0200, Eric Dumazet wrote:
> Neil Horman a écrit :
>> On Mon, Sep 29, 2008 at 11:00:35PM +0200, Eric Dumazet wrote:
>>> Neil Horman a écrit :
>>>> On Mon, Sep 29, 2008 at 10:22:03PM +0200, Eric Dumazet wrote:
>>>>> Neil Horman a écrit :
>>>>>> Hey all-
>>>>>> 	We currently have the ability to disable our route cache secret interval
>>>>>> rebuild timer (by setting it to zero), but if we do that its possible for an
>>>>>> attacker (if they guess our route cache hash secret, to fill our system with
>>>>>> routes that all hash to the same bucket, destroying our performance.  This patch
>>>>>> provides a backstop for that issues.  In the event that our rebuild interval is
>>>>>> disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
>>>>>> an emergency hash rebuild.  During the hash rebuild we:
>>>>>> 1) warn the user of the emergency
>>>>>> 2) disable the rebuild timer
>>>>>> 3) invalidate the route caches
>>>>>> 4) re-enable the rebuild timer with its old value
>>>>>>
>>>>>> Regards
>>>>>> Neil
>>>>> This sounds not good at all to me.
>>>>>
>>>>> 1) Dont set ip_rt_secret_interval to zero, this is plain silly, since
>>>>>   you give attackers infinite time to break your machine.
>>>>>
>>>>> To quote Herbert (who allowed to set this interval to 0)
>>>>>
>>>>>    "Let me first state that disabling the route cache hash rebuild
>>>>>     should not be done without extensive analysis on the risk profile
>>>>>     and careful deliberation.
>>>>>
>>>>>     However, there are times when this can be done safely or for
>>>>>     testing.  For example, when you have mechanisms for ensuring
>>>>>     that offending parties do not exist in your network."
>>>>>
>>>> Thats really rather the motivation behind this.  The patch that Herbert
>>>> submitted with that commit explicitly lets one disable their rebuild timer.  I
>>>> agree its stupid to do that, but we added code to allow it.  This provides a
>>>> patch to help people who are victimized because they've done exactly this
>>>> (additionaly providing them a warning to stop doing it).
>>> Strange... many kernel parameters can be set to hazardous values that make machine unusable...
>>>
>>> ip_rt_gc_interval can also be set to a very large value : No more route cache gc
>>>
>> If you want to talk philosophy, then I accept your premise: we can tune the
>> kernel in ways that make it unusable.  Does that mean we should avoid doing
>> things to prevent that and maintain usibility.  Ithink this patch accomplishes
>> that goal in its small way.
>>  
>>>>
>>>>> 2) Many machines have ip_rt_gc_elasticity set to 2,
>>>>>   because they have a huge hash table, but low chain depths.
>>>> Ok, that seem reasonable, and this isn't going to disallow that.  By the same
>>>> resoning, people who have huge hash tables, and low chain depths won't
>>>> want their low chain length being violated, would they?  This patch will warn
>>>> them if their assumptions are being violated.
>>> Warn only ? If I read your patch, you not only warn in this case.
>> No, not warn only, Warn and correct is the clear intent here
>>
>>> (you invalidate cache for each struct net, potentially wraping rt_genid)
>>>
>> If you overflow your elasticity 2^16 times, yes (I think rt_genid is a 16 bit
>> value, but I don't have the kernel in front of me).  I would hope that would be
>> enough warnings to make a sysadmin address the problem
>>
>>> When you have 2^20 slots in route cache hash table, you dont care if few slots have 3 or 4 elements.
>>> And chance is very high that more than one slot has 3 or even 4 elements, no need for an attacker.
>> Ok, then I would assert if your ok with a few chains getting to be X entries in
>> length, then you should set your elasticity to be X.
>
> Nope. We set elasticity to 2 when we want to trigger code starting at line 1048
>
>        if (cand) {
>                if (chain_length > ip_rt_gc_elasticity) {
>                        *candp = cand->u.dst.rt_next;
>                        rt_free(cand);
>                }
>        }
>
> That is, freeing one entry if chain length is > 2 AND one entry is freeable.
> This means that *some* chains eventually can have live entries and length > 2
> without any problem. This is not an emergency case. I have seen on real  
> machines some chains hitting length of 13 (elasticity=2), that was normal 
> traffic,
> and rt cache was on equilibrium.
>
> Your patch adds to the "if (chain_length > elasticity)" case :
>
Not quite, the above is in en else clause to if (cand), that is to say if there
is no freeable entry, and our chain length is greater than our elasticity, which
I believe is an emergency case.

> If no entry is freeable in this slot, invalidate *all* cache and put a warning.
>
> Invalidating live entries is puting a high pressure on dst_garbage.list.
> Having 1.000.000 entries in this list is not very cool, and should be done
> only if really necessary.
>
> When you know you have about 1.000.000 live entries in rt cache, you
> can safely make your hash table sized to 2^20 slots, and elasticity set to 2
> so that average chain length is <= 2, reducing cache misses at lookup time.
>
> That is quite important to be able to handle 100.000 packets per second
>
I agree, but this should not affect that ability (although I don't have the
equipment to check such high throughput.  If you do, I would appreciate the
test.

>>
>>> Now if you change your code to something like
>>>
>>> if (unlikely(chain_length > some_quite_big_number &&
>>>     ip_rt_secret_interval == 0)) {
>>> 	do_something();
>>> }
>>>
>>> some_quite_big_number could be >= 30 or something...
>>>
>> I'd be ok with that, if some others chimed in with consensus on the matter.  I
>> felt that we had a value definining what chain length should be
>> (ip_rt_gc_elasticity is already used in comparison on chain length in
>> rt_intern_hash, so I was keeping with that).  But if some others speak up and
>> support a new sysctl, I can get behind that
>
> I said 30, but could have said 100. No need for a sysctl.
> If only one chain is really that long (and attacked), all its entries are hot.
> If many chains are hit, then other rt...sysctl_params will control and limit cache grow.
>
>>
>>> then it might be OK (at least it wont break common setups)
>>>
>> I don't think it will break many setups, the default value is 8 for this sysctl.
>> If someone has set it lower, and sees a warning, they won't loose their system
>> altogether, they'll just see a momentary reduction in throughput, and a
>> warning to increase the value they have set.  Its going to far to say anything
>> will 'break'.
>
> I spent many days so that route cache doesnt crash some big machines I have around,
> I have feelings your patch will make them crawl again, unless sysadmin is smart
> enough to change once again rt sysctls.
> So my replies are not just random things from me.
>


Again, if you have the ability to run such a high volume test and demonstrate
that this will happen, I'll gladly recind the patch and go back to the drawing
board, but I think you're wrong.

Regards
Neil
David Miller Sept. 30, 2008, 2:08 p.m. UTC | #7
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 29 Sep 2008 15:12:54 -0400

> 	We currently have the ability to disable our route cache secret interval
> rebuild timer (by setting it to zero), but if we do that its possible for an
> attacker (if they guess our route cache hash secret, to fill our system with
> routes that all hash to the same bucket, destroying our performance.  This patch
> provides a backstop for that issues.  In the event that our rebuild interval is
> disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
> an emergency hash rebuild.  During the hash rebuild we:
> 1) warn the user of the emergency
> 2) disable the rebuild timer
> 3) invalidate the route caches
> 4) re-enable the rebuild timer with its old value

I just want to clarify what my intentions were when I spoke
with Neil about this stuff last week.

The idea is that we can by default not rebuild the secret
at all.

And only when we notice that chains are growing larger than
"(NUM_RTCACHE_ENTRIES / NUM_HASH_CHAINS) * N", only then
do we do this secret rebuild and flush.  Where N is some
constant of configurable value, the GC elasticity is some
example.

Normally this whole hash secret business is totally unnecessary and
there is zero reason to do it until we notice there is actually some
kind of deep hash chain growth problem.

It's expensive, we flush the whole routing cache, so doing it
every so often by default makes no sense and it is causing
performance problems for people.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 30, 2008, 2:08 p.m. UTC | #8
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 29 Sep 2008 22:22:03 +0200

> This sounds not good at all to me.
> 
> 1) Dont set ip_rt_secret_interval to zero, this is plain silly, since
>    you give attackers infinite time to break your machine.
> 

It makes a ton of sense, even by default, if we set things up so that
we turn it back on when necessary.

And that's the final intended idea, to do exactly that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 30, 2008, 2:10 p.m. UTC | #9
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 30 Sep 2008 08:02:44 +0200

> When a machine is targeted by a DDOS attack, about all slots of the
> hash table are fully loaded (ie chain length >= elasticity). We dont
> need to invalidate the cache, but find an equilibrium, with small
> adjustements.

Sure, but it is possible to determine that some hash chains
are unevenly growing out of control compared to others,
and that is the algorithm that Neil is trying to discover.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Denis V. Lunev Sept. 30, 2008, 2:17 p.m. UTC | #10
The patch is wrong in respect to
  for_each_net
usage.

It should be used under rtnl. It is not held at the moment :(

Regards,
	Den

On Mon, 2008-09-29 at 15:12 -0400, Neil Horman wrote:
> Hey all-
> 	We currently have the ability to disable our route cache secret interval
> rebuild timer (by setting it to zero), but if we do that its possible for an
> attacker (if they guess our route cache hash secret, to fill our system with
> routes that all hash to the same bucket, destroying our performance.  This patch
> provides a backstop for that issues.  In the event that our rebuild interval is
> disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
> an emergency hash rebuild.  During the hash rebuild we:
> 1) warn the user of the emergency
> 2) disable the rebuild timer
> 3) invalidate the route caches
> 4) re-enable the rebuild timer with its old value
> 
> Regards
> Neil
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> 
> 
>  route.c |   36 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 35 insertions(+), 1 deletion(-)
> 
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 6ee5354..b95e02a 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -145,7 +145,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
>  static void		 ipv4_link_failure(struct sk_buff *skb);
>  static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
>  static int rt_garbage_collect(struct dst_ops *ops);
> -
> +static void rt_emergency_hash_rebuild(void);
>  
>  static struct dst_ops ipv4_dst_ops = {
>  	.family =		AF_INET,
> @@ -1056,6 +1056,15 @@ restart:
>  			*candp = cand->u.dst.rt_next;
>  			rt_free(cand);
>  		}
> +	} else if (chain_length > ip_rt_gc_elasticity) {
> +		/*
> +		 * We didn't find a route entry we could safely free,
> +		 * yet our chain length is over our elasticity value.
> +		 * Someone may have guessed our hash secret and is artifically
> +		 * filling up our route cache.  Lets do an emergency rebuild
> +		 * to be safe
> +		 */
> +		rt_emergency_hash_rebuild();
>  	}
>  
>  	/* Try to bind route to arp only if it is output
> @@ -2946,6 +2955,31 @@ static void rt_secret_reschedule(int old)
>  	rtnl_unlock();
>  }
>  
> +static void rt_secret_rebuild_oneshot(void) {
> +	struct net *net;
> +	int old_interval = ip_rt_secret_interval;
> +
> +	ip_rt_secret_interval = 0;
> +
> +	rt_secret_reschedule(old_interval);
> +
> +	for_each_net(net) 
> +		rt_cache_invalidate(net);
> +
> +	ip_rt_secret_interval = old_interval;
> +
> +	rt_secret_reschedule(0);
> +}
> +
> +static void rt_emergency_hash_rebuild(void) {
> +	if (net_ratelimit()) {
> +		printk(KERN_WARNING "Route hash chain too long!\n");
> +		printk(KERN_WARNING "Adjust your secret_interval!\n");
> +	}
> +
> +	rt_secret_rebuild_oneshot();
> +}
> +
>  static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
>  					  struct file *filp,
>  					  void __user *buffer, size_t *lenp,

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Sept. 30, 2008, 2:35 p.m. UTC | #11
On Tue, Sep 30, 2008 at 06:17:12PM +0400, Denis V. Lunev wrote:
> The patch is wrong in respect to
>   for_each_net
> usage.
> 
> It should be used under rtnl. It is not held at the moment :(
> 
Thank you, I'll be sure to take that into consideration when we determine what
the proper algorithm to implement here is.

Although, I used the rt_secret_rebuild timer function as my guide here, and it
doesn't hold the rtnl lock either, or is that because its running in a softirq
context?

Neil

> Regards,
> 	Den
> 
> On Mon, 2008-09-29 at 15:12 -0400, Neil Horman wrote:
> > Hey all-
> > 	We currently have the ability to disable our route cache secret interval
> > rebuild timer (by setting it to zero), but if we do that its possible for an
> > attacker (if they guess our route cache hash secret, to fill our system with
> > routes that all hash to the same bucket, destroying our performance.  This patch
> > provides a backstop for that issues.  In the event that our rebuild interval is
> > disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
> > an emergency hash rebuild.  During the hash rebuild we:
> > 1) warn the user of the emergency
> > 2) disable the rebuild timer
> > 3) invalidate the route caches
> > 4) re-enable the rebuild timer with its old value
> > 
> > Regards
> > Neil
> > 
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > 
> > 
> >  route.c |   36 +++++++++++++++++++++++++++++++++++-
> >  1 file changed, 35 insertions(+), 1 deletion(-)
> > 
> > 
> > diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> > index 6ee5354..b95e02a 100644
> > --- a/net/ipv4/route.c
> > +++ b/net/ipv4/route.c
> > @@ -145,7 +145,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
> >  static void		 ipv4_link_failure(struct sk_buff *skb);
> >  static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
> >  static int rt_garbage_collect(struct dst_ops *ops);
> > -
> > +static void rt_emergency_hash_rebuild(void);
> >  
> >  static struct dst_ops ipv4_dst_ops = {
> >  	.family =		AF_INET,
> > @@ -1056,6 +1056,15 @@ restart:
> >  			*candp = cand->u.dst.rt_next;
> >  			rt_free(cand);
> >  		}
> > +	} else if (chain_length > ip_rt_gc_elasticity) {
> > +		/*
> > +		 * We didn't find a route entry we could safely free,
> > +		 * yet our chain length is over our elasticity value.
> > +		 * Someone may have guessed our hash secret and is artifically
> > +		 * filling up our route cache.  Lets do an emergency rebuild
> > +		 * to be safe
> > +		 */
> > +		rt_emergency_hash_rebuild();
> >  	}
> >  
> >  	/* Try to bind route to arp only if it is output
> > @@ -2946,6 +2955,31 @@ static void rt_secret_reschedule(int old)
> >  	rtnl_unlock();
> >  }
> >  
> > +static void rt_secret_rebuild_oneshot(void) {
> > +	struct net *net;
> > +	int old_interval = ip_rt_secret_interval;
> > +
> > +	ip_rt_secret_interval = 0;
> > +
> > +	rt_secret_reschedule(old_interval);
> > +
> > +	for_each_net(net) 
> > +		rt_cache_invalidate(net);
> > +
> > +	ip_rt_secret_interval = old_interval;
> > +
> > +	rt_secret_reschedule(0);
> > +}
> > +
> > +static void rt_emergency_hash_rebuild(void) {
> > +	if (net_ratelimit()) {
> > +		printk(KERN_WARNING "Route hash chain too long!\n");
> > +		printk(KERN_WARNING "Adjust your secret_interval!\n");
> > +	}
> > +
> > +	rt_secret_rebuild_oneshot();
> > +}
> > +
> >  static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
> >  					  struct file *filp,
> >  					  void __user *buffer, size_t *lenp,
> 
>
Denis V. Lunev Sept. 30, 2008, 2:49 p.m. UTC | #12
On Tue, 2008-09-30 at 10:35 -0400, Neil Horman wrote:
> On Tue, Sep 30, 2008 at 06:17:12PM +0400, Denis V. Lunev wrote:
> > The patch is wrong in respect to
> >   for_each_net
> > usage.
> > 
> > It should be used under rtnl. It is not held at the moment :(
> > 
> Thank you, I'll be sure to take that into consideration when we determine what
> the proper algorithm to implement here is.
> 
> Although, I used the rt_secret_rebuild timer function as my guide here, and it
> doesn't hold the rtnl lock either, or is that because its running in a softirq
> context?
no. There is no lock as there is no iteration over net namespace list.
Each namespace has its own timer.

The simplest approach I see right now is to re-iterate over current dst
cache chain and mark only those namespaces. This is at least fair as
only one namespace can be attacked (not entire node).

All locking for this is in place.

Regards,
	Den

> Neil
> 
> > Regards,
> > 	Den
> > 
> > On Mon, 2008-09-29 at 15:12 -0400, Neil Horman wrote:
> > > Hey all-
> > > 	We currently have the ability to disable our route cache secret interval
> > > rebuild timer (by setting it to zero), but if we do that its possible for an
> > > attacker (if they guess our route cache hash secret, to fill our system with
> > > routes that all hash to the same bucket, destroying our performance.  This patch
> > > provides a backstop for that issues.  In the event that our rebuild interval is
> > > disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
> > > an emergency hash rebuild.  During the hash rebuild we:
> > > 1) warn the user of the emergency
> > > 2) disable the rebuild timer
> > > 3) invalidate the route caches
> > > 4) re-enable the rebuild timer with its old value
> > > 
> > > Regards
> > > Neil
> > > 
> > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > 
> > > 
> > >  route.c |   36 +++++++++++++++++++++++++++++++++++-
> > >  1 file changed, 35 insertions(+), 1 deletion(-)
> > > 
> > > 
> > > diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> > > index 6ee5354..b95e02a 100644
> > > --- a/net/ipv4/route.c
> > > +++ b/net/ipv4/route.c
> > > @@ -145,7 +145,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
> > >  static void		 ipv4_link_failure(struct sk_buff *skb);
> > >  static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
> > >  static int rt_garbage_collect(struct dst_ops *ops);
> > > -
> > > +static void rt_emergency_hash_rebuild(void);
> > >  
> > >  static struct dst_ops ipv4_dst_ops = {
> > >  	.family =		AF_INET,
> > > @@ -1056,6 +1056,15 @@ restart:
> > >  			*candp = cand->u.dst.rt_next;
> > >  			rt_free(cand);
> > >  		}
> > > +	} else if (chain_length > ip_rt_gc_elasticity) {
> > > +		/*
> > > +		 * We didn't find a route entry we could safely free,
> > > +		 * yet our chain length is over our elasticity value.
> > > +		 * Someone may have guessed our hash secret and is artifically
> > > +		 * filling up our route cache.  Lets do an emergency rebuild
> > > +		 * to be safe
> > > +		 */
> > > +		rt_emergency_hash_rebuild();
> > >  	}
> > >  
> > >  	/* Try to bind route to arp only if it is output
> > > @@ -2946,6 +2955,31 @@ static void rt_secret_reschedule(int old)
> > >  	rtnl_unlock();
> > >  }
> > >  
> > > +static void rt_secret_rebuild_oneshot(void) {
> > > +	struct net *net;
> > > +	int old_interval = ip_rt_secret_interval;
> > > +
> > > +	ip_rt_secret_interval = 0;
> > > +
> > > +	rt_secret_reschedule(old_interval);
> > > +
> > > +	for_each_net(net) 
> > > +		rt_cache_invalidate(net);
> > > +
> > > +	ip_rt_secret_interval = old_interval;
> > > +
> > > +	rt_secret_reschedule(0);
> > > +}
> > > +
> > > +static void rt_emergency_hash_rebuild(void) {
> > > +	if (net_ratelimit()) {
> > > +		printk(KERN_WARNING "Route hash chain too long!\n");
> > > +		printk(KERN_WARNING "Adjust your secret_interval!\n");
> > > +	}
> > > +
> > > +	rt_secret_rebuild_oneshot();
> > > +}
> > > +
> > >  static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
> > >  					  struct file *filp,
> > >  					  void __user *buffer, size_t *lenp,
> > 
> > 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 30, 2008, 5:16 p.m. UTC | #13
David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Tue, 30 Sep 2008 08:02:44 +0200
> 
>> When a machine is targeted by a DDOS attack, about all slots of the
>> hash table are fully loaded (ie chain length >= elasticity). We dont
>> need to invalidate the cache, but find an equilibrium, with small
>> adjustements.
> 
> Sure, but it is possible to determine that some hash chains
> are unevenly growing out of control compared to others,
> and that is the algorithm that Neil is trying to discover.
> 
> 

No problem, but my suggestion to use a separate threshold than elasticity
was apparently not taken into consideration.

I ran an experiment on a big stable machine with 2^19 rtcache slots,
 scanning all chains and found *many* of them having length > elasticity,
maximum being 13. I am sure its allowed by statistics laws.

(average chain length : 3.55)

In order to avoid unecessary cache invalidation, we need some 
calculation from a statistics expert.

Given rt_hash_size and elasticity (or rt_max_size), compute the "maximum reasonable" 
chain length, ie some X number where probability(chain_length < X) > 0.9999

(CCed Evgeniy Polyakov :) )



MemTotal:   32963064 kB
8 CPUS
/proc/sys/net/ipv4/route/max_size:8388608 (default at boot time)
/proc/sys/net/ipv4/route/gc_thresh:2000000
/proc/sys/net/ipv4/route/gc_elasticity:4
/proc/sys/net/ipv4/route/gc_interval:1

Linux version 2.6.24.5
cat /proc/net/sockstat
sockets: used 957514
TCP: inuse 963998 orphan 7100 tw 10393 alloc 964646 mem 376389

rtstat -c5 -i5 (minus first sample)
rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|
 entries|  in_hit|in_slow_|in_slow_|in_no_ro|  in_brd|in_marti|in_marti| out_hit|out_slow|out_slow|gc_total|gc_ignor|gc_goal_|gc_dst_o|in_hlist|out_hlis|
        |        |     tot|      mc|     ute|        |  an_dst|  an_src|        |    _tot|     _mc|        |      ed|    miss| verflow| _search|t_search|

 1862477|   23758|    4400|       0|       0|       0|       0|       0|    4142|    1134|       0|       0|       0|       0|       0|   45754|   11785|
 1863310|   24794|    4504|       0|       0|       0|       0|       0|    4089|    1183|       0|       0|       0|       0|       0|   47558|   11640|
 1863604|   24183|    4383|       0|       0|       0|       0|       0|    3910|    1061|       0|       0|       0|       0|       0|   46002|   10819|
 1864473|   23899|    4510|       0|       0|       0|       0|       0|    4113|    1193|       0|       0|       0|       0|       0|   46451|   11798|

grep ip_dst_cache /proc/slabinfo
ip_dst_cache      1863543 1868660    384   10    1 : tunables    0    0    0 : slabdata 186866 186866      0

On this machine, a single "ip route flush cache" is risky 
(commit 29e75252da20f3ab9e132c68c9aed156b87beae6 [IPV4] route cache: Introduce rt_genid for smooth cache invalidation)
not yet included in kernel)




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 30, 2008, 5:47 p.m. UTC | #14
David Miller a écrit :
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Mon, 29 Sep 2008 15:12:54 -0400
> 
>> 	We currently have the ability to disable our route cache secret interval
>> rebuild timer (by setting it to zero), but if we do that its possible for an
>> attacker (if they guess our route cache hash secret, to fill our system with
>> routes that all hash to the same bucket, destroying our performance.  This patch
>> provides a backstop for that issues.  In the event that our rebuild interval is
>> disabled (or very large), if any hash chain exceeds ip_rt_gc_elasticity, we do
>> an emergency hash rebuild.  During the hash rebuild we:
>> 1) warn the user of the emergency
>> 2) disable the rebuild timer
>> 3) invalidate the route caches
>> 4) re-enable the rebuild timer with its old value
> 
> I just want to clarify what my intentions were when I spoke
> with Neil about this stuff last week.
> 
> The idea is that we can by default not rebuild the secret
> at all.
> 
> And only when we notice that chains are growing larger than
> "(NUM_RTCACHE_ENTRIES / NUM_HASH_CHAINS) * N", only then
> do we do this secret rebuild and flush.  Where N is some
> constant of configurable value, the GC elasticity is some
> example.
> 
> Normally this whole hash secret business is totally unnecessary and
> there is zero reason to do it until we notice there is actually some
> kind of deep hash chain growth problem.
> 
> It's expensive, we flush the whole routing cache, so doing it
> every so often by default makes no sense and it is causing
> performance problems for people.

Intentions are very good, thanks for clarifying and letting us know.




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Sept. 30, 2008, 6:42 p.m. UTC | #15
On Tue, Sep 30, 2008 at 07:16:50PM +0200, Eric Dumazet wrote:
> David Miller a écrit :
>> From: Eric Dumazet <dada1@cosmosbay.com>
>> Date: Tue, 30 Sep 2008 08:02:44 +0200
>>
>>> When a machine is targeted by a DDOS attack, about all slots of the
>>> hash table are fully loaded (ie chain length >= elasticity). We dont
>>> need to invalidate the cache, but find an equilibrium, with small
>>> adjustements.
>>
>> Sure, but it is possible to determine that some hash chains
>> are unevenly growing out of control compared to others,
>> and that is the algorithm that Neil is trying to discover.
>>
>>
>
> No problem, but my suggestion to use a separate threshold than elasticity
> was apparently not taken into consideration.
>
> I ran an experiment on a big stable machine with 2^19 rtcache slots,
> scanning all chains and found *many* of them having length > elasticity,
> maximum being 13. I am sure its allowed by statistics laws.
>
> (average chain length : 3.55)
>
> In order to avoid unecessary cache invalidation, we need some  
> calculation from a statistics expert.
>
> Given rt_hash_size and elasticity (or rt_max_size), compute the "maximum 
> reasonable" chain length, ie some X number where probability(chain_length 
> < X) > 0.9999
>
I think what you're looking for here is a simple standard deviation, isn't it?
Compute the mean chain legnth, sum the squares of the deviations of each chain
and take the square root.  Any individual chain longer than the mean chain
length + 1 standard deviation can be considered an 'outlier' and therefore
trigger a rebuild of the table for that net namespace.

I full well realize that thats easier said than done, but does that seem about
right?  If so, I can start working on trying to build something to accomplish
that.

Regards
Neil
Evgeniy Polyakov Oct. 2, 2008, 7:13 a.m. UTC | #16
Hi.

On Tue, Sep 30, 2008 at 07:16:50PM +0200, Eric Dumazet (dada1@cosmosbay.com) wrote:
> No problem, but my suggestion to use a separate threshold than elasticity
> was apparently not taken into consideration.
> 
> I ran an experiment on a big stable machine with 2^19 rtcache slots,
> scanning all chains and found *many* of them having length > elasticity,
> maximum being 13. I am sure its allowed by statistics laws.
> 
> (average chain length : 3.55)
> 
> In order to avoid unecessary cache invalidation, we need some 
> calculation from a statistics expert.
> 
> Given rt_hash_size and elasticity (or rt_max_size), compute the "maximum 
> reasonable" chain length, ie some X number where probability(chain_length < 
> X) > 0.9999
> 
> (CCed Evgeniy Polyakov :) )
 
:)

If it is a hash table with fixed size, and hash is being produced like
some function result modulo size of the table, then there will be always
fluctuations with longer chain lenghts than 'usual', since, for example,
hash function produces result in 2^32 field, which hash table size is
only 2^20 or something smaller than maxiumum hash result. When I
analyzed Jenkin's hash I mistakenly concluded that such pikes are result
of the hash distribution itself and not final modulo operation.

Plus, it is always a gaussian distribution, so there will be (although
depending on parameters amount may be small enough) entries with longer
and smaller than average chains.

I tested list walking speed compared to rb-tree and trie on p4 machines
without prefetch and lists with upto 10 element are processed faster
than anything else, so if your average chain lengths are less than 10
elements, you are in a good condition.

Also I have a tricky algorithm to automatically scale RCU protected hash
table (with additional overhead at scale time though, but I believe it
is not a problem), which just waits to be implemented in netchannels, so
long chain length problem can be fixed, if algorithm works. Ok, I've
finished my advertisement rants :)
Evgeniy Polyakov Oct. 2, 2008, 7:16 a.m. UTC | #17
Hi.

On Tue, Sep 30, 2008 at 02:42:49PM -0400, Neil Horman (nhorman@tuxdriver.com) wrote:
> I think what you're looking for here is a simple standard deviation, isn't it?
> Compute the mean chain legnth, sum the squares of the deviations of each chain
> and take the square root.  Any individual chain longer than the mean chain
> length + 1 standard deviation can be considered an 'outlier' and therefore
> trigger a rebuild of the table for that net namespace.
> 
> I full well realize that thats easier said than done, but does that seem about
> right?  If so, I can start working on trying to build something to accomplish
> that.

You are right, but in kernel it may not that simple to get a square
root... We can probably play with logarithms though.
Neil Horman Oct. 2, 2008, 1:14 p.m. UTC | #18
On Thu, Oct 02, 2008 at 11:16:31AM +0400, Evgeniy Polyakov wrote:
> Hi.
> 
> On Tue, Sep 30, 2008 at 02:42:49PM -0400, Neil Horman (nhorman@tuxdriver.com) wrote:
> > I think what you're looking for here is a simple standard deviation, isn't it?
> > Compute the mean chain legnth, sum the squares of the deviations of each chain
> > and take the square root.  Any individual chain longer than the mean chain
> > length + 1 standard deviation can be considered an 'outlier' and therefore
> > trigger a rebuild of the table for that net namespace.
> > 
> > I full well realize that thats easier said than done, but does that seem about
> > right?  If so, I can start working on trying to build something to accomplish
> > that.
> 
> You are right, but in kernel it may not that simple to get a square
> root... We can probably play with logarithms though.
> 

I don't in truth, need a quare root operation (at least I don't think I do).
I'm abel to compute the variance rather easily, with a walk of the hash table,
then I approximate the square root by use of a for loop in which I square the
iterator and compare it to the variance.  The iterator when squared that is
closest to the variance is taken as my standard deviation.  As mentioned
previously 4*sd should cover almost all of my sample universe, identifying any
remaining outliers as the trigger for a rebuild.

Neil
Herbert Xu Oct. 5, 2008, 3:17 a.m. UTC | #19
Neil Horman <nhorman@tuxdriver.com> wrote:
>        We currently have the ability to disable our route cache secret interval
> rebuild timer (by setting it to zero), but if we do that its possible for an
> attacker (if they guess our route cache hash secret, to fill our system with
> routes that all hash to the same bucket, destroying our performance.  This patch

This is completely bogus.  We never allow any chain to grow beyond
the elasticity.  So in the worst case we just bypass the route cache.

Cheers,
Herbert Xu Oct. 5, 2008, 3:20 a.m. UTC | #20
On Sun, Oct 05, 2008 at 11:17:27AM +0800, Herbert Xu wrote:
> Neil Horman <nhorman@tuxdriver.com> wrote:
> >        We currently have the ability to disable our route cache secret interval
> > rebuild timer (by setting it to zero), but if we do that its possible for an
> > attacker (if they guess our route cache hash secret, to fill our system with
> > routes that all hash to the same bucket, destroying our performance.  This patch
> 
> This is completely bogus.  We never allow any chain to grow beyond
> the elasticity.  So in the worst case we just bypass the route cache.

OK we don't actually enforce that as it stands, but perhaps we
should have a maximum chain length that is enforced.

Cheers,
Herbert Xu Oct. 5, 2008, 3:26 a.m. UTC | #21
David Miller <davem@davemloft.net> wrote:
> 
> The idea is that we can by default not rebuild the secret
> at all.

Actually Andrew Dickson <whydna@whydna.net> came up with this idea
quite a while ago: Keep the rehash interval but do nothing until
some chain hits a specified length.  This is quite similar to
what is being discussed here.

Andrew, could you post the patch please?

In addition to this, we should probably enforce that limit as
well by simply not adding the newly created entry or deleting
one forcibly.

Thanks,
Neil Horman Oct. 6, 2008, 12:52 a.m. UTC | #22
On Sun, Oct 05, 2008 at 11:20:47AM +0800, Herbert Xu wrote:
> On Sun, Oct 05, 2008 at 11:17:27AM +0800, Herbert Xu wrote:
> > Neil Horman <nhorman@tuxdriver.com> wrote:
> > >        We currently have the ability to disable our route cache secret interval
> > > rebuild timer (by setting it to zero), but if we do that its possible for an
> > > attacker (if they guess our route cache hash secret, to fill our system with
> > > routes that all hash to the same bucket, destroying our performance.  This patch
> > 
> > This is completely bogus.  We never allow any chain to grow beyond
> > the elasticity.  So in the worst case we just bypass the route cache.
> 
> OK we don't actually enforce that as it stands, but perhaps we
> should have a maximum chain length that is enforced.
> 
Thank you :).  I was just about to respond with a note asking you for a
reference to your previous assertion.  We definately don't enforce that now.

I agree we should have a maximum chain legth that we enforce.  Unfortunately
according to eric, the gc_elasticity value can't be it, because we very
routinely in nominal systems have a limited number of chains that violate the
elasticity, and that should be expected.  Thats why my latest patch tries to
cover that by computing the standard deviation of the set of chains and allowing
chains within avg+4*SD to exist.  With that allowance, we should be able to
remove the periodic rehash entirely.

Best
Neil

> Cheers,
> -- 
> Visit Openswan at http://www.openswan.org/
> Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6ee5354..b95e02a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -145,7 +145,7 @@  static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void		 ipv4_link_failure(struct sk_buff *skb);
 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 static int rt_garbage_collect(struct dst_ops *ops);
-
+static void rt_emergency_hash_rebuild(void);
 
 static struct dst_ops ipv4_dst_ops = {
 	.family =		AF_INET,
@@ -1056,6 +1056,15 @@  restart:
 			*candp = cand->u.dst.rt_next;
 			rt_free(cand);
 		}
+	} else if (chain_length > ip_rt_gc_elasticity) {
+		/*
+		 * We didn't find a route entry we could safely free,
+		 * yet our chain length is over our elasticity value.
+		 * Someone may have guessed our hash secret and is artifically
+		 * filling up our route cache.  Lets do an emergency rebuild
+		 * to be safe
+		 */
+		rt_emergency_hash_rebuild();
 	}
 
 	/* Try to bind route to arp only if it is output
@@ -2946,6 +2955,31 @@  static void rt_secret_reschedule(int old)
 	rtnl_unlock();
 }
 
+static void rt_secret_rebuild_oneshot(void) {
+	struct net *net;
+	int old_interval = ip_rt_secret_interval;
+
+	ip_rt_secret_interval = 0;
+
+	rt_secret_reschedule(old_interval);
+
+	for_each_net(net) 
+		rt_cache_invalidate(net);
+
+	ip_rt_secret_interval = old_interval;
+
+	rt_secret_reschedule(0);
+}
+
+static void rt_emergency_hash_rebuild(void) {
+	if (net_ratelimit()) {
+		printk(KERN_WARNING "Route hash chain too long!\n");
+		printk(KERN_WARNING "Adjust your secret_interval!\n");
+	}
+
+	rt_secret_rebuild_oneshot();
+}
+
 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
 					  struct file *filp,
 					  void __user *buffer, size_t *lenp,