diff mbox

[net-2.6] Re: rib_trie / Fix inflate_threshold_root. Now=15 size=11 bits

Message ID 20090701110407.GC12715@ff.dom.local
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Jarek Poplawski July 1, 2009, 11:04 a.m. UTC
On Wed, Jul 01, 2009 at 10:13:33AM +0000, Jarek Poplawski wrote:
> On Wed, Jul 01, 2009 at 11:43:04AM +0200, Paweł Staszewski wrote:
> ...
> > Yes on on previous patches there was / no warnings / no oopses or lockups
> >
> > But now i apply this patch and i make more testing.
> > First boot with start of bgpd and - traffic is not forwarded
> > So i start to search and make only some routes (static without bgpd)  
> > thru this host
> > And all is working for this host when i make all by static routes.
> >
> > So i change a little my bgp configuration and make default route to only  
> > one of my iBGP peers and start bgpd process
> > All is working and what is weird is number of routes in kernel table.
> > Kernel is learning routes from bgpd but very slowly - really very slowly.
> 
> Pawel, this is really very helpful! So, this is (probably) only about
> timing, not wrong memory freeing. On the other hand this test was only
> for inserts. Btw., if you didn't start the second test, you can skip
> it. I have to rethink this.

So, after your findings I'm about to recommend sending to -stable
3 patches from net-2.6, with additional lowering of threshold_root
settings, but it would be nice if you could give it a try with
CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break
your other apps!) It is expected to work this time...;-) Maybe a
bit slower.

Thanks,
Jarek P.
--------> (all-in-one preempt fixes to apply with vanilla 2.6.29.x)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Paweł Staszewski July 1, 2009, 10:17 p.m. UTC | #1
Jarek Poplawski pisze:
> On Wed, Jul 01, 2009 at 10:13:33AM +0000, Jarek Poplawski wrote:
>   
>> On Wed, Jul 01, 2009 at 11:43:04AM +0200, Paweł Staszewski wrote:
>> ...
>>     
>>> Yes on on previous patches there was / no warnings / no oopses or lockups
>>>
>>> But now i apply this patch and i make more testing.
>>> First boot with start of bgpd and - traffic is not forwarded
>>> So i start to search and make only some routes (static without bgpd)  
>>> thru this host
>>> And all is working for this host when i make all by static routes.
>>>
>>> So i change a little my bgp configuration and make default route to only  
>>> one of my iBGP peers and start bgpd process
>>> All is working and what is weird is number of routes in kernel table.
>>> Kernel is learning routes from bgpd but very slowly - really very slowly.
>>>       
>> Pawel, this is really very helpful! So, this is (probably) only about
>> timing, not wrong memory freeing. On the other hand this test was only
>> for inserts. Btw., if you didn't start the second test, you can skip
>> it. I have to rethink this.
>>     
>
> So, after your findings I'm about to recommend sending to -stable
> 3 patches from net-2.6, with additional lowering of threshold_root
> settings, but it would be nice if you could give it a try with
> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break
> your other apps!) It is expected to work this time...;-) Maybe a
> bit slower.
>
>   
Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE
And working :)

fib_triestats in attached file

I think I can test it with PREEMPT enabled but first i must make some 
other tests of my apps that are on server.

Regards
Paweł Staszewski

> Thanks,
> Jarek P.
> --------> (all-in-one preempt fixes to apply with vanilla 2.6.29.x)
>
> diff -Nurp a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> --- a/net/ipv4/fib_trie.c	2009-07-01 06:17:08.000000000 +0000
> +++ b/net/ipv4/fib_trie.c	2009-07-01 10:43:44.000000000 +0000
> @@ -123,6 +123,7 @@ struct tnode {
>  	union {
>  		struct rcu_head rcu;
>  		struct work_struct work;
> +		struct tnode *tnode_free;
>  	};
>  	struct node *child[0];
>  };
> @@ -161,6 +162,8 @@ static void tnode_put_child_reorg(struct
>  static struct node *resize(struct trie *t, struct tnode *tn);
>  static struct tnode *inflate(struct trie *t, struct tnode *tn);
>  static struct tnode *halve(struct trie *t, struct tnode *tn);
> +/* tnodes to free after resize(); protected by RTNL */
> +static struct tnode *tnode_free_head;
>  
>  static struct kmem_cache *fn_alias_kmem __read_mostly;
>  static struct kmem_cache *trie_leaf_kmem __read_mostly;
> @@ -313,8 +316,8 @@ static inline void check_tnode(const str
>  
>  static const int halve_threshold = 25;
>  static const int inflate_threshold = 50;
> -static const int halve_threshold_root = 8;
> -static const int inflate_threshold_root = 15;
> +static const int halve_threshold_root = 15;
> +static const int inflate_threshold_root = 25;
>  
>  
>  static void __alias_free_mem(struct rcu_head *head)
> @@ -385,6 +388,24 @@ static inline void tnode_free(struct tno
>  		call_rcu(&tn->rcu, __tnode_free_rcu);
>  }
>  
> +static void tnode_free_safe(struct tnode *tn)
> +{
> +	BUG_ON(IS_LEAF(tn));
> +	tn->tnode_free = tnode_free_head;
> +	tnode_free_head = tn;
> +}
> +
> +static void tnode_free_flush(void)
> +{
> +	struct tnode *tn;
> +
> +	while ((tn = tnode_free_head)) {
> +		tnode_free_head = tn->tnode_free;
> +		tn->tnode_free = NULL;
> +		tnode_free(tn);
> +	}
> +}
> +
>  static struct leaf *leaf_new(void)
>  {
>  	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
> @@ -495,7 +516,7 @@ static struct node *resize(struct trie *
>  
>  	/* No children */
>  	if (tn->empty_children == tnode_child_length(tn)) {
> -		tnode_free(tn);
> +		tnode_free_safe(tn);
>  		return NULL;
>  	}
>  	/* One child */
> @@ -509,7 +530,7 @@ static struct node *resize(struct trie *
>  
>  			/* compress one level */
>  			node_set_parent(n, NULL);
> -			tnode_free(tn);
> +			tnode_free_safe(tn);
>  			return n;
>  		}
>  	/*
> @@ -670,7 +691,7 @@ static struct node *resize(struct trie *
>  			/* compress one level */
>  
>  			node_set_parent(n, NULL);
> -			tnode_free(tn);
> +			tnode_free_safe(tn);
>  			return n;
>  		}
>  
> @@ -756,7 +777,7 @@ static struct tnode *inflate(struct trie
>  			put_child(t, tn, 2*i, inode->child[0]);
>  			put_child(t, tn, 2*i+1, inode->child[1]);
>  
> -			tnode_free(inode);
> +			tnode_free_safe(inode);
>  			continue;
>  		}
>  
> @@ -801,9 +822,9 @@ static struct tnode *inflate(struct trie
>  		put_child(t, tn, 2*i, resize(t, left));
>  		put_child(t, tn, 2*i+1, resize(t, right));
>  
> -		tnode_free(inode);
> +		tnode_free_safe(inode);
>  	}
> -	tnode_free(oldtnode);
> +	tnode_free_safe(oldtnode);
>  	return tn;
>  nomem:
>  	{
> @@ -885,7 +906,7 @@ static struct tnode *halve(struct trie *
>  		put_child(t, newBinNode, 1, right);
>  		put_child(t, tn, i/2, resize(t, newBinNode));
>  	}
> -	tnode_free(oldtnode);
> +	tnode_free_safe(oldtnode);
>  	return tn;
>  nomem:
>  	{
> @@ -983,12 +1004,14 @@ fib_find_node(struct trie *t, u32 key)
>  	return NULL;
>  }
>  
> -static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
> +static void trie_rebalance(struct trie *t, struct tnode *tn)
>  {
>  	int wasfull;
> -	t_key cindex, key = tn->key;
> +	t_key cindex, key;
>  	struct tnode *tp;
>  
> +	key = tn->key;
> +
>  	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
>  		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
>  		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
> @@ -999,6 +1022,10 @@ static struct node *trie_rebalance(struc
>  
>  		tp = node_parent((struct node *) tn);
>  		if (!tp)
> +			rcu_assign_pointer(t->trie, (struct node *)tn);
> +
> +		tnode_free_flush();
> +		if (!tp)
>  			break;
>  		tn = tp;
>  	}
> @@ -1007,7 +1034,10 @@ static struct node *trie_rebalance(struc
>  	if (IS_TNODE(tn))
>  		tn = (struct tnode *)resize(t, (struct tnode *)tn);
>  
> -	return (struct node *)tn;
> +	rcu_assign_pointer(t->trie, (struct node *)tn);
> +	tnode_free_flush();
> +
> +	return;
>  }
>  
>  /* only used from updater-side */
> @@ -1155,7 +1185,7 @@ static struct list_head *fib_insert_node
>  
>  	/* Rebalance the trie */
>  
> -	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
> +	trie_rebalance(t, tp);
>  done:
>  	return fa_head;
>  }
> @@ -1575,7 +1605,7 @@ static void trie_leaf_remove(struct trie
>  	if (tp) {
>  		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
>  		put_child(t, (struct tnode *)tp, cindex, NULL);
> -		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
> +		trie_rebalance(t, tp);
>  	} else
>  		rcu_assign_pointer(t->trie, NULL);
>  
>
>
>
cat /proc/net/fib_triestat
Basic info: size of leaf: 20 bytes, size of tnode: 36 bytes.
Main:
        Aver depth:     2.44
        Max depth:      6
        Leaves:         277395
        Prefixes:       290874
        Internal nodes: 66711
          1: 32915  2: 14668  3: 10752  4: 4913  5: 2197  6: 895  7: 367  8: 3  17: 1
        Pointers: 595526
Null ptrs: 251421
Total size: 18044  kB

Counters:
---------
gets = 2705388
backtracks = 137797
semantic match passed = 2658993
semantic match miss = 87
null node hit= 1980950
skipped node resize = 0

Local:
        Aver depth:     3.75
        Max depth:      5
        Leaves:         12
        Prefixes:       13
        Internal nodes: 10
          1: 9  2: 1
        Pointers: 22
Null ptrs: 1
Total size: 2  kB

Counters:
---------
gets = 2709741
backtracks = 1584810
semantic match passed = 4417
semantic match miss = 0
null node hit= 192688
skipped node resize = 0
Jarek Poplawski July 2, 2009, 5:32 a.m. UTC | #2
On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote:
> Jarek Poplawski pisze:
...
>> So, after your findings I'm about to recommend sending to -stable
>> 3 patches from net-2.6, with additional lowering of threshold_root
>> settings, but it would be nice if you could give it a try with
>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break
>> your other apps!) It is expected to work this time...;-) Maybe a
>> bit slower.
>>
>>   
> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE
> And working :)

Hmm... It should, because you tested very similar patch already;-)
Sorry if I didn't make it clear.

>
> fib_triestats in attached file
>
> I think I can test it with PREEMPT enabled but first i must make some  
> other tests of my apps that are on server.

It could probably matter only if you're using some broken out-of-tree
patches. Otherwise the kernel is expected to work OK.

Btw., it would be also interesting to check if there is any difference
wrt. these route cache problems while PREEMPT is enabled.

Thanks,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paweł Staszewski July 2, 2009, 5:43 a.m. UTC | #3
Jarek Poplawski pisze:
> On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote:
>   
>> Jarek Poplawski pisze:
>>     
> ...
>   
>>> So, after your findings I'm about to recommend sending to -stable
>>> 3 patches from net-2.6, with additional lowering of threshold_root
>>> settings, but it would be nice if you could give it a try with
>>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break
>>> your other apps!) It is expected to work this time...;-) Maybe a
>>> bit slower.
>>>
>>>   
>>>       
>> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE
>> And working :)
>>     
>
> Hmm... It should, because you tested very similar patch already;-)
> Sorry if I didn't make it clear.
>
>   
Yes i know there was almost identical one.
And i see this was without sync rcu :)

>> fib_triestats in attached file
>>
>> I think I can test it with PREEMPT enabled but first i must make some  
>> other tests of my apps that are on server.
>>     
>
> It could probably matter only if you're using some broken out-of-tree
> patches. Otherwise the kernel is expected to work OK.
>
>   
Im a little confused about using of PREEMPT kernel because of past
there was many oopses / lockups :) but yes that was a little long time ago.
I will try to make this test today.

> Btw., it would be also interesting to check if there is any difference
> wrt. these route cache problems while PREEMPT is enabled.
>
> Thanks,
> Jarek P.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>   

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski July 2, 2009, 6 a.m. UTC | #4
On Thu, Jul 02, 2009 at 07:43:25AM +0200, Paweł Staszewski wrote:
> Jarek Poplawski pisze:
>> On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote:
>>   
>>> Jarek Poplawski pisze:
>>>     
>> ...
>>   
>>>> So, after your findings I'm about to recommend sending to -stable
>>>> 3 patches from net-2.6, with additional lowering of threshold_root
>>>> settings, but it would be nice if you could give it a try with
>>>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break
>>>> your other apps!) It is expected to work this time...;-) Maybe a
>>>> bit slower.
>>>>
>>>>         
>>> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE
>>> And working :)
>>>     
>>
>> Hmm... It should, because you tested very similar patch already;-)
>> Sorry if I didn't make it clear.
>>
>>   
> Yes i know there was almost identical one.
> And i see this was without sync rcu :)

Yes, it looks like we can't free memory so simple because of such huge
latencies.  

>
>>> fib_triestats in attached file
>>>
>>> I think I can test it with PREEMPT enabled but first i must make some 
>>>  other tests of my apps that are on server.
>>>     
>>
>> It could probably matter only if you're using some broken out-of-tree
>> patches. Otherwise the kernel is expected to work OK.
>>
>>   
> Im a little confused about using of PREEMPT kernel because of past
> there was many oopses / lockups :) but yes that was a little long time ago.
> I will try to make this test today.
>
>> Btw., it would be also interesting to check if there is any difference
>> wrt. these route cache problems while PREEMPT is enabled.

And you're very right! The place we're fixing is the best example. On
the other hand, I hope there is not many such places yet. But if we
test/fix it there will be one less...

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Robert Olsson July 2, 2009, 3:31 p.m. UTC | #5
Jarek Poplawski writes:

 > Yes, it looks like we can't free memory so simple because of such huge
 > latencies.  

 Controlling RCU seems crucial. Insertion of the full BGP table increased
 from 2 seconds to > 20 min with one synchronize_rcu patches.

 And fib_trie "worst case" wrt memory is the root node. So maybe we should 
 monitor changes in root node and use this to control synchronize_rcu.

 Didn't Paul suggest something like this?

 And with don't find any decent solution we have to add an option for 
 a fixed and pre-allocated root-nod typically for BGP-routers.

 Cheers
					--ro
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski July 2, 2009, 7:06 p.m. UTC | #6
On Thu, Jul 02, 2009 at 05:31:58PM +0200, Robert Olsson wrote:
> 
> Jarek Poplawski writes:
> 
>  > Yes, it looks like we can't free memory so simple because of such huge
>  > latencies.  
> 
>  Controlling RCU seems crucial. Insertion of the full BGP table increased
>  from 2 seconds to > 20 min with one synchronize_rcu patches.

I wish I knew this a few days before. I could imagine a slow down,
but it looked like it was stuck. Since these last changes weren't
tested on SMP + PREEMPT I thought there is still something broken.
(I was mainly interested in this synchronize_rcu at the moment as
a preemption test.)  

>  And fib_trie "worst case" wrt memory is the root node. So maybe we should 
>  monitor changes in root node and use this to control synchronize_rcu.
> 
>  Didn't Paul suggest something like this?

Sure, and it needs testing, but we should send some safe preemption
fix for -stable first, don't we?

>  And with don't find any decent solution we have to add an option for 
>  a fixed and pre-allocated root-nod typically for BGP-routers.

Probably you're right; I'd prefer to see the test results showing
a difference vs. simply less aggressive root thresholds. But of
course, even if not convinced, I'll respect your choice as the author
and maintainer, so feel free to NAK my proposals - I won't get it
personally.;-)

Cheers,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Robert Olsson July 2, 2009, 9:32 p.m. UTC | #7
Jarek Poplawski writes:

 > >  Controlling RCU seems crucial. Insertion of the full BGP table increased
 > >  from 2 seconds to > 20 min with one synchronize_rcu patches.
 > 
 > I wish I knew this a few days before. I could imagine a slow down,
 > but it looked like it was stuck. Since these last changes weren't
 > tested on SMP + PREEMPT I thought there is still something broken.
 > (I was mainly interested in this synchronize_rcu at the moment as
 > a preemption test.)  


 Honestly this huge slowdown was surprise for me too. I think I sent 
 you a script so you could insert the full table yourself.

 > >  And fib_trie "worst case" wrt memory is the root node. So maybe we should 
 > >  monitor changes in root node and use this to control synchronize_rcu.
 > > 
 > >  Didn't Paul suggest something like this?
 > 
 > Sure, and it needs testing, but we should send some safe preemption
 > fix for -stable first, don't we?
 
 Yes my hope was that we could combine them... personally I'll need 
 to understand who we can preeemted better in the different configs
 and most of that this can be handled by "standard" RCU.

 > >  And with don't find any decent solution we have to add an option for 
 > >  a fixed and pre-allocated root-nod typically for BGP-routers.
 > 
 > Probably you're right; I'd prefer to see the test results showing
 > a difference vs. simply less aggressive root thresholds. But of
 > course, even if not convinced, I'll respect your choice as the author
 > and maintainer, so feel free to NAK my proposals - I won't get it
 > personally.;-)

 Thresholds we can change no problem... but very soon I'll people 
 will start routing without the route cache this at least in close
 to Internet core ,we will need all fib_look performance we can get.

 fib_trie was designed for classical RCU and no preempt you see the
 names i file... so this new and very challenging work to all of us.
 
 First week of vacation and have to fix the roof of the house...
 it's hot and dirty. 

 Cheers.
					--ro
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski July 2, 2009, 10:13 p.m. UTC | #8
On Thu, Jul 02, 2009 at 11:32:26PM +0200, Robert Olsson wrote:
> 
> Jarek Poplawski writes:
> 
>  > >  Controlling RCU seems crucial. Insertion of the full BGP table increased
>  > >  from 2 seconds to > 20 min with one synchronize_rcu patches.
>  > 
>  > I wish I knew this a few days before. I could imagine a slow down,
>  > but it looked like it was stuck. Since these last changes weren't
>  > tested on SMP + PREEMPT I thought there is still something broken.
>  > (I was mainly interested in this synchronize_rcu at the moment as
>  > a preemption test.)  
> 
> 
>  Honestly this huge slowdown was surprise for me too. I think I sent 
>  you a script so you could insert the full table yourself.

I can't remember this script, but I guess my hardware should be
suitable for reading it.;-)

> 
>  > >  And fib_trie "worst case" wrt memory is the root node. So maybe we should 
>  > >  monitor changes in root node and use this to control synchronize_rcu.
>  > > 
>  > >  Didn't Paul suggest something like this?
>  > 
>  > Sure, and it needs testing, but we should send some safe preemption
>  > fix for -stable first, don't we?
>  
>  Yes my hope was that we could combine them... personally I'll need 
>  to understand who we can preeemted better in the different configs
>  and most of that this can be handled by "standard" RCU.
> 
>  > >  And with don't find any decent solution we have to add an option for 
>  > >  a fixed and pre-allocated root-nod typically for BGP-routers.
>  > 
>  > Probably you're right; I'd prefer to see the test results showing
>  > a difference vs. simply less aggressive root thresholds. But of
>  > course, even if not convinced, I'll respect your choice as the author
>  > and maintainer, so feel free to NAK my proposals - I won't get it
>  > personally.;-)
> 
>  Thresholds we can change no problem... but very soon I'll people 
>  will start routing without the route cache this at least in close
>  to Internet core ,we will need all fib_look performance we can get.

I mean changing thresholds as a temporary solution, until we can
control memory freeing; and it seems to me, even excluding the root
node, there could be a lot of temporary allocations during all those
cycles repeated 10 times.

> 
>  fib_trie was designed for classical RCU and no preempt you see the
>  names i file... so this new and very challenging work to all of us.

Then it should depend on CONFIG_PREEMPT_NONE, I guess.

>  
>  First week of vacation and have to fix the roof of the house...
>  it's hot and dirty. 

Have a nice time,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paweł Staszewski July 5, 2009, 12:26 a.m. UTC | #9
Jarek Poplawski pisze:
> On Thu, Jul 02, 2009 at 07:43:25AM +0200, Paweł Staszewski wrote:
>   
>> Jarek Poplawski pisze:
>>     
>>> On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote:
>>>   
>>>       
>>>> Jarek Poplawski pisze:
>>>>     
>>>>         
>>> ...
>>>   
>>>       
>>>>> So, after your findings I'm about to recommend sending to -stable
>>>>> 3 patches from net-2.6, with additional lowering of threshold_root
>>>>> settings, but it would be nice if you could give it a try with
>>>>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break
>>>>> your other apps!) It is expected to work this time...;-) Maybe a
>>>>> bit slower.
>>>>>
>>>>>   
Ok kernel configured with CONFIG_PREEMPT
and all this day work without any problems (with Jarek last patch).


So in attached file trere is fib_tirestats
I dont see any big change of (cpu load or faster/slower 
routing/propagating routes from bgpd or something else) - in avg there 
is from 2% to 3% more of CPU load i dont know why but it is - i change
from "preempt" to "no preempt" 3 times and check this my "mpstat -P ALL 
1 30"
always avg cpu load was from 2 to 3% more compared to "no preempt"

Regards
Paweł Staszewski

 
>>>>>       
>>>>>           
>>>> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE
>>>> And working :)
>>>>     
>>>>         
>>> Hmm... It should, because you tested very similar patch already;-)
>>> Sorry if I didn't make it clear.
>>>
>>>   
>>>       
>> Yes i know there was almost identical one.
>> And i see this was without sync rcu :)
>>     
>
> Yes, it looks like we can't free memory so simple because of such huge
> latencies.  
>
>   
>>>> fib_triestats in attached file
>>>>
>>>> I think I can test it with PREEMPT enabled but first i must make some 
>>>>  other tests of my apps that are on server.
>>>>     
>>>>         
>>> It could probably matter only if you're using some broken out-of-tree
>>> patches. Otherwise the kernel is expected to work OK.
>>>
>>>   
>>>       
>> Im a little confused about using of PREEMPT kernel because of past
>> there was many oopses / lockups :) but yes that was a little long time ago.
>> I will try to make this test today.
>>
>>     
>>> Btw., it would be also interesting to check if there is any difference
>>> wrt. these route cache problems while PREEMPT is enabled.
>>>       
>
> And you're very right! The place we're fixing is the best example. On
> the other hand, I hope there is not many such places yet. But if we
> test/fix it there will be one less...
>
> Jarek P.
>
>
>   

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paweł Staszewski July 5, 2009, 12:30 a.m. UTC | #10
Oh

I forgot - please Jarek give me patch with sync rcu and i will make test 
on preempt kernel

Thanks
Paweł Staszewski

Paweł Staszewski pisze:
> Jarek Poplawski pisze:
>> On Thu, Jul 02, 2009 at 07:43:25AM +0200, Paweł Staszewski wrote:
>>  
>>> Jarek Poplawski pisze:
>>>    
>>>> On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote:
>>>>        
>>>>> Jarek Poplawski pisze:
>>>>>             
>>>> ...
>>>>        
>>>>>> So, after your findings I'm about to recommend sending to -stable
>>>>>> 3 patches from net-2.6, with additional lowering of threshold_root
>>>>>> settings, but it would be nice if you could give it a try with
>>>>>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break
>>>>>> your other apps!) It is expected to work this time...;-) Maybe a
>>>>>> bit slower.
>>>>>>
>>>>>>   
> Ok kernel configured with CONFIG_PREEMPT
> and all this day work without any problems (with Jarek last patch).
>
>
> So in attached file trere is fib_tirestats
> I dont see any big change of (cpu load or faster/slower 
> routing/propagating routes from bgpd or something else) - in avg there 
> is from 2% to 3% more of CPU load i dont know why but it is - i change
> from "preempt" to "no preempt" 3 times and check this my "mpstat -P 
> ALL 1 30"
> always avg cpu load was from 2 to 3% more compared to "no preempt"
>
> Regards
> Paweł Staszewski
>
>
>>>>>>                 
>>>>> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE
>>>>> And working :)
>>>>>             
>>>> Hmm... It should, because you tested very similar patch already;-)
>>>> Sorry if I didn't make it clear.
>>>>
>>>>         
>>> Yes i know there was almost identical one.
>>> And i see this was without sync rcu :)
>>>     
>>
>> Yes, it looks like we can't free memory so simple because of such huge
>> latencies. 
>>  
>>>>> fib_triestats in attached file
>>>>>
>>>>> I think I can test it with PREEMPT enabled but first i must make 
>>>>> some  other tests of my apps that are on server.
>>>>>             
>>>> It could probably matter only if you're using some broken out-of-tree
>>>> patches. Otherwise the kernel is expected to work OK.
>>>>
>>>>         
>>> Im a little confused about using of PREEMPT kernel because of past
>>> there was many oopses / lockups :) but yes that was a little long 
>>> time ago.
>>> I will try to make this test today.
>>>
>>>    
>>>> Btw., it would be also interesting to check if there is any difference
>>>> wrt. these route cache problems while PREEMPT is enabled.
>>>>       
>>
>> And you're very right! The place we're fixing is the best example. On
>> the other hand, I hope there is not many such places yet. But if we
>> test/fix it there will be one less...
>>
>> Jarek P.
>>
>>
>>   
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paweł Staszewski July 5, 2009, 12:31 a.m. UTC | #11
Sorry again no attachement.




Paweł Staszewski pisze:
> Jarek Poplawski pisze:
>> On Thu, Jul 02, 2009 at 07:43:25AM +0200, Paweł Staszewski wrote:
>>  
>>> Jarek Poplawski pisze:
>>>    
>>>> On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote:
>>>>        
>>>>> Jarek Poplawski pisze:
>>>>>             
>>>> ...
>>>>        
>>>>>> So, after your findings I'm about to recommend sending to -stable
>>>>>> 3 patches from net-2.6, with additional lowering of threshold_root
>>>>>> settings, but it would be nice if you could give it a try with
>>>>>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break
>>>>>> your other apps!) It is expected to work this time...;-) Maybe a
>>>>>> bit slower.
>>>>>>
>>>>>>   
> Ok kernel configured with CONFIG_PREEMPT
> and all this day work without any problems (with Jarek last patch).
>
>
> So in attached file trere is fib_tirestats
> I dont see any big change of (cpu load or faster/slower 
> routing/propagating routes from bgpd or something else) - in avg there 
> is from 2% to 3% more of CPU load i dont know why but it is - i change
> from "preempt" to "no preempt" 3 times and check this my "mpstat -P 
> ALL 1 30"
> always avg cpu load was from 2 to 3% more compared to "no preempt"
>
> Regards
> Paweł Staszewski
>
>
>>>>>>                 
>>>>> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE
>>>>> And working :)
>>>>>             
>>>> Hmm... It should, because you tested very similar patch already;-)
>>>> Sorry if I didn't make it clear.
>>>>
>>>>         
>>> Yes i know there was almost identical one.
>>> And i see this was without sync rcu :)
>>>     
>>
>> Yes, it looks like we can't free memory so simple because of such huge
>> latencies. 
>>  
>>>>> fib_triestats in attached file
>>>>>
>>>>> I think I can test it with PREEMPT enabled but first i must make 
>>>>> some  other tests of my apps that are on server.
>>>>>             
>>>> It could probably matter only if you're using some broken out-of-tree
>>>> patches. Otherwise the kernel is expected to work OK.
>>>>
>>>>         
>>> Im a little confused about using of PREEMPT kernel because of past
>>> there was many oopses / lockups :) but yes that was a little long 
>>> time ago.
>>> I will try to make this test today.
>>>
>>>    
>>>> Btw., it would be also interesting to check if there is any difference
>>>> wrt. these route cache problems while PREEMPT is enabled.
>>>>       
>>
>> And you're very right! The place we're fixing is the best example. On
>> the other hand, I hope there is not many such places yet. But if we
>> test/fix it there will be one less...
>>
>> Jarek P.
>>
>>
>>   
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
cat /proc/net/fib_triestat
Basic info: size of leaf: 20 bytes, size of tnode: 36 bytes.
Main:
        Aver depth:     2.44
        Max depth:      6
        Leaves:         277814
        Prefixes:       291306
        Internal nodes: 66420
          1: 32737  2: 14850  3: 10332  4: 4871  5: 2313  6: 942  7: 371  8: 3  17: 1
        Pointers: 599098
Null ptrs: 254865
Total size: 18067  kB

Counters:
---------
gets = 2003686
backtracks = 78789
semantic match passed = 1977687
semantic match miss = 112
null node hit= 1470619
skipped node resize = 0

Local:
        Aver depth:     3.75
        Max depth:      5
        Leaves:         12
        Prefixes:       13
        Internal nodes: 10
          1: 9  2: 1
        Pointers: 22
Null ptrs: 1
Total size: 2  kB

Counters:
---------
gets = 2008497
backtracks = 1417179
semantic match passed = 4823
semantic match miss = 0
null node hit= 197044
skipped node resize = 0
diff mbox

Patch

diff -Nurp a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
--- a/net/ipv4/fib_trie.c	2009-07-01 06:17:08.000000000 +0000
+++ b/net/ipv4/fib_trie.c	2009-07-01 10:43:44.000000000 +0000
@@ -123,6 +123,7 @@  struct tnode {
 	union {
 		struct rcu_head rcu;
 		struct work_struct work;
+		struct tnode *tnode_free;
 	};
 	struct node *child[0];
 };
@@ -161,6 +162,8 @@  static void tnode_put_child_reorg(struct
 static struct node *resize(struct trie *t, struct tnode *tn);
 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 static struct tnode *halve(struct trie *t, struct tnode *tn);
+/* tnodes to free after resize(); protected by RTNL */
+static struct tnode *tnode_free_head;
 
 static struct kmem_cache *fn_alias_kmem __read_mostly;
 static struct kmem_cache *trie_leaf_kmem __read_mostly;
@@ -313,8 +316,8 @@  static inline void check_tnode(const str
 
 static const int halve_threshold = 25;
 static const int inflate_threshold = 50;
-static const int halve_threshold_root = 8;
-static const int inflate_threshold_root = 15;
+static const int halve_threshold_root = 15;
+static const int inflate_threshold_root = 25;
 
 
 static void __alias_free_mem(struct rcu_head *head)
@@ -385,6 +388,24 @@  static inline void tnode_free(struct tno
 		call_rcu(&tn->rcu, __tnode_free_rcu);
 }
 
+static void tnode_free_safe(struct tnode *tn)
+{
+	BUG_ON(IS_LEAF(tn));
+	tn->tnode_free = tnode_free_head;
+	tnode_free_head = tn;
+}
+
+static void tnode_free_flush(void)
+{
+	struct tnode *tn;
+
+	while ((tn = tnode_free_head)) {
+		tnode_free_head = tn->tnode_free;
+		tn->tnode_free = NULL;
+		tnode_free(tn);
+	}
+}
+
 static struct leaf *leaf_new(void)
 {
 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
@@ -495,7 +516,7 @@  static struct node *resize(struct trie *
 
 	/* No children */
 	if (tn->empty_children == tnode_child_length(tn)) {
-		tnode_free(tn);
+		tnode_free_safe(tn);
 		return NULL;
 	}
 	/* One child */
@@ -509,7 +530,7 @@  static struct node *resize(struct trie *
 
 			/* compress one level */
 			node_set_parent(n, NULL);
-			tnode_free(tn);
+			tnode_free_safe(tn);
 			return n;
 		}
 	/*
@@ -670,7 +691,7 @@  static struct node *resize(struct trie *
 			/* compress one level */
 
 			node_set_parent(n, NULL);
-			tnode_free(tn);
+			tnode_free_safe(tn);
 			return n;
 		}
 
@@ -756,7 +777,7 @@  static struct tnode *inflate(struct trie
 			put_child(t, tn, 2*i, inode->child[0]);
 			put_child(t, tn, 2*i+1, inode->child[1]);
 
-			tnode_free(inode);
+			tnode_free_safe(inode);
 			continue;
 		}
 
@@ -801,9 +822,9 @@  static struct tnode *inflate(struct trie
 		put_child(t, tn, 2*i, resize(t, left));
 		put_child(t, tn, 2*i+1, resize(t, right));
 
-		tnode_free(inode);
+		tnode_free_safe(inode);
 	}
-	tnode_free(oldtnode);
+	tnode_free_safe(oldtnode);
 	return tn;
 nomem:
 	{
@@ -885,7 +906,7 @@  static struct tnode *halve(struct trie *
 		put_child(t, newBinNode, 1, right);
 		put_child(t, tn, i/2, resize(t, newBinNode));
 	}
-	tnode_free(oldtnode);
+	tnode_free_safe(oldtnode);
 	return tn;
 nomem:
 	{
@@ -983,12 +1004,14 @@  fib_find_node(struct trie *t, u32 key)
 	return NULL;
 }
 
-static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
+static void trie_rebalance(struct trie *t, struct tnode *tn)
 {
 	int wasfull;
-	t_key cindex, key = tn->key;
+	t_key cindex, key;
 	struct tnode *tp;
 
+	key = tn->key;
+
 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
@@ -999,6 +1022,10 @@  static struct node *trie_rebalance(struc
 
 		tp = node_parent((struct node *) tn);
 		if (!tp)
+			rcu_assign_pointer(t->trie, (struct node *)tn);
+
+		tnode_free_flush();
+		if (!tp)
 			break;
 		tn = tp;
 	}
@@ -1007,7 +1034,10 @@  static struct node *trie_rebalance(struc
 	if (IS_TNODE(tn))
 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
 
-	return (struct node *)tn;
+	rcu_assign_pointer(t->trie, (struct node *)tn);
+	tnode_free_flush();
+
+	return;
 }
 
 /* only used from updater-side */
@@ -1155,7 +1185,7 @@  static struct list_head *fib_insert_node
 
 	/* Rebalance the trie */
 
-	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+	trie_rebalance(t, tp);
 done:
 	return fa_head;
 }
@@ -1575,7 +1605,7 @@  static void trie_leaf_remove(struct trie
 	if (tp) {
 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
 		put_child(t, (struct tnode *)tp, cindex, NULL);
-		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+		trie_rebalance(t, tp);
 	} else
 		rcu_assign_pointer(t->trie, NULL);