Message ID | 20090701110407.GC12715@ff.dom.local |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
Jarek Poplawski pisze: > On Wed, Jul 01, 2009 at 10:13:33AM +0000, Jarek Poplawski wrote: > >> On Wed, Jul 01, 2009 at 11:43:04AM +0200, Paweł Staszewski wrote: >> ... >> >>> Yes on on previous patches there was / no warnings / no oopses or lockups >>> >>> But now i apply this patch and i make more testing. >>> First boot with start of bgpd and - traffic is not forwarded >>> So i start to search and make only some routes (static without bgpd) >>> thru this host >>> And all is working for this host when i make all by static routes. >>> >>> So i change a little my bgp configuration and make default route to only >>> one of my iBGP peers and start bgpd process >>> All is working and what is weird is number of routes in kernel table. >>> Kernel is learning routes from bgpd but very slowly - really very slowly. >>> >> Pawel, this is really very helpful! So, this is (probably) only about >> timing, not wrong memory freeing. On the other hand this test was only >> for inserts. Btw., if you didn't start the second test, you can skip >> it. I have to rethink this. >> > > So, after your findings I'm about to recommend sending to -stable > 3 patches from net-2.6, with additional lowering of threshold_root > settings, but it would be nice if you could give it a try with > CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break > your other apps!) It is expected to work this time...;-) Maybe a > bit slower. > > Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE And working :) fib_triestats in attached file I think I can test it with PREEMPT enabled but first i must make some other tests of my apps that are on server. Regards Paweł Staszewski > Thanks, > Jarek P. > --------> (all-in-one preempt fixes to apply with vanilla 2.6.29.x) > > diff -Nurp a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c > --- a/net/ipv4/fib_trie.c 2009-07-01 06:17:08.000000000 +0000 > +++ b/net/ipv4/fib_trie.c 2009-07-01 10:43:44.000000000 +0000 > @@ -123,6 +123,7 @@ struct tnode { > union { > struct rcu_head rcu; > struct work_struct work; > + struct tnode *tnode_free; > }; > struct node *child[0]; > }; > @@ -161,6 +162,8 @@ static void tnode_put_child_reorg(struct > static struct node *resize(struct trie *t, struct tnode *tn); > static struct tnode *inflate(struct trie *t, struct tnode *tn); > static struct tnode *halve(struct trie *t, struct tnode *tn); > +/* tnodes to free after resize(); protected by RTNL */ > +static struct tnode *tnode_free_head; > > static struct kmem_cache *fn_alias_kmem __read_mostly; > static struct kmem_cache *trie_leaf_kmem __read_mostly; > @@ -313,8 +316,8 @@ static inline void check_tnode(const str > > static const int halve_threshold = 25; > static const int inflate_threshold = 50; > -static const int halve_threshold_root = 8; > -static const int inflate_threshold_root = 15; > +static const int halve_threshold_root = 15; > +static const int inflate_threshold_root = 25; > > > static void __alias_free_mem(struct rcu_head *head) > @@ -385,6 +388,24 @@ static inline void tnode_free(struct tno > call_rcu(&tn->rcu, __tnode_free_rcu); > } > > +static void tnode_free_safe(struct tnode *tn) > +{ > + BUG_ON(IS_LEAF(tn)); > + tn->tnode_free = tnode_free_head; > + tnode_free_head = tn; > +} > + > +static void tnode_free_flush(void) > +{ > + struct tnode *tn; > + > + while ((tn = tnode_free_head)) { > + tnode_free_head = tn->tnode_free; > + tn->tnode_free = NULL; > + tnode_free(tn); > + } > +} > + > static struct leaf *leaf_new(void) > { > struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); > @@ -495,7 +516,7 @@ static struct node *resize(struct trie * > > /* No children */ > if (tn->empty_children == tnode_child_length(tn)) { > - tnode_free(tn); > + tnode_free_safe(tn); > return NULL; > } > /* One child */ > @@ -509,7 +530,7 @@ static struct node *resize(struct trie * > > /* compress one level */ > node_set_parent(n, NULL); > - tnode_free(tn); > + tnode_free_safe(tn); > return n; > } > /* > @@ -670,7 +691,7 @@ static struct node *resize(struct trie * > /* compress one level */ > > node_set_parent(n, NULL); > - tnode_free(tn); > + tnode_free_safe(tn); > return n; > } > > @@ -756,7 +777,7 @@ static struct tnode *inflate(struct trie > put_child(t, tn, 2*i, inode->child[0]); > put_child(t, tn, 2*i+1, inode->child[1]); > > - tnode_free(inode); > + tnode_free_safe(inode); > continue; > } > > @@ -801,9 +822,9 @@ static struct tnode *inflate(struct trie > put_child(t, tn, 2*i, resize(t, left)); > put_child(t, tn, 2*i+1, resize(t, right)); > > - tnode_free(inode); > + tnode_free_safe(inode); > } > - tnode_free(oldtnode); > + tnode_free_safe(oldtnode); > return tn; > nomem: > { > @@ -885,7 +906,7 @@ static struct tnode *halve(struct trie * > put_child(t, newBinNode, 1, right); > put_child(t, tn, i/2, resize(t, newBinNode)); > } > - tnode_free(oldtnode); > + tnode_free_safe(oldtnode); > return tn; > nomem: > { > @@ -983,12 +1004,14 @@ fib_find_node(struct trie *t, u32 key) > return NULL; > } > > -static struct node *trie_rebalance(struct trie *t, struct tnode *tn) > +static void trie_rebalance(struct trie *t, struct tnode *tn) > { > int wasfull; > - t_key cindex, key = tn->key; > + t_key cindex, key; > struct tnode *tp; > > + key = tn->key; > + > while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { > cindex = tkey_extract_bits(key, tp->pos, tp->bits); > wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); > @@ -999,6 +1022,10 @@ static struct node *trie_rebalance(struc > > tp = node_parent((struct node *) tn); > if (!tp) > + rcu_assign_pointer(t->trie, (struct node *)tn); > + > + tnode_free_flush(); > + if (!tp) > break; > tn = tp; > } > @@ -1007,7 +1034,10 @@ static struct node *trie_rebalance(struc > if (IS_TNODE(tn)) > tn = (struct tnode *)resize(t, (struct tnode *)tn); > > - return (struct node *)tn; > + rcu_assign_pointer(t->trie, (struct node *)tn); > + tnode_free_flush(); > + > + return; > } > > /* only used from updater-side */ > @@ -1155,7 +1185,7 @@ static struct list_head *fib_insert_node > > /* Rebalance the trie */ > > - rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); > + trie_rebalance(t, tp); > done: > return fa_head; > } > @@ -1575,7 +1605,7 @@ static void trie_leaf_remove(struct trie > if (tp) { > t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); > put_child(t, (struct tnode *)tp, cindex, NULL); > - rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); > + trie_rebalance(t, tp); > } else > rcu_assign_pointer(t->trie, NULL); > > > > cat /proc/net/fib_triestat Basic info: size of leaf: 20 bytes, size of tnode: 36 bytes. Main: Aver depth: 2.44 Max depth: 6 Leaves: 277395 Prefixes: 290874 Internal nodes: 66711 1: 32915 2: 14668 3: 10752 4: 4913 5: 2197 6: 895 7: 367 8: 3 17: 1 Pointers: 595526 Null ptrs: 251421 Total size: 18044 kB Counters: --------- gets = 2705388 backtracks = 137797 semantic match passed = 2658993 semantic match miss = 87 null node hit= 1980950 skipped node resize = 0 Local: Aver depth: 3.75 Max depth: 5 Leaves: 12 Prefixes: 13 Internal nodes: 10 1: 9 2: 1 Pointers: 22 Null ptrs: 1 Total size: 2 kB Counters: --------- gets = 2709741 backtracks = 1584810 semantic match passed = 4417 semantic match miss = 0 null node hit= 192688 skipped node resize = 0
On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote: > Jarek Poplawski pisze: ... >> So, after your findings I'm about to recommend sending to -stable >> 3 patches from net-2.6, with additional lowering of threshold_root >> settings, but it would be nice if you could give it a try with >> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break >> your other apps!) It is expected to work this time...;-) Maybe a >> bit slower. >> >> > Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE > And working :) Hmm... It should, because you tested very similar patch already;-) Sorry if I didn't make it clear. > > fib_triestats in attached file > > I think I can test it with PREEMPT enabled but first i must make some > other tests of my apps that are on server. It could probably matter only if you're using some broken out-of-tree patches. Otherwise the kernel is expected to work OK. Btw., it would be also interesting to check if there is any difference wrt. these route cache problems while PREEMPT is enabled. Thanks, Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jarek Poplawski pisze: > On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote: > >> Jarek Poplawski pisze: >> > ... > >>> So, after your findings I'm about to recommend sending to -stable >>> 3 patches from net-2.6, with additional lowering of threshold_root >>> settings, but it would be nice if you could give it a try with >>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break >>> your other apps!) It is expected to work this time...;-) Maybe a >>> bit slower. >>> >>> >>> >> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE >> And working :) >> > > Hmm... It should, because you tested very similar patch already;-) > Sorry if I didn't make it clear. > > Yes i know there was almost identical one. And i see this was without sync rcu :) >> fib_triestats in attached file >> >> I think I can test it with PREEMPT enabled but first i must make some >> other tests of my apps that are on server. >> > > It could probably matter only if you're using some broken out-of-tree > patches. Otherwise the kernel is expected to work OK. > > Im a little confused about using of PREEMPT kernel because of past there was many oopses / lockups :) but yes that was a little long time ago. I will try to make this test today. > Btw., it would be also interesting to check if there is any difference > wrt. these route cache problems while PREEMPT is enabled. > > Thanks, > Jarek P. > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 02, 2009 at 07:43:25AM +0200, Paweł Staszewski wrote: > Jarek Poplawski pisze: >> On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote: >> >>> Jarek Poplawski pisze: >>> >> ... >> >>>> So, after your findings I'm about to recommend sending to -stable >>>> 3 patches from net-2.6, with additional lowering of threshold_root >>>> settings, but it would be nice if you could give it a try with >>>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break >>>> your other apps!) It is expected to work this time...;-) Maybe a >>>> bit slower. >>>> >>>> >>> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE >>> And working :) >>> >> >> Hmm... It should, because you tested very similar patch already;-) >> Sorry if I didn't make it clear. >> >> > Yes i know there was almost identical one. > And i see this was without sync rcu :) Yes, it looks like we can't free memory so simple because of such huge latencies. > >>> fib_triestats in attached file >>> >>> I think I can test it with PREEMPT enabled but first i must make some >>> other tests of my apps that are on server. >>> >> >> It could probably matter only if you're using some broken out-of-tree >> patches. Otherwise the kernel is expected to work OK. >> >> > Im a little confused about using of PREEMPT kernel because of past > there was many oopses / lockups :) but yes that was a little long time ago. > I will try to make this test today. > >> Btw., it would be also interesting to check if there is any difference >> wrt. these route cache problems while PREEMPT is enabled. And you're very right! The place we're fixing is the best example. On the other hand, I hope there is not many such places yet. But if we test/fix it there will be one less... Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jarek Poplawski writes: > Yes, it looks like we can't free memory so simple because of such huge > latencies. Controlling RCU seems crucial. Insertion of the full BGP table increased from 2 seconds to > 20 min with one synchronize_rcu patches. And fib_trie "worst case" wrt memory is the root node. So maybe we should monitor changes in root node and use this to control synchronize_rcu. Didn't Paul suggest something like this? And with don't find any decent solution we have to add an option for a fixed and pre-allocated root-nod typically for BGP-routers. Cheers --ro -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 02, 2009 at 05:31:58PM +0200, Robert Olsson wrote: > > Jarek Poplawski writes: > > > Yes, it looks like we can't free memory so simple because of such huge > > latencies. > > Controlling RCU seems crucial. Insertion of the full BGP table increased > from 2 seconds to > 20 min with one synchronize_rcu patches. I wish I knew this a few days before. I could imagine a slow down, but it looked like it was stuck. Since these last changes weren't tested on SMP + PREEMPT I thought there is still something broken. (I was mainly interested in this synchronize_rcu at the moment as a preemption test.) > And fib_trie "worst case" wrt memory is the root node. So maybe we should > monitor changes in root node and use this to control synchronize_rcu. > > Didn't Paul suggest something like this? Sure, and it needs testing, but we should send some safe preemption fix for -stable first, don't we? > And with don't find any decent solution we have to add an option for > a fixed and pre-allocated root-nod typically for BGP-routers. Probably you're right; I'd prefer to see the test results showing a difference vs. simply less aggressive root thresholds. But of course, even if not convinced, I'll respect your choice as the author and maintainer, so feel free to NAK my proposals - I won't get it personally.;-) Cheers, Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jarek Poplawski writes: > > Controlling RCU seems crucial. Insertion of the full BGP table increased > > from 2 seconds to > 20 min with one synchronize_rcu patches. > > I wish I knew this a few days before. I could imagine a slow down, > but it looked like it was stuck. Since these last changes weren't > tested on SMP + PREEMPT I thought there is still something broken. > (I was mainly interested in this synchronize_rcu at the moment as > a preemption test.) Honestly this huge slowdown was surprise for me too. I think I sent you a script so you could insert the full table yourself. > > And fib_trie "worst case" wrt memory is the root node. So maybe we should > > monitor changes in root node and use this to control synchronize_rcu. > > > > Didn't Paul suggest something like this? > > Sure, and it needs testing, but we should send some safe preemption > fix for -stable first, don't we? Yes my hope was that we could combine them... personally I'll need to understand who we can preeemted better in the different configs and most of that this can be handled by "standard" RCU. > > And with don't find any decent solution we have to add an option for > > a fixed and pre-allocated root-nod typically for BGP-routers. > > Probably you're right; I'd prefer to see the test results showing > a difference vs. simply less aggressive root thresholds. But of > course, even if not convinced, I'll respect your choice as the author > and maintainer, so feel free to NAK my proposals - I won't get it > personally.;-) Thresholds we can change no problem... but very soon I'll people will start routing without the route cache this at least in close to Internet core ,we will need all fib_look performance we can get. fib_trie was designed for classical RCU and no preempt you see the names i file... so this new and very challenging work to all of us. First week of vacation and have to fix the roof of the house... it's hot and dirty. Cheers. --ro -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 02, 2009 at 11:32:26PM +0200, Robert Olsson wrote: > > Jarek Poplawski writes: > > > > Controlling RCU seems crucial. Insertion of the full BGP table increased > > > from 2 seconds to > 20 min with one synchronize_rcu patches. > > > > I wish I knew this a few days before. I could imagine a slow down, > > but it looked like it was stuck. Since these last changes weren't > > tested on SMP + PREEMPT I thought there is still something broken. > > (I was mainly interested in this synchronize_rcu at the moment as > > a preemption test.) > > > Honestly this huge slowdown was surprise for me too. I think I sent > you a script so you could insert the full table yourself. I can't remember this script, but I guess my hardware should be suitable for reading it.;-) > > > > And fib_trie "worst case" wrt memory is the root node. So maybe we should > > > monitor changes in root node and use this to control synchronize_rcu. > > > > > > Didn't Paul suggest something like this? > > > > Sure, and it needs testing, but we should send some safe preemption > > fix for -stable first, don't we? > > Yes my hope was that we could combine them... personally I'll need > to understand who we can preeemted better in the different configs > and most of that this can be handled by "standard" RCU. > > > > And with don't find any decent solution we have to add an option for > > > a fixed and pre-allocated root-nod typically for BGP-routers. > > > > Probably you're right; I'd prefer to see the test results showing > > a difference vs. simply less aggressive root thresholds. But of > > course, even if not convinced, I'll respect your choice as the author > > and maintainer, so feel free to NAK my proposals - I won't get it > > personally.;-) > > Thresholds we can change no problem... but very soon I'll people > will start routing without the route cache this at least in close > to Internet core ,we will need all fib_look performance we can get. I mean changing thresholds as a temporary solution, until we can control memory freeing; and it seems to me, even excluding the root node, there could be a lot of temporary allocations during all those cycles repeated 10 times. > > fib_trie was designed for classical RCU and no preempt you see the > names i file... so this new and very challenging work to all of us. Then it should depend on CONFIG_PREEMPT_NONE, I guess. > > First week of vacation and have to fix the roof of the house... > it's hot and dirty. Have a nice time, Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jarek Poplawski pisze: > On Thu, Jul 02, 2009 at 07:43:25AM +0200, Paweł Staszewski wrote: > >> Jarek Poplawski pisze: >> >>> On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote: >>> >>> >>>> Jarek Poplawski pisze: >>>> >>>> >>> ... >>> >>> >>>>> So, after your findings I'm about to recommend sending to -stable >>>>> 3 patches from net-2.6, with additional lowering of threshold_root >>>>> settings, but it would be nice if you could give it a try with >>>>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break >>>>> your other apps!) It is expected to work this time...;-) Maybe a >>>>> bit slower. >>>>> >>>>> Ok kernel configured with CONFIG_PREEMPT and all this day work without any problems (with Jarek last patch). So in attached file trere is fib_tirestats I dont see any big change of (cpu load or faster/slower routing/propagating routes from bgpd or something else) - in avg there is from 2% to 3% more of CPU load i dont know why but it is - i change from "preempt" to "no preempt" 3 times and check this my "mpstat -P ALL 1 30" always avg cpu load was from 2 to 3% more compared to "no preempt" Regards Paweł Staszewski >>>>> >>>>> >>>> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE >>>> And working :) >>>> >>>> >>> Hmm... It should, because you tested very similar patch already;-) >>> Sorry if I didn't make it clear. >>> >>> >>> >> Yes i know there was almost identical one. >> And i see this was without sync rcu :) >> > > Yes, it looks like we can't free memory so simple because of such huge > latencies. > > >>>> fib_triestats in attached file >>>> >>>> I think I can test it with PREEMPT enabled but first i must make some >>>> other tests of my apps that are on server. >>>> >>>> >>> It could probably matter only if you're using some broken out-of-tree >>> patches. Otherwise the kernel is expected to work OK. >>> >>> >>> >> Im a little confused about using of PREEMPT kernel because of past >> there was many oopses / lockups :) but yes that was a little long time ago. >> I will try to make this test today. >> >> >>> Btw., it would be also interesting to check if there is any difference >>> wrt. these route cache problems while PREEMPT is enabled. >>> > > And you're very right! The place we're fixing is the best example. On > the other hand, I hope there is not many such places yet. But if we > test/fix it there will be one less... > > Jarek P. > > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Oh I forgot - please Jarek give me patch with sync rcu and i will make test on preempt kernel Thanks Paweł Staszewski Paweł Staszewski pisze: > Jarek Poplawski pisze: >> On Thu, Jul 02, 2009 at 07:43:25AM +0200, Paweł Staszewski wrote: >> >>> Jarek Poplawski pisze: >>> >>>> On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote: >>>> >>>>> Jarek Poplawski pisze: >>>>> >>>> ... >>>> >>>>>> So, after your findings I'm about to recommend sending to -stable >>>>>> 3 patches from net-2.6, with additional lowering of threshold_root >>>>>> settings, but it would be nice if you could give it a try with >>>>>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break >>>>>> your other apps!) It is expected to work this time...;-) Maybe a >>>>>> bit slower. >>>>>> >>>>>> > Ok kernel configured with CONFIG_PREEMPT > and all this day work without any problems (with Jarek last patch). > > > So in attached file trere is fib_tirestats > I dont see any big change of (cpu load or faster/slower > routing/propagating routes from bgpd or something else) - in avg there > is from 2% to 3% more of CPU load i dont know why but it is - i change > from "preempt" to "no preempt" 3 times and check this my "mpstat -P > ALL 1 30" > always avg cpu load was from 2 to 3% more compared to "no preempt" > > Regards > Paweł Staszewski > > >>>>>> >>>>> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE >>>>> And working :) >>>>> >>>> Hmm... It should, because you tested very similar patch already;-) >>>> Sorry if I didn't make it clear. >>>> >>>> >>> Yes i know there was almost identical one. >>> And i see this was without sync rcu :) >>> >> >> Yes, it looks like we can't free memory so simple because of such huge >> latencies. >> >>>>> fib_triestats in attached file >>>>> >>>>> I think I can test it with PREEMPT enabled but first i must make >>>>> some other tests of my apps that are on server. >>>>> >>>> It could probably matter only if you're using some broken out-of-tree >>>> patches. Otherwise the kernel is expected to work OK. >>>> >>>> >>> Im a little confused about using of PREEMPT kernel because of past >>> there was many oopses / lockups :) but yes that was a little long >>> time ago. >>> I will try to make this test today. >>> >>> >>>> Btw., it would be also interesting to check if there is any difference >>>> wrt. these route cache problems while PREEMPT is enabled. >>>> >> >> And you're very right! The place we're fixing is the best example. On >> the other hand, I hope there is not many such places yet. But if we >> test/fix it there will be one less... >> >> Jarek P. >> >> >> > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Sorry again no attachement. Paweł Staszewski pisze: > Jarek Poplawski pisze: >> On Thu, Jul 02, 2009 at 07:43:25AM +0200, Paweł Staszewski wrote: >> >>> Jarek Poplawski pisze: >>> >>>> On Thu, Jul 02, 2009 at 12:17:19AM +0200, Paweł Staszewski wrote: >>>> >>>>> Jarek Poplawski pisze: >>>>> >>>> ... >>>> >>>>>> So, after your findings I'm about to recommend sending to -stable >>>>>> 3 patches from net-2.6, with additional lowering of threshold_root >>>>>> settings, but it would be nice if you could give it a try with >>>>>> CONFIG_PREEMPT instead of CONFIG_PREEMPT_NONE (if it doesn't break >>>>>> your other apps!) It is expected to work this time...;-) Maybe a >>>>>> bit slower. >>>>>> >>>>>> > Ok kernel configured with CONFIG_PREEMPT > and all this day work without any problems (with Jarek last patch). > > > So in attached file trere is fib_tirestats > I dont see any big change of (cpu load or faster/slower > routing/propagating routes from bgpd or something else) - in avg there > is from 2% to 3% more of CPU load i dont know why but it is - i change > from "preempt" to "no preempt" 3 times and check this my "mpstat -P > ALL 1 30" > always avg cpu load was from 2 to 3% more compared to "no preempt" > > Regards > Paweł Staszewski > > >>>>>> >>>>> Patch applied to 2.6.29.5 with CONFIG_PREEMPT_NONE >>>>> And working :) >>>>> >>>> Hmm... It should, because you tested very similar patch already;-) >>>> Sorry if I didn't make it clear. >>>> >>>> >>> Yes i know there was almost identical one. >>> And i see this was without sync rcu :) >>> >> >> Yes, it looks like we can't free memory so simple because of such huge >> latencies. >> >>>>> fib_triestats in attached file >>>>> >>>>> I think I can test it with PREEMPT enabled but first i must make >>>>> some other tests of my apps that are on server. >>>>> >>>> It could probably matter only if you're using some broken out-of-tree >>>> patches. Otherwise the kernel is expected to work OK. >>>> >>>> >>> Im a little confused about using of PREEMPT kernel because of past >>> there was many oopses / lockups :) but yes that was a little long >>> time ago. >>> I will try to make this test today. >>> >>> >>>> Btw., it would be also interesting to check if there is any difference >>>> wrt. these route cache problems while PREEMPT is enabled. >>>> >> >> And you're very right! The place we're fixing is the best example. On >> the other hand, I hope there is not many such places yet. But if we >> test/fix it there will be one less... >> >> Jarek P. >> >> >> > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > cat /proc/net/fib_triestat Basic info: size of leaf: 20 bytes, size of tnode: 36 bytes. Main: Aver depth: 2.44 Max depth: 6 Leaves: 277814 Prefixes: 291306 Internal nodes: 66420 1: 32737 2: 14850 3: 10332 4: 4871 5: 2313 6: 942 7: 371 8: 3 17: 1 Pointers: 599098 Null ptrs: 254865 Total size: 18067 kB Counters: --------- gets = 2003686 backtracks = 78789 semantic match passed = 1977687 semantic match miss = 112 null node hit= 1470619 skipped node resize = 0 Local: Aver depth: 3.75 Max depth: 5 Leaves: 12 Prefixes: 13 Internal nodes: 10 1: 9 2: 1 Pointers: 22 Null ptrs: 1 Total size: 2 kB Counters: --------- gets = 2008497 backtracks = 1417179 semantic match passed = 4823 semantic match miss = 0 null node hit= 197044 skipped node resize = 0
diff -Nurp a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c --- a/net/ipv4/fib_trie.c 2009-07-01 06:17:08.000000000 +0000 +++ b/net/ipv4/fib_trie.c 2009-07-01 10:43:44.000000000 +0000 @@ -123,6 +123,7 @@ struct tnode { union { struct rcu_head rcu; struct work_struct work; + struct tnode *tnode_free; }; struct node *child[0]; }; @@ -161,6 +162,8 @@ static void tnode_put_child_reorg(struct static struct node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); +/* tnodes to free after resize(); protected by RTNL */ +static struct tnode *tnode_free_head; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; @@ -313,8 +316,8 @@ static inline void check_tnode(const str static const int halve_threshold = 25; static const int inflate_threshold = 50; -static const int halve_threshold_root = 8; -static const int inflate_threshold_root = 15; +static const int halve_threshold_root = 15; +static const int inflate_threshold_root = 25; static void __alias_free_mem(struct rcu_head *head) @@ -385,6 +388,24 @@ static inline void tnode_free(struct tno call_rcu(&tn->rcu, __tnode_free_rcu); } +static void tnode_free_safe(struct tnode *tn) +{ + BUG_ON(IS_LEAF(tn)); + tn->tnode_free = tnode_free_head; + tnode_free_head = tn; +} + +static void tnode_free_flush(void) +{ + struct tnode *tn; + + while ((tn = tnode_free_head)) { + tnode_free_head = tn->tnode_free; + tn->tnode_free = NULL; + tnode_free(tn); + } +} + static struct leaf *leaf_new(void) { struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); @@ -495,7 +516,7 @@ static struct node *resize(struct trie * /* No children */ if (tn->empty_children == tnode_child_length(tn)) { - tnode_free(tn); + tnode_free_safe(tn); return NULL; } /* One child */ @@ -509,7 +530,7 @@ static struct node *resize(struct trie * /* compress one level */ node_set_parent(n, NULL); - tnode_free(tn); + tnode_free_safe(tn); return n; } /* @@ -670,7 +691,7 @@ static struct node *resize(struct trie * /* compress one level */ node_set_parent(n, NULL); - tnode_free(tn); + tnode_free_safe(tn); return n; } @@ -756,7 +777,7 @@ static struct tnode *inflate(struct trie put_child(t, tn, 2*i, inode->child[0]); put_child(t, tn, 2*i+1, inode->child[1]); - tnode_free(inode); + tnode_free_safe(inode); continue; } @@ -801,9 +822,9 @@ static struct tnode *inflate(struct trie put_child(t, tn, 2*i, resize(t, left)); put_child(t, tn, 2*i+1, resize(t, right)); - tnode_free(inode); + tnode_free_safe(inode); } - tnode_free(oldtnode); + tnode_free_safe(oldtnode); return tn; nomem: { @@ -885,7 +906,7 @@ static struct tnode *halve(struct trie * put_child(t, newBinNode, 1, right); put_child(t, tn, i/2, resize(t, newBinNode)); } - tnode_free(oldtnode); + tnode_free_safe(oldtnode); return tn; nomem: { @@ -983,12 +1004,14 @@ fib_find_node(struct trie *t, u32 key) return NULL; } -static struct node *trie_rebalance(struct trie *t, struct tnode *tn) +static void trie_rebalance(struct trie *t, struct tnode *tn) { int wasfull; - t_key cindex, key = tn->key; + t_key cindex, key; struct tnode *tp; + key = tn->key; + while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); @@ -999,6 +1022,10 @@ static struct node *trie_rebalance(struc tp = node_parent((struct node *) tn); if (!tp) + rcu_assign_pointer(t->trie, (struct node *)tn); + + tnode_free_flush(); + if (!tp) break; tn = tp; } @@ -1007,7 +1034,10 @@ static struct node *trie_rebalance(struc if (IS_TNODE(tn)) tn = (struct tnode *)resize(t, (struct tnode *)tn); - return (struct node *)tn; + rcu_assign_pointer(t->trie, (struct node *)tn); + tnode_free_flush(); + + return; } /* only used from updater-side */ @@ -1155,7 +1185,7 @@ static struct list_head *fib_insert_node /* Rebalance the trie */ - rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); + trie_rebalance(t, tp); done: return fa_head; } @@ -1575,7 +1605,7 @@ static void trie_leaf_remove(struct trie if (tp) { t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, NULL); - rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); + trie_rebalance(t, tp); } else rcu_assign_pointer(t->trie, NULL);