Message ID | 20090708223459.GB3666@ami.dom.local |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
Jarek Poplawski pisze: > Pawel Staszewski wrote, On 06/30/2009 10:36 AM: > ... > >>>>>> rt_worker_func() taking 13% of cpu0 is an alarm for me :) >>>>>> And 21% of cpu0 and 34% of cpu6 taken by oprofiled seems odd too... >>>>>> > > Pawel, here is a patch which changes this function (or what it calls) > back to 2.6.28 version; I'm not sure it's OK, so try it very > cautiously... > > Cheers, > Jarek P. > --- (for debugging only; apply to 2.6.29.5 or .6) > > I added this patch 30min ago - all is working but problem still exist. There is only one change - without this patch cpu load was rising from 40 to 50% With this patch there is 15 to 25% cpu load. > diff -Nurp a/net/ipv4/route.c b/net/ipv4/route.c > --- a/net/ipv4/route.c 2009-07-08 23:42:15.000000000 +0200 > +++ b/net/ipv4/route.c 2009-07-08 22:47:52.000000000 +0200 > @@ -769,24 +769,11 @@ static void rt_do_flush(int process_cont > } > } > > -/* > - * While freeing expired entries, we compute average chain length > - * and standard deviation, using fixed-point arithmetic. > - * This to have an estimation of rt_chain_length_max > - * rt_chain_length_max = max(elasticity, AVG + 4*SD) > - * We use 3 bits for frational part, and 29 (or 61) for magnitude. > - */ > - > -#define FRACT_BITS 3 > -#define ONE (1UL << FRACT_BITS) > - > static void rt_check_expire(void) > { > static unsigned int rover; > unsigned int i = rover, goal; > - struct rtable *rth, *aux, **rthp; > - unsigned long samples = 0; > - unsigned long sum = 0, sum2 = 0; > + struct rtable *rth, **rthp; > u64 mult; > > mult = ((u64)ip_rt_gc_interval) << rt_hash_log; > @@ -797,7 +784,6 @@ static void rt_check_expire(void) > goal = rt_hash_mask + 1; > for (; goal > 0; goal--) { > unsigned long tmo = ip_rt_gc_timeout; > - unsigned long length; > > i = (i + 1) & rt_hash_mask; > rthp = &rt_hash_table[i].chain; > @@ -805,14 +791,10 @@ static void rt_check_expire(void) > if (need_resched()) > cond_resched(); > > - samples++; > - > if (*rthp == NULL) > continue; > - length = 0; > spin_lock_bh(rt_hash_lock_addr(i)); > while ((rth = *rthp) != NULL) { > - prefetch(rth->u.dst.rt_next); > if (rt_is_expired(rth)) { > *rthp = rth->u.dst.rt_next; > rt_free(rth); > @@ -821,46 +803,23 @@ static void rt_check_expire(void) > if (rth->u.dst.expires) { > /* Entry is expired even if it is in use */ > if (time_before_eq(jiffies, rth->u.dst.expires)) { > -nofree: > tmo >>= 1; > rthp = &rth->u.dst.rt_next; > - /* > - * We only count entries on > - * a chain with equal hash inputs once > - * so that entries for different QOS > - * levels, and other non-hash input > - * attributes don't unfairly skew > - * the length computation > - */ > - for (aux = rt_hash_table[i].chain;;) { > - if (aux == rth) { > - length += ONE; > - break; > - } > - if (compare_hash_inputs(&aux->fl, &rth->fl)) > - break; > - aux = aux->u.dst.rt_next; > - } > continue; > } > - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) > - goto nofree; > + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { > + tmo >>= 1; > + rthp = &rth->u.dst.rt_next; > + continue; > + } > > /* Cleanup aged off entries. */ > *rthp = rth->u.dst.rt_next; > rt_free(rth); > } > spin_unlock_bh(rt_hash_lock_addr(i)); > - sum += length; > - sum2 += length*length; > - } > - if (samples) { > - unsigned long avg = sum / samples; > - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); > - rt_chain_length_max = max_t(unsigned long, > - ip_rt_gc_elasticity, > - (avg + 4*sd) >> FRACT_BITS); > } > + rt_chain_length_max = ip_rt_gc_elasticity; > rover = i; > } > > > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Today i make other tests with change of /proc/sys/net/ipv4/rt_cache_rebuild_count and kernel 2.6.30.1 And when rt_cache_rebuild_count is set to "-1" i have always load on x86_64 machine approx 40-50% of each cpu where network card is binded by irq_aff when rt_cache_rebuild_count is set to more than "-1" i have 15 to 20 sec of 1 to 3% cpu and after 40-50% cpu I attach some oprofile output for rt_cache_rebuild_count = 4 - oprofile_rt_cache_rebuild_count_4.txt and rt_cache_rebuild_count = -1 - oprofile_rt_cache_rebuild_count_-1.txt Regards Pawel Staszewski Paweł Staszewski pisze: > Jarek Poplawski pisze: >> Pawel Staszewski wrote, On 06/30/2009 10:36 AM: >> ... >> >>>>>>> rt_worker_func() taking 13% of cpu0 is an alarm for me :) >>>>>>> And 21% of cpu0 and 34% of cpu6 taken by oprofiled seems odd too... >>>>>>> >> >> Pawel, here is a patch which changes this function (or what it calls) >> back to 2.6.28 version; I'm not sure it's OK, so try it very >> cautiously... >> >> Cheers, >> Jarek P. >> --- (for debugging only; apply to 2.6.29.5 or .6) >> >> > I added this patch 30min ago - all is working but problem still exist. > > There is only one change - without this patch cpu load was rising from > 40 to 50% > With this patch there is 15 to 25% cpu load. > > >> diff -Nurp a/net/ipv4/route.c b/net/ipv4/route.c >> --- a/net/ipv4/route.c 2009-07-08 23:42:15.000000000 +0200 >> +++ b/net/ipv4/route.c 2009-07-08 22:47:52.000000000 +0200 >> @@ -769,24 +769,11 @@ static void rt_do_flush(int process_cont >> } >> } >> >> -/* >> - * While freeing expired entries, we compute average chain length >> - * and standard deviation, using fixed-point arithmetic. >> - * This to have an estimation of rt_chain_length_max >> - * rt_chain_length_max = max(elasticity, AVG + 4*SD) >> - * We use 3 bits for frational part, and 29 (or 61) for magnitude. >> - */ >> - >> -#define FRACT_BITS 3 >> -#define ONE (1UL << FRACT_BITS) >> - >> static void rt_check_expire(void) >> { >> static unsigned int rover; >> unsigned int i = rover, goal; >> - struct rtable *rth, *aux, **rthp; >> - unsigned long samples = 0; >> - unsigned long sum = 0, sum2 = 0; >> + struct rtable *rth, **rthp; >> u64 mult; >> >> mult = ((u64)ip_rt_gc_interval) << rt_hash_log; >> @@ -797,7 +784,6 @@ static void rt_check_expire(void) >> goal = rt_hash_mask + 1; >> for (; goal > 0; goal--) { >> unsigned long tmo = ip_rt_gc_timeout; >> - unsigned long length; >> >> i = (i + 1) & rt_hash_mask; >> rthp = &rt_hash_table[i].chain; >> @@ -805,14 +791,10 @@ static void rt_check_expire(void) >> if (need_resched()) >> cond_resched(); >> >> - samples++; >> - >> if (*rthp == NULL) >> continue; >> - length = 0; >> spin_lock_bh(rt_hash_lock_addr(i)); >> while ((rth = *rthp) != NULL) { >> - prefetch(rth->u.dst.rt_next); >> if (rt_is_expired(rth)) { >> *rthp = rth->u.dst.rt_next; >> rt_free(rth); >> @@ -821,46 +803,23 @@ static void rt_check_expire(void) >> if (rth->u.dst.expires) { >> /* Entry is expired even if it is in use */ >> if (time_before_eq(jiffies, rth->u.dst.expires)) { >> -nofree: >> tmo >>= 1; >> rthp = &rth->u.dst.rt_next; >> - /* >> - * We only count entries on >> - * a chain with equal hash inputs once >> - * so that entries for different QOS >> - * levels, and other non-hash input >> - * attributes don't unfairly skew >> - * the length computation >> - */ >> - for (aux = rt_hash_table[i].chain;;) { >> - if (aux == rth) { >> - length += ONE; >> - break; >> - } >> - if (compare_hash_inputs(&aux->fl, &rth->fl)) >> - break; >> - aux = aux->u.dst.rt_next; >> - } >> continue; >> } >> - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) >> - goto nofree; >> + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { >> + tmo >>= 1; >> + rthp = &rth->u.dst.rt_next; >> + continue; >> + } >> >> /* Cleanup aged off entries. */ >> *rthp = rth->u.dst.rt_next; >> rt_free(rth); >> } >> spin_unlock_bh(rt_hash_lock_addr(i)); >> - sum += length; >> - sum2 += length*length; >> - } >> - if (samples) { >> - unsigned long avg = sum / samples; >> - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); >> - rt_chain_length_max = max_t(unsigned long, >> - ip_rt_gc_elasticity, >> - (avg + 4*sd) >> FRACT_BITS); >> } >> + rt_chain_length_max = ip_rt_gc_elasticity; >> rover = i; >> } >> >> >> >> > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > CPU: Core 2, speed 2999.65 MHz (estimated) Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000 Samples on CPU 0 Samples on CPU 1 Samples on CPU 2 Samples on CPU 3 Samples on CPU 4 Samples on CPU 5 Samples on CPU 6 Samples on CPU 7 samples % samples % samples % samples % samples % samples % samples % samples % image name app name symbol name 2051 17.8488 0 0 0 0 0 0 0 0 0 0 0 0 0 0 oprofiled oprofiled (no symbols) 1845 16.0560 16362 4.7340 22049 4.2067 1949 16.5197 100 9.6618 43 10.2625 95 10.5205 49 2.3512 vmlinux vmlinux mwait_idle 1126 9.7990 0 0 0 0 0 0 0 0 0 0 0 0 0 0 vmlinux vmlinux rt_worker_func 751 6.5355 51 0.0148 53 0.0101 5 0.0424 3 0.2899 0 0 4 0.4430 16 0.7678 vmlinux vmlinux free_block 621 5.4042 0 0 4 7.6e-04 3766 31.9207 1 0.0966 0 0 0 0 686 32.9175 opreport opreport (no symbols) 568 4.9430 25 0.0072 38 0.0073 0 0 0 0 0 0 0 0 0 0 vmlinux vmlinux dst_destroy 303 2.6368 2 5.8e-04 6 0.0011 106 0.8985 0 0 0 0 5 0.5537 26 1.2476 vmlinux vmlinux copy_user_generic_string 296 2.5759 17175 4.9692 23050 4.3977 196 1.6613 59 5.7005 28 6.6826 71 7.8627 63 3.0230 vmlinux vmlinux _raw_spin_lock 267 2.3236 21 0.0061 3044 0.5808 1328 11.2561 3 0.2899 1 0.2387 26 2.8793 265 12.7159 libc-2.8.so libc-2.8.so (no symbols) 261 2.2713 0 0 1 1.9e-04 36 0.3051 0 0 0 0 1 0.1107 6 0.2879 vmlinux vmlinux search_by_key 247 2.1495 9 0.0026 21 0.0040 3 0.0254 1 0.0966 0 0 0 0 0 0 vmlinux vmlinux __call_rcu 219 1.9058 0 0 0 0 1177 9.9763 0 0 0 0 0 0 247 11.8522 libstdc++.so.6 .0.10 libstdc++.so.6.0.10 (no symbols) 150 1.3054 1606 0.4647 2104 0.4014 5 0.0424 2 0.1932 3 0.7160 2 0.2215 1 0.0480 vmlinux vmlinux get_next_timer_interrupt 138 1.2009 15 0.0043 23 0.0044 2 0.0170 2 0.1932 0 0 3 0.3322 1 0.0480 vmlinux vmlinux __rcu_process_callbacks 105 0.9138 321 0.0929 368 0.0702 44 0.3729 271 26.1836 54 12.8878 63 6.9767 57 2.7351 vmlinux vmlinux tg_shares_up 100 0.8702 27 0.0078 16 0.0031 0 0 0 0 0 0 0 0 0 0 vmlinux vmlinux memmove 88 0.7658 1 2.9e-04 7 0.0013 488 4.1363 0 0 0 0 16 1.7719 121 5.8061 vmlinux vmlinux clear_page_c 86 0.7484 0 0 0 0 454 3.8481 0 0 0 0 0 0 96 4.6065 libbfd-2.18.so libbfd-2.18.so (no symbols) 65 0.5657 0 0 2 3.8e-04 2 0.0170 0 0 0 0 0 0 0 0 vmlinux vmlinux __find_get_block 62 0.5396 1536 0.4444 3313 0.6321 15 0.1271 4 0.3865 0 0 4 0.4430 5 0.2399 vmlinux vmlinux _raw_spin_unlock 57 0.4960 0 0 0 0 0 0 0 0 0 0 0 0 0 0 vmlinux vmlinux create_virtual_node 52 0.4525 6 0.0017 0 0 930 7.8827 50 4.8309 163 38.9021 95 10.5205 13 0.6238 vmlinux vmlinux mutex_spin_on_owner 50 0.4351 2 5.8e-04 2 3.8e-04 0 0 0 0 0 0 0 0 0 0 vmlinux vmlinux dst_rcu_free 49 0.4264 477 0.1380 871 0.1662 2 0.0170 1 0.0966 0 0 0 0 1 0.0480 vmlinux CPU: Core 2, speed 2999.65 MHz (estimated) Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000 Samples on CPU 0 Samples on CPU 1 Samples on CPU 2 Samples on CPU 3 Samples on CPU 4 Samples on CPU 5 Samples on CPU 6 Samples on CPU 7 samples % samples % samples % samples % samples % samples % samples % samples % image name app name symbol name 5233 15.7190 760 0.0795 259 0.0190 10 0.1857 0 0 1 0.1247 340 9.6454 20 1.4006 libc-2.8.so libc-2.8.so (no symbols) 5116 15.3675 0 0 0 0 0 0 0 0 0 0 0 0 0 0 bgpd bgpd bgp_best_selection 4499 13.5142 37423 3.9162 47399 3.4832 4615 85.7010 918 9.9804 172 21.4464 540 15.3191 191 13.3754 vmlinux vmlinux mwait_idle 3635 10.9189 0 0 2102 0.1545 0 0 0 0 0 0 0 0 0 0 oprofiled oprofiled (no symbols) 2855 8.5759 765 0.0801 0 0 0 0 0 0 0 0 0 0 0 0 bgpd bgpd bgp_route_next 1626 4.8842 372 0.0389 0 0 0 0 0 0 0 0 0 0 0 0 bgpd bgpd bgp_scan_timer 1611 4.8391 0 0 0 0 0 0 0 0 0 0 0 0 0 0 bgpd bgpd bgp_process_main 1133 3.4033 1 1.0e-04 0 0 0 0 0 0 0 0 0 0 0 0 bgpd bgpd bgp_process_announce_selected 950 2.8536 218 0.0228 0 0 0 0 0 0 0 0 0 0 0 0 bgpd bgpd bgp_process 486 1.4599 11 0.0012 240 0.0176 1 0.0186 0 0 0 0 31 0.8794 1 0.0700 vmlinux vmlinux copy_user_generic_string 442 1.3277 111 0.0116 0 0 0 0 0 0 0 0 0 0 0 0 bgpd bgpd bgp_nexthop_lookup 376 1.1294 1 1.0e-04 0 0 0 0 0 0 0 0 0 0 0 0 libzebra.so.0. 0.0 libzebra.so.0.0.0 work_queue_run 323 0.9702 82 0.0086 2 1.5e-04 0 0 0 0 0 0 0 0 0 0 libzebra.so.0. 0.0 libzebra.so.0.0.0 prefix_match 237 0.7119 64 0.0067 1 7.3e-05 0 0 0 0 0 0 0 0 0 0 libzebra.so.0. 0.0 libzebra.so.0.0.0 zcalloc 216 0.6488 0 0 0 0 0 0 0 0 0 0 0 0 0 0 vmlinux vmlinux rt_worker_func 203 0.6098 3070 0.3213 4038 0.2967 7 0.1300 32 0.3479 13 1.6209 3 0.0851 6 0.4202 vmlinux vmlinux get_next_timer_interrupt 192 0.5767 30 0.0031 0 0 0 0 0 0 0 0 0 0 0 0 libzebra.so.0. 0.0 libzebra.so.0.0.0 .plt 189 0.5677 0 0 0 0 0 0 0 0 0 0 0 0 0 0 bgpd bgpd bgp_adj_out_unset 187 0.5617 40562 4.2447 54422 3.9993 65 1.2071 409 4.4466 73 9.1022 126 3.5745 116 8.1232 vmlinux vmlinux _raw_spin_lock 144 0.4325 60 0.0063 3 2.2e-04 0 0 0 0 0 0 0 0 0 0 bgpd bgpd bgp_node_get 127 0.3815 40 0.0042 0 0 0 0 0 0 0 0 0 0 0 0 libzebra.so.0. 0.0 libzebra.so.0.0.0 listnode_add 124 0.3725 1183 0.1238 1623 0.1193 72 1.3370 2894 31.4634 90 11.2219 108 3.0638 191 13.3754 vmlinux vmlinux tg_shares_up 119 0.3575 0 0 13 9.6e-04 0 0 0 0 0 0 0 0 0 0 vmlinux vmlinux mutex_spin_on_owner 114 0.3424 14 0.0015 0 0 0 0 0 0 0 0 0 0 0 0 bgpd
diff -Nurp a/net/ipv4/route.c b/net/ipv4/route.c --- a/net/ipv4/route.c 2009-07-08 23:42:15.000000000 +0200 +++ b/net/ipv4/route.c 2009-07-08 22:47:52.000000000 +0200 @@ -769,24 +769,11 @@ static void rt_do_flush(int process_cont } } -/* - * While freeing expired entries, we compute average chain length - * and standard deviation, using fixed-point arithmetic. - * This to have an estimation of rt_chain_length_max - * rt_chain_length_max = max(elasticity, AVG + 4*SD) - * We use 3 bits for frational part, and 29 (or 61) for magnitude. - */ - -#define FRACT_BITS 3 -#define ONE (1UL << FRACT_BITS) - static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, *aux, **rthp; - unsigned long samples = 0; - unsigned long sum = 0, sum2 = 0; + struct rtable *rth, **rthp; u64 mult; mult = ((u64)ip_rt_gc_interval) << rt_hash_log; @@ -797,7 +784,6 @@ static void rt_check_expire(void) goal = rt_hash_mask + 1; for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; - unsigned long length; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; @@ -805,14 +791,10 @@ static void rt_check_expire(void) if (need_resched()) cond_resched(); - samples++; - if (*rthp == NULL) continue; - length = 0; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { - prefetch(rth->u.dst.rt_next); if (rt_is_expired(rth)) { *rthp = rth->u.dst.rt_next; rt_free(rth); @@ -821,46 +803,23 @@ static void rt_check_expire(void) if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ if (time_before_eq(jiffies, rth->u.dst.expires)) { -nofree: tmo >>= 1; rthp = &rth->u.dst.rt_next; - /* - * We only count entries on - * a chain with equal hash inputs once - * so that entries for different QOS - * levels, and other non-hash input - * attributes don't unfairly skew - * the length computation - */ - for (aux = rt_hash_table[i].chain;;) { - if (aux == rth) { - length += ONE; - break; - } - if (compare_hash_inputs(&aux->fl, &rth->fl)) - break; - aux = aux->u.dst.rt_next; - } continue; } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) - goto nofree; + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { + tmo >>= 1; + rthp = &rth->u.dst.rt_next; + continue; + } /* Cleanup aged off entries. */ *rthp = rth->u.dst.rt_next; rt_free(rth); } spin_unlock_bh(rt_hash_lock_addr(i)); - sum += length; - sum2 += length*length; - } - if (samples) { - unsigned long avg = sum / samples; - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); - rt_chain_length_max = max_t(unsigned long, - ip_rt_gc_elasticity, - (avg + 4*sd) >> FRACT_BITS); } + rt_chain_length_max = ip_rt_gc_elasticity; rover = i; }