Message ID | 1272010378-2955-1-git-send-email-xiaosuo@gmail.com |
---|---|
State | Accepted, archived |
Delegated to: | David Miller |
Headers | show |
Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit : > batch skb dequeueing from softnet input_pkt_queue. > > batch skb dequeueing from softnet input_pkt_queue to reduce potential lock > contention when RPS is enabled. > > Note: in the worst case, the number of packets in a softnet_data may be double > of netdev_max_backlog. > > Signed-off-by: Changli Gao <xiaosuo@gmail.com> Very good patch Changli, thanks ! Lets see how it improves thing for Jamal benchs ;) Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> > ---- > include/linux/netdevice.h | 6 +++-- > net/core/dev.c | 50 +++++++++++++++++++++++++++++++--------------- > 2 files changed, 38 insertions(+), 18 deletions(-) > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 3c5ed5f..6ae9f2b 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -1387,6 +1387,7 @@ struct softnet_data { > struct Qdisc *output_queue; > struct list_head poll_list; > struct sk_buff *completion_queue; > + struct sk_buff_head process_queue; > > #ifdef CONFIG_RPS > struct softnet_data *rps_ipi_list; > @@ -1401,10 +1402,11 @@ struct softnet_data { > struct napi_struct backlog; > }; > > -static inline void input_queue_head_incr(struct softnet_data *sd) > +static inline void input_queue_head_add(struct softnet_data *sd, > + unsigned int len) > { > #ifdef CONFIG_RPS > - sd->input_queue_head++; > + sd->input_queue_head += len; > #endif > } > > diff --git a/net/core/dev.c b/net/core/dev.c > index a4a7c36..c1585f9 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -2409,12 +2409,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > __get_cpu_var(netdev_rx_stat).total++; > > rps_lock(sd); > - if (sd->input_pkt_queue.qlen <= netdev_max_backlog) { > - if (sd->input_pkt_queue.qlen) { > + if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { > + if (skb_queue_len(&sd->input_pkt_queue)) { > enqueue: > __skb_queue_tail(&sd->input_pkt_queue, skb); > #ifdef CONFIG_RPS > - *qtail = sd->input_queue_head + sd->input_pkt_queue.qlen; > + *qtail = sd->input_queue_head + > + skb_queue_len(&sd->input_pkt_queue); > #endif > rps_unlock(sd); > local_irq_restore(flags); > @@ -2934,13 +2935,21 @@ static void flush_backlog(void *arg) > struct sk_buff *skb, *tmp; > > rps_lock(sd); > - skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) > + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { > if (skb->dev == dev) { > __skb_unlink(skb, &sd->input_pkt_queue); > kfree_skb(skb); > - input_queue_head_incr(sd); > + input_queue_head_add(sd, 1); > } > + } > rps_unlock(sd); > + > + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { > + if (skb->dev == dev) { > + __skb_unlink(skb, &sd->process_queue); > + kfree_skb(skb); > + } > + } > } > > static int napi_gro_complete(struct sk_buff *skb) > @@ -3286,24 +3295,30 @@ static int process_backlog(struct napi_struct *napi, int quota) > } > #endif > napi->weight = weight_p; > - do { > + local_irq_disable(); > + while (1) { > struct sk_buff *skb; > > - local_irq_disable(); > + while ((skb = __skb_dequeue(&sd->process_queue))) { > + local_irq_enable(); > + __netif_receive_skb(skb); > + if (++work >= quota) > + return work; > + local_irq_disable(); > + } > + > rps_lock(sd); > - skb = __skb_dequeue(&sd->input_pkt_queue); > - if (!skb) { > + input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue)); > + skb_queue_splice_tail_init(&sd->input_pkt_queue, > + &sd->process_queue); > + if (skb_queue_empty(&sd->process_queue)) { > __napi_complete(napi); > rps_unlock(sd); > - local_irq_enable(); > break; > } > - input_queue_head_incr(sd); > rps_unlock(sd); > - local_irq_enable(); > - > - __netif_receive_skb(skb); > - } while (++work < quota); > + } > + local_irq_enable(); > > return work; > } > @@ -5631,8 +5646,10 @@ static int dev_cpu_callback(struct notifier_block *nfb, > /* Process offline CPU's input_pkt_queue */ > while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { > netif_rx(skb); > - input_queue_head_incr(oldsd); > + input_queue_head_add(oldsd, 1); > } > + while ((skb = __skb_dequeue(&oldsd->process_queue))) > + netif_rx(skb); > > return NOTIFY_OK; > } > @@ -5851,6 +5868,7 @@ static int __init net_dev_init(void) > struct softnet_data *sd = &per_cpu(softnet_data, i); > > skb_queue_head_init(&sd->input_pkt_queue); > + skb_queue_head_init(&sd->process_queue); > sd->completion_queue = NULL; > INIT_LIST_HEAD(&sd->poll_list); > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2010-04-23 at 11:27 +0200, Eric Dumazet wrote: > > Lets see how it improves thing for Jamal benchs ;) Ive done a setup with the last patch from Changli + net-next - I will post test results tomorrow AM. cheers, jamal -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2010-04-23 at 18:02 -0400, jamal wrote: > Ive done a setup with the last patch from Changli + net-next - I will > post test results tomorrow AM. ok, annotated results attached. cheers, jamal sink cpu all cpuint cpuapp nn-standalone 93.95% 84.5% 99.8% 79.8% nn-rps 96.41% 85.4% 95.5% 82.5% nn-cl 97.29% 84.0% 99.9% 79.6% nn-cl-rps 97.76% 86.5% 96.5% 84.8% nn-standalone: Basic net-next from Apr23 nn-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0 nn-cl: Basic net-next from Apr23 + Changli patch nn-cl-rps: Basic net-next from Apr23 + Changli patch + rps mask ee,irq aff cpu0 sink: the amount of traffic the system was able to sink in. cpu all: avg % system cpu consumed in test cpuint: avg %cpu consumed by the cpu where interrupts happened cpuapp: avg %cpu consumed by a sample cpu which did app processing Testing was as previously explained.. I repeated each test 4-5 times and took averages.. It seems the non-rps case has improved drammatically since the last net-next i tested. The rps case has also improved but the gap between rps and non-rps is smaller. [There are just too many variables for me to pinpoint to one item as being the contributor. For example sky2 driver may have become worse (consumes more cycles) but i cant quantify it yet (i just see sky2_rx_submit showing up higher in profiles than before). Also call_function_single_interrupt shows up prominently on application processing CPUs but improved by Changli's changes]. After doing the math, I dont trust my results after applying Changlis patch. It seems both the rps and non-rps case have gotten better (and i dont see Changlis contribution to non-rps). It also seems that the gap between rps and non-rps is non-existent now. In other words, there is no benefit to using rps (it consumes more cpu for the same throughput). So it is likely that i need to repeat these tests; maybe i did something wrong in my setup... And here are the profiles: -------------------------- cpu0 always received all the interrupts regardless of the tests. cpu1, 7 etc were processing apps.. I could not spot much difference between before and after Changli's I: Test setup : nn-standalone: Basic net-next from Apr23 All cpus ------------------------------------------------------------------------------- PerfTop: 3784 irqs/sec kernel:84.2% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 3254.00 10.3% sky2_poll [sky2] 1853.00 5.9% _raw_spin_lock_irqsave [kernel] 872.00 2.8% fget [kernel] 870.00 2.8% copy_user_generic_string [kernel] 819.00 2.6% _raw_spin_unlock_irqrestore [kernel] 729.00 2.3% sys_epoll_ctl [kernel] 701.00 2.2% datagram_poll [kernel] 615.00 2.0% udp_recvmsg [kernel] 602.00 1.9% _raw_spin_lock_bh [kernel] 595.00 1.9% system_call [kernel] 592.00 1.9% kmem_cache_free [kernel] 574.00 1.8% schedule [kernel] 568.00 1.8% _raw_spin_lock [kernel] ------------------------------------------------------------------------------- PerfTop: 3574 irqs/sec kernel:85.1% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 5023.00 10.9% sky2_poll [sky2] 2762.00 6.0% _raw_spin_lock_irqsave [kernel] 1319.00 2.9% copy_user_generic_string [kernel] 1306.00 2.8% fget [kernel] 1198.00 2.6% _raw_spin_unlock_irqrestore [kernel] 1071.00 2.3% datagram_poll [kernel] 1061.00 2.3% sys_epoll_ctl [kernel] 927.00 2.0% _raw_spin_lock_bh [kernel] 917.00 2.0% system_call [kernel] 901.00 1.9% udp_recvmsg [kernel] 895.00 1.9% kmem_cache_free [kernel] 819.00 1.8% _raw_spin_lock [kernel] 802.00 1.7% schedule [kernel] 774.00 1.7% sys_epoll_wait [kernel] 720.00 1.6% kmem_cache_alloc [kernel] ------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 751.00 36.1% sky2_poll [sky2] 108.00 5.2% __udp4_lib_lookup [kernel] 95.00 4.6% ip_route_input [kernel] 83.00 4.0% _raw_spin_lock [kernel] 79.00 3.8% _raw_spin_lock_irqsave [kernel] 77.00 3.7% __netif_receive_skb [kernel] 77.00 3.7% __alloc_skb [kernel] 66.00 3.2% ip_rcv [kernel] 60.00 2.9% __udp4_lib_rcv [kernel] 54.00 2.6% sock_queue_rcv_skb [kernel] 45.00 2.2% sky2_rx_submit [sky2] 42.00 2.0% __wake_up_common [kernel] 40.00 1.9% __kmalloc [kernel] 39.00 1.9% sock_def_readable [kernel] 30.00 1.4% ep_poll_callback [kernel] ------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:99.8% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 3511.00 36.7% sky2_poll [sky2] 519.00 5.4% __udp4_lib_lookup [kernel] 431.00 4.5% ip_route_input [kernel] 353.00 3.7% _raw_spin_lock_irqsave [kernel] 351.00 3.7% __alloc_skb [kernel] 338.00 3.5% __netif_receive_skb [kernel] 337.00 3.5% _raw_spin_lock [kernel] 307.00 3.2% ip_rcv [kernel] 264.00 2.8% sky2_rx_submit [sky2] 254.00 2.7% sock_queue_rcv_skb [kernel] 246.00 2.6% __udp4_lib_rcv [kernel] 206.00 2.2% sock_def_readable [kernel] 177.00 1.9% __wake_up_common [kernel] 168.00 1.8% __kmalloc [kernel] ------------------------------------------------------------------------------- PerfTop: 908 irqs/sec kernel:80.0% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 177.00 6.7% _raw_spin_lock_irqsave [kernel] 120.00 4.5% copy_user_generic_string [kernel] 110.00 4.2% fget [kernel] 108.00 4.1% datagram_poll [kernel] 98.00 3.7% _raw_spin_lock_bh [kernel] 91.00 3.4% sys_epoll_ctl [kernel] 89.00 3.4% kmem_cache_free [kernel] 77.00 2.9% system_call [kernel] 76.00 2.9% schedule [kernel] 76.00 2.9% _raw_spin_unlock_irqrestore [kernel] 63.00 2.4% fput [kernel] 61.00 2.3% sys_epoll_wait [kernel] 61.00 2.3% udp_recvmsg [kernel] 49.00 1.8% process_recv mcpudp ------------------------------------------------------------------------------- PerfTop: 815 irqs/sec kernel:79.8% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ _________________ 491.00 8.0% _raw_spin_lock_irqsave [kernel.kallsyms] 285.00 4.7% copy_user_generic_string [kernel.kallsyms] 252.00 4.1% fget [kernel.kallsyms] 215.00 3.5% datagram_poll [kernel.kallsyms] 206.00 3.4% _raw_spin_unlock_irqrestore [kernel.kallsyms] 204.00 3.3% sys_epoll_ctl [kernel.kallsyms] 196.00 3.2% _raw_spin_lock_bh [kernel.kallsyms] 184.00 3.0% udp_recvmsg [kernel.kallsyms] 184.00 3.0% kmem_cache_free [kernel.kallsyms] 180.00 2.9% system_call [kernel.kallsyms] 168.00 2.7% sys_epoll_wait [kernel.kallsyms] 159.00 2.6% schedule [kernel.kallsyms] 144.00 2.4% fput [kernel.kallsyms] II: Test setup nn-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0 ------------------------------------------------------------------------------- PerfTop: 3558 irqs/sec kernel:85.0% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ________ 3519.00 15.9% sky2_poll [sky2] 865.00 3.9% _raw_spin_lock_irqsave [kernel] 568.00 2.6% _raw_spin_unlock_irqrestore [kernel] 526.00 2.4% sky2_intr [sky2] 493.00 2.2% __netif_receive_skb [kernel] 477.00 2.2% _raw_spin_lock [kernel] 470.00 2.1% ip_rcv [kernel] 456.00 2.1% fget [kernel] 447.00 2.0% sys_epoll_ctl [kernel] 420.00 1.9% copy_user_generic_string [kernel] 387.00 1.8% ip_route_input [kernel] 359.00 1.6% system_call [kernel] 334.00 1.5% kmem_cache_free [kernel] 310.00 1.4% kmem_cache_alloc [kernel] 302.00 1.4% call_function_single_interrupt [kernel] ------------------------------------------------------------------------------- PerfTop: 3546 irqs/sec kernel:85.8% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ ________ 6592.00 16.2% sky2_poll [sky2] 1540.00 3.8% _raw_spin_lock_irqsave [kernel] 1014.00 2.5% _raw_spin_unlock_irqrestore [kernel] 885.00 2.2% fget [kernel] 881.00 2.2% _raw_spin_lock [kernel] 880.00 2.2% sky2_intr [sky2] 872.00 2.1% __netif_receive_skb [kernel] 858.00 2.1% ip_rcv [kernel] 802.00 2.0% sys_epoll_ctl [kernel] 710.00 1.7% copy_user_generic_string [kernel] 696.00 1.7% system_call [kernel] 692.00 1.7% ip_route_input [kernel] 634.00 1.6% schedule [kernel] 618.00 1.5% kmem_cache_free [kernel] 605.00 1.5% call_function_single_interrupt [kernel] cpu0 ------------------------------------------------------------------------------- PerfTop: 971 irqs/sec kernel:96.5% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 4222.00 58.2% sky2_poll [sky2] 668.00 9.2% sky2_intr [sky2] 228.00 3.1% __alloc_skb [kernel] 183.00 2.5% get_rps_cpu [kernel] 138.00 1.9% sky2_rx_submit [sky2] 124.00 1.7% enqueue_to_backlog [kernel] 119.00 1.6% __kmalloc [kernel] 103.00 1.4% kmem_cache_alloc [kernel] 91.00 1.3% _raw_spin_lock [kernel] 90.00 1.2% _raw_spin_lock_irqsave [kernel] 73.00 1.0% swiotlb_sync_single [kernel] 72.00 1.0% irq_entries_start [kernel] 55.00 0.8% copy_user_generic_string [kernel] 53.00 0.7% _raw_spin_unlock_irqrestore [kernel] 48.00 0.7% fget [kernel] ------------------------------------------------------------------------------- PerfTop: 998 irqs/sec kernel:94.8% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 6745.00 58.5% sky2_poll [sky2] 831.00 7.2% sky2_intr [sky2] 352.00 3.1% __alloc_skb [kernel] 281.00 2.4% get_rps_cpu [kernel] 226.00 2.0% sky2_rx_submit [sky2] 186.00 1.6% __kmalloc [kernel] 181.00 1.6% enqueue_to_backlog [kernel] 173.00 1.5% _raw_spin_lock_irqsave [kernel] 166.00 1.4% kmem_cache_alloc [kernel] 162.00 1.4% _raw_spin_lock [kernel] 99.00 0.9% swiotlb_sync_single [kernel] 98.00 0.9% irq_entries_start [kernel] 94.00 0.8% fget [kernel] 92.00 0.8% _raw_spin_unlock_irqrestore [kernel] 80.00 0.7% system_call [kernel] cpu1 ------------------------------------------------------------------------------- PerfTop: 724 irqs/sec kernel:82.0% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _________________ 204.00 5.3% _raw_spin_lock_irqsave [kernel.kallsyms] 153.00 4.0% _raw_spin_unlock_irqrestore [kernel.kallsyms] 147.00 3.8% call_function_single_interrupt [kernel.kallsyms] 139.00 3.6% __netif_receive_skb [kernel.kallsyms] 135.00 3.5% sys_epoll_ctl [kernel.kallsyms] 132.00 3.4% ip_rcv [kernel.kallsyms] 129.00 3.3% fget [kernel.kallsyms] 128.00 3.3% _raw_spin_lock [kernel.kallsyms] 122.00 3.2% system_call [kernel.kallsyms] 118.00 3.1% ip_route_input [kernel.kallsyms] 109.00 2.8% kmem_cache_free [kernel.kallsyms] 108.00 2.8% copy_user_generic_string [kernel.kallsyms] 90.00 2.3% schedule [kernel.kallsyms] 85.00 2.2% fput [kernel.kallsyms] ------------------------------------------------------------------------------- PerfTop: 763 irqs/sec kernel:83.0% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _________________ 428.00 6.2% _raw_spin_lock_irqsave [kernel.kallsyms] 302.00 4.4% _raw_spin_unlock_irqrestore [kernel.kallsyms] 269.00 3.9% __netif_receive_skb [kernel.kallsyms] 258.00 3.7% call_function_single_interrupt [kernel.kallsyms] 254.00 3.7% fget [kernel.kallsyms] 238.00 3.4% ip_rcv [kernel.kallsyms] 230.00 3.3% sys_epoll_ctl [kernel.kallsyms] 222.00 3.2% _raw_spin_lock [kernel.kallsyms] 220.00 3.2% ip_route_input [kernel.kallsyms] 197.00 2.9% system_call [kernel.kallsyms] 189.00 2.7% kmem_cache_free [kernel.kallsyms] 184.00 2.7% copy_user_generic_string [kernel.kallsyms] 144.00 2.1% ep_remove [kernel.kallsyms] 140.00 2.0% schedule [kernel.kallsyms] ------------------------------------------------------------------------------- PerfTop: 546 irqs/sec kernel:83.3% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________________ _________________ 346.00 5.7% _raw_spin_lock_irqsave [kernel.kallsyms] 275.00 4.6% _raw_spin_unlock_irqrestore [kernel.kallsyms] 238.00 3.9% call_function_single_interrupt [kernel.kallsyms] 228.00 3.8% fget [kernel.kallsyms] 222.00 3.7% __netif_receive_skb [kernel.kallsyms] 219.00 3.6% sys_epoll_ctl [kernel.kallsyms] 209.00 3.5% _raw_spin_lock [kernel.kallsyms] 205.00 3.4% ip_rcv [kernel.kallsyms] 199.00 3.3% ip_route_input [kernel.kallsyms] 173.00 2.9% system_call [kernel.kallsyms] 170.00 2.8% copy_user_generic_string [kernel.kallsyms] 167.00 2.8% kmem_cache_free [kernel.kallsyms] 127.00 2.1% ep_remove [kernel.kallsyms] 123.00 2.0% dst_release [kernel.kalls III: Test setup nn-cl: Basic net-next from Apr23 + Changli patch ------------------------------------------------------------------------------- PerfTop: 3789 irqs/sec kernel:84.1% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 3514.00 10.2% sky2_poll [sky2] 1862.00 5.4% _raw_spin_lock_irqsave [kernel] 1274.00 3.7% system_call [kernel] 926.00 2.7% fget [kernel] 872.00 2.5% _raw_spin_unlock_irqrestore [kernel] 862.00 2.5% copy_user_generic_string [kernel] 766.00 2.2% sys_epoll_ctl [kernel] 765.00 2.2% datagram_poll [kernel] 671.00 2.0% _raw_spin_lock_bh [kernel] 668.00 1.9% kmem_cache_free [kernel] 602.00 1.8% udp_recvmsg [kernel] 586.00 1.7% _raw_spin_lock [kernel] 585.00 1.7% vread_tsc [kernel].vsyscall_fn ------------------------------------------------------------------------------- PerfTop: 3794 irqs/sec kernel:83.6% [1000Hz cycles], (all, 8 CPUs) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 4756.00 9.8% sky2_poll [sky2] 2742.00 5.7% _raw_spin_lock_irqsave [kernel] 1826.00 3.8% system_call [kernel] 1285.00 2.7% fget [kernel] 1284.00 2.7% copy_user_generic_string [kernel] 1235.00 2.6% _raw_spin_unlock_irqrestore [kernel] 1096.00 2.3% sys_epoll_ctl [kernel] 1071.00 2.2% datagram_poll [kernel] 954.00 2.0% kmem_cache_free [kernel] 925.00 1.9% _raw_spin_lock_bh [kernel] 888.00 1.8% vread_tsc [kernel].vsyscall_fn 880.00 1.8% udp_recvmsg [kernel] 793.00 1.6% _raw_spin_lock [kernel] 790.00 1.6% schedule [kernel] ------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:99.9% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 675.00 32.6% sky2_poll [sky2] 116.00 5.6% __udp4_lib_lookup [kernel] 111.00 5.4% ip_route_input [kernel] 81.00 3.9% _raw_spin_lock_irqsave [kernel] 81.00 3.9% _raw_spin_lock [kernel] 70.00 3.4% __alloc_skb [kernel] 67.00 3.2% ip_rcv [kernel] 66.00 3.2% __netif_receive_skb [kernel] 61.00 2.9% __udp4_lib_rcv [kernel] 57.00 2.8% sock_queue_rcv_skb [kernel] 47.00 2.3% sock_def_readable [kernel] 42.00 2.0% __kmalloc [kernel] 42.00 2.0% __wake_up_common [kernel] 38.00 1.8% sky2_rx_submit [sky2] ------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 2526.00 32.8% sky2_poll [sky2] 406.00 5.3% ip_route_input [kernel] 399.00 5.2% __udp4_lib_lookup [kernel] 328.00 4.3% _raw_spin_lock_irqsave [kernel] 307.00 4.0% _raw_spin_lock [kernel] 296.00 3.8% ip_rcv [kernel] 287.00 3.7% __alloc_skb [kernel] 272.00 3.5% sock_queue_rcv_skb [kernel] 224.00 2.9% __udp4_lib_rcv [kernel] 224.00 2.9% __netif_receive_skb [kernel] 182.00 2.4% sock_def_readable [kernel] 163.00 2.1% __wake_up_common [kernel] 140.00 1.8% sky2_rx_submit [sky2] ------------------------------------------------------------------------------- PerfTop: 1001 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 0) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ______________________ ________ 4445.00 33.4% sky2_poll [sky2] 707.00 5.3% __udp4_lib_lookup [kernel] 662.00 5.0% ip_route_input [kernel] 567.00 4.3% _raw_spin_lock_irqsave [kernel] 512.00 3.8% __alloc_skb [kernel] 506.00 3.8% ip_rcv [kernel] 476.00 3.6% sock_queue_rcv_skb [kernel] 473.00 3.6% _raw_spin_lock [kernel] 415.00 3.1% __udp4_lib_rcv [kernel] 408.00 3.1% __netif_receive_skb [kernel] 306.00 2.3% sock_def_readable [kernel] 272.00 2.0% __wake_up_common [kernel] 260.00 2.0% __kmalloc [kernel] 216.00 1.6% _raw_read_lock [kernel] 214.00 1.6% sky2_rx_submit [sky2] ------------------------------------------------------------------------------- PerfTop: 748 irqs/sec kernel:80.9% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 244.00 7.4% _raw_spin_lock_irqsave [kernel] 207.00 6.2% system_call [kernel] 127.00 3.8% _raw_spin_unlock_irqrestore [kernel] 124.00 3.7% copy_user_generic_string [kernel] 122.00 3.7% sys_epoll_ctl [kernel] 120.00 3.6% fget [kernel] 118.00 3.6% datagram_poll [kernel] 96.00 2.9% schedule [kernel] 94.00 2.8% _raw_spin_lock_bh [kernel] 86.00 2.6% vread_tsc [kernel].vsyscall_fn 82.00 2.5% udp_recvmsg [kernel] 76.00 2.3% fput [kernel] 73.00 2.2% kmem_cache_free [kernel] 67.00 2.0% sys_epoll_wait [kernel] ------------------------------------------------------------------------------- PerfTop: 625 irqs/sec kernel:78.6% [1000Hz cycles], (all, cpu: 1) ------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ____________________ 488.00 7.5% _raw_spin_lock_irqsave [kernel] 380.00 5.9% system_call [kernel] 274.00 4.2% copy_user_generic_string [kernel] 252.00 3.9% fget [kernel] 244.00 3.8% datagram_poll [kernel] 217.00 3.3% _raw_spin_unlock_irqrestore [kernel] 211.00 3.3% sys_epoll_ctl [kernel] 186.00 2.9% schedule [kernel] 185.00 2.9% _raw_spin_lock_bh [kernel] 173.00 2.7% udp_recvmsg [kernel] 169.00 2.6% vread_tsc [kernel].vsyscall_fn 164.00 2.5% kmem_cache_free [kernel] 143.00 2.2% fput [kernel] 133.00 2.1% sys_epoll_wait [kernel] IV: Test setup nn-cl-rps: Basic net-next from Apr23 + Changli patch + rps mask ee,irq aff -------------------------------------------------------------------------- PerfTop: 3043 irqs/sec kernel:87.5% [1000Hz cycles], (all, 8 CPUs) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 2240.00 20.4% sky2_poll [sky2] 375.00 3.4% _raw_spin_lock_irqsave [kernel] 335.00 3.0% sky2_intr [sky2] 326.00 3.0% system_call [kernel] 239.00 2.2% _raw_spin_unlock_irqrestor [kernel] 224.00 2.0% ip_rcv [kernel] 201.00 1.8% __netif_receive_skb [kernel] 198.00 1.8% sys_epoll_ctl [kernel] 190.00 1.7% _raw_spin_lock [kernel] 182.00 1.7% fget [kernel] 169.00 1.5% copy_user_generic_string [kernel] 165.00 1.5% kmem_cache_free [kernel] 149.00 1.4% load_balance [kernel] 146.00 1.3% ip_route_input [kernel] -------------------------------------------------------------------------- PerfTop: 3210 irqs/sec kernel:85.8% [1000Hz cycles], (all, 8 CPUs) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 6539.00 20.4% sky2_poll [sky2] 1106.00 3.4% _raw_spin_lock_irqsave [kernel] 1014.00 3.2% sky2_intr [sky2] 976.00 3.0% system_call [kernel] 684.00 2.1% _raw_spin_unlock_irqrestor [kernel] 611.00 1.9% ip_rcv [kernel] 601.00 1.9% fget [kernel] 593.00 1.8% _raw_spin_lock [kernel] 592.00 1.8% sys_epoll_ctl [kernel] 574.00 1.8% __netif_receive_skb [kernel] 526.00 1.6% copy_user_generic_string [kernel] 482.00 1.5% kmem_cache_free [kernel] 480.00 1.5% ip_route_input [kernel] 425.00 1.3% vread_tsc [kernel].vsyscall_fn 410.00 1.3% kmem_cache_alloc [kernel] -------------------------------------------------------------------------- PerfTop: 999 irqs/sec kernel:97.2% [1000Hz cycles], (all, cpu: 0) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 2035.00 60.5% sky2_poll [sky2] 302.00 9.0% sky2_intr [sky2] 109.00 3.2% __alloc_skb [kernel] 57.00 1.7% _raw_spin_lock [kernel] 57.00 1.7% get_rps_cpu [kernel] 52.00 1.5% __kmalloc [kernel] 51.00 1.5% enqueue_to_backlog [kernel] 49.00 1.5% _raw_spin_lock_irqsave [kernel] 44.00 1.3% kmem_cache_alloc [kernel] 34.00 1.0% sky2_rx_submit [sky2] 33.00 1.0% swiotlb_sync_single [kernel] 31.00 0.9% system_call [kernel] 28.00 0.8% irq_entries_start [kernel] 22.00 0.7% _raw_spin_unlock_irqrestore [kernel] 21.00 0.6% sky2_remove [sky2] -------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:96.2% [1000Hz cycles], (all, cpu: 0) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 5493.00 60.1% sky2_poll [sky2] 803.00 8.8% sky2_intr [sky2] 281.00 3.1% __alloc_skb [kernel] 233.00 2.6% get_rps_cpu [kernel] 136.00 1.5% enqueue_to_backlog [kernel] 132.00 1.4% __kmalloc [kernel] 126.00 1.4% _raw_spin_lock [kernel] 122.00 1.3% kmem_cache_alloc [kernel] 122.00 1.3% _raw_spin_lock_irqsave [kernel] 102.00 1.1% swiotlb_sync_single [kernel] 88.00 1.0% sky2_rx_submit [sky2] 77.00 0.8% system_call [kernel] 69.00 0.8% irq_entries_start [kernel] 55.00 0.6% _raw_spin_unlock_irqrestore [kernel] 54.00 0.6% copy_user_generic_string [kernel] -------------------------------------------------------------------------- PerfTop: 999 irqs/sec kernel:97.5% [1000Hz cycles], (all, cpu: 0) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________ ________ 6699.00 60.1% sky2_poll [sky2] 988.00 8.9% sky2_intr [sky2] 327.00 2.9% __alloc_skb [kernel] 261.00 2.3% get_rps_cpu [kernel] 168.00 1.5% __kmalloc [kernel] 161.00 1.4% kmem_cache_alloc [kernel] 160.00 1.4% enqueue_to_backlog [kernel] 157.00 1.4% _raw_spin_lock [kernel] 125.00 1.1% _raw_spin_lock_irqsave [kernel] 122.00 1.1% swiotlb_sync_single [kernel] 114.00 1.0% sky2_rx_submit [sky2] 96.00 0.9% system_call [kernel] 85.00 0.8% irq_entries_start [kernel] 66.00 0.6% sky2_remove [sky2] 64.00 0.6% _raw_spin_unlock_irqrestore [kernel] -------------------------------------------------------------------------- PerfTop: 420 irqs/sec kernel:84.8% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 188.00 4.8% _raw_spin_lock_irqsave [kernel] 175.00 4.5% system_call [kernel] 155.00 4.0% _raw_spin_unlock_irqrestor [kernel] 143.00 3.7% __netif_receive_skb [kernel] 124.00 3.2% ip_route_input [kernel] 122.00 3.1% fget [kernel] 118.00 3.0% ip_rcv [kernel] 115.00 2.9% sys_epoll_ctl [kernel] 107.00 2.7% call_function_single_inter [kernel] 98.00 2.5% vread_tsc [kernel].vsyscall_fn 97.00 2.5% _raw_spin_lock [kernel] 89.00 2.3% copy_user_generic_string [kernel] -------------------------------------------------------------------------- PerfTop: 372 irqs/sec kernel:87.9% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 212.00 4.6% _raw_spin_lock_irqsave [kernel] 192.00 4.2% system_call [kernel] 187.00 4.1% __netif_receive_skb [kernel] 184.00 4.0% ip_rcv [kernel] 174.00 3.8% ip_route_input [kernel] 165.00 3.6% _raw_spin_unlock_irqrestor [kernel] 143.00 3.1% call_function_single_inter [kernel] 135.00 3.0% fget [kernel] 133.00 2.9% sys_epoll_ctl [kernel] 122.00 2.7% _raw_spin_lock [kernel] 112.00 2.5% __udp4_lib_lookup [kernel] 99.00 2.2% copy_user_generic_string [kernel] 93.00 2.0% vread_tsc [kernel].vsyscall_fn 90.00 2.0% kmem_cache_free [kernel] 89.00 1.9% ep_remove [kernel] o -------------------------------------------------------------------------- PerfTop: 269 irqs/sec kernel:85.1% [1000Hz cycles], (all, cpu: 7) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 23.00 4.6% _raw_spin_lock_irqsave [kernel] 21.00 4.2% system_call [kernel] 19.00 3.8% _raw_spin_unlock_irqrestor [kernel] 17.00 3.4% fget [kernel] 15.00 3.0% __netif_receive_skb [kernel] 14.00 2.8% dst_release [kernel] 13.00 2.6% call_function_single_inter [kernel] 11.00 2.2% kmem_cache_free [kernel] 10.00 2.0% vread_tsc [kernel].vsyscall_fn 10.00 2.0% copy_user_generic_string [kernel] 10.00 2.0% ktime_get [kernel] 10.00 2.0% ip_route_input [kernel] 10.00 2.0% schedule [kernel] -------------------------------------------------------------------------- PerfTop: 253 irqs/sec kernel:84.6% [1000Hz cycles], (all, cpu: 7) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 109.00 4.9% system_call [kernel] 104.00 4.6% _raw_spin_lock_irqsave [kernel] 79.00 3.5% ip_rcv [kernel] 74.00 3.3% _raw_spin_unlock_irqrestor [kernel] 71.00 3.2% fget [kernel] 68.00 3.0% sys_epoll_ctl [kernel] 66.00 2.9% ip_route_input [kernel] 58.00 2.6% call_function_single_inter [kernel] 55.00 2.4% _raw_spin_lock [kernel] 54.00 2.4% copy_user_generic_string [kernel] 53.00 2.4% __netif_receive_skb [kernel] 51.00 2.3% schedule [kernel] 51.00 2.3% kmem_cache_free [kernel] 43.00 1.9% vread_tsc [kernel].vsyscall_fn 38.00 1.7% __udp4_lib_lookup [kernel] -------------------------------------------------------------------------- PerfTop: 236 irqs/sec kernel:84.3% [1000Hz cycles], (all, cpu: 7) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 131.00 4.9% _raw_spin_lock_irqsave [kernel] 128.00 4.8% system_call [kernel] 101.00 3.8% _raw_spin_unlock_irqrestor [kernel] 89.00 3.3% fget [kernel] 85.00 3.2% sys_epoll_ctl [kernel] 81.00 3.0% ip_rcv [kernel] 76.00 2.8% ip_route_input [kernel] 66.00 2.5% call_function_single_inter [kernel] 65.00 2.4% _raw_spin_lock [kernel] 65.00 2.4% kmem_cache_free [kernel] 64.00 2.4% copy_user_generic_string [kernel] 57.00 2.1% __netif_receive_skb [kernel] 47.00 1.8% schedule [kernel] 45.00 1.7% vread_tsc [kernel].vsyscall_fn -------------------------------------------------------------------------- PerfTop: 478 irqs/sec kernel:82.2% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 319.00 5.2% _raw_spin_lock_irqsave [kernel] 289.00 4.7% system_call [kernel] 246.00 4.0% _raw_spin_unlock_irqrestor [kernel] 199.00 3.2% ip_route_input [kernel] 198.00 3.2% __netif_receive_skb [kernel] 197.00 3.2% sys_epoll_ctl [kernel] 183.00 3.0% ip_rcv [kernel] 182.00 2.9% fget [kernel] 166.00 2.7% call_function_single_inter [kernel] 157.00 2.5% copy_user_generic_string [kernel] 149.00 2.4% kmem_cache_free [kernel] 146.00 2.4% vread_tsc [kernel].vsyscall_fn 133.00 2.1% _raw_spin_lock [kernel] 118.00 1.9% schedule [kernel] 112.00 1.8% __udp4_lib_lookup [kernel] -------------------------------------------------------------------------- PerfTop: 535 irqs/sec kernel:83.0% [1000Hz cycles], (all, cpu: 2) -------------------------------------------------------------------------- samples pcnt function DSO _______ _____ __________________________ ____________________ 345.00 5.2% _raw_spin_lock_irqsave [kernel] 291.00 4.4% system_call [kernel] 255.00 3.9% _raw_spin_unlock_irqrestor [kernel] 218.00 3.3% fget [kernel] 201.00 3.0% ip_route_input [kernel] 193.00 2.9% __netif_receive_skb [kernel] 193.00 2.9% sys_epoll_ctl [kernel] 180.00 2.7% ip_rcv [kernel] 173.00 2.6% call_function_single_inter [kernel] 163.00 2.5% copy_user_generic_string [kernel] 152.00 2.3% kmem_cache_free [kernel] 151.00 2.3% vread_tsc [kernel].vsyscall_fn 142.00 2.1% _raw_spin_lock [kernel] 131.00 2.0% schedule [kernel]
Le samedi 24 avril 2010 à 10:10 -0400, jamal a écrit : > On Fri, 2010-04-23 at 18:02 -0400, jamal wrote: > > > Ive done a setup with the last patch from Changli + net-next - I will > > post test results tomorrow AM. > > ok, annotated results attached. > > cheers, > jamal Jamal, I have a Nehalem setup now, and I can see _raw_spin_lock_irqsave() abuse is not coming from network tree, but from clockevents_notify() My pktgen sends 1040989pps : # Samples: 389707198131 # # Overhead Command Shared Object Symbol # ........ .............. ............................ ...... # 23.52% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave | --- _raw_spin_lock_irqsave | |--94.74%-- clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | |--4.10%-- tick_broadcast_oneshot_control | tick_notify | notifier_call_chain | __raw_notifier_call_chain | raw_notifier_call_chain | clockevents_do_notify | clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | |--0.58%-- lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary --0.58%-- [...] 8.94% init [kernel.kallsyms] [k] acpi_os_read_port | --- acpi_os_read_port | |--99.55%-- acpi_hw_read_port | acpi_hw_read | acpi_hw_read_multiple | acpi_hw_register_read | acpi_read_bit_register # Samples: 389233082962 # # Overhead Command Shared Object Symbol # ........ .............. ............................ ...... # 23.25% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave 8.90% init [kernel.kallsyms] [k] acpi_os_read_port 2.93% init [kernel.kallsyms] [k] mwait_idle_with_hints 1.99% init [kernel.kallsyms] [k] schedule 1.94% udpsink [kernel.kallsyms] [k] schedule 1.73% swapper [kernel.kallsyms] [k] _raw_spin_lock_irqsave 1.48% init [kernel.kallsyms] [k] bnx2x_rx_int 1.47% init [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 1.44% init [kernel.kallsyms] [k] _raw_spin_lock 1.36% udpsink [kernel.kallsyms] [k] udp_recvmsg 1.05% udpsink [kernel.kallsyms] [k] __skb_recv_datagram 1.05% init [kernel.kallsyms] [k] __udp4_lib_lookup 1.04% udpsink [kernel.kallsyms] [k] copy_user_generic_string 1.04% udpsink [kernel.kallsyms] [k] __slab_free 0.99% init [kernel.kallsyms] [k] select_task_rq_fair 0.99% init [kernel.kallsyms] [k] try_to_wake_up 0.98% init [kernel.kallsyms] [k] task_rq_lock 0.93% init [kernel.kallsyms] [k] tick_broadcast_oneshot_control 0.89% init [kernel.kallsyms] [k] sock_queue_rcv_skb 0.89% udpsink [kernel.kallsyms] [k] sock_recv_ts_and_drops 0.88% udpsink [kernel.kallsyms] [k] kfree 0.79% swapper [kernel.kallsyms] [k] acpi_os_read_port 0.76% udpsink [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.73% udpsink [kernel.kallsyms] [k] inet_recvmsg 0.71% udpsink [vdso] [.] 0x000000ffffe431 0.65% udpsink [kernel.kallsyms] [k] sock_recvmsg 0.62% init [kernel.kallsyms] [k] gs_change 0.61% init [kernel.kallsyms] [k] enqueue_task_fair 0.61% init [kernel.kallsyms] [k] eth_type_trans 0.61% init [kernel.kallsyms] [k] sock_def_readable 0.60% udpsink [kernel.kallsyms] [k] _raw_spin_lock_bh 0.59% init [kernel.kallsyms] [k] ip_route_input 0.59% udpsink libpthread-2.3.4.so [.] __pthread_disable_asynccancel 0.56% init [kernel.kallsyms] [k] bnx2x_poll 0.56% udpsink [kernel.kallsyms] [k] __get_user_4 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le lundi 26 avril 2010 à 16:03 +0200, Eric Dumazet a écrit : > Le samedi 24 avril 2010 à 10:10 -0400, jamal a écrit : > > On Fri, 2010-04-23 at 18:02 -0400, jamal wrote: > > > > > Ive done a setup with the last patch from Changli + net-next - I will > > > post test results tomorrow AM. > > > > ok, annotated results attached. > > > > cheers, > > jamal > > Jamal, I have a Nehalem setup now, and I can see > _raw_spin_lock_irqsave() abuse is not coming from network tree, but from > clockevents_notify() > Another interesting finding: - if all packets are received on a single queue, max speed seems to be 1.200.000 packets per second on my machine :-( And on profile of receiving cpu (RPS enabled, pakets sent to 15 other cpus), we can see default_send_IPI_mask_sequence_phys() is the slow thing... Andi, what do you think of this one ? Dont we have a function to send an IPI to an individual cpu instead ? void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) { unsigned long query_cpu; unsigned long flags; /* * Hack. The clustered APIC addressing mode doesn't allow us to send * to an arbitrary mask, so I do a unicast to each CPU instead. * - mbligh */ local_irq_save(flags); for_each_cpu(query_cpu, mask) { __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } ----------------------------------------------------------------------------------------------------------------------------------------- PerfTop: 1000 irqs/sec kernel:100.0% [1000Hz cycles], (all, cpu: 7) ----------------------------------------------------------------------------------------------------------------------------------------- samples pcnt function DSO _______ _____ ___________________________________ _______ 668.00 17.7% default_send_IPI_mask_sequence_phys vmlinux 363.00 9.6% bnx2x_rx_int vmlinux 354.00 9.4% eth_type_trans vmlinux 332.00 8.8% kmem_cache_alloc_node vmlinux 285.00 7.6% __kmalloc_node_track_caller vmlinux 278.00 7.4% _raw_spin_lock vmlinux 166.00 4.4% __slab_alloc vmlinux 147.00 3.9% __memset vmlinux 136.00 3.6% list_del vmlinux 132.00 3.5% get_partial_node vmlinux 131.00 3.5% get_rps_cpu vmlinux 102.00 2.7% enqueue_to_backlog vmlinux 95.00 2.5% unmap_single vmlinux 94.00 2.5% __alloc_skb vmlinux 74.00 2.0% vlan_gro_common vmlinux 52.00 1.4% __phys_addr vmlinux 48.00 1.3% dev_gro_receive vmlinux 39.00 1.0% swiotlb_dma_mapping_error vmlinux 36.00 1.0% swiotlb_map_page vmlinux 34.00 0.9% skb_put vmlinux 27.00 0.7% is_swiotlb_buffer vmlinux 23.00 0.6% deactivate_slab vmlinux 20.00 0.5% vlan_gro_receive vmlinux 17.00 0.5% __skb_bond_should_drop vmlinux 14.00 0.4% netif_receive_skb vmlinux 14.00 0.4% __netdev_alloc_skb vmlinux 12.00 0.3% skb_gro_reset_offset vmlinux 12.00 0.3% get_slab vmlinux 11.00 0.3% napi_skb_finish vmlinux -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2010-04-26 at 16:03 +0200, Eric Dumazet wrote: > > Jamal, I have a Nehalem setup now, and I can see > _raw_spin_lock_irqsave() abuse is not coming from network tree, but from > clockevents_notify() yikes. Thanks Eric - I shouldve been able to figure that one out. But why is this thing expensive? I will run the test tommorow and see if i see the same thing. cheers, jamal -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2010-04-26 at 16:55 +0200, Eric Dumazet wrote: > Another interesting finding: > > - if all packets are received on a single queue, max speed seems to be > 1.200.000 packets per second on my machine :-( Well, if any consolation, it is not as bad as sky2 hardware;-> I cant do more than 750Kpps. Also, it seems you use VLANS - max pps will be lower than without VLANs by probably maybe 6-70Kpps (doesnt explain the 1.2Mpps of course). cheers, jamal -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit : > > Andi, what do you think of this one ? > > Dont we have a function to send an IPI to an individual cpu instead ? > > That's what this function already does. You only set a single CPU > in the target mask, right? > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC > which is a bit faster for this, but that's not available in the lower > end Nehalems. But even then it's not exactly fast. > > I don't think the IPI primitive can be optimized much. It's not a cheap > operation. > > If it's a problem do it less often and batch IPIs. > > It's essentially the same problem as interrupt mitigation or NAPI > are solving for NICs. I guess just need a suitable mitigation mechanism. > > Of course that would move more work to the sending CPU again, but > perhaps there's no alternative. I guess you could make it cheaper it by > minimizing access to packet data. > > -Andi Well, IPI are already batched, and rate is auto adaptative. After various changes, it seems things are going better, maybe there is something related to cache line trashing. I 'solved' it by using idle=poll, but you might take a look at clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly contended spinlock... 23.52% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave | --- _raw_spin_lock_irqsave | |--94.74%-- clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | |--4.10%-- tick_broadcast_oneshot_control | tick_notify | notifier_call_chain | __raw_notifier_call_chain | raw_notifier_call_chain | clockevents_do_notify | clockevents_notify | lapic_timer_state_broadcast | acpi_idle_enter_bm | cpuidle_idle_call | cpu_idle | start_secondary | -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit : > > > Andi, what do you think of this one ? > > > Dont we have a function to send an IPI to an individual cpu instead ? > > > > That's what this function already does. You only set a single CPU > > in the target mask, right? > > > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC > > which is a bit faster for this, but that's not available in the lower > > end Nehalems. But even then it's not exactly fast. > > > > I don't think the IPI primitive can be optimized much. It's not a cheap > > operation. > > > > If it's a problem do it less often and batch IPIs. > > > > It's essentially the same problem as interrupt mitigation or NAPI > > are solving for NICs. I guess just need a suitable mitigation mechanism. > > > > Of course that would move more work to the sending CPU again, but > > perhaps there's no alternative. I guess you could make it cheaper it by > > minimizing access to packet data. > > > > -Andi > > Well, IPI are already batched, and rate is auto adaptative. > > After various changes, it seems things are going better, maybe there is > something related to cache line trashing. > > I 'solved' it by using idle=poll, but you might take a look at > clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly > contended spinlock... > > > > > 23.52% init [kernel.kallsyms] [k] _raw_spin_lock_irqsave > | > --- _raw_spin_lock_irqsave > | > |--94.74%-- clockevents_notify > | lapic_timer_state_broadcast > | acpi_idle_enter_bm > | cpuidle_idle_call > | cpu_idle > | start_secondary > | > |--4.10%-- tick_broadcast_oneshot_control > | tick_notify > | notifier_call_chain > | __raw_notifier_call_chain > | raw_notifier_call_chain > | clockevents_do_notify > | clockevents_notify > | lapic_timer_state_broadcast > | acpi_idle_enter_bm > | cpuidle_idle_call > | cpu_idle > | start_secondary > | > I keep getting asked about taking some core's away from clock and scheduler to be reserved just for network processing. Seeing this kind of stuff makes me wonder if maybe that isn't a half bad idea.
Le jeudi 29 avril 2010 à 20:23 +0200, Andi Kleen a écrit : > On Thu, Apr 29, 2010 at 07:56:12PM +0200, Eric Dumazet wrote: > > Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit : > > > > Andi, what do you think of this one ? > > > > Dont we have a function to send an IPI to an individual cpu instead ? > > > > > > That's what this function already does. You only set a single CPU > > > in the target mask, right? > > > > > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC > > > which is a bit faster for this, but that's not available in the lower > > > end Nehalems. But even then it's not exactly fast. > > > > > > I don't think the IPI primitive can be optimized much. It's not a cheap > > > operation. > > > > > > If it's a problem do it less often and batch IPIs. > > > > > > It's essentially the same problem as interrupt mitigation or NAPI > > > are solving for NICs. I guess just need a suitable mitigation mechanism. > > > > > > Of course that would move more work to the sending CPU again, but > > > perhaps there's no alternative. I guess you could make it cheaper it by > > > minimizing access to packet data. > > > > > > -Andi > > > > Well, IPI are already batched, and rate is auto adaptative. > > > > After various changes, it seems things are going better, maybe there is > > something related to cache line trashing. > > > > I 'solved' it by using idle=poll, but you might take a look at > > clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly > > contended spinlock... > > acpi_idle_enter_bm should not be executed on a Nehalem, it's obsolete. > If it does on your system something is wrong. > > Ahh, that triggers a bell. There's one issue that if the remote CPU is in a very > deep idle state it could take a long time to wake it up. Nehalem has deeper > sleep states than earlier CPUs. When this happens the IPI sender will be slow > too I believe. > > Are the target CPUs idle? > Yes, mostly, but about 200.000 wakeups per second I would say... If a cpu in deep state receives an IPI, process a softirq, should it come back to deep state immediately, or should it wait for some milliseconds ? > Perhaps need to feed some information to cpuidle's governour to prevent this problem. > > idle=poll is very drastic, better to limit to C1 > How can I do this ? Thanks ! -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 29 Apr 2010, Stephen Hemminger wrote: > > Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit : > > > > Andi, what do you think of this one ? > > > > Dont we have a function to send an IPI to an individual cpu instead ? > > > > > > That's what this function already does. You only set a single CPU > > > in the target mask, right? > > > > > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC > > > which is a bit faster for this, but that's not available in the lower > > > end Nehalems. But even then it's not exactly fast. > > > > > > I don't think the IPI primitive can be optimized much. It's not a cheap > > > operation. > > > > > > If it's a problem do it less often and batch IPIs. > > > > > > It's essentially the same problem as interrupt mitigation or NAPI > > > are solving for NICs. I guess just need a suitable mitigation mechanism. > > > > > > Of course that would move more work to the sending CPU again, but > > > perhaps there's no alternative. I guess you could make it cheaper it by > > > minimizing access to packet data. > > > > > > -Andi > > > > Well, IPI are already batched, and rate is auto adaptative. > > > > After various changes, it seems things are going better, maybe there is > > something related to cache line trashing. > > > > I 'solved' it by using idle=poll, but you might take a look at > > clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly > > contended spinlock... Say thanks to Intel/AMD for providing us timers which stop in lower c-states. Not much we can do about the broadcast lock when several cores are going idle and we need to setup a global timer to work around the lapic timer stops in C2/C3 issue. Simply the C-state timer broadcasting does not scale. And it was never meant to scale. It's a workaround for laptops to have functional NOHZ. There are several ways to work around that on larger machines: - Restrict c-states - Disable NOHZ and highres timers - idle=poll is definitely the worst of all possible solutions > I keep getting asked about taking some core's away from clock and scheduler > to be reserved just for network processing. Seeing this kind of stuff > makes me wonder if maybe that isn't a half bad idea. This comes up every few month and we pointed out several times what needs to be done to make this work w/o these weird hacks which put a core offline and then start some magic undebugable binary blob on it. We have not seen anyone working on this, but the "set cores aside and let them do X" idea seems to stick in peoples heads. Seriously, that's not a solution. It's going to be some hacked up nightmare which is completely unmaintainable. Aside of that I seriously doubt that you can do networking w/o time and timers. Thanks, tglx
Le jeudi 29 avril 2010 à 21:19 +0200, Thomas Gleixner a écrit : > Say thanks to Intel/AMD for providing us timers which stop in lower > c-states. > > Not much we can do about the broadcast lock when several cores are > going idle and we need to setup a global timer to work around the > lapic timer stops in C2/C3 issue. > > Simply the C-state timer broadcasting does not scale. And it was never > meant to scale. It's a workaround for laptops to have functional NOHZ. > > There are several ways to work around that on larger machines: > > - Restrict c-states > - Disable NOHZ and highres timers > - idle=poll is definitely the worst of all possible solutions > > > I keep getting asked about taking some core's away from clock and scheduler > > to be reserved just for network processing. Seeing this kind of stuff > > makes me wonder if maybe that isn't a half bad idea. > > This comes up every few month and we pointed out several times what > needs to be done to make this work w/o these weird hacks which put a > core offline and then start some magic undebugable binary blob on it. > We have not seen anyone working on this, but the "set cores aside and > let them do X" idea seems to stick in peoples heads. > > Seriously, that's not a solution. It's going to be some hacked up > nightmare which is completely unmaintainable. > > Aside of that I seriously doubt that you can do networking w/o time > and timers. > Thanks a lot ! booting with processor.max_cstate=1 solves the problem (I already had a CONFIG_NO_HZ=no conf, but highres timer enabled) Even with _carefuly_ chosen crazy configuration (receiving a packet on a cpu, then transfert it to another cpu, with a full 16x16 matrix involved), generating 700.000 IPI per second on the machine seems fine now. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le jeudi 29 avril 2010 à 23:41 +0200, Andi Kleen a écrit : > On Thu, Apr 29, 2010 at 09:12:27PM +0200, Eric Dumazet wrote: > > Yes, mostly, but about 200.000 wakeups per second I would say... > > > > If a cpu in deep state receives an IPI, process a softirq, should it > > come back to deep state immediately, or should it wait for some > > milliseconds ? > > In principle the cpuidle governour should detect this and not put the target into > the slow deep c states. One change that was done recently to fix a similar > problem for disk IO was to take processes that wait for IO into account > (see 69d25870). But it doesn't work for networking. > > Here's a untested patch that might help: tell the cpuidle governour > networking is waiting for IO. This will tell it to not go down the deeply. > > I might have missed some schedule() paths, feel free to add more. > > Actually it's probably too aggressive because it will avoid C states even for > a closed window on the other side which might be hours. Better would > be some heuristic to only do this when you're really expected IO shortly. > > Also does your workload even sleep at all? If not we would need to increase > the iowait counters in recvmsg() itself. > My workload yes, uses blocking recvmsg() calls, but Jamal one uses epoll() so I guess problem is more generic than that. We should have an estimate of the number of wakeups (IO or not...) per second (or sub-second) so that cpuidle can avoid these deep states ? > Anyways might be still worth a try. > > For routing we probably need some other solution though, there are no > schedules there. > > > > > > Perhaps need to feed some information to cpuidle's governour to prevent this problem. > > > > > > idle=poll is very drastic, better to limit to C1 > > > > > > > How can I do this ? > > processor.max_cstate=1 or using /dev/network_latency > (see Documentation/power/pm_qos_interface.txt) > > -Andi > Thanks, I'll play with this today ! > > > commit 810227a7c24ecae2bb4aac320490a7115ac33be8 > Author: Andi Kleen <ak@linux.intel.com> > Date: Thu Apr 29 23:33:18 2010 +0200 > > Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies > > XXX: probably too aggressive, some of these sleeps are not under high load. > > Based on a bug report from Eric Dumazet. > > Signed-off-by: Andi Kleen <ak@linux.intel.com> > > diff --git a/net/core/sock.c b/net/core/sock.c > index c5812bb..c246d6c 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -1402,7 +1402,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) > break; > if (sk->sk_err) > break; > - timeo = schedule_timeout(timeo); > + timeo = io_schedule_timeout(timeo); > } > finish_wait(sk->sk_sleep, &wait); > return timeo; > @@ -1512,7 +1512,7 @@ static void __lock_sock(struct sock *sk) > prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, > TASK_UNINTERRUPTIBLE); > spin_unlock_bh(&sk->sk_lock.slock); > - schedule(); > + io_schedule(); > spin_lock_bh(&sk->sk_lock.slock); > if (!sock_owned_by_user(sk)) > break; > > > > > Thanks ! > > > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Eric Dumazet wrote: > Le jeudi 29 avril 2010 à 21:19 +0200, Thomas Gleixner a écrit : > >> Say thanks to Intel/AMD for providing us timers which stop in lower >> c-states. >> >> Not much we can do about the broadcast lock when several cores are >> going idle and we need to setup a global timer to work around the >> lapic timer stops in C2/C3 issue. >> >> Simply the C-state timer broadcasting does not scale. And it was never >> meant to scale. It's a workaround for laptops to have functional NOHZ. >> >> There are several ways to work around that on larger machines: >> >> - Restrict c-states >> - Disable NOHZ and highres timers >> - idle=poll is definitely the worst of all possible solutions >> >>> I keep getting asked about taking some core's away from clock and scheduler >>> to be reserved just for network processing. Seeing this kind of stuff >>> makes me wonder if maybe that isn't a half bad idea. >> This comes up every few month and we pointed out several times what >> needs to be done to make this work w/o these weird hacks which put a >> core offline and then start some magic undebugable binary blob on it. >> We have not seen anyone working on this, but the "set cores aside and >> let them do X" idea seems to stick in peoples heads. >> >> Seriously, that's not a solution. It's going to be some hacked up >> nightmare which is completely unmaintainable. >> >> Aside of that I seriously doubt that you can do networking w/o time >> and timers. >> > > Thanks a lot ! > > booting with processor.max_cstate=1 solves the problem > > (I already had a CONFIG_NO_HZ=no conf, but highres timer enabled) > > Even with _carefuly_ chosen crazy configuration (receiving a packet on a > cpu, then transfert it to another cpu, with a full 16x16 matrix > involved), generating 700.000 IPI per second on the machine seems fine > now. FYI you can also restrict c=states at runtime with PM QoS: Documentation/power/pm_qos_interface.txt On my machine, /sys/devices/system/cpu/cpu0/cpuidle/state2/latency is 205usec, so configuring a PM QoS request for <= 205usec latency should prevent it being entered: #!/usr/bin/python import os; import struct; import signal; latency_rec_usec = 100 f = os.open("/dev/cpu_dma_latency", os.O_WRONLY); os.write(f, struct.pack("=i", latency_rec_usec)); signal.pause(); -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Thomas Gleixner <tglx@linutronix.de> Date: Thu, 29 Apr 2010 21:19:36 +0200 (CEST) > Aside of that I seriously doubt that you can do networking w/o time > and timers. You're right that we need timestamps and the like. But only if we actually process the packets on these restricted cpus :-) If we use RPS and farm out all packets to other cpus, ie. just doing the driver work and the remote cpu dispatch on these "offline" cpus, it is doable. Then we can do cool tricks like having the cpu spin on a mwait() on the network device's status descriptor in memory. In any event I agree with you, it's a cool idea at best, and likely not really practical. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Dave, On Fri, 30 Apr 2010, David Miller wrote: > From: Thomas Gleixner <tglx@linutronix.de> > Date: Thu, 29 Apr 2010 21:19:36 +0200 (CEST) > > > Aside of that I seriously doubt that you can do networking w/o time > > and timers. > > You're right that we need timestamps and the like. > > But only if we actually process the packets on these restricted cpus :-) > > If we use RPS and farm out all packets to other cpus, ie. just doing > the driver work and the remote cpu dispatch on these "offline" cpus, > it is doable. > > Then we can do cool tricks like having the cpu spin on a mwait() on the > network device's status descriptor in memory. > > In any event I agree with you, it's a cool idea at best, and likely > not really practical. Well, it might be worth to experiment with that once we get the basic infrastructure in place to "isolate" cores under full kernel control. It's not too hard to solve the problems, but it seems nobody has a free time slot to tackle them. Thanks tglx -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Then we can do cool tricks like having the cpu spin on a mwait() on the > network device's status descriptor in memory. When you specify a deep C state in that mwait then it will also have the long wakeup latency in the idle case. When you don't then you just killed higher Turbo mode on that socket and give away a lot of performance on the other cores. So you have to solve the idle state governour issue anyways, and then you likely don't need it anymore. Besides it seems to me that dispatching is something the NIC should just do directly. "RPS only CPU" would be essentially just an interrupt mitigation/flow redirection scheme that a lot of NICs do anyways. > In any event I agree with you, it's a cool idea at best, and likely > not really practical. s/cool// -Andi -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Andi Kleen <andi@firstfloor.org> Date: Fri, 30 Apr 2010 23:01:31 +0200 > Besides it seems to me that dispatching is something the NIC should > just do directly. "RPS only CPU" would be essentially just an > interrupt mitigation/flow redirection scheme that a lot of NICs > do anyways. We've already established that the NIC can't do a complete job in all important cases, that's why we've integrated the RPS/RFS patches in the first place. And we don't want it to, because the decision mechanisms for steering that we using now are starting to get into the stateful territory and that's verbotton for NIC offload as far as we're concerned. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Andi Kleen <ak@gargoyle.fritz.box> Date: Thu, 29 Apr 2010 23:41:44 +0200 > Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies > > XXX: probably too aggressive, some of these sleeps are not under high load. > > Based on a bug report from Eric Dumazet. > > Signed-off-by: Andi Kleen <ak@linux.intel.com> I like this, except that we probably don't want the delayacct_blkio_*() calls these things do. Probably the rest of what these things do should remain in the io_schedule*() functions and the block layer can call it's own versions which add in the delayacct_blkio_*() bits. Or, if the delacct stuff is useful for socket I/O too, then it's interfaces names should have the "blk" stripped from them :-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> And we don't want it to, because the decision mechanisms for steering > that we using now are starting to get into the stateful territory and > that's verbotton for NIC offload as far as we're concerned. Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC (or someone else like netfilter) tracking flows is quite common and very far from full offload. AFAIK it doesn't have near all the problems full offload has. -Andi [1] although it seems to leak in more and more through the RDMA backdoor. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Apr 30, 2010 at 04:38:57PM -0700, David Miller wrote: > From: Andi Kleen <ak@gargoyle.fritz.box> > Date: Thu, 29 Apr 2010 23:41:44 +0200 > > > Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies > > > > XXX: probably too aggressive, some of these sleeps are not under high load. > > > > Based on a bug report from Eric Dumazet. > > > > Signed-off-by: Andi Kleen <ak@linux.intel.com> > > I like this, except that we probably don't want the delayacct_blkio_*() calls > these things do. Yes. It needs more work, please don't apply it yet, to handle the "long sleep" case. Still curious if it fixes Eric's test case. > > Probably the rest of what these things do should remain in the io_schedule*() > functions and the block layer can call it's own versions which add in the > delayacct_blkio_*() bits. Good point. > > Or, if the delacct stuff is useful for socket I/O too, then it's interfaces > names should have the "blk" stripped from them :-) Good question. I suspect it's actually useful for some cases, but just adding sockets might confuse some users. -Andi -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 30 Apr 2010, David Miller wrote: > Then we can do cool tricks like having the cpu spin on a mwait() on the > network device's status descriptor in memory. Can you have mwait monitor multiple cachelines for stores? If not then it might be hard to do that when you have multiple nics and you actually need to use the status descriptors, otherwise you could possibly have them all written to the same cacheline. Or if the nic doesn't support updating a status descriptor in memory. If you just want to wake up quickly without using interrupts it might be possible to abuse MSI to wake up without actually using interrupts, set the address to the cacheline that is being monitored. /Martin -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Andi Kleen <andi@firstfloor.org> Date: Sat, 1 May 2010 12:53:04 +0200 >> And we don't want it to, because the decision mechanisms for steering >> that we using now are starting to get into the stateful territory and >> that's verbotton for NIC offload as far as we're concerned. > > Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC > (or someone else like netfilter) tracking flows is quite common and very far > from full offload. AFAIK it doesn't have near all the problems full > offload has. We're tracking flow cpu location state at the socket operations, like recvmsg() and sendmsg(), where it belongs. Would you like us to call into the card drivers and firmware at these spots instead? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Martin Josefsson <gandalf@mjufs.se> Date: Sat, 1 May 2010 22:31:05 +0200 (CEST) > On Fri, 30 Apr 2010, David Miller wrote: > >> Then we can do cool tricks like having the cpu spin on a mwait() on >> the >> network device's status descriptor in memory. > > Can you have mwait monitor multiple cachelines for stores? The idea is that if you have hundreds of cpus threads (several of my machines do, and it's not too long before these kinds of boxes will be common) in your machine you can spare one for each NIC. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> We're tracking flow cpu location state at the socket operations, like > recvmsg() and sendmsg(), where it belongs. > > Would you like us to call into the card drivers and firmware at these > spots instead? No, that's not needed for lazy flow tracking like in netfilter or some NICs, it doesn't need exact updates. It just works with seen network packets. -Andi -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Andi Kleen <andi@firstfloor.org> Date: Sun, 2 May 2010 00:58:15 +0200 >> We're tracking flow cpu location state at the socket operations, like >> recvmsg() and sendmsg(), where it belongs. >> >> Would you like us to call into the card drivers and firmware at these >> spots instead? > > No, that's not needed for lazy flow tracking like in netfilter or > some NICs, it doesn't need exact updates. It just works with seen network > packets. Well what we need is exact flow updates so that we steer packets to where the applications actually are. Andi, this discussion is going in circles, can I just say "yeah you're right Andi" and this will satisfy your desire to be correct and we can be done with this? Thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, 2010-05-01 at 15:03 -0700, David Miller wrote: > From: Andi Kleen <andi@firstfloor.org> > Date: Sat, 1 May 2010 12:53:04 +0200 > > >> And we don't want it to, because the decision mechanisms for steering > >> that we using now are starting to get into the stateful territory and > >> that's verbotton for NIC offload as far as we're concerned. > > > > Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC > > (or someone else like netfilter) tracking flows is quite common and very far > > from full offload. AFAIK it doesn't have near all the problems full > > offload has. > > We're tracking flow cpu location state at the socket operations, like > recvmsg() and sendmsg(), where it belongs. > > Would you like us to call into the card drivers and firmware at these > spots instead? I'm interested in experimenting with this at some point, since our hardware supports a fairly large number of filters that could be used for it. Ben.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3c5ed5f..6ae9f2b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1387,6 +1387,7 @@ struct softnet_data { struct Qdisc *output_queue; struct list_head poll_list; struct sk_buff *completion_queue; + struct sk_buff_head process_queue; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; @@ -1401,10 +1402,11 @@ struct softnet_data { struct napi_struct backlog; }; -static inline void input_queue_head_incr(struct softnet_data *sd) +static inline void input_queue_head_add(struct softnet_data *sd, + unsigned int len) { #ifdef CONFIG_RPS - sd->input_queue_head++; + sd->input_queue_head += len; #endif } diff --git a/net/core/dev.c b/net/core/dev.c index a4a7c36..c1585f9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2409,12 +2409,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, __get_cpu_var(netdev_rx_stat).total++; rps_lock(sd); - if (sd->input_pkt_queue.qlen <= netdev_max_backlog) { - if (sd->input_pkt_queue.qlen) { + if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); #ifdef CONFIG_RPS - *qtail = sd->input_queue_head + sd->input_pkt_queue.qlen; + *qtail = sd->input_queue_head + + skb_queue_len(&sd->input_pkt_queue); #endif rps_unlock(sd); local_irq_restore(flags); @@ -2934,13 +2935,21 @@ static void flush_backlog(void *arg) struct sk_buff *skb, *tmp; rps_lock(sd); - skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev == dev) { __skb_unlink(skb, &sd->input_pkt_queue); kfree_skb(skb); - input_queue_head_incr(sd); + input_queue_head_add(sd, 1); } + } rps_unlock(sd); + + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->process_queue); + kfree_skb(skb); + } + } } static int napi_gro_complete(struct sk_buff *skb) @@ -3286,24 +3295,30 @@ static int process_backlog(struct napi_struct *napi, int quota) } #endif napi->weight = weight_p; - do { + local_irq_disable(); + while (1) { struct sk_buff *skb; - local_irq_disable(); + while ((skb = __skb_dequeue(&sd->process_queue))) { + local_irq_enable(); + __netif_receive_skb(skb); + if (++work >= quota) + return work; + local_irq_disable(); + } + rps_lock(sd); - skb = __skb_dequeue(&sd->input_pkt_queue); - if (!skb) { + input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue)); + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + if (skb_queue_empty(&sd->process_queue)) { __napi_complete(napi); rps_unlock(sd); - local_irq_enable(); break; } - input_queue_head_incr(sd); rps_unlock(sd); - local_irq_enable(); - - __netif_receive_skb(skb); - } while (++work < quota); + } + local_irq_enable(); return work; } @@ -5631,8 +5646,10 @@ static int dev_cpu_callback(struct notifier_block *nfb, /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + input_queue_head_add(oldsd, 1); } + while ((skb = __skb_dequeue(&oldsd->process_queue))) + netif_rx(skb); return NOTIFY_OK; } @@ -5851,6 +5868,7 @@ static int __init net_dev_init(void) struct softnet_data *sd = &per_cpu(softnet_data, i); skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); sd->completion_queue = NULL; INIT_LIST_HEAD(&sd->poll_list);
batch skb dequeueing from softnet input_pkt_queue. batch skb dequeueing from softnet input_pkt_queue to reduce potential lock contention when RPS is enabled. Note: in the worst case, the number of packets in a softnet_data may be double of netdev_max_backlog. Signed-off-by: Changli Gao <xiaosuo@gmail.com> ---- include/linux/netdevice.h | 6 +++-- net/core/dev.c | 50 +++++++++++++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 18 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html