diff mbox

[v6] net: batch skb dequeueing from softnet input_pkt_queue

Message ID 1272010378-2955-1-git-send-email-xiaosuo@gmail.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Changli Gao April 23, 2010, 8:12 a.m. UTC
batch skb dequeueing from softnet input_pkt_queue.

batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
contention when RPS is enabled.

Note: in the worst case, the number of packets in a softnet_data may be double
of netdev_max_backlog.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 include/linux/netdevice.h |    6 +++--
 net/core/dev.c            |   50 +++++++++++++++++++++++++++++++---------------
 2 files changed, 38 insertions(+), 18 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Dumazet April 23, 2010, 9:27 a.m. UTC | #1
Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit :
> batch skb dequeueing from softnet input_pkt_queue.
> 
> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
> contention when RPS is enabled.
> 
> Note: in the worst case, the number of packets in a softnet_data may be double
> of netdev_max_backlog.
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>

Very good patch Changli, thanks !

Lets see how it improves thing for Jamal benchs ;)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

> ----
>  include/linux/netdevice.h |    6 +++--
>  net/core/dev.c            |   50 +++++++++++++++++++++++++++++++---------------
>  2 files changed, 38 insertions(+), 18 deletions(-)
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3c5ed5f..6ae9f2b 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1387,6 +1387,7 @@ struct softnet_data {
>  	struct Qdisc		*output_queue;
>  	struct list_head	poll_list;
>  	struct sk_buff		*completion_queue;
> +	struct sk_buff_head	process_queue;
>  
>  #ifdef CONFIG_RPS
>  	struct softnet_data	*rps_ipi_list;
> @@ -1401,10 +1402,11 @@ struct softnet_data {
>  	struct napi_struct	backlog;
>  };
>  
> -static inline void input_queue_head_incr(struct softnet_data *sd)
> +static inline void input_queue_head_add(struct softnet_data *sd,
> +					unsigned int len)
>  {
>  #ifdef CONFIG_RPS
> -	sd->input_queue_head++;
> +	sd->input_queue_head += len;
>  #endif
>  }
>  
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a4a7c36..c1585f9 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2409,12 +2409,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
>  	__get_cpu_var(netdev_rx_stat).total++;
>  
>  	rps_lock(sd);
> -	if (sd->input_pkt_queue.qlen <= netdev_max_backlog) {
> -		if (sd->input_pkt_queue.qlen) {
> +	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
> +		if (skb_queue_len(&sd->input_pkt_queue)) {
>  enqueue:
>  			__skb_queue_tail(&sd->input_pkt_queue, skb);
>  #ifdef CONFIG_RPS
> -			*qtail = sd->input_queue_head + sd->input_pkt_queue.qlen;
> +			*qtail = sd->input_queue_head +
> +					skb_queue_len(&sd->input_pkt_queue);
>  #endif
>  			rps_unlock(sd);
>  			local_irq_restore(flags);
> @@ -2934,13 +2935,21 @@ static void flush_backlog(void *arg)
>  	struct sk_buff *skb, *tmp;
>  
>  	rps_lock(sd);
> -	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp)
> +	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
>  		if (skb->dev == dev) {
>  			__skb_unlink(skb, &sd->input_pkt_queue);
>  			kfree_skb(skb);
> -			input_queue_head_incr(sd);
> +			input_queue_head_add(sd, 1);
>  		}
> +	}
>  	rps_unlock(sd);
> +
> +	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
> +		if (skb->dev == dev) {
> +			__skb_unlink(skb, &sd->process_queue);
> +			kfree_skb(skb);
> +		}
> +	}
>  }
>  
>  static int napi_gro_complete(struct sk_buff *skb)
> @@ -3286,24 +3295,30 @@ static int process_backlog(struct napi_struct *napi, int quota)
>  	}
>  #endif
>  	napi->weight = weight_p;
> -	do {
> +	local_irq_disable();
> +	while (1) {
>  		struct sk_buff *skb;
>  
> -		local_irq_disable();
> +		while ((skb = __skb_dequeue(&sd->process_queue))) {
> +			local_irq_enable();
> +			__netif_receive_skb(skb);
> +			if (++work >= quota)
> +				return work;
> +			local_irq_disable();
> +		}
> +
>  		rps_lock(sd);
> -		skb = __skb_dequeue(&sd->input_pkt_queue);
> -		if (!skb) {
> +		input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue));
> +		skb_queue_splice_tail_init(&sd->input_pkt_queue,
> +					   &sd->process_queue);
> +		if (skb_queue_empty(&sd->process_queue)) {
>  			__napi_complete(napi);
>  			rps_unlock(sd);
> -			local_irq_enable();
>  			break;
>  		}
> -		input_queue_head_incr(sd);
>  		rps_unlock(sd);
> -		local_irq_enable();
> -
> -		__netif_receive_skb(skb);
> -	} while (++work < quota);
> +	}
> +	local_irq_enable();
>  
>  	return work;
>  }
> @@ -5631,8 +5646,10 @@ static int dev_cpu_callback(struct notifier_block *nfb,
>  	/* Process offline CPU's input_pkt_queue */
>  	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
>  		netif_rx(skb);
> -		input_queue_head_incr(oldsd);
> +		input_queue_head_add(oldsd, 1);
>  	}
> +	while ((skb = __skb_dequeue(&oldsd->process_queue)))
> +		netif_rx(skb);
>  
>  	return NOTIFY_OK;
>  }
> @@ -5851,6 +5868,7 @@ static int __init net_dev_init(void)
>  		struct softnet_data *sd = &per_cpu(softnet_data, i);
>  
>  		skb_queue_head_init(&sd->input_pkt_queue);
> +		skb_queue_head_init(&sd->process_queue);
>  		sd->completion_queue = NULL;
>  		INIT_LIST_HEAD(&sd->poll_list);
>  
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal April 23, 2010, 10:02 p.m. UTC | #2
On Fri, 2010-04-23 at 11:27 +0200, Eric Dumazet wrote:

> 
> Lets see how it improves thing for Jamal benchs ;)


Ive done a setup with the last patch from Changli + net-next - I will
post test results tomorrow AM.

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal April 24, 2010, 2:10 p.m. UTC | #3
On Fri, 2010-04-23 at 18:02 -0400, jamal wrote:

> Ive done a setup with the last patch from Changli + net-next - I will
> post test results tomorrow AM.

ok, annotated results attached. 

cheers,
jamal
sink    cpu all     cpuint       cpuapp
nn-standalone 	93.95%   84.5%        99.8%        79.8%
nn-rps          96.41%   85.4%        95.5%        82.5%
nn-cl           97.29%   84.0%        99.9%        79.6%
nn-cl-rps       97.76%   86.5%        96.5%        84.8%

nn-standalone: Basic net-next from Apr23
nn-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0
nn-cl: Basic net-next from Apr23 + Changli patch
nn-cl-rps: Basic net-next from Apr23 + Changli patch + rps mask ee,irq aff cpu0
sink: the amount of traffic the system was able to sink in.
cpu all: avg % system cpu consumed in test
cpuint: avg %cpu consumed by the cpu where interrupts happened
cpuapp: avg %cpu consumed by a sample cpu which did app processing

Testing was as previously explained..
I repeated each test 4-5 times and took averages..

It seems the non-rps case has improved drammatically since the last 
net-next i tested. The rps case has also improved but the gap between 
rps and non-rps is smaller.
[There are just too many variables for me to pinpoint
to one item as being the contributor. For example sky2 driver may
have become worse (consumes more cycles) but i cant quantify it yet
(i just see sky2_rx_submit showing up higher in profiles than before).
Also call_function_single_interrupt shows up prominently on application
processing CPUs but improved by Changli's changes].
After doing the math, I dont trust my results after applying Changlis patch. 
It seems both the rps and non-rps case have gotten better (and i dont 
see Changlis contribution to non-rps). It also seems that the gap between 
rps and non-rps is non-existent now. In other words, there is no benefit to
using rps (it consumes more cpu for the same throughput). So it is likely 
that i need to repeat these tests; maybe i did something wrong in my setup...

And here are the profiles:
--------------------------

cpu0 always received all the interrupts regardless of the tests.
cpu1, 7 etc were processing apps..
I could not spot much difference between before and after Changli's


I: Test setup : nn-standalone: Basic net-next from Apr23

All cpus

-------------------------------------------------------------------------------
   PerfTop:    3784 irqs/sec  kernel:84.2% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             3254.00 10.3% sky2_poll                   [sky2]  
             1853.00  5.9% _raw_spin_lock_irqsave      [kernel]
              872.00  2.8% fget                        [kernel]
              870.00  2.8% copy_user_generic_string    [kernel]
              819.00  2.6% _raw_spin_unlock_irqrestore [kernel]
              729.00  2.3% sys_epoll_ctl               [kernel]
              701.00  2.2% datagram_poll               [kernel]
              615.00  2.0% udp_recvmsg                 [kernel]
              602.00  1.9% _raw_spin_lock_bh           [kernel]
              595.00  1.9% system_call                 [kernel]
              592.00  1.9% kmem_cache_free             [kernel]
              574.00  1.8% schedule                    [kernel]
              568.00  1.8% _raw_spin_lock              [kernel]


-------------------------------------------------------------------------------
   PerfTop:    3574 irqs/sec  kernel:85.1% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             5023.00 10.9% sky2_poll                   [sky2]  
             2762.00  6.0% _raw_spin_lock_irqsave      [kernel]
             1319.00  2.9% copy_user_generic_string    [kernel]
             1306.00  2.8% fget                        [kernel]
             1198.00  2.6% _raw_spin_unlock_irqrestore [kernel]
             1071.00  2.3% datagram_poll               [kernel]
             1061.00  2.3% sys_epoll_ctl               [kernel]
              927.00  2.0% _raw_spin_lock_bh           [kernel]
              917.00  2.0% system_call                 [kernel]
              901.00  1.9% udp_recvmsg                 [kernel]
              895.00  1.9% kmem_cache_free             [kernel]
              819.00  1.8% _raw_spin_lock              [kernel]
              802.00  1.7% schedule                    [kernel]
              774.00  1.7% sys_epoll_wait              [kernel]
              720.00  1.6% kmem_cache_alloc            [kernel]


-------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

              751.00 36.1% sky2_poll              [sky2]  
              108.00  5.2% __udp4_lib_lookup      [kernel]
               95.00  4.6% ip_route_input         [kernel]
               83.00  4.0% _raw_spin_lock         [kernel]
               79.00  3.8% _raw_spin_lock_irqsave [kernel]
               77.00  3.7% __netif_receive_skb    [kernel]
               77.00  3.7% __alloc_skb            [kernel]
               66.00  3.2% ip_rcv                 [kernel]
               60.00  2.9% __udp4_lib_rcv         [kernel]
               54.00  2.6% sock_queue_rcv_skb     [kernel]
               45.00  2.2% sky2_rx_submit         [sky2]  
               42.00  2.0% __wake_up_common       [kernel]
               40.00  1.9% __kmalloc              [kernel]
               39.00  1.9% sock_def_readable      [kernel]
               30.00  1.4% ep_poll_callback       [kernel]


-------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:99.8% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

             3511.00 36.7% sky2_poll              [sky2]  
              519.00  5.4% __udp4_lib_lookup      [kernel]
              431.00  4.5% ip_route_input         [kernel]
              353.00  3.7% _raw_spin_lock_irqsave [kernel]
              351.00  3.7% __alloc_skb            [kernel]
              338.00  3.5% __netif_receive_skb    [kernel]
              337.00  3.5% _raw_spin_lock         [kernel]
              307.00  3.2% ip_rcv                 [kernel]
              264.00  2.8% sky2_rx_submit         [sky2]  
              254.00  2.7% sock_queue_rcv_skb     [kernel]
              246.00  2.6% __udp4_lib_rcv         [kernel]
              206.00  2.2% sock_def_readable      [kernel]
              177.00  1.9% __wake_up_common       [kernel]
              168.00  1.8% __kmalloc              [kernel]


-------------------------------------------------------------------------------
   PerfTop:     908 irqs/sec  kernel:80.0% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

              177.00  6.7% _raw_spin_lock_irqsave      [kernel]
              120.00  4.5% copy_user_generic_string    [kernel]
              110.00  4.2% fget                        [kernel]
              108.00  4.1% datagram_poll               [kernel]
               98.00  3.7% _raw_spin_lock_bh           [kernel]
               91.00  3.4% sys_epoll_ctl               [kernel]
               89.00  3.4% kmem_cache_free             [kernel]
               77.00  2.9% system_call                 [kernel]
               76.00  2.9% schedule                    [kernel]
               76.00  2.9% _raw_spin_unlock_irqrestore [kernel]
               63.00  2.4% fput                        [kernel]
               61.00  2.3% sys_epoll_wait              [kernel]
               61.00  2.3% udp_recvmsg                 [kernel]
               49.00  1.8% process_recv                mcpudp  


-------------------------------------------------------------------------------
   PerfTop:     815 irqs/sec  kernel:79.8% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ _________________

              491.00  8.0% _raw_spin_lock_irqsave      [kernel.kallsyms]
              285.00  4.7% copy_user_generic_string    [kernel.kallsyms]
              252.00  4.1% fget                        [kernel.kallsyms]
              215.00  3.5% datagram_poll               [kernel.kallsyms]
              206.00  3.4% _raw_spin_unlock_irqrestore [kernel.kallsyms]
              204.00  3.3% sys_epoll_ctl               [kernel.kallsyms]
              196.00  3.2% _raw_spin_lock_bh           [kernel.kallsyms]
              184.00  3.0% udp_recvmsg                 [kernel.kallsyms]
              184.00  3.0% kmem_cache_free             [kernel.kallsyms]
              180.00  2.9% system_call                 [kernel.kallsyms]
              168.00  2.7% sys_epoll_wait              [kernel.kallsyms]
              159.00  2.6% schedule                    [kernel.kallsyms]
              144.00  2.4% fput                        [kernel.kallsyms]


II: Test setup 
nn-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0

-------------------------------------------------------------------------------
   PerfTop:    3558 irqs/sec  kernel:85.0% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ________

             3519.00 15.9% sky2_poll                      [sky2]  
              865.00  3.9% _raw_spin_lock_irqsave         [kernel]
              568.00  2.6% _raw_spin_unlock_irqrestore    [kernel]
              526.00  2.4% sky2_intr                      [sky2]  
              493.00  2.2% __netif_receive_skb            [kernel]
              477.00  2.2% _raw_spin_lock                 [kernel]
              470.00  2.1% ip_rcv                         [kernel]
              456.00  2.1% fget                           [kernel]
              447.00  2.0% sys_epoll_ctl                  [kernel]
              420.00  1.9% copy_user_generic_string       [kernel]
              387.00  1.8% ip_route_input                 [kernel]
              359.00  1.6% system_call                    [kernel]
              334.00  1.5% kmem_cache_free                [kernel]
              310.00  1.4% kmem_cache_alloc               [kernel]
              302.00  1.4% call_function_single_interrupt [kernel]


-------------------------------------------------------------------------------
   PerfTop:    3546 irqs/sec  kernel:85.8% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ________

             6592.00 16.2% sky2_poll                      [sky2]  
             1540.00  3.8% _raw_spin_lock_irqsave         [kernel]
             1014.00  2.5% _raw_spin_unlock_irqrestore    [kernel]
              885.00  2.2% fget                           [kernel]
              881.00  2.2% _raw_spin_lock                 [kernel]
              880.00  2.2% sky2_intr                      [sky2]  
              872.00  2.1% __netif_receive_skb            [kernel]
              858.00  2.1% ip_rcv                         [kernel]
              802.00  2.0% sys_epoll_ctl                  [kernel]
              710.00  1.7% copy_user_generic_string       [kernel]
              696.00  1.7% system_call                    [kernel]
              692.00  1.7% ip_route_input                 [kernel]
              634.00  1.6% schedule                       [kernel]
              618.00  1.5% kmem_cache_free                [kernel]
              605.00  1.5% call_function_single_interrupt [kernel]


cpu0

-------------------------------------------------------------------------------
   PerfTop:     971 irqs/sec  kernel:96.5% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             4222.00 58.2% sky2_poll                   [sky2]  
              668.00  9.2% sky2_intr                   [sky2]  
              228.00  3.1% __alloc_skb                 [kernel]
              183.00  2.5% get_rps_cpu                 [kernel]
              138.00  1.9% sky2_rx_submit              [sky2]  
              124.00  1.7% enqueue_to_backlog          [kernel]
              119.00  1.6% __kmalloc                   [kernel]
              103.00  1.4% kmem_cache_alloc            [kernel]
               91.00  1.3% _raw_spin_lock              [kernel]
               90.00  1.2% _raw_spin_lock_irqsave      [kernel]
               73.00  1.0% swiotlb_sync_single         [kernel]
               72.00  1.0% irq_entries_start           [kernel]
               55.00  0.8% copy_user_generic_string    [kernel]
               53.00  0.7% _raw_spin_unlock_irqrestore [kernel]
               48.00  0.7% fget                        [kernel]


-------------------------------------------------------------------------------
   PerfTop:     998 irqs/sec  kernel:94.8% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             6745.00 58.5% sky2_poll                   [sky2]  
              831.00  7.2% sky2_intr                   [sky2]  
              352.00  3.1% __alloc_skb                 [kernel]
              281.00  2.4% get_rps_cpu                 [kernel]
              226.00  2.0% sky2_rx_submit              [sky2]  
              186.00  1.6% __kmalloc                   [kernel]
              181.00  1.6% enqueue_to_backlog          [kernel]
              173.00  1.5% _raw_spin_lock_irqsave      [kernel]
              166.00  1.4% kmem_cache_alloc            [kernel]
              162.00  1.4% _raw_spin_lock              [kernel]
               99.00  0.9% swiotlb_sync_single         [kernel]
               98.00  0.9% irq_entries_start           [kernel]
               94.00  0.8% fget                        [kernel]
               92.00  0.8% _raw_spin_unlock_irqrestore [kernel]
               80.00  0.7% system_call                 [kernel]


cpu1


-------------------------------------------------------------------------------
   PerfTop:     724 irqs/sec  kernel:82.0% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _________________

              204.00  5.3% _raw_spin_lock_irqsave         [kernel.kallsyms]
              153.00  4.0% _raw_spin_unlock_irqrestore    [kernel.kallsyms]
              147.00  3.8% call_function_single_interrupt [kernel.kallsyms]
              139.00  3.6% __netif_receive_skb            [kernel.kallsyms]
              135.00  3.5% sys_epoll_ctl                  [kernel.kallsyms]
              132.00  3.4% ip_rcv                         [kernel.kallsyms]
              129.00  3.3% fget                           [kernel.kallsyms]
              128.00  3.3% _raw_spin_lock                 [kernel.kallsyms]
              122.00  3.2% system_call                    [kernel.kallsyms]
              118.00  3.1% ip_route_input                 [kernel.kallsyms]
              109.00  2.8% kmem_cache_free                [kernel.kallsyms]
              108.00  2.8% copy_user_generic_string       [kernel.kallsyms]
               90.00  2.3% schedule                       [kernel.kallsyms]
               85.00  2.2% fput                           [kernel.kallsyms]



-------------------------------------------------------------------------------
   PerfTop:     763 irqs/sec  kernel:83.0% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _________________

              428.00  6.2% _raw_spin_lock_irqsave         [kernel.kallsyms]
              302.00  4.4% _raw_spin_unlock_irqrestore    [kernel.kallsyms]
              269.00  3.9% __netif_receive_skb            [kernel.kallsyms]
              258.00  3.7% call_function_single_interrupt [kernel.kallsyms]
              254.00  3.7% fget                           [kernel.kallsyms]
              238.00  3.4% ip_rcv                         [kernel.kallsyms]
              230.00  3.3% sys_epoll_ctl                  [kernel.kallsyms]
              222.00  3.2% _raw_spin_lock                 [kernel.kallsyms]
              220.00  3.2% ip_route_input                 [kernel.kallsyms]
              197.00  2.9% system_call                    [kernel.kallsyms]
              189.00  2.7% kmem_cache_free                [kernel.kallsyms]
              184.00  2.7% copy_user_generic_string       [kernel.kallsyms]
              144.00  2.1% ep_remove                      [kernel.kallsyms]
              140.00  2.0% schedule                       [kernel.kallsyms]


-------------------------------------------------------------------------------
   PerfTop:     546 irqs/sec  kernel:83.3% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _________________

              346.00  5.7% _raw_spin_lock_irqsave         [kernel.kallsyms]
              275.00  4.6% _raw_spin_unlock_irqrestore    [kernel.kallsyms]
              238.00  3.9% call_function_single_interrupt [kernel.kallsyms]
              228.00  3.8% fget                           [kernel.kallsyms]
              222.00  3.7% __netif_receive_skb            [kernel.kallsyms]
              219.00  3.6% sys_epoll_ctl                  [kernel.kallsyms]
              209.00  3.5% _raw_spin_lock                 [kernel.kallsyms]
              205.00  3.4% ip_rcv                         [kernel.kallsyms]
              199.00  3.3% ip_route_input                 [kernel.kallsyms]
              173.00  2.9% system_call                    [kernel.kallsyms]
              170.00  2.8% copy_user_generic_string       [kernel.kallsyms]
              167.00  2.8% kmem_cache_free                [kernel.kallsyms]
              127.00  2.1% ep_remove                      [kernel.kallsyms]
              123.00  2.0% dst_release                    [kernel.kalls



III: Test setup 
nn-cl: Basic net-next from Apr23 + Changli patch

-------------------------------------------------------------------------------
   PerfTop:    3789 irqs/sec  kernel:84.1% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

             3514.00 10.2% sky2_poll                   [sky2]              
             1862.00  5.4% _raw_spin_lock_irqsave      [kernel]            
             1274.00  3.7% system_call                 [kernel]            
              926.00  2.7% fget                        [kernel]            
              872.00  2.5% _raw_spin_unlock_irqrestore [kernel]            
              862.00  2.5% copy_user_generic_string    [kernel]            
              766.00  2.2% sys_epoll_ctl               [kernel]            
              765.00  2.2% datagram_poll               [kernel]            
              671.00  2.0% _raw_spin_lock_bh           [kernel]            
              668.00  1.9% kmem_cache_free             [kernel]            
              602.00  1.8% udp_recvmsg                 [kernel]            
              586.00  1.7% _raw_spin_lock              [kernel]            
              585.00  1.7% vread_tsc                   [kernel].vsyscall_fn



-------------------------------------------------------------------------------
   PerfTop:    3794 irqs/sec  kernel:83.6% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

             4756.00  9.8% sky2_poll                   [sky2]              
             2742.00  5.7% _raw_spin_lock_irqsave      [kernel]            
             1826.00  3.8% system_call                 [kernel]            
             1285.00  2.7% fget                        [kernel]            
             1284.00  2.7% copy_user_generic_string    [kernel]            
             1235.00  2.6% _raw_spin_unlock_irqrestore [kernel]            
             1096.00  2.3% sys_epoll_ctl               [kernel]            
             1071.00  2.2% datagram_poll               [kernel]            
              954.00  2.0% kmem_cache_free             [kernel]            
              925.00  1.9% _raw_spin_lock_bh           [kernel]            
              888.00  1.8% vread_tsc                   [kernel].vsyscall_fn
              880.00  1.8% udp_recvmsg                 [kernel]            
              793.00  1.6% _raw_spin_lock              [kernel]            
              790.00  1.6% schedule                    [kernel]   

-------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:99.9% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

              675.00 32.6% sky2_poll              [sky2]  
              116.00  5.6% __udp4_lib_lookup      [kernel]
              111.00  5.4% ip_route_input         [kernel]
               81.00  3.9% _raw_spin_lock_irqsave [kernel]
               81.00  3.9% _raw_spin_lock         [kernel]
               70.00  3.4% __alloc_skb            [kernel]
               67.00  3.2% ip_rcv                 [kernel]
               66.00  3.2% __netif_receive_skb    [kernel]
               61.00  2.9% __udp4_lib_rcv         [kernel]
               57.00  2.8% sock_queue_rcv_skb     [kernel]
               47.00  2.3% sock_def_readable      [kernel]
               42.00  2.0% __kmalloc              [kernel]
               42.00  2.0% __wake_up_common       [kernel]
               38.00  1.8% sky2_rx_submit         [sky2]  

-------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

             2526.00 32.8% sky2_poll              [sky2]  
              406.00  5.3% ip_route_input         [kernel]
              399.00  5.2% __udp4_lib_lookup      [kernel]
              328.00  4.3% _raw_spin_lock_irqsave [kernel]
              307.00  4.0% _raw_spin_lock         [kernel]
              296.00  3.8% ip_rcv                 [kernel]
              287.00  3.7% __alloc_skb            [kernel]
              272.00  3.5% sock_queue_rcv_skb     [kernel]
              224.00  2.9% __udp4_lib_rcv         [kernel]
              224.00  2.9% __netif_receive_skb    [kernel]
              182.00  2.4% sock_def_readable      [kernel]
              163.00  2.1% __wake_up_common       [kernel]
              140.00  1.8% sky2_rx_submit         [sky2]  

-------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

             4445.00 33.4% sky2_poll              [sky2]  
              707.00  5.3% __udp4_lib_lookup      [kernel]
              662.00  5.0% ip_route_input         [kernel]
              567.00  4.3% _raw_spin_lock_irqsave [kernel]
              512.00  3.8% __alloc_skb            [kernel]
              506.00  3.8% ip_rcv                 [kernel]
              476.00  3.6% sock_queue_rcv_skb     [kernel]
              473.00  3.6% _raw_spin_lock         [kernel]
              415.00  3.1% __udp4_lib_rcv         [kernel]
              408.00  3.1% __netif_receive_skb    [kernel]
              306.00  2.3% sock_def_readable      [kernel]
              272.00  2.0% __wake_up_common       [kernel]
              260.00  2.0% __kmalloc              [kernel]
              216.00  1.6% _raw_read_lock         [kernel]
              214.00  1.6% sky2_rx_submit         [sky2]  


-------------------------------------------------------------------------------
   PerfTop:     748 irqs/sec  kernel:80.9% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

              244.00  7.4% _raw_spin_lock_irqsave      [kernel]            
              207.00  6.2% system_call                 [kernel]            
              127.00  3.8% _raw_spin_unlock_irqrestore [kernel]            
              124.00  3.7% copy_user_generic_string    [kernel]            
              122.00  3.7% sys_epoll_ctl               [kernel]            
              120.00  3.6% fget                        [kernel]            
              118.00  3.6% datagram_poll               [kernel]            
               96.00  2.9% schedule                    [kernel]            
               94.00  2.8% _raw_spin_lock_bh           [kernel]            
               86.00  2.6% vread_tsc                   [kernel].vsyscall_fn
               82.00  2.5% udp_recvmsg                 [kernel]            
               76.00  2.3% fput                        [kernel]            
               73.00  2.2% kmem_cache_free             [kernel]            
               67.00  2.0% sys_epoll_wait              [kernel]         

-------------------------------------------------------------------------------
   PerfTop:     625 irqs/sec  kernel:78.6% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

              488.00  7.5% _raw_spin_lock_irqsave      [kernel]            
              380.00  5.9% system_call                 [kernel]            
              274.00  4.2% copy_user_generic_string    [kernel]            
              252.00  3.9% fget                        [kernel]            
              244.00  3.8% datagram_poll               [kernel]            
              217.00  3.3% _raw_spin_unlock_irqrestore [kernel]            
              211.00  3.3% sys_epoll_ctl               [kernel]            
              186.00  2.9% schedule                    [kernel]            
              185.00  2.9% _raw_spin_lock_bh           [kernel]            
              173.00  2.7% udp_recvmsg                 [kernel]            
              169.00  2.6% vread_tsc                   [kernel].vsyscall_fn
              164.00  2.5% kmem_cache_free             [kernel]            
              143.00  2.2% fput                        [kernel]            
              133.00  2.1% sys_epoll_wait              [kernel]        


IV: Test setup 
nn-cl-rps: Basic net-next from Apr23 + Changli patch + rps mask ee,irq aff

--------------------------------------------------------------------------
   PerfTop:    3043 irqs/sec  kernel:87.5% [1000Hz cycles],  (all, 8 CPUs)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

             2240.00 20.4% sky2_poll                  [sky2]              
              375.00  3.4% _raw_spin_lock_irqsave     [kernel]            
              335.00  3.0% sky2_intr                  [sky2]              
              326.00  3.0% system_call                [kernel]            
              239.00  2.2% _raw_spin_unlock_irqrestor [kernel]            
              224.00  2.0% ip_rcv                     [kernel]            
              201.00  1.8% __netif_receive_skb        [kernel]            
              198.00  1.8% sys_epoll_ctl              [kernel]            
              190.00  1.7% _raw_spin_lock             [kernel]            
              182.00  1.7% fget                       [kernel]            
              169.00  1.5% copy_user_generic_string   [kernel]            
              165.00  1.5% kmem_cache_free            [kernel]            
              149.00  1.4% load_balance               [kernel]            
              146.00  1.3% ip_route_input             [kernel]           


--------------------------------------------------------------------------
   PerfTop:    3210 irqs/sec  kernel:85.8% [1000Hz cycles],  (all, 8 CPUs)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

             6539.00 20.4% sky2_poll                  [sky2]              
             1106.00  3.4% _raw_spin_lock_irqsave     [kernel]            
             1014.00  3.2% sky2_intr                  [sky2]              
              976.00  3.0% system_call                [kernel]            
              684.00  2.1% _raw_spin_unlock_irqrestor [kernel]            
              611.00  1.9% ip_rcv                     [kernel]            
              601.00  1.9% fget                       [kernel]            
              593.00  1.8% _raw_spin_lock             [kernel]            
              592.00  1.8% sys_epoll_ctl              [kernel]            
              574.00  1.8% __netif_receive_skb        [kernel]            
              526.00  1.6% copy_user_generic_string   [kernel]            
              482.00  1.5% kmem_cache_free            [kernel]            
              480.00  1.5% ip_route_input             [kernel]            
              425.00  1.3% vread_tsc                  [kernel].vsyscall_fn
              410.00  1.3% kmem_cache_alloc           [kernel]            


--------------------------------------------------------------------------
   PerfTop:     999 irqs/sec  kernel:97.2% [1000Hz cycles],  (all, cpu: 0)
--------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             2035.00 60.5% sky2_poll                   [sky2]  
              302.00  9.0% sky2_intr                   [sky2]  
              109.00  3.2% __alloc_skb                 [kernel]
               57.00  1.7% _raw_spin_lock              [kernel]
               57.00  1.7% get_rps_cpu                 [kernel]
               52.00  1.5% __kmalloc                   [kernel]
               51.00  1.5% enqueue_to_backlog          [kernel]
               49.00  1.5% _raw_spin_lock_irqsave      [kernel]
               44.00  1.3% kmem_cache_alloc            [kernel]
               34.00  1.0% sky2_rx_submit              [sky2]  
               33.00  1.0% swiotlb_sync_single         [kernel]
               31.00  0.9% system_call                 [kernel]
               28.00  0.8% irq_entries_start           [kernel]
               22.00  0.7% _raw_spin_unlock_irqrestore [kernel]
               21.00  0.6% sky2_remove                 [sky2]  

--------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:96.2% [1000Hz cycles],  (all, cpu: 0)
--------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             5493.00 60.1% sky2_poll                   [sky2]  
              803.00  8.8% sky2_intr                   [sky2]  
              281.00  3.1% __alloc_skb                 [kernel]
              233.00  2.6% get_rps_cpu                 [kernel]
              136.00  1.5% enqueue_to_backlog          [kernel]
              132.00  1.4% __kmalloc                   [kernel]
              126.00  1.4% _raw_spin_lock              [kernel]
              122.00  1.3% kmem_cache_alloc            [kernel]
              122.00  1.3% _raw_spin_lock_irqsave      [kernel]
              102.00  1.1% swiotlb_sync_single         [kernel]
               88.00  1.0% sky2_rx_submit              [sky2]  
               77.00  0.8% system_call                 [kernel]
               69.00  0.8% irq_entries_start           [kernel]
               55.00  0.6% _raw_spin_unlock_irqrestore [kernel]
               54.00  0.6% copy_user_generic_string    [kernel]

--------------------------------------------------------------------------
   PerfTop:     999 irqs/sec  kernel:97.5% [1000Hz cycles],  (all, cpu: 0)
--------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             6699.00 60.1% sky2_poll                   [sky2]  
              988.00  8.9% sky2_intr                   [sky2]  
              327.00  2.9% __alloc_skb                 [kernel]
              261.00  2.3% get_rps_cpu                 [kernel]
              168.00  1.5% __kmalloc                   [kernel]
              161.00  1.4% kmem_cache_alloc            [kernel]
              160.00  1.4% enqueue_to_backlog          [kernel]
              157.00  1.4% _raw_spin_lock              [kernel]
              125.00  1.1% _raw_spin_lock_irqsave      [kernel]
              122.00  1.1% swiotlb_sync_single         [kernel]
              114.00  1.0% sky2_rx_submit              [sky2]  
               96.00  0.9% system_call                 [kernel]
               85.00  0.8% irq_entries_start           [kernel]
               66.00  0.6% sky2_remove                 [sky2]  
               64.00  0.6% _raw_spin_unlock_irqrestore [kernel]

--------------------------------------------------------------------------
   PerfTop:     420 irqs/sec  kernel:84.8% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              188.00  4.8% _raw_spin_lock_irqsave     [kernel]            
              175.00  4.5% system_call                [kernel]            
              155.00  4.0% _raw_spin_unlock_irqrestor [kernel]            
              143.00  3.7% __netif_receive_skb        [kernel]            
              124.00  3.2% ip_route_input             [kernel]            
              122.00  3.1% fget                       [kernel]            
              118.00  3.0% ip_rcv                     [kernel]            
              115.00  2.9% sys_epoll_ctl              [kernel]            
              107.00  2.7% call_function_single_inter [kernel]            
               98.00  2.5% vread_tsc                  [kernel].vsyscall_fn
               97.00  2.5% _raw_spin_lock             [kernel]            
               89.00  2.3% copy_user_generic_string   [kernel]        

--------------------------------------------------------------------------
   PerfTop:     372 irqs/sec  kernel:87.9% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              212.00  4.6% _raw_spin_lock_irqsave     [kernel]            
              192.00  4.2% system_call                [kernel]            
              187.00  4.1% __netif_receive_skb        [kernel]            
              184.00  4.0% ip_rcv                     [kernel]            
              174.00  3.8% ip_route_input             [kernel]            
              165.00  3.6% _raw_spin_unlock_irqrestor [kernel]            
              143.00  3.1% call_function_single_inter [kernel]            
              135.00  3.0% fget                       [kernel]            
              133.00  2.9% sys_epoll_ctl              [kernel]            
              122.00  2.7% _raw_spin_lock             [kernel]            
              112.00  2.5% __udp4_lib_lookup          [kernel]            
               99.00  2.2% copy_user_generic_string   [kernel]            
               93.00  2.0% vread_tsc                  [kernel].vsyscall_fn
               90.00  2.0% kmem_cache_free            [kernel]            
               89.00  1.9% ep_remove                  [kernel]        
o
--------------------------------------------------------------------------
   PerfTop:     269 irqs/sec  kernel:85.1% [1000Hz cycles],  (all, cpu: 7)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

               23.00  4.6% _raw_spin_lock_irqsave     [kernel]            
               21.00  4.2% system_call                [kernel]            
               19.00  3.8% _raw_spin_unlock_irqrestor [kernel]            
               17.00  3.4% fget                       [kernel]            
               15.00  3.0% __netif_receive_skb        [kernel]            
               14.00  2.8% dst_release                [kernel]            
               13.00  2.6% call_function_single_inter [kernel]            
               11.00  2.2% kmem_cache_free            [kernel]            
               10.00  2.0% vread_tsc                  [kernel].vsyscall_fn
               10.00  2.0% copy_user_generic_string   [kernel]            
               10.00  2.0% ktime_get                  [kernel]            
               10.00  2.0% ip_route_input             [kernel]            
               10.00  2.0% schedule                   [kernel]            


--------------------------------------------------------------------------
   PerfTop:     253 irqs/sec  kernel:84.6% [1000Hz cycles],  (all, cpu: 7)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              109.00  4.9% system_call                [kernel]            
              104.00  4.6% _raw_spin_lock_irqsave     [kernel]            
               79.00  3.5% ip_rcv                     [kernel]            
               74.00  3.3% _raw_spin_unlock_irqrestor [kernel]            
               71.00  3.2% fget                       [kernel]            
               68.00  3.0% sys_epoll_ctl              [kernel]            
               66.00  2.9% ip_route_input             [kernel]            
               58.00  2.6% call_function_single_inter [kernel]            
               55.00  2.4% _raw_spin_lock             [kernel]            
               54.00  2.4% copy_user_generic_string   [kernel]            
               53.00  2.4% __netif_receive_skb        [kernel]            
               51.00  2.3% schedule                   [kernel]            
               51.00  2.3% kmem_cache_free            [kernel]            
               43.00  1.9% vread_tsc                  [kernel].vsyscall_fn
               38.00  1.7% __udp4_lib_lookup          [kernel]  

--------------------------------------------------------------------------
   PerfTop:     236 irqs/sec  kernel:84.3% [1000Hz cycles],  (all, cpu: 7)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              131.00  4.9% _raw_spin_lock_irqsave     [kernel]            
              128.00  4.8% system_call                [kernel]            
              101.00  3.8% _raw_spin_unlock_irqrestor [kernel]            
               89.00  3.3% fget                       [kernel]            
               85.00  3.2% sys_epoll_ctl              [kernel]            
               81.00  3.0% ip_rcv                     [kernel]            
               76.00  2.8% ip_route_input             [kernel]            
               66.00  2.5% call_function_single_inter [kernel]            
               65.00  2.4% _raw_spin_lock             [kernel]            
               65.00  2.4% kmem_cache_free            [kernel]            
               64.00  2.4% copy_user_generic_string   [kernel]            
               57.00  2.1% __netif_receive_skb        [kernel]            
               47.00  1.8% schedule                   [kernel]            
               45.00  1.7% vread_tsc                  [kernel].vsyscall_fn


--------------------------------------------------------------------------
   PerfTop:     478 irqs/sec  kernel:82.2% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              319.00  5.2% _raw_spin_lock_irqsave     [kernel]            
              289.00  4.7% system_call                [kernel]            
              246.00  4.0% _raw_spin_unlock_irqrestor [kernel]            
              199.00  3.2% ip_route_input             [kernel]            
              198.00  3.2% __netif_receive_skb        [kernel]            
              197.00  3.2% sys_epoll_ctl              [kernel]            
              183.00  3.0% ip_rcv                     [kernel]            
              182.00  2.9% fget                       [kernel]            
              166.00  2.7% call_function_single_inter [kernel]            
              157.00  2.5% copy_user_generic_string   [kernel]            
              149.00  2.4% kmem_cache_free            [kernel]            
              146.00  2.4% vread_tsc                  [kernel].vsyscall_fn
              133.00  2.1% _raw_spin_lock             [kernel]            
              118.00  1.9% schedule                   [kernel]            
              112.00  1.8% __udp4_lib_lookup          [kernel]            



--------------------------------------------------------------------------
   PerfTop:     535 irqs/sec  kernel:83.0% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              345.00  5.2% _raw_spin_lock_irqsave     [kernel]            
              291.00  4.4% system_call                [kernel]            
              255.00  3.9% _raw_spin_unlock_irqrestor [kernel]            
              218.00  3.3% fget                       [kernel]            
              201.00  3.0% ip_route_input             [kernel]            
              193.00  2.9% __netif_receive_skb        [kernel]            
              193.00  2.9% sys_epoll_ctl              [kernel]            
              180.00  2.7% ip_rcv                     [kernel]            
              173.00  2.6% call_function_single_inter [kernel]            
              163.00  2.5% copy_user_generic_string   [kernel]            
              152.00  2.3% kmem_cache_free            [kernel]            
              151.00  2.3% vread_tsc                  [kernel].vsyscall_fn
              142.00  2.1% _raw_spin_lock             [kernel]            
              131.00  2.0% schedule                   [kernel]
Eric Dumazet April 26, 2010, 2:03 p.m. UTC | #4
Le samedi 24 avril 2010 à 10:10 -0400, jamal a écrit :
> On Fri, 2010-04-23 at 18:02 -0400, jamal wrote:
> 
> > Ive done a setup with the last patch from Changli + net-next - I will
> > post test results tomorrow AM.
> 
> ok, annotated results attached. 
> 
> cheers,
> jamal

Jamal, I have a Nehalem setup now, and I can see
_raw_spin_lock_irqsave() abuse is not coming from network tree, but from
clockevents_notify()

My pktgen sends 1040989pps :

# Samples: 389707198131
#
# Overhead         Command                 Shared Object  Symbol
# ........  ..............  ............................  ......
#
    23.52%            init  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
                      |
                      --- _raw_spin_lock_irqsave
                         |          
                         |--94.74%-- clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          
                         |--4.10%-- tick_broadcast_oneshot_control
                         |          tick_notify
                         |          notifier_call_chain
                         |          __raw_notifier_call_chain
                         |          raw_notifier_call_chain
                         |          clockevents_do_notify
                         |          clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          
                         |--0.58%-- lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                          --0.58%-- [...]

     8.94%            init  [kernel.kallsyms]             [k] acpi_os_read_port
                      |
                      --- acpi_os_read_port
                         |          
                         |--99.55%-- acpi_hw_read_port
                         |          acpi_hw_read
                         |          acpi_hw_read_multiple
                         |          acpi_hw_register_read
                         |          acpi_read_bit_register



# Samples: 389233082962
#
# Overhead         Command                 Shared Object  Symbol
# ........  ..............  ............................  ......
#
    23.25%            init  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
     8.90%            init  [kernel.kallsyms]             [k] acpi_os_read_port
     2.93%            init  [kernel.kallsyms]             [k] mwait_idle_with_hints
     1.99%            init  [kernel.kallsyms]             [k] schedule
     1.94%         udpsink  [kernel.kallsyms]             [k] schedule
     1.73%         swapper  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
     1.48%            init  [kernel.kallsyms]             [k] bnx2x_rx_int
     1.47%            init  [kernel.kallsyms]             [k] _raw_spin_unlock_irqrestore
     1.44%            init  [kernel.kallsyms]             [k] _raw_spin_lock
     1.36%         udpsink  [kernel.kallsyms]             [k] udp_recvmsg
     1.05%         udpsink  [kernel.kallsyms]             [k] __skb_recv_datagram
     1.05%            init  [kernel.kallsyms]             [k] __udp4_lib_lookup
     1.04%         udpsink  [kernel.kallsyms]             [k] copy_user_generic_string
     1.04%         udpsink  [kernel.kallsyms]             [k] __slab_free
     0.99%            init  [kernel.kallsyms]             [k] select_task_rq_fair
     0.99%            init  [kernel.kallsyms]             [k] try_to_wake_up
     0.98%            init  [kernel.kallsyms]             [k] task_rq_lock
     0.93%            init  [kernel.kallsyms]             [k] tick_broadcast_oneshot_control
     0.89%            init  [kernel.kallsyms]             [k] sock_queue_rcv_skb
     0.89%         udpsink  [kernel.kallsyms]             [k] sock_recv_ts_and_drops
     0.88%         udpsink  [kernel.kallsyms]             [k] kfree
     0.79%         swapper  [kernel.kallsyms]             [k] acpi_os_read_port
     0.76%         udpsink  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
     0.73%         udpsink  [kernel.kallsyms]             [k] inet_recvmsg
     0.71%         udpsink  [vdso]                        [.] 0x000000ffffe431
     0.65%         udpsink  [kernel.kallsyms]             [k] sock_recvmsg
     0.62%            init  [kernel.kallsyms]             [k] gs_change
     0.61%            init  [kernel.kallsyms]             [k] enqueue_task_fair
     0.61%            init  [kernel.kallsyms]             [k] eth_type_trans
     0.61%            init  [kernel.kallsyms]             [k] sock_def_readable
     0.60%         udpsink  [kernel.kallsyms]             [k] _raw_spin_lock_bh
     0.59%            init  [kernel.kallsyms]             [k] ip_route_input
     0.59%         udpsink  libpthread-2.3.4.so           [.] __pthread_disable_asynccancel
     0.56%            init  [kernel.kallsyms]             [k] bnx2x_poll
     0.56%         udpsink  [kernel.kallsyms]             [k] __get_user_4


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 26, 2010, 2:55 p.m. UTC | #5
Le lundi 26 avril 2010 à 16:03 +0200, Eric Dumazet a écrit :
> Le samedi 24 avril 2010 à 10:10 -0400, jamal a écrit :
> > On Fri, 2010-04-23 at 18:02 -0400, jamal wrote:
> > 
> > > Ive done a setup with the last patch from Changli + net-next - I will
> > > post test results tomorrow AM.
> > 
> > ok, annotated results attached. 
> > 
> > cheers,
> > jamal
> 
> Jamal, I have a Nehalem setup now, and I can see
> _raw_spin_lock_irqsave() abuse is not coming from network tree, but from
> clockevents_notify()
> 

Another interesting finding:

- if all packets are received on a single queue, max speed seems to be
1.200.000 packets per second on my machine :-(

And on profile of receiving cpu (RPS enabled, pakets sent to 15 other
cpus), we can see default_send_IPI_mask_sequence_phys() is the slow
thing...

Andi, what do you think of this one ?
Dont we have a function to send an IPI to an individual cpu instead ?

void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int
vector)
{
        unsigned long query_cpu;
        unsigned long flags;

        /*
         * Hack. The clustered APIC addressing mode doesn't allow us to
send
         * to an arbitrary mask, so I do a unicast to each CPU instead.
         * - mbligh
         */
        local_irq_save(flags);
        for_each_cpu(query_cpu, mask) {
                __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
                                query_cpu), vector, APIC_DEST_PHYSICAL);
        }
        local_irq_restore(flags);
}


-----------------------------------------------------------------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu:
7)
-----------------------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function                            DSO
             _______ _____ ___________________________________ _______

              668.00 17.7% default_send_IPI_mask_sequence_phys vmlinux
              363.00  9.6% bnx2x_rx_int                        vmlinux
              354.00  9.4% eth_type_trans                      vmlinux
              332.00  8.8% kmem_cache_alloc_node               vmlinux
              285.00  7.6% __kmalloc_node_track_caller         vmlinux
              278.00  7.4% _raw_spin_lock                      vmlinux
              166.00  4.4% __slab_alloc                        vmlinux
              147.00  3.9% __memset                            vmlinux
              136.00  3.6% list_del                            vmlinux
              132.00  3.5% get_partial_node                    vmlinux
              131.00  3.5% get_rps_cpu                         vmlinux
              102.00  2.7% enqueue_to_backlog                  vmlinux
               95.00  2.5% unmap_single                        vmlinux
               94.00  2.5% __alloc_skb                         vmlinux
               74.00  2.0% vlan_gro_common                     vmlinux
               52.00  1.4% __phys_addr                         vmlinux
               48.00  1.3% dev_gro_receive                     vmlinux
               39.00  1.0% swiotlb_dma_mapping_error           vmlinux
               36.00  1.0% swiotlb_map_page                    vmlinux
               34.00  0.9% skb_put                             vmlinux
               27.00  0.7% is_swiotlb_buffer                   vmlinux
               23.00  0.6% deactivate_slab                     vmlinux
               20.00  0.5% vlan_gro_receive                    vmlinux
               17.00  0.5% __skb_bond_should_drop              vmlinux
               14.00  0.4% netif_receive_skb                   vmlinux
               14.00  0.4% __netdev_alloc_skb                  vmlinux
               12.00  0.3% skb_gro_reset_offset                vmlinux
               12.00  0.3% get_slab                            vmlinux
               11.00  0.3% napi_skb_finish                     vmlinux


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal April 26, 2010, 9:03 p.m. UTC | #6
On Mon, 2010-04-26 at 16:03 +0200, Eric Dumazet wrote:

> 
> Jamal, I have a Nehalem setup now, and I can see
> _raw_spin_lock_irqsave() abuse is not coming from network tree, but from
> clockevents_notify()

yikes. Thanks Eric - I shouldve been able to figure that one out. But
why is this thing expensive? I will run the test tommorow and see if i
see the same thing. 

cheers,
jamal



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal April 26, 2010, 9:06 p.m. UTC | #7
On Mon, 2010-04-26 at 16:55 +0200, Eric Dumazet wrote:

> Another interesting finding:
> 
> - if all packets are received on a single queue, max speed seems to be
> 1.200.000 packets per second on my machine :-(

Well, if any consolation, it is not as bad as sky2 hardware;-> I cant do
more than 750Kpps.
Also, it seems you use VLANS - max pps will be lower than without VLANs
by probably maybe 6-70Kpps (doesnt explain the 1.2Mpps of course).

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 29, 2010, 5:56 p.m. UTC | #8
Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit :
> > Andi, what do you think of this one ?
> > Dont we have a function to send an IPI to an individual cpu instead ?
> 
> That's what this function already does. You only set a single CPU 
> in the target mask, right?
> 
> IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC
> which is a bit faster for this, but that's not available in the lower
> end Nehalems. But even then it's not exactly fast.
> 
> I don't think the IPI primitive can be optimized much. It's not a cheap 
> operation.
> 
> If it's a problem do it less often and batch IPIs.
> 
> It's essentially the same problem as interrupt mitigation or NAPI 
> are solving for NICs. I guess just need a suitable mitigation mechanism.
> 
> Of course that would move more work to the sending CPU again, but 
> perhaps there's no alternative. I guess you could make it cheaper it by
> minimizing access to packet data.
> 
> -Andi

Well, IPI are already batched, and rate is auto adaptative.

After various changes, it seems things are going better, maybe there is
something related to cache line trashing.

I 'solved' it by using idle=poll, but you might take a look at
clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly
contended spinlock...




    23.52%            init  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
                      |
                      --- _raw_spin_lock_irqsave
                         |          
                         |--94.74%-- clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          
                         |--4.10%-- tick_broadcast_oneshot_control
                         |          tick_notify
                         |          notifier_call_chain
                         |          __raw_notifier_call_chain
                         |          raw_notifier_call_chain
                         |          clockevents_do_notify
                         |          clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger April 29, 2010, 6:10 p.m. UTC | #9
> Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit :
> > > Andi, what do you think of this one ?
> > > Dont we have a function to send an IPI to an individual cpu instead ?  
> > 
> > That's what this function already does. You only set a single CPU 
> > in the target mask, right?
> > 
> > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC
> > which is a bit faster for this, but that's not available in the lower
> > end Nehalems. But even then it's not exactly fast.
> > 
> > I don't think the IPI primitive can be optimized much. It's not a cheap 
> > operation.
> > 
> > If it's a problem do it less often and batch IPIs.
> > 
> > It's essentially the same problem as interrupt mitigation or NAPI 
> > are solving for NICs. I guess just need a suitable mitigation mechanism.
> > 
> > Of course that would move more work to the sending CPU again, but 
> > perhaps there's no alternative. I guess you could make it cheaper it by
> > minimizing access to packet data.
> > 
> > -Andi  
> 
> Well, IPI are already batched, and rate is auto adaptative.
> 
> After various changes, it seems things are going better, maybe there is
> something related to cache line trashing.
> 
> I 'solved' it by using idle=poll, but you might take a look at
> clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly
> contended spinlock...
> 
> 
> 
> 
>     23.52%            init  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
>                       |
>                       --- _raw_spin_lock_irqsave
>                          |          
>                          |--94.74%-- clockevents_notify
>                          |          lapic_timer_state_broadcast
>                          |          acpi_idle_enter_bm
>                          |          cpuidle_idle_call
>                          |          cpu_idle
>                          |          start_secondary
>                          |          
>                          |--4.10%-- tick_broadcast_oneshot_control
>                          |          tick_notify
>                          |          notifier_call_chain
>                          |          __raw_notifier_call_chain
>                          |          raw_notifier_call_chain
>                          |          clockevents_do_notify
>                          |          clockevents_notify
>                          |          lapic_timer_state_broadcast
>                          |          acpi_idle_enter_bm
>                          |          cpuidle_idle_call
>                          |          cpu_idle
>                          |          start_secondary
>                          |          
> 


I keep getting asked about taking some core's away from clock and scheduler
to be reserved just for network processing. Seeing this kind of stuff
makes me wonder if maybe that isn't a half bad idea.
Eric Dumazet April 29, 2010, 7:12 p.m. UTC | #10
Le jeudi 29 avril 2010 à 20:23 +0200, Andi Kleen a écrit :
> On Thu, Apr 29, 2010 at 07:56:12PM +0200, Eric Dumazet wrote:
> > Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit :
> > > > Andi, what do you think of this one ?
> > > > Dont we have a function to send an IPI to an individual cpu instead ?
> > > 
> > > That's what this function already does. You only set a single CPU 
> > > in the target mask, right?
> > > 
> > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC
> > > which is a bit faster for this, but that's not available in the lower
> > > end Nehalems. But even then it's not exactly fast.
> > > 
> > > I don't think the IPI primitive can be optimized much. It's not a cheap 
> > > operation.
> > > 
> > > If it's a problem do it less often and batch IPIs.
> > > 
> > > It's essentially the same problem as interrupt mitigation or NAPI 
> > > are solving for NICs. I guess just need a suitable mitigation mechanism.
> > > 
> > > Of course that would move more work to the sending CPU again, but 
> > > perhaps there's no alternative. I guess you could make it cheaper it by
> > > minimizing access to packet data.
> > > 
> > > -Andi
> > 
> > Well, IPI are already batched, and rate is auto adaptative.
> > 
> > After various changes, it seems things are going better, maybe there is
> > something related to cache line trashing.
> > 
> > I 'solved' it by using idle=poll, but you might take a look at
> > clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly
> > contended spinlock...
> 
> acpi_idle_enter_bm should not be executed on a Nehalem, it's obsolete.
> If it does on your system something is wrong.
> 
> Ahh, that triggers a bell. There's one issue that if the remote CPU is in a very
> deep idle state it could take a long time to wake it up. Nehalem has deeper
> sleep states than earlier CPUs. When this happens the IPI sender will be slow
> too I believe.
> 
> Are the target CPUs idle? 
> 

Yes, mostly, but about 200.000 wakeups per second I would say...

If a cpu in deep state receives an IPI, process a softirq, should it
come back to deep state immediately, or should it wait for some
milliseconds ?

> Perhaps need to feed some information to cpuidle's governour to prevent this problem.
> 
> idle=poll is very drastic, better to limit to C1 
> 

How can I do this ?

Thanks !


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Gleixner April 29, 2010, 7:19 p.m. UTC | #11
On Thu, 29 Apr 2010, Stephen Hemminger wrote:
> > Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit :
> > > > Andi, what do you think of this one ?
> > > > Dont we have a function to send an IPI to an individual cpu instead ?  
> > > 
> > > That's what this function already does. You only set a single CPU 
> > > in the target mask, right?
> > > 
> > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC
> > > which is a bit faster for this, but that's not available in the lower
> > > end Nehalems. But even then it's not exactly fast.
> > > 
> > > I don't think the IPI primitive can be optimized much. It's not a cheap 
> > > operation.
> > > 
> > > If it's a problem do it less often and batch IPIs.
> > > 
> > > It's essentially the same problem as interrupt mitigation or NAPI 
> > > are solving for NICs. I guess just need a suitable mitigation mechanism.
> > > 
> > > Of course that would move more work to the sending CPU again, but 
> > > perhaps there's no alternative. I guess you could make it cheaper it by
> > > minimizing access to packet data.
> > > 
> > > -Andi  
> > 
> > Well, IPI are already batched, and rate is auto adaptative.
> > 
> > After various changes, it seems things are going better, maybe there is
> > something related to cache line trashing.
> > 
> > I 'solved' it by using idle=poll, but you might take a look at
> > clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly
> > contended spinlock...

Say thanks to Intel/AMD for providing us timers which stop in lower
c-states.

Not much we can do about the broadcast lock when several cores are
going idle and we need to setup a global timer to work around the
lapic timer stops in C2/C3 issue.

Simply the C-state timer broadcasting does not scale. And it was never
meant to scale. It's a workaround for laptops to have functional NOHZ.

There are several ways to work around that on larger machines:

 - Restrict c-states
 - Disable NOHZ and highres timers
 - idle=poll is definitely the worst of all possible solutions

> I keep getting asked about taking some core's away from clock and scheduler
> to be reserved just for network processing. Seeing this kind of stuff
> makes me wonder if maybe that isn't a half bad idea.

This comes up every few month and we pointed out several times what
needs to be done to make this work w/o these weird hacks which put a
core offline and then start some magic undebugable binary blob on it.
We have not seen anyone working on this, but the "set cores aside and
let them do X" idea seems to stick in peoples heads.

Seriously, that's not a solution. It's going to be some hacked up
nightmare which is completely unmaintainable.

Aside of that I seriously doubt that you can do networking w/o time
and timers.

Thanks,

	tglx
Eric Dumazet April 29, 2010, 8:02 p.m. UTC | #12
Le jeudi 29 avril 2010 à 21:19 +0200, Thomas Gleixner a écrit :

> Say thanks to Intel/AMD for providing us timers which stop in lower
> c-states.
> 
> Not much we can do about the broadcast lock when several cores are
> going idle and we need to setup a global timer to work around the
> lapic timer stops in C2/C3 issue.
> 
> Simply the C-state timer broadcasting does not scale. And it was never
> meant to scale. It's a workaround for laptops to have functional NOHZ.
> 
> There are several ways to work around that on larger machines:
> 
>  - Restrict c-states
>  - Disable NOHZ and highres timers
>  - idle=poll is definitely the worst of all possible solutions
> 
> > I keep getting asked about taking some core's away from clock and scheduler
> > to be reserved just for network processing. Seeing this kind of stuff
> > makes me wonder if maybe that isn't a half bad idea.
> 
> This comes up every few month and we pointed out several times what
> needs to be done to make this work w/o these weird hacks which put a
> core offline and then start some magic undebugable binary blob on it.
> We have not seen anyone working on this, but the "set cores aside and
> let them do X" idea seems to stick in peoples heads.
> 
> Seriously, that's not a solution. It's going to be some hacked up
> nightmare which is completely unmaintainable.
> 
> Aside of that I seriously doubt that you can do networking w/o time
> and timers.
> 

Thanks a lot !

booting with processor.max_cstate=1 solves the problem

(I already had a CONFIG_NO_HZ=no conf, but highres timer enabled)

Even with _carefuly_ chosen crazy configuration (receiving a packet on a
cpu, then transfert it to another cpu, with a full 16x16 matrix
involved), generating 700.000 IPI per second on the machine seems fine
now.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 30, 2010, 5:25 a.m. UTC | #13
Le jeudi 29 avril 2010 à 23:41 +0200, Andi Kleen a écrit :
> On Thu, Apr 29, 2010 at 09:12:27PM +0200, Eric Dumazet wrote:
> > Yes, mostly, but about 200.000 wakeups per second I would say...
> > 
> > If a cpu in deep state receives an IPI, process a softirq, should it
> > come back to deep state immediately, or should it wait for some
> > milliseconds ?
> 
> In principle the cpuidle governour should detect this and not put the target into
> the slow deep c states. One change that was done recently to fix a similar 
> problem for disk IO was to take processes that wait for IO into account 
> (see 69d25870). But it doesn't work for networking.
> 
> Here's a untested patch that might help: tell the cpuidle governour 
> networking is waiting for IO. This will tell it to not go down the deeply.
> 
> I might have missed some schedule() paths, feel free to add more.
> 
> Actually it's probably too aggressive because it will avoid C states even for
> a closed window on the other side which might be hours. Better would
> be some heuristic to only do this when you're really expected IO shortly.
> 
> Also does your workload even sleep at all? If not we would need to increase
> the iowait counters in recvmsg() itself.
> 

My workload yes, uses blocking recvmsg() calls, but Jamal one uses
epoll() so I guess problem is more generic than that. We should have an
estimate of the number of wakeups (IO or not...) per second (or
sub-second) so that cpuidle can avoid these deep states ?

> Anyways might be still worth a try.
> 
> For routing we probably need some other solution though, there are no 
> schedules there.
> 
> > 
> > > Perhaps need to feed some information to cpuidle's governour to prevent this problem.
> > > 
> > > idle=poll is very drastic, better to limit to C1 
> > > 
> > 
> > How can I do this ?
> 
> processor.max_cstate=1 or using /dev/network_latency 
> (see Documentation/power/pm_qos_interface.txt)
> 
> -Andi
> 

Thanks, I'll play with this today !

> 
> 
> commit 810227a7c24ecae2bb4aac320490a7115ac33be8
> Author: Andi Kleen <ak@linux.intel.com>
> Date:   Thu Apr 29 23:33:18 2010 +0200
> 
>     Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies
> 
>     XXX: probably too aggressive, some of these sleeps are not under high load.
> 
>     Based on a bug report from Eric Dumazet.
>     
>     Signed-off-by: Andi Kleen <ak@linux.intel.com>
> 
> diff --git a/net/core/sock.c b/net/core/sock.c
> index c5812bb..c246d6c 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1402,7 +1402,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
>  			break;
>  		if (sk->sk_err)
>  			break;
> -		timeo = schedule_timeout(timeo);
> +		timeo = io_schedule_timeout(timeo);
>  	}
>  	finish_wait(sk->sk_sleep, &wait);
>  	return timeo;
> @@ -1512,7 +1512,7 @@ static void __lock_sock(struct sock *sk)
>  		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
>  					TASK_UNINTERRUPTIBLE);
>  		spin_unlock_bh(&sk->sk_lock.slock);
> -		schedule();
> +		io_schedule();
>  		spin_lock_bh(&sk->sk_lock.slock);
>  		if (!sock_owned_by_user(sk))
>  			break;
> 
> > 
> > Thanks !
> > 
> > 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Bloniarz April 30, 2010, 6:15 p.m. UTC | #14
Eric Dumazet wrote:
> Le jeudi 29 avril 2010 à 21:19 +0200, Thomas Gleixner a écrit :
> 
>> Say thanks to Intel/AMD for providing us timers which stop in lower
>> c-states.
>>
>> Not much we can do about the broadcast lock when several cores are
>> going idle and we need to setup a global timer to work around the
>> lapic timer stops in C2/C3 issue.
>>
>> Simply the C-state timer broadcasting does not scale. And it was never
>> meant to scale. It's a workaround for laptops to have functional NOHZ.
>>
>> There are several ways to work around that on larger machines:
>>
>>  - Restrict c-states
>>  - Disable NOHZ and highres timers
>>  - idle=poll is definitely the worst of all possible solutions
>>
>>> I keep getting asked about taking some core's away from clock and scheduler
>>> to be reserved just for network processing. Seeing this kind of stuff
>>> makes me wonder if maybe that isn't a half bad idea.
>> This comes up every few month and we pointed out several times what
>> needs to be done to make this work w/o these weird hacks which put a
>> core offline and then start some magic undebugable binary blob on it.
>> We have not seen anyone working on this, but the "set cores aside and
>> let them do X" idea seems to stick in peoples heads.
>>
>> Seriously, that's not a solution. It's going to be some hacked up
>> nightmare which is completely unmaintainable.
>>
>> Aside of that I seriously doubt that you can do networking w/o time
>> and timers.
>>
> 
> Thanks a lot !
> 
> booting with processor.max_cstate=1 solves the problem
> 
> (I already had a CONFIG_NO_HZ=no conf, but highres timer enabled)
> 
> Even with _carefuly_ chosen crazy configuration (receiving a packet on a
> cpu, then transfert it to another cpu, with a full 16x16 matrix
> involved), generating 700.000 IPI per second on the machine seems fine
> now.

FYI you can also restrict c=states at runtime with PM QoS:
Documentation/power/pm_qos_interface.txt

On my machine, /sys/devices/system/cpu/cpu0/cpuidle/state2/latency
is 205usec, so configuring a PM QoS request for <= 205usec latency
should prevent it being entered:

#!/usr/bin/python
import os;
import struct;
import signal;

latency_rec_usec = 100
f = os.open("/dev/cpu_dma_latency", os.O_WRONLY);
os.write(f, struct.pack("=i", latency_rec_usec));
signal.pause();
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 30, 2010, 6:57 p.m. UTC | #15
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Apr 2010 21:19:36 +0200 (CEST)

> Aside of that I seriously doubt that you can do networking w/o time
> and timers.

You're right that we need timestamps and the like.

But only if we actually process the packets on these restricted cpus :-)

If we use RPS and farm out all packets to other cpus, ie. just doing
the driver work and the remote cpu dispatch on these "offline" cpus,
it is doable.

Then we can do cool tricks like having the cpu spin on a mwait() on the
network device's status descriptor in memory.

In any event I agree with you, it's a cool idea at best, and likely
not really practical.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Gleixner April 30, 2010, 7:58 p.m. UTC | #16
Dave,

On Fri, 30 Apr 2010, David Miller wrote:

> From: Thomas Gleixner <tglx@linutronix.de>
> Date: Thu, 29 Apr 2010 21:19:36 +0200 (CEST)
> 
> > Aside of that I seriously doubt that you can do networking w/o time
> > and timers.
> 
> You're right that we need timestamps and the like.
> 
> But only if we actually process the packets on these restricted cpus :-)
> 
> If we use RPS and farm out all packets to other cpus, ie. just doing
> the driver work and the remote cpu dispatch on these "offline" cpus,
> it is doable.
> 
> Then we can do cool tricks like having the cpu spin on a mwait() on the
> network device's status descriptor in memory.
> 
> In any event I agree with you, it's a cool idea at best, and likely
> not really practical.

Well, it might be worth to experiment with that once we get the basic
infrastructure in place to "isolate" cores under full kernel control. 

It's not too hard to solve the problems, but it seems nobody has a
free time slot to tackle them.

Thanks

	tglx
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen April 30, 2010, 9:01 p.m. UTC | #17
> Then we can do cool tricks like having the cpu spin on a mwait() on the
> network device's status descriptor in memory.

When you specify a deep C state in that mwait then it will also have the long 
wakeup latency in the idle case.  When you don't then you just killed higher
Turbo mode on that socket and give away a lot of performance on the other
cores.

So you have to solve the idle state governour issue anyways, and then
you likely don't need it anymore.

Besides it seems to me that dispatching is something the NIC should
just do directly. "RPS only CPU" would be essentially just an 
interrupt mitigation/flow redirection scheme that a lot of NICs
do anyways.

> In any event I agree with you, it's a cool idea at best, and likely
> not really practical.

s/cool//

-Andi
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 30, 2010, 10:30 p.m. UTC | #18
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 30 Apr 2010 23:01:31 +0200

> Besides it seems to me that dispatching is something the NIC should
> just do directly. "RPS only CPU" would be essentially just an 
> interrupt mitigation/flow redirection scheme that a lot of NICs
> do anyways.

We've already established that the NIC can't do a complete job in all
important cases, that's why we've integrated the RPS/RFS patches in
the first place.

And we don't want it to, because the decision mechanisms for steering
that we using now are starting to get into the stateful territory and
that's verbotton for NIC offload as far as we're concerned.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 30, 2010, 11:38 p.m. UTC | #19
From: Andi Kleen <ak@gargoyle.fritz.box>
Date: Thu, 29 Apr 2010 23:41:44 +0200

>     Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies
> 
>     XXX: probably too aggressive, some of these sleeps are not under high load.
> 
>     Based on a bug report from Eric Dumazet.
>     
>     Signed-off-by: Andi Kleen <ak@linux.intel.com>

I like this, except that we probably don't want the delayacct_blkio_*() calls
these things do.

Probably the rest of what these things do should remain in the io_schedule*()
functions and the block layer can call it's own versions which add in the
delayacct_blkio_*() bits.

Or, if the delacct stuff is useful for socket I/O too, then it's interfaces
names should have the "blk" stripped from them :-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen May 1, 2010, 10:53 a.m. UTC | #20
> And we don't want it to, because the decision mechanisms for steering
> that we using now are starting to get into the stateful territory and
> that's verbotton for NIC offload as far as we're concerned.

Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC 
(or someone else like netfilter) tracking flows is quite common and very far 
from full offload. AFAIK it doesn't have near all the problems full
offload has.

-Andi

[1] although it seems to leak in more and more through the RDMA backdoor.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen May 1, 2010, 11 a.m. UTC | #21
On Fri, Apr 30, 2010 at 04:38:57PM -0700, David Miller wrote:
> From: Andi Kleen <ak@gargoyle.fritz.box>
> Date: Thu, 29 Apr 2010 23:41:44 +0200
> 
> >     Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies
> > 
> >     XXX: probably too aggressive, some of these sleeps are not under high load.
> > 
> >     Based on a bug report from Eric Dumazet.
> >     
> >     Signed-off-by: Andi Kleen <ak@linux.intel.com>
> 
> I like this, except that we probably don't want the delayacct_blkio_*() calls
> these things do.

Yes.

It needs more work, please don't apply it yet, to handle the "long sleep" case.

Still curious if it fixes Eric's test case.

> 
> Probably the rest of what these things do should remain in the io_schedule*()
> functions and the block layer can call it's own versions which add in the
> delayacct_blkio_*() bits.

Good point.

> 
> Or, if the delacct stuff is useful for socket I/O too, then it's interfaces
> names should have the "blk" stripped from them :-)

Good question. I suspect it's actually useful for some cases, but just adding
sockets might confuse some users.

-Andi
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin Josefsson May 1, 2010, 8:31 p.m. UTC | #22
On Fri, 30 Apr 2010, David Miller wrote:

> Then we can do cool tricks like having the cpu spin on a mwait() on the
> network device's status descriptor in memory.

Can you have mwait monitor multiple cachelines for stores? If not then it 
might be hard to do that when you have multiple nics and you actually 
need to use the status descriptors, otherwise you could possibly have them 
all written to the same cacheline. 
Or if the nic doesn't support updating a status descriptor in memory.

If you just want to wake up quickly without using interrupts it might be 
possible to abuse MSI to wake up without actually using interrupts, set 
the address to the cacheline that is being monitored.

/Martin
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller May 1, 2010, 10:03 p.m. UTC | #23
From: Andi Kleen <andi@firstfloor.org>
Date: Sat, 1 May 2010 12:53:04 +0200

>> And we don't want it to, because the decision mechanisms for steering
>> that we using now are starting to get into the stateful territory and
>> that's verbotton for NIC offload as far as we're concerned.
> 
> Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC 
> (or someone else like netfilter) tracking flows is quite common and very far 
> from full offload. AFAIK it doesn't have near all the problems full
> offload has.

We're tracking flow cpu location state at the socket operations, like
recvmsg() and sendmsg(), where it belongs.

Would you like us to call into the card drivers and firmware at these
spots instead?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller May 1, 2010, 10:13 p.m. UTC | #24
From: Martin Josefsson <gandalf@mjufs.se>
Date: Sat, 1 May 2010 22:31:05 +0200 (CEST)

> On Fri, 30 Apr 2010, David Miller wrote:
> 
>> Then we can do cool tricks like having the cpu spin on a mwait() on
>> the
>> network device's status descriptor in memory.
> 
> Can you have mwait monitor multiple cachelines for stores?

The idea is that if you have hundreds of cpus threads (several of my
machines do, and it's not too long before these kinds of boxes will be
common) in your machine you can spare one for each NIC.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen May 1, 2010, 10:58 p.m. UTC | #25
> We're tracking flow cpu location state at the socket operations, like
> recvmsg() and sendmsg(), where it belongs.
> 
> Would you like us to call into the card drivers and firmware at these
> spots instead?

No, that's not needed for lazy flow tracking like in netfilter or 
some NICs, it doesn't need exact updates. It just works with seen network 
packets. 

-Andi
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller May 1, 2010, 11:29 p.m. UTC | #26
From: Andi Kleen <andi@firstfloor.org>
Date: Sun, 2 May 2010 00:58:15 +0200

>> We're tracking flow cpu location state at the socket operations, like
>> recvmsg() and sendmsg(), where it belongs.
>> 
>> Would you like us to call into the card drivers and firmware at these
>> spots instead?
> 
> No, that's not needed for lazy flow tracking like in netfilter or 
> some NICs, it doesn't need exact updates. It just works with seen network 
> packets. 

Well what we need is exact flow updates so that we steer packets
to where the applications actually are.

Andi, this discussion is going in circles, can I just say "yeah you're
right Andi" and this will satisfy your desire to be correct and we can
be done with this?

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Hutchings May 1, 2010, 11:44 p.m. UTC | #27
On Sat, 2010-05-01 at 15:03 -0700, David Miller wrote:
> From: Andi Kleen <andi@firstfloor.org>
> Date: Sat, 1 May 2010 12:53:04 +0200
> 
> >> And we don't want it to, because the decision mechanisms for steering
> >> that we using now are starting to get into the stateful territory and
> >> that's verbotton for NIC offload as far as we're concerned.
> > 
> > Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC 
> > (or someone else like netfilter) tracking flows is quite common and very far 
> > from full offload. AFAIK it doesn't have near all the problems full
> > offload has.
> 
> We're tracking flow cpu location state at the socket operations, like
> recvmsg() and sendmsg(), where it belongs.
> 
> Would you like us to call into the card drivers and firmware at these
> spots instead?

I'm interested in experimenting with this at some point, since our
hardware supports a fairly large number of filters that could be used
for it.

Ben.
diff mbox

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3c5ed5f..6ae9f2b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1387,6 +1387,7 @@  struct softnet_data {
 	struct Qdisc		*output_queue;
 	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;
+	struct sk_buff_head	process_queue;
 
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
@@ -1401,10 +1402,11 @@  struct softnet_data {
 	struct napi_struct	backlog;
 };
 
-static inline void input_queue_head_incr(struct softnet_data *sd)
+static inline void input_queue_head_add(struct softnet_data *sd,
+					unsigned int len)
 {
 #ifdef CONFIG_RPS
-	sd->input_queue_head++;
+	sd->input_queue_head += len;
 #endif
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index a4a7c36..c1585f9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2409,12 +2409,13 @@  static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	__get_cpu_var(netdev_rx_stat).total++;
 
 	rps_lock(sd);
-	if (sd->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (sd->input_pkt_queue.qlen) {
+	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
 #ifdef CONFIG_RPS
-			*qtail = sd->input_queue_head + sd->input_pkt_queue.qlen;
+			*qtail = sd->input_queue_head +
+					skb_queue_len(&sd->input_pkt_queue);
 #endif
 			rps_unlock(sd);
 			local_irq_restore(flags);
@@ -2934,13 +2935,21 @@  static void flush_backlog(void *arg)
 	struct sk_buff *skb, *tmp;
 
 	rps_lock(sd);
-	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp)
+	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->input_pkt_queue);
 			kfree_skb(skb);
-			input_queue_head_incr(sd);
+			input_queue_head_add(sd, 1);
 		}
+	}
 	rps_unlock(sd);
+
+	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+		if (skb->dev == dev) {
+			__skb_unlink(skb, &sd->process_queue);
+			kfree_skb(skb);
+		}
+	}
 }
 
 static int napi_gro_complete(struct sk_buff *skb)
@@ -3286,24 +3295,30 @@  static int process_backlog(struct napi_struct *napi, int quota)
 	}
 #endif
 	napi->weight = weight_p;
-	do {
+	local_irq_disable();
+	while (1) {
 		struct sk_buff *skb;
 
-		local_irq_disable();
+		while ((skb = __skb_dequeue(&sd->process_queue))) {
+			local_irq_enable();
+			__netif_receive_skb(skb);
+			if (++work >= quota)
+				return work;
+			local_irq_disable();
+		}
+
 		rps_lock(sd);
-		skb = __skb_dequeue(&sd->input_pkt_queue);
-		if (!skb) {
+		input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue));
+		skb_queue_splice_tail_init(&sd->input_pkt_queue,
+					   &sd->process_queue);
+		if (skb_queue_empty(&sd->process_queue)) {
 			__napi_complete(napi);
 			rps_unlock(sd);
-			local_irq_enable();
 			break;
 		}
-		input_queue_head_incr(sd);
 		rps_unlock(sd);
-		local_irq_enable();
-
-		__netif_receive_skb(skb);
-	} while (++work < quota);
+	}
+	local_irq_enable();
 
 	return work;
 }
@@ -5631,8 +5646,10 @@  static int dev_cpu_callback(struct notifier_block *nfb,
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
 		netif_rx(skb);
-		input_queue_head_incr(oldsd);
+		input_queue_head_add(oldsd, 1);
 	}
+	while ((skb = __skb_dequeue(&oldsd->process_queue)))
+		netif_rx(skb);
 
 	return NOTIFY_OK;
 }
@@ -5851,6 +5868,7 @@  static int __init net_dev_init(void)
 		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
 		skb_queue_head_init(&sd->input_pkt_queue);
+		skb_queue_head_init(&sd->process_queue);
 		sd->completion_queue = NULL;
 		INIT_LIST_HEAD(&sd->poll_list);