diff mbox series

[net,3/4] tcp: add tcp_tx_skb_cache sysctl

Message ID 20190614232221.248392-4-edumazet@google.com
State Accepted
Delegated to: David Miller
Headers show
Series tcp: add three static keys | expand

Commit Message

Eric Dumazet June 14, 2019, 11:22 p.m. UTC
Feng Tang reported a performance regression after introduction
of per TCP socket tx/rx caches, for TCP over loopback (netperf)

There is high chance the regression is caused by a change on
how well the 32 KB per-thread page (current->task_frag) can
be recycled, and lack of pcp caches for order-3 pages.

I could not reproduce the regression myself, cpus all being
spinning on the mm spinlocks for page allocs/freeing, regardless
of enabling or disabling the per tcp socket caches.

It seems best to disable the feature by default, and let
admins enabling it.

MM layer either needs to provide scalable order-3 pages
allocations, or could attempt a trylock on zone->lock if
the caller only attempts to get a high-order page and is
able to fallback to order-0 ones in case of pressure.

Tests run on a 56 cores host (112 hyper threads)

-	35.49%	netperf 		 [kernel.vmlinux]	  [k] queued_spin_lock_slowpath
   - 35.49% queued_spin_lock_slowpath
	  - 18.18% get_page_from_freelist
		 - __alloc_pages_nodemask
			- 18.18% alloc_pages_current
				 skb_page_frag_refill
				 sk_page_frag_refill
				 tcp_sendmsg_locked
				 tcp_sendmsg
				 inet_sendmsg
				 sock_sendmsg
				 __sys_sendto
				 __x64_sys_sendto
				 do_syscall_64
				 entry_SYSCALL_64_after_hwframe
				 __libc_send
	  + 17.31% __free_pages_ok
+	31.43%	swapper 		 [kernel.vmlinux]	  [k] intel_idle
+	 9.12%	netperf 		 [kernel.vmlinux]	  [k] copy_user_enhanced_fast_string
+	 6.53%	netserver		 [kernel.vmlinux]	  [k] copy_user_enhanced_fast_string
+	 0.69%	netserver		 [kernel.vmlinux]	  [k] queued_spin_lock_slowpath
+	 0.68%	netperf 		 [kernel.vmlinux]	  [k] skb_release_data
+	 0.52%	netperf 		 [kernel.vmlinux]	  [k] tcp_sendmsg_locked
	 0.46%	netperf 		 [kernel.vmlinux]	  [k] _raw_spin_lock_irqsave

Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Feng Tang <feng.tang@intel.com>
---
 include/net/sock.h         | 4 +++-
 net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

Comments

Feng Tang June 16, 2019, 7:42 a.m. UTC | #1
Hi Eric,

On Fri, Jun 14, 2019 at 04:22:20PM -0700, Eric Dumazet wrote:
> Feng Tang reported a performance regression after introduction
> of per TCP socket tx/rx caches, for TCP over loopback (netperf)
> 
> There is high chance the regression is caused by a change on
> how well the 32 KB per-thread page (current->task_frag) can
> be recycled, and lack of pcp caches for order-3 pages.

Exactly! When I checked the regression, I did several experiments,
and thought of the simliar idea to add the per-CPU orderX pcp list,
the other idea is to add a order3 list in per-cpu softnet_data as
local cache.

Thanks,
Feng

> 
> I could not reproduce the regression myself, cpus all being
> spinning on the mm spinlocks for page allocs/freeing, regardless
> of enabling or disabling the per tcp socket caches.
> 
> It seems best to disable the feature by default, and let
> admins enabling it.
> 
> MM layer either needs to provide scalable order-3 pages
> allocations, or could attempt a trylock on zone->lock if
> the caller only attempts to get a high-order page and is
> able to fallback to order-0 ones in case of pressure.
> 
> Tests run on a 56 cores host (112 hyper threads)
> 
> -	35.49%	netperf 		 [kernel.vmlinux]	  [k] queued_spin_lock_slowpath
>    - 35.49% queued_spin_lock_slowpath
> 	  - 18.18% get_page_from_freelist
> 		 - __alloc_pages_nodemask
> 			- 18.18% alloc_pages_current
> 				 skb_page_frag_refill
> 				 sk_page_frag_refill
> 				 tcp_sendmsg_locked
> 				 tcp_sendmsg
> 				 inet_sendmsg
> 				 sock_sendmsg
> 				 __sys_sendto
> 				 __x64_sys_sendto
> 				 do_syscall_64
> 				 entry_SYSCALL_64_after_hwframe
> 				 __libc_send
> 	  + 17.31% __free_pages_ok
> +	31.43%	swapper 		 [kernel.vmlinux]	  [k] intel_idle
> +	 9.12%	netperf 		 [kernel.vmlinux]	  [k] copy_user_enhanced_fast_string
> +	 6.53%	netserver		 [kernel.vmlinux]	  [k] copy_user_enhanced_fast_string
> +	 0.69%	netserver		 [kernel.vmlinux]	  [k] queued_spin_lock_slowpath
> +	 0.68%	netperf 		 [kernel.vmlinux]	  [k] skb_release_data
> +	 0.52%	netperf 		 [kernel.vmlinux]	  [k] tcp_sendmsg_locked
> 	 0.46%	netperf 		 [kernel.vmlinux]	  [k] _raw_spin_lock_irqsave
> 
> Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: Feng Tang <feng.tang@intel.com>
> ---
>  include/net/sock.h         | 4 +++-
>  net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
>  2 files changed, 11 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index b02645e2dfad722769c1455bcde76e46da9fc5ac..7d7f4ce63bb2aae7c87a9445d11339b6e6b19724 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1463,12 +1463,14 @@  static inline void sk_mem_uncharge(struct sock *sk, int size)
 		__sk_mem_reclaim(sk, 1 << 20);
 }
 
+DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
 	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
 	sk->sk_wmem_queued -= skb->truesize;
 	sk_mem_uncharge(sk, skb->truesize);
-	if (!sk->sk_tx_skb_cache && !skb_cloned(skb)) {
+	if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
+	    !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
 		skb_zcopy_clear(skb, true);
 		sk->sk_tx_skb_cache = skb;
 		return;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 886b58d31351df44725bdc34081e798bcb89ecf0..08a428a7b2749c4f2a03aa6352e44c053596ef75 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -54,6 +54,8 @@  static int one_day_secs = 24 * 3600;
 DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
 EXPORT_SYMBOL(tcp_rx_skb_cache_key);
 
+DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
+
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
 
@@ -568,6 +570,12 @@  static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_do_static_key,
 	},
+	{
+		.procname	= "tcp_tx_skb_cache",
+		.data		= &tcp_tx_skb_cache_key.key,
+		.mode		= 0644,
+		.proc_handler	= proc_do_static_key,
+	},
 	{ }
 };