diff mbox

[RFC] csum experts, csum_replace2() is too expensive

Message ID 1395406250.9114.142.camel@edumazet-glaptop2.roam.corp.google.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet March 21, 2014, 12:50 p.m. UTC
On Thu, 2014-03-20 at 18:56 -0700, Andi Kleen wrote:
> Eric Dumazet <eric.dumazet@gmail.com> writes:
> >
> > I saw csum_partial() consuming 1% of cpu cycles in a GRO workload, that
> > is insane...
> 
> 
> Couldn't it just be the cache miss?

Or the fact that we mix 16 bit stores and 32bit loads ?

iph->tot_len = newlen;
iph->check = 0;
iph->check = ip_fast_csum(iph, 5);

-> following perf annotation :

         :      ffffffff81538e70 <inet_gro_complete>:
    0.49 :      ffffffff81538e70:       callq  ffffffff81578c80 <__entry_text_start>
    0.49 :      ffffffff81538e75:       push   %rbp
    1.46 :      ffffffff81538e76:       movzwl 0x60(%rdi),%edx
    0.00 :      ffffffff81538e7a:       movslq %esi,%rax
    0.00 :      ffffffff81538e7d:       add    0xc8(%rdi),%rax
    1.46 :      ffffffff81538e84:       mov    %rsp,%rbp
    0.00 :      ffffffff81538e87:       sub    %esi,%edx
    0.00 :      ffffffff81538e89:       rol    $0x8,%dx
    0.00 :      ffffffff81538e8d:       movzbl 0x9(%rax),%ecx
    0.98 :      ffffffff81538e91:       mov    %dx,0x2(%rax)     iph->tot_len = newlen;
    0.49 :      ffffffff81538e95:       xor    %edx,%edx
    0.00 :      ffffffff81538e97:       mov    %dx,0xa(%rax)     iph->check = 0;
    0.00 :      ffffffff81538e9b:       mov    (%rax),%edx       // 32bit load  -> stall 
   86.34 :      ffffffff81538e9d:       add    0x4(%rax),%edx
    0.49 :      ffffffff81538ea0:       adc    0x8(%rax),%edx
    0.49 :      ffffffff81538ea3:       adc    0xc(%rax),%edx
    0.98 :      ffffffff81538ea6:       adc    0x10(%rax),%edx
    0.49 :      ffffffff81538ea9:       adc    $0x0,%edx
    0.00 :      ffffffff81538eac:       mov    %edx,%r8d
    0.49 :      ffffffff81538eaf:       xor    %dx,%dx
    0.00 :      ffffffff81538eb2:       shl    $0x10,%r8d
    0.98 :      ffffffff81538eb6:       add    %r8d,%edx
    0.98 :      ffffffff81538eb9:       adc    $0xffff,%edx
    0.98 :      ffffffff81538ebf:       not    %edx
    0.00 :      ffffffff81538ec1:       shr    $0x10,%edx
    0.49 :      ffffffff81538ec4:       mov    %dx,0xa(%rax)
    0.00 :      ffffffff81538ec8:       movslq %ecx,%rax
    0.00 :      ffffffff81538ecb:       mov    -0x7e734f40(,%rax,8),%rax
    0.00 :      ffffffff81538ed3:       test   %rax,%rax
    0.00 :      ffffffff81538ed6:       je     ffffffff81538ef0 <inet_gro_complete+0x80>

BTW, any idea why ip_fast_csum() on x86 contains a "memory" constraint ?
It looks like a barrier() would be more appropriate.

Following patch seems to help, but the stall seems to be the fact that
we write on iph->check (16bits) before doing the checksum using 32bit
loads.

Note that we could share same ip_fast_csum() implementation between x86
32/64 bits...



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Andi Kleen March 21, 2014, 1:28 p.m. UTC | #1
On Fri, Mar 21, 2014 at 05:50:50AM -0700, Eric Dumazet wrote:
> On Thu, 2014-03-20 at 18:56 -0700, Andi Kleen wrote:
> > Eric Dumazet <eric.dumazet@gmail.com> writes:
> > >
> > > I saw csum_partial() consuming 1% of cpu cycles in a GRO workload, that
> > > is insane...
> > 
> > 
> > Couldn't it just be the cache miss?
> 
> Or the fact that we mix 16 bit stores and 32bit loads ?

It should cause a small stall from not doing load-store
forwarding, but 1% of a serious workload would be surprising.

Are you sure it's not some skid effect?

-Andi
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 21, 2014, 1:32 p.m. UTC | #2
On Fri, 2014-03-21 at 05:50 -0700, Eric Dumazet wrote:

> Or the fact that we mix 16 bit stores and 32bit loads ?
> 
> iph->tot_len = newlen;
> iph->check = 0;
> iph->check = ip_fast_csum(iph, 5);

Yep definitely. Using 16 bit loads in ip_fast_csum() totally removes the
stall. I no longer see inet_gro_complete() in perf top...

+       if (__builtin_constant_p(ihl) && ihl == 5) {
+               asm("  movw (%[iph]), %w[sum]\n"        /* ihl/version/tos */
+                   "  addw 2(%[iph]), %w[sum]\n"       /* tot_len      */
+                   "  adcw 8(%[iph]), %w[sum]\n"       /* ttl/protocol */
+                   "  adcw 10(%[iph]), %w[sum]\n"      /* check        */
+                   "  adcl 4(%[iph]), %[sum]\n"        /* id/frag_off  */
+                   "  adcl 12(%[iph]), %[sum]\n"       /* saddr        */
+                   "  adcl 16(%[iph]), %[sum]\n"       /* daddr        */
+                   "  adcl $0, %[sum]\n"
+                   : [sum] "=r" (sum)
+                   : [iph] "r" (iph)
+                   );
+               return csum_fold(sum);


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight March 21, 2014, 2:14 p.m. UTC | #3
From: Eric Dumazet

> On Thu, 2014-03-20 at 18:56 -0700, Andi Kleen wrote:

> > Eric Dumazet <eric.dumazet@gmail.com> writes:

> > >

> > > I saw csum_partial() consuming 1% of cpu cycles in a GRO workload, that

> > > is insane...

> >

> >

> > Couldn't it just be the cache miss?

> 

> Or the fact that we mix 16 bit stores and 32bit loads ?

> 

> BTW, any idea why ip_fast_csum() on x86 contains a "memory" constraint ?


The correct constraint would be one that told gcc that it
accesses the 20 bytes from the source pointer.

Without it gcc won't necessarily write out the values before
the asm instructions execute.

	David
David Miller March 21, 2014, 6:52 p.m. UTC | #4
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 21 Mar 2014 05:50:50 -0700

> It looks like a barrier() would be more appropriate.

barrier() == __asm__ __volatile__(:::"memory")
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index e6fd8a026c7b..a81cc3a5facc 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -46,6 +46,24 @@  static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 {
 	unsigned int sum;
 
+	/*
+	 * Callers might clear iph->check before calling us, make sure
+	 * compiler wont mess things...
+	 */
+	barrier();
+
+	if (__builtin_constant_p(ihl) && ihl == 5) {
+		asm("  movl (%[iph]), %[sum]\n"
+		    "  addl 4(%[iph]), %[sum]\n"
+		    "  adcl 8(%[iph]), %[sum]\n"
+		    "  adcl 12(%[iph]), %[sum]\n"
+		    "  adcl 16(%[iph]), %[sum]\n"
+		    "  adcl $0, %[sum]\n"
+		    : [sum] "=r" (sum)
+		    : [iph] "r" (iph)
+		    );
+		return csum_fold(sum);
+	}
 	asm("  movl (%1), %0\n"
 	    "  subl $4, %2\n"
 	    "  jbe 2f\n"
@@ -68,7 +86,7 @@  static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 	   will assume they contain their original values. */
 	    : "=r" (sum), "=r" (iph), "=r" (ihl)
 	    : "1" (iph), "2" (ihl)
-	    : "memory");
+	    );
 	return (__force __sum16)sum;
 }
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8c54870db792..90aa562dfbf5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1434,8 +1434,9 @@  static int inet_gro_complete(struct sk_buff *skb, int nhoff)
 	int proto = iph->protocol;
 	int err = -ENOSYS;
 
-	csum_replace2(&iph->check, iph->tot_len, newlen);
 	iph->tot_len = newlen;
+	iph->check = 0;
+	iph->check = ip_fast_csum((u8 *)iph, 5);
 
 	rcu_read_lock();
 	ops = rcu_dereference(inet_offloads[proto]);