diff mbox series

powerpc: Force inlining of csum_add()

Message ID f7f4d4e364de6e473da874468b903da6e5d97adc.1620713272.git.christophe.leroy@csgroup.eu
State New
Headers show
Series powerpc: Force inlining of csum_add() | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (fcc98c6d0289241dded10b74f8198fc4ecb22bd1)
snowpatch_ozlabs/build-ppc64le success Build succeeded
snowpatch_ozlabs/build-ppc64be success Build succeeded
snowpatch_ozlabs/build-ppc64e success Build succeeded
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/checkpatch warning total: 0 errors, 1 warnings, 0 checks, 8 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Christophe Leroy May 11, 2021, 6:08 a.m. UTC
Commit 328e7e487a46 ("powerpc: force inlining of csum_partial() to
avoid multiple csum_partial() with GCC10") inlined csum_partial().

Now that csum_partial() is inlined, GCC outlines csum_add() when
called by csum_partial().

c064fb28 <csum_add>:
c064fb28:	7c 63 20 14 	addc    r3,r3,r4
c064fb2c:	7c 63 01 94 	addze   r3,r3
c064fb30:	4e 80 00 20 	blr

c0665fb8 <csum_add>:
c0665fb8:	7c 63 20 14 	addc    r3,r3,r4
c0665fbc:	7c 63 01 94 	addze   r3,r3
c0665fc0:	4e 80 00 20 	blr

c066719c:	7c 9a c0 2e 	lwzx    r4,r26,r24
c06671a0:	38 60 00 00 	li      r3,0
c06671a4:	7f 1a c2 14 	add     r24,r26,r24
c06671a8:	4b ff ee 11 	bl      c0665fb8 <csum_add>
c06671ac:	80 98 00 04 	lwz     r4,4(r24)
c06671b0:	4b ff ee 09 	bl      c0665fb8 <csum_add>
c06671b4:	80 98 00 08 	lwz     r4,8(r24)
c06671b8:	4b ff ee 01 	bl      c0665fb8 <csum_add>
c06671bc:	a0 98 00 0c 	lhz     r4,12(r24)
c06671c0:	4b ff ed f9 	bl      c0665fb8 <csum_add>
c06671c4:	7c 63 18 f8 	not     r3,r3
c06671c8:	81 3f 00 68 	lwz     r9,104(r31)
c06671cc:	81 5f 00 a0 	lwz     r10,160(r31)
c06671d0:	7d 29 18 14 	addc    r9,r9,r3
c06671d4:	7d 29 01 94 	addze   r9,r9
c06671d8:	91 3f 00 68 	stw     r9,104(r31)
c06671dc:	7d 1a 50 50 	subf    r8,r26,r10
c06671e0:	83 01 00 10 	lwz     r24,16(r1)
c06671e4:	83 41 00 18 	lwz     r26,24(r1)

The sum with 0 is useless, should have been skipped.
And there is even one completely unused instance of csum_add().

In file included from ./include/net/checksum.h:22,
                 from ./include/linux/skbuff.h:28,
                 from ./include/linux/icmp.h:16,
                 from net/ipv6/ip6_tunnel.c:23:
./arch/powerpc/include/asm/checksum.h: In function '__ip6_tnl_rcv':
./arch/powerpc/include/asm/checksum.h:94:22: warning: inlining failed in call to 'csum_add': call is unlikely and code size would grow [-Winline]
   94 | static inline __wsum csum_add(__wsum csum, __wsum addend)
      |                      ^~~~~~~~
./arch/powerpc/include/asm/checksum.h:172:31: note: called from here
  172 |                         sum = csum_add(sum, (__force __wsum)*(const u32 *)buff);
      |                               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
./arch/powerpc/include/asm/checksum.h:94:22: warning: inlining failed in call to 'csum_add': call is unlikely and code size would grow [-Winline]
   94 | static inline __wsum csum_add(__wsum csum, __wsum addend)
      |                      ^~~~~~~~
./arch/powerpc/include/asm/checksum.h:177:31: note: called from here
  177 |                         sum = csum_add(sum, (__force __wsum)
      |                               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  178 |                                             *(const u32 *)(buff + 4));
      |                                             ~~~~~~~~~~~~~~~~~~~~~~~~~
./arch/powerpc/include/asm/checksum.h:94:22: warning: inlining failed in call to 'csum_add': call is unlikely and code size would grow [-Winline]
   94 | static inline __wsum csum_add(__wsum csum, __wsum addend)
      |                      ^~~~~~~~
./arch/powerpc/include/asm/checksum.h:183:31: note: called from here
  183 |                         sum = csum_add(sum, (__force __wsum)
      |                               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  184 |                                             *(const u32 *)(buff + 8));
      |                                             ~~~~~~~~~~~~~~~~~~~~~~~~~
./arch/powerpc/include/asm/checksum.h:94:22: warning: inlining failed in call to 'csum_add': call is unlikely and code size would grow [-Winline]
   94 | static inline __wsum csum_add(__wsum csum, __wsum addend)
      |                      ^~~~~~~~
./arch/powerpc/include/asm/checksum.h:186:31: note: called from here
  186 |                         sum = csum_add(sum, (__force __wsum)
      |                               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  187 |                                             *(const u16 *)(buff + 12));
      |                                             ~~~~~~~~~~~~~~~~~~~~~~~~~~

Force inlining of csum_add().

     94c:	80 df 00 a0 	lwz     r6,160(r31)
     950:	7d 28 50 2e 	lwzx    r9,r8,r10
     954:	7d 48 52 14 	add     r10,r8,r10
     958:	80 aa 00 04 	lwz     r5,4(r10)
     95c:	80 ff 00 68 	lwz     r7,104(r31)
     960:	7d 29 28 14 	addc    r9,r9,r5
     964:	7d 29 01 94 	addze   r9,r9
     968:	7d 08 30 50 	subf    r8,r8,r6
     96c:	80 aa 00 08 	lwz     r5,8(r10)
     970:	a1 4a 00 0c 	lhz     r10,12(r10)
     974:	7d 29 28 14 	addc    r9,r9,r5
     978:	7d 29 01 94 	addze   r9,r9
     97c:	7d 29 50 14 	addc    r9,r9,r10
     980:	7d 29 01 94 	addze   r9,r9
     984:	7d 29 48 f8 	not     r9,r9
     988:	7c e7 48 14 	addc    r7,r7,r9
     98c:	7c e7 01 94 	addze   r7,r7
     990:	90 ff 00 68 	stw     r7,104(r31)

In the non-inlined version, the first sum with 0 was performed.
Here it is skipped.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/checksum.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Comments

Segher Boessenkool May 11, 2021, 10:51 a.m. UTC | #1
Hi!

On Tue, May 11, 2021 at 06:08:06AM +0000, Christophe Leroy wrote:
> Commit 328e7e487a46 ("powerpc: force inlining of csum_partial() to
> avoid multiple csum_partial() with GCC10") inlined csum_partial().
> 
> Now that csum_partial() is inlined, GCC outlines csum_add() when
> called by csum_partial().

> c064fb28 <csum_add>:
> c064fb28:	7c 63 20 14 	addc    r3,r3,r4
> c064fb2c:	7c 63 01 94 	addze   r3,r3
> c064fb30:	4e 80 00 20 	blr

Could you build this with -fdump-tree-einline-all and send me the
results?  Or open a GCC PR yourself :-)

Something seems to have decided this asm is more expensive than it is.
That isn't always avoidable -- the compiler cannot look inside asms --
but it seems it could be improved here.

Do you have (or can make) a self-contained testcase?

> The sum with 0 is useless, should have been skipped.

That isn't something the compiler can do anything about (not sure if you
were suggesting that); it has to be done in the user code (and it tries
to already, see below).

> And there is even one completely unused instance of csum_add().

That is strange, that should never happen.

> ./arch/powerpc/include/asm/checksum.h: In function '__ip6_tnl_rcv':
> ./arch/powerpc/include/asm/checksum.h:94:22: warning: inlining failed in call to 'csum_add': call is unlikely and code size would grow [-Winline]
>    94 | static inline __wsum csum_add(__wsum csum, __wsum addend)
>       |                      ^~~~~~~~
> ./arch/powerpc/include/asm/checksum.h:172:31: note: called from here
>   172 |                         sum = csum_add(sum, (__force __wsum)*(const u32 *)buff);
>       |                               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

At least we say what happened.  Progress!  :-)

> In the non-inlined version, the first sum with 0 was performed.
> Here it is skipped.

That is because of how __builtin_constant_p works, most likely.  As we
discussed elsewhere it is evaluated before all forms of loop unrolling.

The patch looks perfect of course :-)

Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org>


Segher


> --- a/arch/powerpc/include/asm/checksum.h
> +++ b/arch/powerpc/include/asm/checksum.h
> @@ -91,7 +91,7 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
>  }
>  
>  #define HAVE_ARCH_CSUM_ADD
> -static inline __wsum csum_add(__wsum csum, __wsum addend)
> +static __always_inline __wsum csum_add(__wsum csum, __wsum addend)
>  {
>  #ifdef __powerpc64__
>  	u64 res = (__force u64)csum;
Christophe Leroy May 12, 2021, 12:56 p.m. UTC | #2
Hi,

Le 11/05/2021 à 12:51, Segher Boessenkool a écrit :
> Hi!
> 
> On Tue, May 11, 2021 at 06:08:06AM +0000, Christophe Leroy wrote:
>> Commit 328e7e487a46 ("powerpc: force inlining of csum_partial() to
>> avoid multiple csum_partial() with GCC10") inlined csum_partial().
>>
>> Now that csum_partial() is inlined, GCC outlines csum_add() when
>> called by csum_partial().
> 
>> c064fb28 <csum_add>:
>> c064fb28:	7c 63 20 14 	addc    r3,r3,r4
>> c064fb2c:	7c 63 01 94 	addze   r3,r3
>> c064fb30:	4e 80 00 20 	blr
> 
> Could you build this with -fdump-tree-einline-all and send me the
> results?  Or open a GCC PR yourself :-)

Ok, I'll forward it to you in a minute.

> 
> Something seems to have decided this asm is more expensive than it is.
> That isn't always avoidable -- the compiler cannot look inside asms --
> but it seems it could be improved here.
> 
> Do you have (or can make) a self-contained testcase?

I have not tried, and I fear it might be difficult, because on a kernel build with dozens of calls 
to csum_add(), only ip6_tunnel.o exhibits such an issue.

> 
>> The sum with 0 is useless, should have been skipped.
> 
> That isn't something the compiler can do anything about (not sure if you
> were suggesting that); it has to be done in the user code (and it tries
> to already, see below).

I was not suggesting that, only that when properly inlined the sum with 0 is skipped (because we put 
the necessary stuff in csum_add() of course).

> 
>> And there is even one completely unused instance of csum_add().
> 
> That is strange, that should never happen.

It seems that several .o include unused versions of csum_add. After the final link, one remains (in 
addition to the used one) in vmlinux.

> 
>> ./arch/powerpc/include/asm/checksum.h: In function '__ip6_tnl_rcv':
>> ./arch/powerpc/include/asm/checksum.h:94:22: warning: inlining failed in call to 'csum_add': call is unlikely and code size would grow [-Winline]
>>     94 | static inline __wsum csum_add(__wsum csum, __wsum addend)
>>        |                      ^~~~~~~~
>> ./arch/powerpc/include/asm/checksum.h:172:31: note: called from here
>>    172 |                         sum = csum_add(sum, (__force __wsum)*(const u32 *)buff);
>>        |                               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 
> At least we say what happened.  Progress!  :-)

Lol. I've seen this warning for long, that's not something new I guess.

> 
>> In the non-inlined version, the first sum with 0 was performed.
>> Here it is skipped.
> 
> That is because of how __builtin_constant_p works, most likely.  As we
> discussed elsewhere it is evaluated before all forms of loop unrolling.

But we are not talking about loop unrolling here, are we ?

It seems that the reason here is that __builtin_constant_p() is evaluated long after GCC decided to 
not inline that call to csum_add().

Christophe
Segher Boessenkool May 12, 2021, 2:31 p.m. UTC | #3
On Wed, May 12, 2021 at 02:56:56PM +0200, Christophe Leroy wrote:
> Le 11/05/2021 à 12:51, Segher Boessenkool a écrit :
> >Something seems to have decided this asm is more expensive than it is.
> >That isn't always avoidable -- the compiler cannot look inside asms --
> >but it seems it could be improved here.
> >
> >Do you have (or can make) a self-contained testcase?
> 
> I have not tried, and I fear it might be difficult, because on a kernel 
> build with dozens of calls to csum_add(), only ip6_tunnel.o exhibits such 
> an issue.

Yeah.  Sometimes you can force some of the decisions, but that usually
requires knowing too many GCC internals :-/

> >>And there is even one completely unused instance of csum_add().
> >
> >That is strange, that should never happen.
> 
> It seems that several .o include unused versions of csum_add. After the 
> final link, one remains (in addition to the used one) in vmlinux.

But it is a static function, so it should not end up in any object file
where it isn't used.

> >>In the non-inlined version, the first sum with 0 was performed.
> >>Here it is skipped.
> >
> >That is because of how __builtin_constant_p works, most likely.  As we
> >discussed elsewhere it is evaluated before all forms of loop unrolling.
> 
> But we are not talking about loop unrolling here, are we ?

Oh, right you are, but that doesn't change much.  The
_builtin_constant_p(len) is evaluated long before the compiler sees len
is a constant here.

> It seems that the reason here is that __builtin_constant_p() is evaluated 
> long after GCC decided to not inline that call to csum_add().

Yes, it seems we do not currently do even trivial inlining except very
early in the compiler.

Thanks,


Segher
Christophe Leroy May 12, 2021, 2:43 p.m. UTC | #4
Le 12/05/2021 à 16:31, Segher Boessenkool a écrit :
> On Wed, May 12, 2021 at 02:56:56PM +0200, Christophe Leroy wrote:
>> Le 11/05/2021 à 12:51, Segher Boessenkool a écrit :
>>> Something seems to have decided this asm is more expensive than it is.
>>> That isn't always avoidable -- the compiler cannot look inside asms --
>>> but it seems it could be improved here.
>>>
>>> Do you have (or can make) a self-contained testcase?
>>
>> I have not tried, and I fear it might be difficult, because on a kernel
>> build with dozens of calls to csum_add(), only ip6_tunnel.o exhibits such
>> an issue.
> 
> Yeah.  Sometimes you can force some of the decisions, but that usually
> requires knowing too many GCC internals :-/
> 
>>>> And there is even one completely unused instance of csum_add().
>>>
>>> That is strange, that should never happen.
>>
>> It seems that several .o include unused versions of csum_add. After the
>> final link, one remains (in addition to the used one) in vmlinux.
> 
> But it is a static function, so it should not end up in any object file
> where it isn't used.

Well .... did I dream ?

Now I only find one extra .o with unused csum_add() : That's net/ipv6/exthdrs.o
It matches the one found in vmlinux.

Are you interested in -fdump-tree-einline-all for that one as well ?

Christophe
Segher Boessenkool May 12, 2021, 6:21 p.m. UTC | #5
On Wed, May 12, 2021 at 04:43:33PM +0200, Christophe Leroy wrote:
> Le 12/05/2021 à 16:31, Segher Boessenkool a écrit :
> >On Wed, May 12, 2021 at 02:56:56PM +0200, Christophe Leroy wrote:
> >>Le 11/05/2021 à 12:51, Segher Boessenkool a écrit :
> >>>Something seems to have decided this asm is more expensive than it is.
> >>>That isn't always avoidable -- the compiler cannot look inside asms --
> >>>but it seems it could be improved here.
> >>>
> >>>Do you have (or can make) a self-contained testcase?
> >>
> >>I have not tried, and I fear it might be difficult, because on a kernel
> >>build with dozens of calls to csum_add(), only ip6_tunnel.o exhibits such
> >>an issue.
> >
> >Yeah.  Sometimes you can force some of the decisions, but that usually
> >requires knowing too many GCC internals :-/
> >
> >>>>And there is even one completely unused instance of csum_add().
> >>>
> >>>That is strange, that should never happen.
> >>
> >>It seems that several .o include unused versions of csum_add. After the
> >>final link, one remains (in addition to the used one) in vmlinux.
> >
> >But it is a static function, so it should not end up in any object file
> >where it isn't used.
> 
> Well .... did I dream ?
> 
> Now I only find one extra .o with unused csum_add() : That's 
> net/ipv6/exthdrs.o
> It matches the one found in vmlinux.
> 
> Are you interested in -fdump-tree-einline-all for that one as well ?

Sure.  Hopefully it will show more :-)


Segher
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index d5da7ddbf0fc..350de8f90250 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -91,7 +91,7 @@  static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
 }
 
 #define HAVE_ARCH_CSUM_ADD
-static inline __wsum csum_add(__wsum csum, __wsum addend)
+static __always_inline __wsum csum_add(__wsum csum, __wsum addend)
 {
 #ifdef __powerpc64__
 	u64 res = (__force u64)csum;