diff mbox series

Enable inter-unit-moves for -mtune=generic

Message ID 20181231101151.xyjpvczxlf562qoy@kam.mff.cuni.cz
State New
Headers show
Series Enable inter-unit-moves for -mtune=generic | expand

Commit Message

Jan Hubicka Dec. 31, 2018, 10:11 a.m. UTC
Hi,
while working on the Firefox performance
http://hubicka.blogspot.com/2018/12/even-more-fun-with-building-and.html
I noticed that we still disable inter-unit-moves because of Bulldozer
CPUs. Last year I left that in tuning because Zens was new and it did
not seem to hurt much other CPUs in tests. I think it is time to tune
for more recent CPUs in this case because it also turns out that this
penalizes quite noticeably hand written vectorized code. In particular

#include <emmintrin.h>
__m128i test (short a,short b,short c,short d,short e,short f,short g)
{
  return _mm_set_epi16(a,b,c,d,e,f,g,255);
}

translates to 
        movzwl  %r8w, %r8d
        movzwl  %di, %edi
        movzwl  %r9w, %r9d
        movzwl  %si, %esi
        salq    $16, %r8
        salq    $16, %rdi
        movzwl  8(%rsp), %eax
        movzwl  %dx, %edx
        orq     %r9, %r8
        orq     %rsi, %rdi
        movzwl  %cx, %ecx
        salq    $16, %r8
        salq    $16, %rdi
        orq     %rax, %r8
        orq     %rdx, %rdi
        salq    $16, %r8
        salq    $16, %rdi
        orb     $-1, %r8b
        orq     %rcx, %rdi
        movq    %r8, -24(%rsp)
        movq    %rdi, -16(%rsp)
        movdqa  -24(%rsp), %xmm0
        ret

Which has mismatches store.
Now we produce:

        movd    %r9d, %xmm3
        movd    %ecx, %xmm1
        movd    %esi, %xmm2
        movl    $255, %eax
        movd    %eax, %xmm0
        pinsrw  $1, %r8d, %xmm3
        pinsrw  $1, %edx, %xmm1
        pinsrw  $1, 8(%rsp), %xmm0
        pinsrw  $1, %edi, %xmm2
        punpckldq       %xmm2, %xmm1
        punpckldq       %xmm3, %xmm0
        punpcklqdq      %xmm1, %xmm0

which is a lot better.
Clan does

        movd    %edi, %xmm0
        movd    %esi, %xmm1
        punpcklwd       %xmm0, %xmm1    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
        movd    %edx, %xmm0
        movd    %ecx, %xmm2
        punpcklwd       %xmm0, %xmm2    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
        punpckldq       %xmm1, %xmm2    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
        movd    %r8d, %xmm0
        movd    %r9d, %xmm1
        punpcklwd       %xmm0, %xmm1    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
        movl    $255, %eax
        movd    %eax, %xmm0
        movd    8(%rsp), %xmm3          # xmm3 = mem[0],zero,zero,zero
        punpcklwd       %xmm3, %xmm0    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
        punpckldq       %xmm1, %xmm0    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
        punpcklqdq      %xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0]

Which looks worse, but I like the comment explaining semantics.

Bootstrapped/regtested x86_64 linux. Comitted.

	* x86-tune.def: Enable inter_unit_moves_to_vec for generic.

Comments

Richard Biener Dec. 31, 2018, 12:56 p.m. UTC | #1
On December 31, 2018 11:11:51 AM GMT+01:00, Jan Hubicka <hubicka@ucw.cz> wrote:
>Hi,
>while working on the Firefox performance
>http://hubicka.blogspot.com/2018/12/even-more-fun-with-building-and.html
>I noticed that we still disable inter-unit-moves because of Bulldozer
>CPUs. Last year I left that in tuning because Zens was new and it did
>not seem to hurt much other CPUs in tests. I think it is time to tune
>for more recent CPUs in this case because it also turns out that this
>penalizes quite noticeably hand written vectorized code. In particular
>
>#include <emmintrin.h>
>__m128i test (short a,short b,short c,short d,short e,short f,short g)
>{
>  return _mm_set_epi16(a,b,c,d,e,f,g,255);
>}
>
>translates to 
>        movzwl  %r8w, %r8d
>        movzwl  %di, %edi
>        movzwl  %r9w, %r9d
>        movzwl  %si, %esi
>        salq    $16, %r8
>        salq    $16, %rdi
>        movzwl  8(%rsp), %eax
>        movzwl  %dx, %edx
>        orq     %r9, %r8
>        orq     %rsi, %rdi
>        movzwl  %cx, %ecx
>        salq    $16, %r8
>        salq    $16, %rdi
>        orq     %rax, %r8
>        orq     %rdx, %rdi
>        salq    $16, %r8
>        salq    $16, %rdi
>        orb     $-1, %r8b
>        orq     %rcx, %rdi
>        movq    %r8, -24(%rsp)
>        movq    %rdi, -16(%rsp)
>        movdqa  -24(%rsp), %xmm0
>        ret
>
>Which has mismatches store.
>Now we produce:
>
>        movd    %r9d, %xmm3
>        movd    %ecx, %xmm1
>        movd    %esi, %xmm2
>        movl    $255, %eax
>        movd    %eax, %xmm0
>        pinsrw  $1, %r8d, %xmm3
>        pinsrw  $1, %edx, %xmm1
>        pinsrw  $1, 8(%rsp), %xmm0
>        pinsrw  $1, %edi, %xmm2
>        punpckldq       %xmm2, %xmm1
>        punpckldq       %xmm3, %xmm0
>        punpcklqdq      %xmm1, %xmm0
>
>which is a lot better.
>Clan does
>
>        movd    %edi, %xmm0
>        movd    %esi, %xmm1
>punpcklwd       %xmm0, %xmm1    # xmm1 =
>xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
>        movd    %edx, %xmm0
>        movd    %ecx, %xmm2
>punpcklwd       %xmm0, %xmm2    # xmm2 =
>xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
>punpckldq       %xmm1, %xmm2    # xmm2 =
>xmm2[0],xmm1[0],xmm2[1],xmm1[1]
>        movd    %r8d, %xmm0
>        movd    %r9d, %xmm1
>punpcklwd       %xmm0, %xmm1    # xmm1 =
>xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
>        movl    $255, %eax
>        movd    %eax, %xmm0
>        movd    8(%rsp), %xmm3          # xmm3 = mem[0],zero,zero,zero
>punpcklwd       %xmm3, %xmm0    # xmm0 =
>xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
>punpckldq       %xmm1, %xmm0    # xmm0 =
>xmm0[0],xmm1[0],xmm0[1],xmm1[1]
>        punpcklqdq      %xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0]
>
>Which looks worse, but I like the comment explaining semantics.

Do we use inter-unit moves for size optimization independent of the tuning setting? 

>Bootstrapped/regtested x86_64 linux. Comitted.
>
>	* x86-tune.def: Enable inter_unit_moves_to_vec for generic.
>Index: config/i386/x86-tune.def
>===================================================================
>--- config/i386/x86-tune.def	(revision 267477)
>+++ config/i386/x86-tune.def	(working copy)
>@@ -379,9 +379,13 @@ DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "s
> 
> /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
>    to SSE registers.  If disabled, the moves will be done by storing
>-   the value to memory and reloading.  */
>+   the value to memory and reloading.
>+   Enable this flag for generic - the only relevant architecture
>preferring
>+   no inter-unit moves is Buldozer. While this makes small regression
>on SPECfp
>+   scores (sub 0.3%), disabling inter-unit moves penalizes noticeably
>hand
>+   written vectorized code which use i.e. _mm_set_epi16.  */
> DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
>-          ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC))
>+          ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER))
> 
> /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
>  to integer registers.  If disabled, the moves will be done by storing
diff mbox series

Patch

Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def	(revision 267477)
+++ config/i386/x86-tune.def	(working copy)
@@ -379,9 +379,13 @@  DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "s
 
 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
    to SSE registers.  If disabled, the moves will be done by storing
-   the value to memory and reloading.  */
+   the value to memory and reloading.
+   Enable this flag for generic - the only relevant architecture preferring
+   no inter-unit moves is Buldozer. While this makes small regression on SPECfp
+   scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand
+   written vectorized code which use i.e. _mm_set_epi16.  */
 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
-          ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC))
+          ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER))
 
 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
    to integer registers.  If disabled, the moves will be done by storing