Message ID | 20181130211441.GJ12380@tucnak |
---|---|
State | New |
Headers | show |
Series | Add *vec_concatv4sf_0 and *vec_concatv4si_0 insns (PR target/88278) | expand |
On Fri, Nov 30, 2018 at 10:14 PM Jakub Jelinek <jakub@redhat.com> wrote: > > Hi! > > As mentioned in the PR, while in vec_concatv2df or vec_concatv2di we have > alternatives where the lower half is low 64-bit part of a xmm reg or 64-bit > memory and upper half is zero using movq/vmovq (or movq2dq), for > vec_concatv4sf and vec_concatv4si we don't have anything similar, although > the operations do pretty much the same thing. I'm not advocating to put > in alternatives with GPR operands as having V2SF or V2SI in a GPR is too > weird, but for the cases where a simple movq or vmovq instruction can move > 64-bits and clear upper 64-bits these patterns look useful to me. > > I had to tweak the pr53759.c testcase because it started FAILing, but only > because it changed: > - vxorps %xmm0, %xmm0, %xmm0 > - vmovlps (%rsi), %xmm0, %xmm0 > + vmovq (%rsi), %xmm0 > vaddps %xmm0, %xmm0, %xmm0 > vmovaps %xmm0, (%rdi) > ret > I've added a variant of that testcase that tests its original purpose by > using a register there not known to be all zeros. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2018-11-30 Jakub Jelinek <jakub@redhat.com> > > PR target/88278 > * config/i386/sse.md (*vec_concatv4sf_0, *vec_concatv4si_0): New insns. > > * gcc.target/i386/pr88278.c: New test. > * gcc.target/i386/pr53759.c: Don't expect vmovlps insn, expect vmovq > instead. > * gcc.target/i386/pr53759-2.c: New test. OK with a constraint adjustment below. Thanks, Uros. > > --- gcc/config/i386/sse.md.jj 2018-11-30 21:36:22.277762263 +0100 > +++ gcc/config/i386/sse.md 2018-11-30 21:38:02.261120768 +0100 > @@ -7248,6 +7248,17 @@ (define_insn "*vec_concatv4sf" > (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") > (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) > > +(define_insn "*vec_concatv4sf_0" > + [(set (match_operand:V4SF 0 "register_operand" "=v") > + (vec_concat:V4SF > + (match_operand:V2SF 1 "nonimmediate_operand" "xm") The constraint here can be "vm". There is no limitation on register set for vmovq insn. > + (match_operand:V2SF 2 "const0_operand" " C")))] > + "TARGET_SSE2" > + "%vmovq\t{%1, %0|%0, %1}" > + [(set_attr "type" "ssemov") > + (set_attr "prefix" "maybe_vex") > + (set_attr "mode" "DF")]) > + > ;; Avoid combining registers from different units in a single alternative, > ;; see comment above inline_secondary_memory_needed function in i386.c > (define_insn "vec_set<mode>_0" > @@ -14409,6 +14420,23 @@ (define_insn "*vec_concatv4si" > (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") > (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) > > +(define_insn "*vec_concatv4si_0" > + [(set (match_operand:V4SI 0 "register_operand" "=v,x") > + (vec_concat:V4SI > + (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") > + (match_operand:V2SI 2 "const0_operand" " C,C")))] > + "TARGET_SSE2" > + "@ > + %vmovq\t{%1, %0|%0, %1} > + movq2dq\t{%1, %0|%0, %1}" > + [(set_attr "type" "ssemov") > + (set_attr "prefix" "maybe_vex,orig") > + (set_attr "mode" "TI") > + (set (attr "preferred_for_speed") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC") > + (symbol_ref "true")))]) > + > ;; movd instead of movq is required to handle broken assemblers. > (define_insn "vec_concatv2di" > [(set (match_operand:V2DI 0 "register_operand" > --- gcc/testsuite/gcc.target/i386/pr88278.c.jj 2018-11-30 21:38:02.261120768 +0100 > +++ gcc/testsuite/gcc.target/i386/pr88278.c 2018-11-30 21:38:02.261120768 +0100 > @@ -0,0 +1,34 @@ > +/* PR target/88278 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -mno-sse3 -fgimple -masm=att" } */ > +/* { dg-final { scan-assembler-times "movq\[ \t]+\\(" 2 } } */ > +/* { dg-final { scan-assembler-not "punpcklqdq\[ \t]+" } } */ > +/* { dg-final { scan-assembler-not "pxor\[ \t]+" } } */ > + > +typedef unsigned char v16qi __attribute__((vector_size(16))); > +typedef unsigned char v8qi __attribute__((vector_size(8))); > +typedef unsigned int v4si __attribute__((vector_size(16))); > +typedef unsigned int v2si __attribute__((vector_size(8))); > + > +v16qi __GIMPLE foo (unsigned char *p) > +{ > + v8qi _2; > + v16qi _3; > + > +bb_2: > + _2 = __MEM <v8qi, 8> (p_1(D)); > + _3 = _Literal (v16qi) { _2, _Literal (v8qi) { _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0 } }; > + return _3; > +} > + > + > +v4si __GIMPLE bar (unsigned int *p) > +{ > + v2si _2; > + v4si _3; > + > +bb_2: > + _2 = __MEM <v2si, 32> (p_1(D)); > + _3 = _Literal (v4si) { _2, _Literal (v2si) { 0u, 0u } }; > + return _3; > +} > --- gcc/testsuite/gcc.target/i386/pr53759.c.jj 2016-05-22 12:20:04.184034591 +0200 > +++ gcc/testsuite/gcc.target/i386/pr53759.c 2018-11-30 22:04:56.432806470 +0100 > @@ -12,5 +12,6 @@ foo (__m128 *x, __m64 *y) > *x = _mm_add_ps (b, b); > } > > -/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */ > +/* { dg-final { scan-assembler "vmovq\[ \\t\]" } } */ > +/* { dg-final { scan-assembler-not "vmovlps\[ \\t\]" } } */ > /* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */ > --- gcc/testsuite/gcc.target/i386/pr53759-2.c.jj 2018-11-30 22:05:06.932657374 +0100 > +++ gcc/testsuite/gcc.target/i386/pr53759-2.c 2018-11-30 22:05:42.568151361 +0100 > @@ -0,0 +1,16 @@ > +/* PR target/53759 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > + > +#include <xmmintrin.h> > + > +void > +foo (__m128 *x, __m64 *y) > +{ > + __m128 a = _mm_add_ps (x[1], x[2]); > + __m128 b = _mm_loadl_pi (a, y); > + *x = _mm_add_ps (b, b); > +} > + > +/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */ > +/* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */ > > Jakub
On Sun, Dec 02, 2018 at 08:23:21PM +0100, Uros Bizjak wrote: > OK with a constraint adjustment below. > > > +(define_insn "*vec_concatv4sf_0" > > + [(set (match_operand:V4SF 0 "register_operand" "=v") > > + (vec_concat:V4SF > > + (match_operand:V2SF 1 "nonimmediate_operand" "xm") > > The constraint here can be "vm". There is no limitation on register > set for vmovq insn. That is what vec_concatv2df has also: (define_insn "vec_concatv2df" [(set (match_operand:V2DF 0 "register_operand" "=x,x,v,x,v,x,x, v,x,x") (vec_concat:V2DF (match_operand:DF 1 "nonimmediate_operand" " 0,x,v,m,m,0,x,xm,0,0") (match_operand:DF 2 "nonimm_or_0_operand" " x,x,v,1,1,m,m, C,x,m")))] I believe ix86_hard_regno_mode_ok just doesn't allow V2SFmode in xmm16+ regs: if (SSE_REGNO_P (regno)) { ... /* xmm16-xmm31 are only available for AVX-512. */ if (EXT_REX_SSE_REGNO_P (regno)) return false; and V2SFmode is a VALID_MMX_REG_MODE, which is not mentioned anywhere before the the above conditional, only after it. While vmovq doesn't have this limitation, I believe RA will not try to put anything V2SFmode in those regs if lucky, otherwise it could be upset about it. Though, DFmode in the vec_concatv2df is different, because ix86_hard_regno_mode_ok should allow that in those registers, as DFmode is VALID_AVX512F_SCALAR_MODE. Jakub
On Sun, Dec 2, 2018 at 8:48 PM Jakub Jelinek <jakub@redhat.com> wrote: > > On Sun, Dec 02, 2018 at 08:23:21PM +0100, Uros Bizjak wrote: > > OK with a constraint adjustment below. > > > > > +(define_insn "*vec_concatv4sf_0" > > > + [(set (match_operand:V4SF 0 "register_operand" "=v") > > > + (vec_concat:V4SF > > > + (match_operand:V2SF 1 "nonimmediate_operand" "xm") > > > > The constraint here can be "vm". There is no limitation on register > > set for vmovq insn. > > That is what vec_concatv2df has also: > (define_insn "vec_concatv2df" > [(set (match_operand:V2DF 0 "register_operand" "=x,x,v,x,v,x,x, v,x,x") > (vec_concat:V2DF > (match_operand:DF 1 "nonimmediate_operand" " 0,x,v,m,m,0,x,xm,0,0") > (match_operand:DF 2 "nonimm_or_0_operand" " x,x,v,1,1,m,m, C,x,m")))] > > I believe ix86_hard_regno_mode_ok just doesn't allow V2SFmode in xmm16+ regs: > if (SSE_REGNO_P (regno)) > { > ... > /* xmm16-xmm31 are only available for AVX-512. */ > if (EXT_REX_SSE_REGNO_P (regno)) > return false; IIRC, RA just won't allocate a register with a regno that won't satisfy hard_regno_mode_ok. The quoted vec_concatv2df looks like an unintended oversight (DFmode is supported in AVX512 regs through VALID_AVX512F_SCALAR_MODE). But we can stay on the safe side and leave MMX modes with "x" constraint. So, patch is OK as is. Thanks, Uros. > and V2SFmode is a VALID_MMX_REG_MODE, which is not mentioned anywhere before > the the above conditional, only after it. > While vmovq doesn't have this limitation, I believe RA will not try to put > anything V2SFmode in those regs if lucky, otherwise it could be upset about > it. Though, DFmode in the vec_concatv2df is different, because > ix86_hard_regno_mode_ok should allow that in those registers, as DFmode is > VALID_AVX512F_SCALAR_MODE. > > Jakub
--- gcc/config/i386/sse.md.jj 2018-11-30 21:36:22.277762263 +0100 +++ gcc/config/i386/sse.md 2018-11-30 21:38:02.261120768 +0100 @@ -7248,6 +7248,17 @@ (define_insn "*vec_concatv4sf" (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) +(define_insn "*vec_concatv4sf_0" + [(set (match_operand:V4SF 0 "register_operand" "=v") + (vec_concat:V4SF + (match_operand:V2SF 1 "nonimmediate_operand" "xm") + (match_operand:V2SF 2 "const0_operand" " C")))] + "TARGET_SSE2" + "%vmovq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DF")]) + ;; Avoid combining registers from different units in a single alternative, ;; see comment above inline_secondary_memory_needed function in i386.c (define_insn "vec_set<mode>_0" @@ -14409,6 +14420,23 @@ (define_insn "*vec_concatv4si" (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) +(define_insn "*vec_concatv4si_0" + [(set (match_operand:V4SI 0 "register_operand" "=v,x") + (vec_concat:V4SI + (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") + (match_operand:V2SI 2 "const0_operand" " C,C")))] + "TARGET_SSE2" + "@ + %vmovq\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex,orig") + (set_attr "mode" "TI") + (set (attr "preferred_for_speed") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC") + (symbol_ref "true")))]) + ;; movd instead of movq is required to handle broken assemblers. (define_insn "vec_concatv2di" [(set (match_operand:V2DI 0 "register_operand" --- gcc/testsuite/gcc.target/i386/pr88278.c.jj 2018-11-30 21:38:02.261120768 +0100 +++ gcc/testsuite/gcc.target/i386/pr88278.c 2018-11-30 21:38:02.261120768 +0100 @@ -0,0 +1,34 @@ +/* PR target/88278 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -mno-sse3 -fgimple -masm=att" } */ +/* { dg-final { scan-assembler-times "movq\[ \t]+\\(" 2 } } */ +/* { dg-final { scan-assembler-not "punpcklqdq\[ \t]+" } } */ +/* { dg-final { scan-assembler-not "pxor\[ \t]+" } } */ + +typedef unsigned char v16qi __attribute__((vector_size(16))); +typedef unsigned char v8qi __attribute__((vector_size(8))); +typedef unsigned int v4si __attribute__((vector_size(16))); +typedef unsigned int v2si __attribute__((vector_size(8))); + +v16qi __GIMPLE foo (unsigned char *p) +{ + v8qi _2; + v16qi _3; + +bb_2: + _2 = __MEM <v8qi, 8> (p_1(D)); + _3 = _Literal (v16qi) { _2, _Literal (v8qi) { _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0 } }; + return _3; +} + + +v4si __GIMPLE bar (unsigned int *p) +{ + v2si _2; + v4si _3; + +bb_2: + _2 = __MEM <v2si, 32> (p_1(D)); + _3 = _Literal (v4si) { _2, _Literal (v2si) { 0u, 0u } }; + return _3; +} --- gcc/testsuite/gcc.target/i386/pr53759.c.jj 2016-05-22 12:20:04.184034591 +0200 +++ gcc/testsuite/gcc.target/i386/pr53759.c 2018-11-30 22:04:56.432806470 +0100 @@ -12,5 +12,6 @@ foo (__m128 *x, __m64 *y) *x = _mm_add_ps (b, b); } -/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */ +/* { dg-final { scan-assembler "vmovq\[ \\t\]" } } */ +/* { dg-final { scan-assembler-not "vmovlps\[ \\t\]" } } */ /* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */ --- gcc/testsuite/gcc.target/i386/pr53759-2.c.jj 2018-11-30 22:05:06.932657374 +0100 +++ gcc/testsuite/gcc.target/i386/pr53759-2.c 2018-11-30 22:05:42.568151361 +0100 @@ -0,0 +1,16 @@ +/* PR target/53759 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ + +#include <xmmintrin.h> + +void +foo (__m128 *x, __m64 *y) +{ + __m128 a = _mm_add_ps (x[1], x[2]); + __m128 b = _mm_loadl_pi (a, y); + *x = _mm_add_ps (b, b); +} + +/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */ +/* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */