Message ID | 20181129075956.GE12380@tucnak |
---|---|
State | New |
Headers | show |
Series | Optimize integral lt + blend into just blend (PR target/54700) | expand |
On Thu, Nov 29, 2018 at 9:00 AM Jakub Jelinek <jakub@redhat.com> wrote: > > Hi! > > The following patch optimizes > - pxor %xmm3, %xmm3 > - pcmpgtb %xmm0, %xmm3 > - movdqa %xmm3, %xmm0 > pblendvb %xmm0, %xmm1, %xmm2 > movdqa %xmm2, %xmm0 > ret > > - vpxor %xmm3, %xmm3, %xmm3 > - vpcmpgtq %ymm0, %ymm3, %ymm0 > - vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 > + vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 > ret > > etc. As the *blendv* instructions only look at the most significant > bit, we don't really need to perform pcmpgt* or vpcmpgt* instructions; > while they set also the other bits based on the most significant one, > the only consumer doesn't care about those other bits. > > I believe we can't do this for floating point comparisons even with > -ffast-math, because -fno-signed-zeros isn't a guarantee that -0.0 won't > appear, just that it will appear randomly when 0.0 is wanted and vice versa, > and having x < 0.0 be suddenly false if x is -0.0 would IMHO break too much > code. I agree with the above. This would mean that a comparison x < 0.0 would be substituted with an equivalent to a signbit (). We don't do this even for -ffast-math or -funsafe-math-optimizations. > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2018-11-28 Jakub Jelinek <jakub@redhat.com> > > PR target/54700 > * config/i386/sse.md (ssebytemode): Add V16SI, V8SI and V4SI entries. > (ssefltmodesuffix, ssefltvecmode): New define_mode_attrs. > (*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt, > *<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint, > *<sse4_1_avx2>_pblendvb_lt): New define_insns. > > * g++.target/i386/sse4_1-pr54700-1.C: New test. > * g++.target/i386/sse4_1-pr54700-2.C: New test. > * g++.target/i386/avx-pr54700-1.C: New test. > * g++.target/i386/avx-pr54700-2.C: New test. > * g++.target/i386/avx2-pr54700-1.C: New test. > * g++.target/i386/avx2-pr54700-2.C: New test. > * g++.target/i386/sse4_1-check.h: New file. > * g++.target/i386/avx-check.h: New file. > * g++.target/i386/avx2-check.h: New file. > * g++.target/i386/m128-check.h: New file. > * g++.target/i386/m256-check.h: New file. > * g++.target/i386/avx-os-support.h: New file. OK. Thanks, Uros. > --- gcc/config/i386/sse.md.jj 2018-11-28 09:53:55.607998012 +0100 > +++ gcc/config/i386/sse.md 2018-11-28 18:18:34.042124095 +0100 > @@ -601,7 +601,8 @@ (define_mode_attr ssedoublemode > (V4DI "V8DI") (V8DI "V16DI")]) > > (define_mode_attr ssebytemode > - [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")]) > + [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI") > + (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")]) > > ;; All 128bit vector integer modes > (define_mode_iterator VI_128 [V16QI V8HI V4SI V2DI]) > @@ -15681,6 +15682,60 @@ (define_insn "sse4_1_blendv<ssemodesuffi > ] > (const_string "<ssevecmode>")))]) > > +(define_insn "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt" > + [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") > + (unspec:VF_128_256 > + [(match_operand:VF_128_256 1 "register_operand" "0,0,x") > + (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm") > + (subreg:VF_128_256 > + (lt:<sseintvecmode> > + (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x") > + (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)] > + UNSPEC_BLENDV))] > + "TARGET_SSE4_1" > + "@ > + blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} > + blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} > + vblendv<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "isa" "noavx,noavx,avx") > + (set_attr "type" "ssemov") > + (set_attr "length_immediate" "1") > + (set_attr "prefix_data16" "1,1,*") > + (set_attr "prefix_extra" "1") > + (set_attr "prefix" "orig,orig,vex") > + (set_attr "btver2_decode" "vector,vector,vector") > + (set_attr "mode" "<MODE>")]) > + > +(define_mode_attr ssefltmodesuffix > + [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")]) > + > +(define_mode_attr ssefltvecmode > + [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")]) > + > +(define_insn "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint" > + [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x") > + (unspec:<ssebytemode> > + [(match_operand:<ssebytemode> 1 "register_operand" "0,0,x") > + (match_operand:<ssebytemode> 2 "vector_operand" "YrBm,*xBm,xm") > + (subreg:<ssebytemode> > + (lt:VI48_AVX > + (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x") > + (match_operand:VI48_AVX 4 "const0_operand" "C,C,C")) 0)] > + UNSPEC_BLENDV))] > + "TARGET_SSE4_1" > + "@ > + blendv<ssefltmodesuffix>\t{%3, %2, %0|%0, %2, %3} > + blendv<ssefltmodesuffix>\t{%3, %2, %0|%0, %2, %3} > + vblendv<ssefltmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "isa" "noavx,noavx,avx") > + (set_attr "type" "ssemov") > + (set_attr "length_immediate" "1") > + (set_attr "prefix_data16" "1,1,*") > + (set_attr "prefix_extra" "1") > + (set_attr "prefix" "orig,orig,vex") > + (set_attr "btver2_decode" "vector,vector,vector") > + (set_attr "mode" "<ssefltvecmode>")]) > + > (define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>" > [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") > (unspec:VF_128_256 > @@ -15768,6 +15823,27 @@ (define_insn "<sse4_1_avx2>_pblendvb" > UNSPEC_BLENDV))] > "TARGET_SSE4_1" > "@ > + pblendvb\t{%3, %2, %0|%0, %2, %3} > + pblendvb\t{%3, %2, %0|%0, %2, %3} > + vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "isa" "noavx,noavx,avx") > + (set_attr "type" "ssemov") > + (set_attr "prefix_extra" "1") > + (set_attr "length_immediate" "*,*,1") > + (set_attr "prefix" "orig,orig,vex") > + (set_attr "btver2_decode" "vector,vector,vector") > + (set_attr "mode" "<sseinsnmode>")]) > + > +(define_insn "*<sse4_1_avx2>_pblendvb_lt" > + [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x") > + (unspec:VI1_AVX2 > + [(match_operand:VI1_AVX2 1 "register_operand" "0,0,x") > + (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm") > + (lt:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x") > + (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))] > + UNSPEC_BLENDV))] > + "TARGET_SSE4_1" > + "@ > pblendvb\t{%3, %2, %0|%0, %2, %3} > pblendvb\t{%3, %2, %0|%0, %2, %3} > vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}" > --- gcc/testsuite/g++.target/i386/sse4_1-pr54700-1.C.jj 2018-11-28 18:33:14.085526427 +0100 > +++ gcc/testsuite/g++.target/i386/sse4_1-pr54700-1.C 2018-11-28 18:53:01.519834866 +0100 > @@ -0,0 +1,69 @@ > +/* PR target/54700 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -std=c++14 -msse4 -mno-avx -mno-xop" } */ > +/* { dg-final { scan-assembler-not "pcmpgt\[bdq]" } } */ > +/* { dg-final { scan-assembler-times "pblendvb" 2 } } */ > +/* { dg-final { scan-assembler-times "blendvps" 4 } } */ > +/* { dg-final { scan-assembler-times "blendvpd" 4 } } */ > + > +#include <x86intrin.h> > + > +__attribute__((noipa)) __v16qi > +f1 (__v16qi a, __v16qi b, __v16qi c) > +{ > + return a < 0 ? b : c; > +} > + > +__attribute__((noipa)) __v16qi > +f2 (__v16qi a, __v16qi b, __v16qi c) > +{ > + return a >= 0 ? b : c; > +} > + > +__attribute__((noipa)) __v4si > +f3 (__v4si a, __v4si b, __v4si c) > +{ > + return a < 0 ? b : c; > +} > + > +__attribute__((noipa)) __v4si > +f4 (__v4si a, __v4si b, __v4si c) > +{ > + return a >= 0 ? b : c; > +} > + > +__attribute__((noipa)) __v2di > +f5 (__v2di a, __v2di b, __v2di c) > +{ > + return a < 0 ? b : c; > +} > + > +__attribute__((noipa)) __v2di > +f6 (__v2di a, __v2di b, __v2di c) > +{ > + return a >= 0 ? b : c; > +} > + > +__attribute__((noipa)) __v4sf > +f7 (__v4si a, __v4sf b, __v4sf c) > +{ > + return a < 0 ? b : c; > +} > + > +__attribute__((noipa)) __v4sf > +f8 (__v4si a, __v4sf b, __v4sf c) > +{ > + return a >= 0 ? b : c; > +} > + > +__attribute__((noipa)) __v2df > +f9 (__v2di a, __v2df b, __v2df c) > +{ > + return a < 0 ? b : c; > +} > + > +__attribute__((noipa)) __v2df > +f10 (__v2di a, __v2df b, __v2df c) > +{ > + return a >= 0 ? b : c; > +} > --- gcc/testsuite/g++.target/i386/sse4_1-pr54700-2.C.jj 2018-11-28 18:40:01.036774967 +0100 > +++ gcc/testsuite/g++.target/i386/sse4_1-pr54700-2.C 2018-11-28 19:09:01.730917128 +0100 > @@ -0,0 +1,73 @@ > +/* PR target/54700 */ > +/* { dg-do run { target sse4 } } */ > +/* { dg-options "-O2 -std=c++14 -msse4 -mno-avx -mno-xop" } */ > + > +#ifndef CHECK_H > +#define CHECK_H "sse4_1-check.h" > +#endif > + > +#ifndef TEST > +#define TEST sse4_1_test > +#endif > + > +#include CHECK_H > + > +#include "sse4_1-pr54700-1.C" > + > +static void > +TEST () > +{ > + __v16qi v16qia = { -128, 12, -1, 127, 115, 0, -19, 125, -125, 12, 0, -37, 37, 15, 98, -105 }; > + __v16qi v16qib = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; > + __v16qi v16qic = { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 }; > + __v16qi v16qie = { 1, 18, 3, 20, 21, 22, 7, 24, 9, 26, 27, 12, 29, 30, 31, 16 }; > + __v16qi v16qif = { 17, 2, 19, 4, 5, 6, 23, 8, 25, 10, 11, 28, 13, 14, 15, 32 }; > + __v16qi v16qir = f1 (v16qia, v16qib, v16qic); > + if (__builtin_memcmp (&v16qir, &v16qie, sizeof (__v16qi))) > + __builtin_abort (); > + v16qir = f2 (v16qia, v16qib, v16qic); > + if (__builtin_memcmp (&v16qir, &v16qif, sizeof (__v16qi))) > + __builtin_abort (); > + __v4si v4sia = { __INT_MAX__, -__INT_MAX__ - 1, -32, 12 }; > + __v4si v4sib = { 1, 2, 3, 4 }; > + __v4si v4sic = { 5, 6, 7, 8 }; > + __v4si v4sie = { 5, 2, 3, 8 }; > + __v4si v4sif = { 1, 6, 7, 4 }; > + __v4si v4sir = f3 (v4sia, v4sib, v4sic); > + if (__builtin_memcmp (&v4sir, &v4sie, sizeof (__v4si))) > + __builtin_abort (); > + v4sir = f4 (v4sia, v4sib, v4sic); > + if (__builtin_memcmp (&v4sir, &v4sif, sizeof (__v4si))) > + __builtin_abort (); > + __v2di v2dia = { -__LONG_LONG_MAX__, 1000LL * __INT_MAX__ }; > + __v2di v2dib = { 1, 2 }; > + __v2di v2dic = { 3, 4 }; > + __v2di v2die = { 1, 4 }; > + __v2di v2dif = { 3, 2 }; > + __v2di v2dir = f5 (v2dia, v2dib, v2dic); > + if (__builtin_memcmp (&v2dir, &v2die, sizeof (__v2di))) > + __builtin_abort (); > + v2dir = f6 (v2dia, v2dib, v2dic); > + if (__builtin_memcmp (&v2dir, &v2dif, sizeof (__v2di))) > + __builtin_abort (); > + __v4sf v4sfb = { 1.0f, 2.0f, 3.0f, 4.0f }; > + __v4sf v4sfc = { 5.0f, 6.0f, 7.0f, 8.0f }; > + __v4sf v4sfe = { 5.0f, 2.0f, 3.0f, 8.0f }; > + __v4sf v4sff = { 1.0f, 6.0f, 7.0f, 4.0f }; > + __v4sf v4sfr = f7 (v4sia, v4sfb, v4sfc); > + if (__builtin_memcmp (&v4sfr, &v4sfe, sizeof (__v4sf))) > + __builtin_abort (); > + v4sfr = f8 (v4sia, v4sfb, v4sfc); > + if (__builtin_memcmp (&v4sfr, &v4sff, sizeof (__v4sf))) > + __builtin_abort (); > + __v2df v2dfb = { 1.0, 2.0 }; > + __v2df v2dfc = { 3.0, 4.0 }; > + __v2df v2dfe = { 1.0, 4.0 }; > + __v2df v2dff = { 3.0, 2.0 }; > + __v2df v2dfr = f9 (v2dia, v2dfb, v2dfc); > + if (__builtin_memcmp (&v2dfr, &v2dfe, sizeof (__v2df))) > + __builtin_abort (); > + v2dfr = f10 (v2dia, v2dfb, v2dfc); > + if (__builtin_memcmp (&v2dfr, &v2dff, sizeof (__v2df))) > + __builtin_abort (); > +} > --- gcc/testsuite/g++.target/i386/avx-pr54700-1.C.jj 2018-11-28 18:34:24.237362509 +0100 > +++ gcc/testsuite/g++.target/i386/avx-pr54700-1.C 2018-11-28 18:53:19.388538643 +0100 > @@ -0,0 +1,9 @@ > +/* PR target/54700 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -std=c++14 -mavx -mno-xop -mno-avx2" } */ > +/* { dg-final { scan-assembler-not "vpcmpgt\[bdq]" } } */ > +/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */ > +/* { dg-final { scan-assembler-times "vblendvps" 4 } } */ > +/* { dg-final { scan-assembler-times "vblendvpd" 4 } } */ > + > +#include "sse4_1-pr54700-1.C" > --- gcc/testsuite/g++.target/i386/avx-pr54700-2.C.jj 2018-11-28 19:04:18.460612953 +0100 > +++ gcc/testsuite/g++.target/i386/avx-pr54700-2.C 2018-11-28 19:09:11.430756336 +0100 > @@ -0,0 +1,8 @@ > +/* PR target/54700 */ > +/* { dg-do run { target avx } } */ > +/* { dg-options "-O2 -std=c++14 -mavx -mno-xop -mno-avx2" } */ > + > +#define CHECK_H "avx-check.h" > +#define TEST avx_test > + > +#include "sse4_1-pr54700-2.C" > --- gcc/testsuite/g++.target/i386/avx2-pr54700-1.C.jj 2018-11-28 18:37:43.990048342 +0100 > +++ gcc/testsuite/g++.target/i386/avx2-pr54700-1.C 2018-11-28 18:53:26.209425564 +0100 > @@ -0,0 +1,69 @@ > +/* PR target/54700 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -std=c++14 -mavx2 -mno-xop -mno-avx512f" } */ > +/* { dg-final { scan-assembler-not "vpcmpgt\[bdq]" } } */ > +/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */ > +/* { dg-final { scan-assembler-times "vblendvps" 4 } } */ > +/* { dg-final { scan-assembler-times "vblendvpd" 4 } } */ > + > +#include <x86intrin.h> > + > +__attribute__((noipa)) __v32qi > +f1 (__v32qi a, __v32qi b, __v32qi c) > +{ > + return a < 0 ? b : c; > +} > + > +__attribute__((noipa)) __v32qi > +f2 (__v32qi a, __v32qi b, __v32qi c) > +{ > + return a >= 0 ? b : c; > +} > + > +__attribute__((noipa)) __v8si > +f3 (__v8si a, __v8si b, __v8si c) > +{ > + return a < 0 ? b : c; > +} > + > +__attribute__((noipa)) __v8si > +f4 (__v8si a, __v8si b, __v8si c) > +{ > + return a >= 0 ? b : c; > +} > + > +__attribute__((noipa)) __v4di > +f5 (__v4di a, __v4di b, __v4di c) > +{ > + return a < 0 ? b : c; > +} > + > +__attribute__((noipa)) __v4di > +f6 (__v4di a, __v4di b, __v4di c) > +{ > + return a >= 0 ? b : c; > +} > + > +__attribute__((noipa)) __v8sf > +f7 (__v8si a, __v8sf b, __v8sf c) > +{ > + return a < 0 ? b : c; > +} > + > +__attribute__((noipa)) __v8sf > +f8 (__v8si a, __v8sf b, __v8sf c) > +{ > + return a >= 0 ? b : c; > +} > + > +__attribute__((noipa)) __v4df > +f9 (__v4di a, __v4df b, __v4df c) > +{ > + return a < 0 ? b : c; > +} > + > +__attribute__((noipa)) __v4df > +f10 (__v4di a, __v4df b, __v4df c) > +{ > + return a >= 0 ? b : c; > +} > --- gcc/testsuite/g++.target/i386/avx2-pr54700-2.C.jj 2018-11-28 19:08:12.269737055 +0100 > +++ gcc/testsuite/g++.target/i386/avx2-pr54700-2.C 2018-11-28 19:17:58.058021174 +0100 > @@ -0,0 +1,72 @@ > +/* PR target/54700 */ > +/* { dg-do run { target avx2 } } */ > +/* { dg-options "-O2 -std=c++14 -mavx2 -mno-xop -mno-avx512f" } */ > + > +#include "avx2-check.h" > + > +#define TEST avx2_test > + > +#include "avx2-pr54700-1.C" > + > +static void > +TEST () > +{ > + __v32qi v32qia = { -128, 12, -1, 127, 115, 0, -19, 125, -125, 12, 0, -37, 37, 15, 98, -105, > + 0, 1, 2, 3, -1, -2, -3, -4, 4, -5, 5, -6, 6, -7, 7, -8 }; > + __v32qi v32qib = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, > + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16 }; > + __v32qi v32qic = { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, > + -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32 }; > + __v32qi v32qie = { 1, 18, 3, 20, 21, 22, 7, 24, 9, 26, 27, 12, 29, 30, 31, 16, > + -17, -18, -19, -20, -5, -6, -7, -8, -25, -10, -27, -12, -29, -14, -31, -16 }; > + __v32qi v32qif = { 17, 2, 19, 4, 5, 6, 23, 8, 25, 10, 11, 28, 13, 14, 15, 32, > + -1, -2, -3, -4, -21, -22, -23, -24, -9, -26, -11, -28, -13, -30, -15, -32 }; > + __v32qi v32qir = f1 (v32qia, v32qib, v32qic); > + if (__builtin_memcmp (&v32qir, &v32qie, sizeof (__v32qi))) > + __builtin_abort (); > + v32qir = f2 (v32qia, v32qib, v32qic); > + if (__builtin_memcmp (&v32qir, &v32qif, sizeof (__v32qi))) > + __builtin_abort (); > + __v8si v8sia = { __INT_MAX__, -__INT_MAX__ - 1, -32, 12, __INT_MAX__ - 2, -__INT_MAX__, 15, -1 }; > + __v8si v8sib = { 1, 2, 3, 4, -1, -2, -3, -4 }; > + __v8si v8sic = { 5, 6, 7, 8, -5, -6, -7, -8 }; > + __v8si v8sie = { 5, 2, 3, 8, -5, -2, -7, -4 }; > + __v8si v8sif = { 1, 6, 7, 4, -1, -6, -3, -8 }; > + __v8si v8sir = f3 (v8sia, v8sib, v8sic); > + if (__builtin_memcmp (&v8sir, &v8sie, sizeof (__v8si))) > + __builtin_abort (); > + v8sir = f4 (v8sia, v8sib, v8sic); > + if (__builtin_memcmp (&v8sir, &v8sif, sizeof (__v8si))) > + __builtin_abort (); > + __v4di v4dia = { -__LONG_LONG_MAX__, 1000LL * __INT_MAX__, __LONG_LONG_MAX__, -2 }; > + __v4di v4dib = { 1, 2, -1, -2 }; > + __v4di v4dic = { 3, 4, -3, -4 }; > + __v4di v4die = { 1, 4, -3, -2 }; > + __v4di v4dif = { 3, 2, -1, -4 }; > + __v4di v4dir = f5 (v4dia, v4dib, v4dic); > + if (__builtin_memcmp (&v4dir, &v4die, sizeof (__v4di))) > + __builtin_abort (); > + v4dir = f6 (v4dia, v4dib, v4dic); > + if (__builtin_memcmp (&v4dir, &v4dif, sizeof (__v4di))) > + __builtin_abort (); > + __v8sf v8sfb = { 1.0f, 2.0f, 3.0f, 4.0f, -1.0f, -2.0f, -3.0f, -4.0f }; > + __v8sf v8sfc = { 5.0f, 6.0f, 7.0f, 8.0f, -5.0f, -6.0f, -7.0f, -8.0f }; > + __v8sf v8sfe = { 5.0f, 2.0f, 3.0f, 8.0f, -5.0f, -2.0f, -7.0f, -4.0f }; > + __v8sf v8sff = { 1.0f, 6.0f, 7.0f, 4.0f, -1.0f, -6.0f, -3.0f, -8.0f }; > + __v8sf v8sfr = f7 (v8sia, v8sfb, v8sfc); > + if (__builtin_memcmp (&v8sfr, &v8sfe, sizeof (__v8sf))) > + __builtin_abort (); > + v8sfr = f8 (v8sia, v8sfb, v8sfc); > + if (__builtin_memcmp (&v8sfr, &v8sff, sizeof (__v8sf))) > + __builtin_abort (); > + __v4df v4dfb = { 1.0, 2.0, -1.0, -2.0 }; > + __v4df v4dfc = { 3.0, 4.0, -3.0, -4.0 }; > + __v4df v4dfe = { 1.0, 4.0, -3.0, -2.0 }; > + __v4df v4dff = { 3.0, 2.0, -1.0, -4.0 }; > + __v4df v4dfr = f9 (v4dia, v4dfb, v4dfc); > + if (__builtin_memcmp (&v4dfr, &v4dfe, sizeof (__v4df))) > + __builtin_abort (); > + v4dfr = f10 (v4dia, v4dfb, v4dfc); > + if (__builtin_memcmp (&v4dfr, &v4dff, sizeof (__v4df))) > + __builtin_abort (); > +} > --- gcc/testsuite/g++.target/i386/sse4_1-check.h.jj 2018-11-28 18:42:21.280449620 +0100 > +++ gcc/testsuite/g++.target/i386/sse4_1-check.h 2018-11-28 18:42:16.404530462 +0100 > @@ -0,0 +1 @@ > +#include "../../gcc.target/i386/sse4_1-check.h" > --- gcc/testsuite/g++.target/i386/avx-check.h.jj 2018-11-28 19:05:38.688283010 +0100 > +++ gcc/testsuite/g++.target/i386/avx-check.h 2018-11-28 19:05:43.296206617 +0100 > @@ -0,0 +1 @@ > +#include "../../gcc.target/i386/avx-check.h" > --- gcc/testsuite/g++.target/i386/avx2-check.h.jj 2018-11-28 19:06:41.619239790 +0100 > +++ gcc/testsuite/g++.target/i386/avx2-check.h 2018-11-28 19:06:46.474159302 +0100 > @@ -0,0 +1 @@ > +#include "../../gcc.target/i386/avx2-check.h" > --- gcc/testsuite/g++.target/i386/m128-check.h.jj 2018-11-28 18:42:33.576245739 +0100 > +++ gcc/testsuite/g++.target/i386/m128-check.h 2018-11-28 18:42:39.245151746 +0100 > @@ -0,0 +1 @@ > +#include "../../gcc.target/i386/m128-check.h" > --- gcc/testsuite/g++.target/i386/m256-check.h.jj 2018-11-28 19:06:21.805568241 +0100 > +++ gcc/testsuite/g++.target/i386/m256-check.h 2018-11-28 19:06:26.463491032 +0100 > @@ -0,0 +1 @@ > +#include "../../gcc.target/i386/m256-check.h" > --- gcc/testsuite/g++.target/i386/avx-os-support.h.jj 2018-11-28 19:05:56.615985814 +0100 > +++ gcc/testsuite/g++.target/i386/avx-os-support.h 2018-11-28 19:06:01.759900538 +0100 > @@ -0,0 +1 @@ > +#include "../../gcc.target/i386/avx-os-support.h" > > Jakub
On Thu, Nov 29, 2018 at 10:54 AM Uros Bizjak <ubizjak@gmail.com> wrote: > > On Thu, Nov 29, 2018 at 9:00 AM Jakub Jelinek <jakub@redhat.com> wrote: > > > > Hi! > > > > The following patch optimizes > > - pxor %xmm3, %xmm3 > > - pcmpgtb %xmm0, %xmm3 > > - movdqa %xmm3, %xmm0 > > pblendvb %xmm0, %xmm1, %xmm2 > > movdqa %xmm2, %xmm0 > > ret > > > > - vpxor %xmm3, %xmm3, %xmm3 > > - vpcmpgtq %ymm0, %ymm3, %ymm0 > > - vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 > > + vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 > > ret > > > > etc. As the *blendv* instructions only look at the most significant > > bit, we don't really need to perform pcmpgt* or vpcmpgt* instructions; > > while they set also the other bits based on the most significant one, > > the only consumer doesn't care about those other bits. > > > > I believe we can't do this for floating point comparisons even with > > -ffast-math, because -fno-signed-zeros isn't a guarantee that -0.0 won't > > appear, just that it will appear randomly when 0.0 is wanted and vice versa, > > and having x < 0.0 be suddenly false if x is -0.0 would IMHO break too much > > code. > > I agree with the above. This would mean that a comparison x < 0.0 > would be substituted with an equivalent to a signbit (). We don't do > this even for -ffast-math or -funsafe-math-optimizations. > > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > > > 2018-11-28 Jakub Jelinek <jakub@redhat.com> > > > > PR target/54700 > > * config/i386/sse.md (ssebytemode): Add V16SI, V8SI and V4SI entries. > > (ssefltmodesuffix, ssefltvecmode): New define_mode_attrs. > > (*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt, > > *<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint, > > *<sse4_1_avx2>_pblendvb_lt): New define_insns. > > > > * g++.target/i386/sse4_1-pr54700-1.C: New test. > > * g++.target/i386/sse4_1-pr54700-2.C: New test. > > * g++.target/i386/avx-pr54700-1.C: New test. > > * g++.target/i386/avx-pr54700-2.C: New test. > > * g++.target/i386/avx2-pr54700-1.C: New test. > > * g++.target/i386/avx2-pr54700-2.C: New test. > > * g++.target/i386/sse4_1-check.h: New file. > > * g++.target/i386/avx-check.h: New file. > > * g++.target/i386/avx2-check.h: New file. > > * g++.target/i386/m128-check.h: New file. > > * g++.target/i386/m256-check.h: New file. > > * g++.target/i386/avx-os-support.h: New file. > > OK. On a second thought, should we rather use (pre-reload?) define_insn_and_split to split the combination to the blend insn? Uros.
On Thu, Nov 29, 2018 at 05:21:53PM +0100, Uros Bizjak wrote: > > > * g++.target/i386/avx2-check.h: New file. > > > * g++.target/i386/m128-check.h: New file. > > > * g++.target/i386/m256-check.h: New file. > > > * g++.target/i386/avx-os-support.h: New file. > > > > OK. > > On a second thought, should we rather use (pre-reload?) > define_insn_and_split to split the combination to the blend insn? I've already committed it. But can work on a patch that does that tomorrow. Jakub
On Thu, Nov 29, 2018 at 5:28 PM Jakub Jelinek <jakub@redhat.com> wrote: > > On Thu, Nov 29, 2018 at 05:21:53PM +0100, Uros Bizjak wrote: > > > > * g++.target/i386/avx2-check.h: New file. > > > > * g++.target/i386/m128-check.h: New file. > > > > * g++.target/i386/m256-check.h: New file. > > > > * g++.target/i386/avx-os-support.h: New file. > > > > > > OK. > > > > On a second thought, should we rather use (pre-reload?) > > define_insn_and_split to split the combination to the blend insn? > > I've already committed it. But can work on a patch that does that tomorrow. Thanks. You will probably need to split it after reload, since a change from intvec->FPvec is needed. Uros.
On Thu, Nov 29, 2018 at 05:41:59PM +0100, Uros Bizjak wrote: > On Thu, Nov 29, 2018 at 5:28 PM Jakub Jelinek <jakub@redhat.com> wrote: > > > > On Thu, Nov 29, 2018 at 05:21:53PM +0100, Uros Bizjak wrote: > > > > > * g++.target/i386/avx2-check.h: New file. > > > > > * g++.target/i386/m128-check.h: New file. > > > > > * g++.target/i386/m256-check.h: New file. > > > > > * g++.target/i386/avx-os-support.h: New file. > > > > > > > > OK. > > > > > > On a second thought, should we rather use (pre-reload?) > > > define_insn_and_split to split the combination to the blend insn? > > > > I've already committed it. But can work on a patch that does that tomorrow. > > Thanks. You will probably need to split it after reload, since a > change from intvec->FPvec is needed. Like this? Bootstrapped/regtested on x86_64-linux and i686-linux. 2018-11-29 Jakub Jelinek <jakub@redhat.com> PR target/54700 * config/i386/sse.md (*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt, *<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint, *<sse4_1_avx2>_pblendvb_lt): Change define_insn into define_insn_and_split. --- gcc/config/i386/sse.md.jj 2018-11-29 15:32:27.597301378 +0100 +++ gcc/config/i386/sse.md 2018-11-29 18:52:42.747904630 +0100 @@ -15682,7 +15682,7 @@ (define_insn "sse4_1_blendv<ssemodesuffi ] (const_string "<ssevecmode>")))]) -(define_insn "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt" +(define_insn_and_split "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt" [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") (unspec:VF_128_256 [(match_operand:VF_128_256 1 "register_operand" "0,0,x") @@ -15693,10 +15693,12 @@ (define_insn "*<sse4_1>_blendv<ssemodesu (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)] UNSPEC_BLENDV))] "TARGET_SSE4_1" - "@ - blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} - blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} - vblendv<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:VF_128_256 + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] + "operands[3] = gen_lowpart (<MODE>mode, operands[3]);" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "length_immediate" "1") @@ -15712,7 +15714,7 @@ (define_mode_attr ssefltmodesuffix (define_mode_attr ssefltvecmode [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")]) -(define_insn "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint" +(define_insn_and_split "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint" [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x") (unspec:<ssebytemode> [(match_operand:<ssebytemode> 1 "register_operand" "0,0,x") @@ -15723,10 +15725,17 @@ (define_insn "*<sse4_1>_blendv<ssefltmod (match_operand:VI48_AVX 4 "const0_operand" "C,C,C")) 0)] UNSPEC_BLENDV))] "TARGET_SSE4_1" - "@ - blendv<ssefltmodesuffix>\t{%3, %2, %0|%0, %2, %3} - blendv<ssefltmodesuffix>\t{%3, %2, %0|%0, %2, %3} - vblendv<ssefltmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:<ssefltvecmode> + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] +{ + operands[0] = gen_lowpart (<ssefltvecmode>mode, operands[0]); + operands[1] = gen_lowpart (<ssefltvecmode>mode, operands[1]); + operands[2] = gen_lowpart (<ssefltvecmode>mode, operands[2]); + operands[3] = gen_lowpart (<ssefltvecmode>mode, operands[3]); +} [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "length_immediate" "1") @@ -15834,7 +15843,7 @@ (define_insn "<sse4_1_avx2>_pblendvb" (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "*<sse4_1_avx2>_pblendvb_lt" +(define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt" [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x") (unspec:VI1_AVX2 [(match_operand:VI1_AVX2 1 "register_operand" "0,0,x") @@ -15843,10 +15852,12 @@ (define_insn "*<sse4_1_avx2>_pblendvb_lt (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))] UNSPEC_BLENDV))] "TARGET_SSE4_1" - "@ - pblendvb\t{%3, %2, %0|%0, %2, %3} - pblendvb\t{%3, %2, %0|%0, %2, %3} - vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "#" + "" + [(set (match_dup 0) + (unspec:VI1_AVX2 + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] + "" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") Jakub
On Thu, Nov 29, 2018 at 10:40 PM Jakub Jelinek <jakub@redhat.com> wrote: > > On Thu, Nov 29, 2018 at 05:41:59PM +0100, Uros Bizjak wrote: > > On Thu, Nov 29, 2018 at 5:28 PM Jakub Jelinek <jakub@redhat.com> wrote: > > > > > > On Thu, Nov 29, 2018 at 05:21:53PM +0100, Uros Bizjak wrote: > > > > > > * g++.target/i386/avx2-check.h: New file. > > > > > > * g++.target/i386/m128-check.h: New file. > > > > > > * g++.target/i386/m256-check.h: New file. > > > > > > * g++.target/i386/avx-os-support.h: New file. > > > > > > > > > > OK. > > > > > > > > On a second thought, should we rather use (pre-reload?) > > > > define_insn_and_split to split the combination to the blend insn? > > > > > > I've already committed it. But can work on a patch that does that tomorrow. > > > > Thanks. You will probably need to split it after reload, since a > > change from intvec->FPvec is needed. > > Like this? Bootstrapped/regtested on x86_64-linux and i686-linux. > > 2018-11-29 Jakub Jelinek <jakub@redhat.com> > > PR target/54700 > * config/i386/sse.md > (*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt, > *<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint, > *<sse4_1_avx2>_pblendvb_lt): Change define_insn into > define_insn_and_split. OK. Thanks, Uros. > --- gcc/config/i386/sse.md.jj 2018-11-29 15:32:27.597301378 +0100 > +++ gcc/config/i386/sse.md 2018-11-29 18:52:42.747904630 +0100 > @@ -15682,7 +15682,7 @@ (define_insn "sse4_1_blendv<ssemodesuffi > ] > (const_string "<ssevecmode>")))]) > > -(define_insn "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt" > +(define_insn_and_split "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt" > [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") > (unspec:VF_128_256 > [(match_operand:VF_128_256 1 "register_operand" "0,0,x") > @@ -15693,10 +15693,12 @@ (define_insn "*<sse4_1>_blendv<ssemodesu > (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)] > UNSPEC_BLENDV))] > "TARGET_SSE4_1" > - "@ > - blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} > - blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} > - vblendv<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "#" > + "&& reload_completed" > + [(set (match_dup 0) > + (unspec:VF_128_256 > + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] > + "operands[3] = gen_lowpart (<MODE>mode, operands[3]);" > [(set_attr "isa" "noavx,noavx,avx") > (set_attr "type" "ssemov") > (set_attr "length_immediate" "1") > @@ -15712,7 +15714,7 @@ (define_mode_attr ssefltmodesuffix > (define_mode_attr ssefltvecmode > [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")]) > > -(define_insn "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint" > +(define_insn_and_split "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint" > [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x") > (unspec:<ssebytemode> > [(match_operand:<ssebytemode> 1 "register_operand" "0,0,x") > @@ -15723,10 +15725,17 @@ (define_insn "*<sse4_1>_blendv<ssefltmod > (match_operand:VI48_AVX 4 "const0_operand" "C,C,C")) 0)] > UNSPEC_BLENDV))] > "TARGET_SSE4_1" > - "@ > - blendv<ssefltmodesuffix>\t{%3, %2, %0|%0, %2, %3} > - blendv<ssefltmodesuffix>\t{%3, %2, %0|%0, %2, %3} > - vblendv<ssefltmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "#" > + "&& reload_completed" > + [(set (match_dup 0) > + (unspec:<ssefltvecmode> > + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] > +{ > + operands[0] = gen_lowpart (<ssefltvecmode>mode, operands[0]); > + operands[1] = gen_lowpart (<ssefltvecmode>mode, operands[1]); > + operands[2] = gen_lowpart (<ssefltvecmode>mode, operands[2]); > + operands[3] = gen_lowpart (<ssefltvecmode>mode, operands[3]); > +} > [(set_attr "isa" "noavx,noavx,avx") > (set_attr "type" "ssemov") > (set_attr "length_immediate" "1") > @@ -15834,7 +15843,7 @@ (define_insn "<sse4_1_avx2>_pblendvb" > (set_attr "btver2_decode" "vector,vector,vector") > (set_attr "mode" "<sseinsnmode>")]) > > -(define_insn "*<sse4_1_avx2>_pblendvb_lt" > +(define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt" > [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x") > (unspec:VI1_AVX2 > [(match_operand:VI1_AVX2 1 "register_operand" "0,0,x") > @@ -15843,10 +15852,12 @@ (define_insn "*<sse4_1_avx2>_pblendvb_lt > (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))] > UNSPEC_BLENDV))] > "TARGET_SSE4_1" > - "@ > - pblendvb\t{%3, %2, %0|%0, %2, %3} > - pblendvb\t{%3, %2, %0|%0, %2, %3} > - vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "#" > + "" > + [(set (match_dup 0) > + (unspec:VI1_AVX2 > + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] > + "" > [(set_attr "isa" "noavx,noavx,avx") > (set_attr "type" "ssemov") > (set_attr "prefix_extra" "1") > > > Jakub
--- gcc/config/i386/sse.md.jj 2018-11-28 09:53:55.607998012 +0100 +++ gcc/config/i386/sse.md 2018-11-28 18:18:34.042124095 +0100 @@ -601,7 +601,8 @@ (define_mode_attr ssedoublemode (V4DI "V8DI") (V8DI "V16DI")]) (define_mode_attr ssebytemode - [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")]) + [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI") + (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")]) ;; All 128bit vector integer modes (define_mode_iterator VI_128 [V16QI V8HI V4SI V2DI]) @@ -15681,6 +15682,60 @@ (define_insn "sse4_1_blendv<ssemodesuffi ] (const_string "<ssevecmode>")))]) +(define_insn "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt" + [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") + (unspec:VF_128_256 + [(match_operand:VF_128_256 1 "register_operand" "0,0,x") + (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm") + (subreg:VF_128_256 + (lt:<sseintvecmode> + (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x") + (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "@ + blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} + blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} + vblendv<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector,vector,vector") + (set_attr "mode" "<MODE>")]) + +(define_mode_attr ssefltmodesuffix + [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")]) + +(define_mode_attr ssefltvecmode + [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")]) + +(define_insn "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint" + [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x") + (unspec:<ssebytemode> + [(match_operand:<ssebytemode> 1 "register_operand" "0,0,x") + (match_operand:<ssebytemode> 2 "vector_operand" "YrBm,*xBm,xm") + (subreg:<ssebytemode> + (lt:VI48_AVX + (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x") + (match_operand:VI48_AVX 4 "const0_operand" "C,C,C")) 0)] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "@ + blendv<ssefltmodesuffix>\t{%3, %2, %0|%0, %2, %3} + blendv<ssefltmodesuffix>\t{%3, %2, %0|%0, %2, %3} + vblendv<ssefltmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector,vector,vector") + (set_attr "mode" "<ssefltvecmode>")]) + (define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>" [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") (unspec:VF_128_256 @@ -15768,6 +15823,27 @@ (define_insn "<sse4_1_avx2>_pblendvb" UNSPEC_BLENDV))] "TARGET_SSE4_1" "@ + pblendvb\t{%3, %2, %0|%0, %2, %3} + pblendvb\t{%3, %2, %0|%0, %2, %3} + vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "*,*,1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector,vector,vector") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*<sse4_1_avx2>_pblendvb_lt" + [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x") + (unspec:VI1_AVX2 + [(match_operand:VI1_AVX2 1 "register_operand" "0,0,x") + (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm") + (lt:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x") + (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "@ pblendvb\t{%3, %2, %0|%0, %2, %3} pblendvb\t{%3, %2, %0|%0, %2, %3} vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}" --- gcc/testsuite/g++.target/i386/sse4_1-pr54700-1.C.jj 2018-11-28 18:33:14.085526427 +0100 +++ gcc/testsuite/g++.target/i386/sse4_1-pr54700-1.C 2018-11-28 18:53:01.519834866 +0100 @@ -0,0 +1,69 @@ +/* PR target/54700 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -std=c++14 -msse4 -mno-avx -mno-xop" } */ +/* { dg-final { scan-assembler-not "pcmpgt\[bdq]" } } */ +/* { dg-final { scan-assembler-times "pblendvb" 2 } } */ +/* { dg-final { scan-assembler-times "blendvps" 4 } } */ +/* { dg-final { scan-assembler-times "blendvpd" 4 } } */ + +#include <x86intrin.h> + +__attribute__((noipa)) __v16qi +f1 (__v16qi a, __v16qi b, __v16qi c) +{ + return a < 0 ? b : c; +} + +__attribute__((noipa)) __v16qi +f2 (__v16qi a, __v16qi b, __v16qi c) +{ + return a >= 0 ? b : c; +} + +__attribute__((noipa)) __v4si +f3 (__v4si a, __v4si b, __v4si c) +{ + return a < 0 ? b : c; +} + +__attribute__((noipa)) __v4si +f4 (__v4si a, __v4si b, __v4si c) +{ + return a >= 0 ? b : c; +} + +__attribute__((noipa)) __v2di +f5 (__v2di a, __v2di b, __v2di c) +{ + return a < 0 ? b : c; +} + +__attribute__((noipa)) __v2di +f6 (__v2di a, __v2di b, __v2di c) +{ + return a >= 0 ? b : c; +} + +__attribute__((noipa)) __v4sf +f7 (__v4si a, __v4sf b, __v4sf c) +{ + return a < 0 ? b : c; +} + +__attribute__((noipa)) __v4sf +f8 (__v4si a, __v4sf b, __v4sf c) +{ + return a >= 0 ? b : c; +} + +__attribute__((noipa)) __v2df +f9 (__v2di a, __v2df b, __v2df c) +{ + return a < 0 ? b : c; +} + +__attribute__((noipa)) __v2df +f10 (__v2di a, __v2df b, __v2df c) +{ + return a >= 0 ? b : c; +} --- gcc/testsuite/g++.target/i386/sse4_1-pr54700-2.C.jj 2018-11-28 18:40:01.036774967 +0100 +++ gcc/testsuite/g++.target/i386/sse4_1-pr54700-2.C 2018-11-28 19:09:01.730917128 +0100 @@ -0,0 +1,73 @@ +/* PR target/54700 */ +/* { dg-do run { target sse4 } } */ +/* { dg-options "-O2 -std=c++14 -msse4 -mno-avx -mno-xop" } */ + +#ifndef CHECK_H +#define CHECK_H "sse4_1-check.h" +#endif + +#ifndef TEST +#define TEST sse4_1_test +#endif + +#include CHECK_H + +#include "sse4_1-pr54700-1.C" + +static void +TEST () +{ + __v16qi v16qia = { -128, 12, -1, 127, 115, 0, -19, 125, -125, 12, 0, -37, 37, 15, 98, -105 }; + __v16qi v16qib = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; + __v16qi v16qic = { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 }; + __v16qi v16qie = { 1, 18, 3, 20, 21, 22, 7, 24, 9, 26, 27, 12, 29, 30, 31, 16 }; + __v16qi v16qif = { 17, 2, 19, 4, 5, 6, 23, 8, 25, 10, 11, 28, 13, 14, 15, 32 }; + __v16qi v16qir = f1 (v16qia, v16qib, v16qic); + if (__builtin_memcmp (&v16qir, &v16qie, sizeof (__v16qi))) + __builtin_abort (); + v16qir = f2 (v16qia, v16qib, v16qic); + if (__builtin_memcmp (&v16qir, &v16qif, sizeof (__v16qi))) + __builtin_abort (); + __v4si v4sia = { __INT_MAX__, -__INT_MAX__ - 1, -32, 12 }; + __v4si v4sib = { 1, 2, 3, 4 }; + __v4si v4sic = { 5, 6, 7, 8 }; + __v4si v4sie = { 5, 2, 3, 8 }; + __v4si v4sif = { 1, 6, 7, 4 }; + __v4si v4sir = f3 (v4sia, v4sib, v4sic); + if (__builtin_memcmp (&v4sir, &v4sie, sizeof (__v4si))) + __builtin_abort (); + v4sir = f4 (v4sia, v4sib, v4sic); + if (__builtin_memcmp (&v4sir, &v4sif, sizeof (__v4si))) + __builtin_abort (); + __v2di v2dia = { -__LONG_LONG_MAX__, 1000LL * __INT_MAX__ }; + __v2di v2dib = { 1, 2 }; + __v2di v2dic = { 3, 4 }; + __v2di v2die = { 1, 4 }; + __v2di v2dif = { 3, 2 }; + __v2di v2dir = f5 (v2dia, v2dib, v2dic); + if (__builtin_memcmp (&v2dir, &v2die, sizeof (__v2di))) + __builtin_abort (); + v2dir = f6 (v2dia, v2dib, v2dic); + if (__builtin_memcmp (&v2dir, &v2dif, sizeof (__v2di))) + __builtin_abort (); + __v4sf v4sfb = { 1.0f, 2.0f, 3.0f, 4.0f }; + __v4sf v4sfc = { 5.0f, 6.0f, 7.0f, 8.0f }; + __v4sf v4sfe = { 5.0f, 2.0f, 3.0f, 8.0f }; + __v4sf v4sff = { 1.0f, 6.0f, 7.0f, 4.0f }; + __v4sf v4sfr = f7 (v4sia, v4sfb, v4sfc); + if (__builtin_memcmp (&v4sfr, &v4sfe, sizeof (__v4sf))) + __builtin_abort (); + v4sfr = f8 (v4sia, v4sfb, v4sfc); + if (__builtin_memcmp (&v4sfr, &v4sff, sizeof (__v4sf))) + __builtin_abort (); + __v2df v2dfb = { 1.0, 2.0 }; + __v2df v2dfc = { 3.0, 4.0 }; + __v2df v2dfe = { 1.0, 4.0 }; + __v2df v2dff = { 3.0, 2.0 }; + __v2df v2dfr = f9 (v2dia, v2dfb, v2dfc); + if (__builtin_memcmp (&v2dfr, &v2dfe, sizeof (__v2df))) + __builtin_abort (); + v2dfr = f10 (v2dia, v2dfb, v2dfc); + if (__builtin_memcmp (&v2dfr, &v2dff, sizeof (__v2df))) + __builtin_abort (); +} --- gcc/testsuite/g++.target/i386/avx-pr54700-1.C.jj 2018-11-28 18:34:24.237362509 +0100 +++ gcc/testsuite/g++.target/i386/avx-pr54700-1.C 2018-11-28 18:53:19.388538643 +0100 @@ -0,0 +1,9 @@ +/* PR target/54700 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -std=c++14 -mavx -mno-xop -mno-avx2" } */ +/* { dg-final { scan-assembler-not "vpcmpgt\[bdq]" } } */ +/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */ +/* { dg-final { scan-assembler-times "vblendvps" 4 } } */ +/* { dg-final { scan-assembler-times "vblendvpd" 4 } } */ + +#include "sse4_1-pr54700-1.C" --- gcc/testsuite/g++.target/i386/avx-pr54700-2.C.jj 2018-11-28 19:04:18.460612953 +0100 +++ gcc/testsuite/g++.target/i386/avx-pr54700-2.C 2018-11-28 19:09:11.430756336 +0100 @@ -0,0 +1,8 @@ +/* PR target/54700 */ +/* { dg-do run { target avx } } */ +/* { dg-options "-O2 -std=c++14 -mavx -mno-xop -mno-avx2" } */ + +#define CHECK_H "avx-check.h" +#define TEST avx_test + +#include "sse4_1-pr54700-2.C" --- gcc/testsuite/g++.target/i386/avx2-pr54700-1.C.jj 2018-11-28 18:37:43.990048342 +0100 +++ gcc/testsuite/g++.target/i386/avx2-pr54700-1.C 2018-11-28 18:53:26.209425564 +0100 @@ -0,0 +1,69 @@ +/* PR target/54700 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -std=c++14 -mavx2 -mno-xop -mno-avx512f" } */ +/* { dg-final { scan-assembler-not "vpcmpgt\[bdq]" } } */ +/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */ +/* { dg-final { scan-assembler-times "vblendvps" 4 } } */ +/* { dg-final { scan-assembler-times "vblendvpd" 4 } } */ + +#include <x86intrin.h> + +__attribute__((noipa)) __v32qi +f1 (__v32qi a, __v32qi b, __v32qi c) +{ + return a < 0 ? b : c; +} + +__attribute__((noipa)) __v32qi +f2 (__v32qi a, __v32qi b, __v32qi c) +{ + return a >= 0 ? b : c; +} + +__attribute__((noipa)) __v8si +f3 (__v8si a, __v8si b, __v8si c) +{ + return a < 0 ? b : c; +} + +__attribute__((noipa)) __v8si +f4 (__v8si a, __v8si b, __v8si c) +{ + return a >= 0 ? b : c; +} + +__attribute__((noipa)) __v4di +f5 (__v4di a, __v4di b, __v4di c) +{ + return a < 0 ? b : c; +} + +__attribute__((noipa)) __v4di +f6 (__v4di a, __v4di b, __v4di c) +{ + return a >= 0 ? b : c; +} + +__attribute__((noipa)) __v8sf +f7 (__v8si a, __v8sf b, __v8sf c) +{ + return a < 0 ? b : c; +} + +__attribute__((noipa)) __v8sf +f8 (__v8si a, __v8sf b, __v8sf c) +{ + return a >= 0 ? b : c; +} + +__attribute__((noipa)) __v4df +f9 (__v4di a, __v4df b, __v4df c) +{ + return a < 0 ? b : c; +} + +__attribute__((noipa)) __v4df +f10 (__v4di a, __v4df b, __v4df c) +{ + return a >= 0 ? b : c; +} --- gcc/testsuite/g++.target/i386/avx2-pr54700-2.C.jj 2018-11-28 19:08:12.269737055 +0100 +++ gcc/testsuite/g++.target/i386/avx2-pr54700-2.C 2018-11-28 19:17:58.058021174 +0100 @@ -0,0 +1,72 @@ +/* PR target/54700 */ +/* { dg-do run { target avx2 } } */ +/* { dg-options "-O2 -std=c++14 -mavx2 -mno-xop -mno-avx512f" } */ + +#include "avx2-check.h" + +#define TEST avx2_test + +#include "avx2-pr54700-1.C" + +static void +TEST () +{ + __v32qi v32qia = { -128, 12, -1, 127, 115, 0, -19, 125, -125, 12, 0, -37, 37, 15, 98, -105, + 0, 1, 2, 3, -1, -2, -3, -4, 4, -5, 5, -6, 6, -7, 7, -8 }; + __v32qi v32qib = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16 }; + __v32qi v32qic = { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32 }; + __v32qi v32qie = { 1, 18, 3, 20, 21, 22, 7, 24, 9, 26, 27, 12, 29, 30, 31, 16, + -17, -18, -19, -20, -5, -6, -7, -8, -25, -10, -27, -12, -29, -14, -31, -16 }; + __v32qi v32qif = { 17, 2, 19, 4, 5, 6, 23, 8, 25, 10, 11, 28, 13, 14, 15, 32, + -1, -2, -3, -4, -21, -22, -23, -24, -9, -26, -11, -28, -13, -30, -15, -32 }; + __v32qi v32qir = f1 (v32qia, v32qib, v32qic); + if (__builtin_memcmp (&v32qir, &v32qie, sizeof (__v32qi))) + __builtin_abort (); + v32qir = f2 (v32qia, v32qib, v32qic); + if (__builtin_memcmp (&v32qir, &v32qif, sizeof (__v32qi))) + __builtin_abort (); + __v8si v8sia = { __INT_MAX__, -__INT_MAX__ - 1, -32, 12, __INT_MAX__ - 2, -__INT_MAX__, 15, -1 }; + __v8si v8sib = { 1, 2, 3, 4, -1, -2, -3, -4 }; + __v8si v8sic = { 5, 6, 7, 8, -5, -6, -7, -8 }; + __v8si v8sie = { 5, 2, 3, 8, -5, -2, -7, -4 }; + __v8si v8sif = { 1, 6, 7, 4, -1, -6, -3, -8 }; + __v8si v8sir = f3 (v8sia, v8sib, v8sic); + if (__builtin_memcmp (&v8sir, &v8sie, sizeof (__v8si))) + __builtin_abort (); + v8sir = f4 (v8sia, v8sib, v8sic); + if (__builtin_memcmp (&v8sir, &v8sif, sizeof (__v8si))) + __builtin_abort (); + __v4di v4dia = { -__LONG_LONG_MAX__, 1000LL * __INT_MAX__, __LONG_LONG_MAX__, -2 }; + __v4di v4dib = { 1, 2, -1, -2 }; + __v4di v4dic = { 3, 4, -3, -4 }; + __v4di v4die = { 1, 4, -3, -2 }; + __v4di v4dif = { 3, 2, -1, -4 }; + __v4di v4dir = f5 (v4dia, v4dib, v4dic); + if (__builtin_memcmp (&v4dir, &v4die, sizeof (__v4di))) + __builtin_abort (); + v4dir = f6 (v4dia, v4dib, v4dic); + if (__builtin_memcmp (&v4dir, &v4dif, sizeof (__v4di))) + __builtin_abort (); + __v8sf v8sfb = { 1.0f, 2.0f, 3.0f, 4.0f, -1.0f, -2.0f, -3.0f, -4.0f }; + __v8sf v8sfc = { 5.0f, 6.0f, 7.0f, 8.0f, -5.0f, -6.0f, -7.0f, -8.0f }; + __v8sf v8sfe = { 5.0f, 2.0f, 3.0f, 8.0f, -5.0f, -2.0f, -7.0f, -4.0f }; + __v8sf v8sff = { 1.0f, 6.0f, 7.0f, 4.0f, -1.0f, -6.0f, -3.0f, -8.0f }; + __v8sf v8sfr = f7 (v8sia, v8sfb, v8sfc); + if (__builtin_memcmp (&v8sfr, &v8sfe, sizeof (__v8sf))) + __builtin_abort (); + v8sfr = f8 (v8sia, v8sfb, v8sfc); + if (__builtin_memcmp (&v8sfr, &v8sff, sizeof (__v8sf))) + __builtin_abort (); + __v4df v4dfb = { 1.0, 2.0, -1.0, -2.0 }; + __v4df v4dfc = { 3.0, 4.0, -3.0, -4.0 }; + __v4df v4dfe = { 1.0, 4.0, -3.0, -2.0 }; + __v4df v4dff = { 3.0, 2.0, -1.0, -4.0 }; + __v4df v4dfr = f9 (v4dia, v4dfb, v4dfc); + if (__builtin_memcmp (&v4dfr, &v4dfe, sizeof (__v4df))) + __builtin_abort (); + v4dfr = f10 (v4dia, v4dfb, v4dfc); + if (__builtin_memcmp (&v4dfr, &v4dff, sizeof (__v4df))) + __builtin_abort (); +} --- gcc/testsuite/g++.target/i386/sse4_1-check.h.jj 2018-11-28 18:42:21.280449620 +0100 +++ gcc/testsuite/g++.target/i386/sse4_1-check.h 2018-11-28 18:42:16.404530462 +0100 @@ -0,0 +1 @@ +#include "../../gcc.target/i386/sse4_1-check.h" --- gcc/testsuite/g++.target/i386/avx-check.h.jj 2018-11-28 19:05:38.688283010 +0100 +++ gcc/testsuite/g++.target/i386/avx-check.h 2018-11-28 19:05:43.296206617 +0100 @@ -0,0 +1 @@ +#include "../../gcc.target/i386/avx-check.h" --- gcc/testsuite/g++.target/i386/avx2-check.h.jj 2018-11-28 19:06:41.619239790 +0100 +++ gcc/testsuite/g++.target/i386/avx2-check.h 2018-11-28 19:06:46.474159302 +0100 @@ -0,0 +1 @@ +#include "../../gcc.target/i386/avx2-check.h" --- gcc/testsuite/g++.target/i386/m128-check.h.jj 2018-11-28 18:42:33.576245739 +0100 +++ gcc/testsuite/g++.target/i386/m128-check.h 2018-11-28 18:42:39.245151746 +0100 @@ -0,0 +1 @@ +#include "../../gcc.target/i386/m128-check.h" --- gcc/testsuite/g++.target/i386/m256-check.h.jj 2018-11-28 19:06:21.805568241 +0100 +++ gcc/testsuite/g++.target/i386/m256-check.h 2018-11-28 19:06:26.463491032 +0100 @@ -0,0 +1 @@ +#include "../../gcc.target/i386/m256-check.h" --- gcc/testsuite/g++.target/i386/avx-os-support.h.jj 2018-11-28 19:05:56.615985814 +0100 +++ gcc/testsuite/g++.target/i386/avx-os-support.h 2018-11-28 19:06:01.759900538 +0100 @@ -0,0 +1 @@ +#include "../../gcc.target/i386/avx-os-support.h"