Message ID | 20181128074335.GU12380@tucnak |
---|---|
State | New |
Headers | show |
Series | Use {,v}blendvp{s,d} for [SD]Fmode sse_movcc (PR target/88189) | expand |
On Wed, Nov 28, 2018 at 8:43 AM Jakub Jelinek <jakub@redhat.com> wrote: > > Hi! > > This patch implments Marc's idea of using {,v}blenvp{s,d} for scalar > [SD]Fmode ix86_expand_sse_movcc for -msse4.1 and above. > Without this patch we emit sequences like > andpd %xmm2, %xmm0 > andnpd %xmm1, %xmm2 > orpd %xmm2, %xmm0 > or > andps %xmm2, %xmm0 > andnps %xmm1, %xmm2 > orps %xmm2, %xmm0 > and this replaces it with > blendvpd %xmm0, %xmm2, %xmm1 > movapd %xmm1, %xmm0 > or > blendvps %xmm0, %xmm2, %xmm1 > movaps %xmm1, %xmm0 > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2018-11-28 Jakub Jelinek <jakub@redhat.com> > > PR target/88189 > * config/i386/i386.c (ix86_expand_sse_movcc): Handle DFmode and > SFmode using sse4_1_blendvs[sd] with TARGET_SSE4_1. Formatting fixes. > * config/i386/sse.md (sse4_1_blendv<ssemodesuffix>): New pattern. > > * gcc.target/i386/sse4_1-pr88189-1.c: New test. > * gcc.target/i386/sse4_1-pr88189-2.c: New test. > * gcc.target/i386/avx-pr88189-1.c: New test. > * gcc.target/i386/avx-pr88189-2.c: New test. OK. Thanks, Uros. > --- gcc/config/i386/i386.c.jj 2018-11-26 22:25:50.716253308 +0100 > +++ gcc/config/i386/i386.c 2018-11-27 11:18:23.135715272 +0100 > @@ -23585,15 +23585,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp > { > emit_insn (gen_rtx_SET (dest, cmp)); > } > - else if (op_false == CONST0_RTX (mode) > - && !maskcmp) > + else if (op_false == CONST0_RTX (mode) && !maskcmp) > { > op_true = force_reg (mode, op_true); > x = gen_rtx_AND (mode, cmp, op_true); > emit_insn (gen_rtx_SET (dest, x)); > } > - else if (op_true == CONST0_RTX (mode) > - && !maskcmp) > + else if (op_true == CONST0_RTX (mode) && !maskcmp) > { > op_false = force_reg (mode, op_false); > x = gen_rtx_NOT (mode, cmp); > @@ -23601,14 +23599,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp > emit_insn (gen_rtx_SET (dest, x)); > } > else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode) > - && !maskcmp) > + && !maskcmp) > { > op_false = force_reg (mode, op_false); > x = gen_rtx_IOR (mode, cmp, op_false); > emit_insn (gen_rtx_SET (dest, x)); > } > - else if (TARGET_XOP > - && !maskcmp) > + else if (TARGET_XOP && !maskcmp) > { > op_true = force_reg (mode, op_true); > > @@ -23639,6 +23636,20 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp > if (TARGET_SSE4_1) > gen = gen_sse4_1_blendvpd; > break; > + case E_SFmode: > + if (TARGET_SSE4_1) > + { > + gen = gen_sse4_1_blendvss; > + op_true = force_reg (mode, op_true); > + } > + break; > + case E_DFmode: > + if (TARGET_SSE4_1) > + { > + gen = gen_sse4_1_blendvsd; > + op_true = force_reg (mode, op_true); > + } > + break; > case E_V16QImode: > case E_V8HImode: > case E_V4SImode: > --- gcc/config/i386/sse.md.jj 2018-11-21 17:39:51.000000000 +0100 > +++ gcc/config/i386/sse.md 2018-11-27 10:48:38.500120925 +0100 > @@ -15641,6 +15641,46 @@ (define_insn "<sse4_1>_blendv<ssemodesuf > (set_attr "btver2_decode" "vector,vector,vector") > (set_attr "mode" "<MODE>")]) > > +;; Also define scalar versions. These are used for conditional move. > +;; Using subregs into vector modes causes register allocation lossage. > +;; These patterns do not allow memory operands because the native > +;; instructions read the full 128-bits. > + > +(define_insn "sse4_1_blendv<ssemodesuffix>" > + [(set (match_operand:MODEF 0 "register_operand" "=Yr,*x,x") > + (unspec:MODEF > + [(match_operand:MODEF 1 "register_operand" "0,0,x") > + (match_operand:MODEF 2 "register_operand" "Yr,*x,x") > + (match_operand:MODEF 3 "register_operand" "Yz,Yz,x")] > + UNSPEC_BLENDV))] > + "TARGET_SSE4_1" > +{ > + if (get_attr_mode (insn) == MODE_V4SF) > + return (which_alternative == 2 > + ? "vblendvps\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + : "blendvps\t{%3, %2, %0|%0, %2, %3}"); > + else > + return (which_alternative == 2 > + ? "vblendv<ssevecmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + : "blendv<ssevecmodesuffix>\t{%3, %2, %0|%0, %2, %3}"); > +} > + [(set_attr "isa" "noavx,noavx,avx") > + (set_attr "type" "ssemov") > + (set_attr "length_immediate" "1") > + (set_attr "prefix_data16" "1,1,*") > + (set_attr "prefix_extra" "1") > + (set_attr "prefix" "orig,orig,vex") > + (set_attr "btver2_decode" "vector,vector,vector") > + (set (attr "mode") > + (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") > + (const_string "V4SF") > + (match_test "TARGET_AVX") > + (const_string "<ssevecmode>") > + (match_test "optimize_function_for_size_p (cfun)") > + (const_string "V4SF") > + ] > + (const_string "<ssevecmode>")))]) > + > (define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>" > [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") > (unspec:VF_128_256 > --- gcc/testsuite/gcc.target/i386/sse4_1-pr88189-1.c.jj 2018-11-27 11:00:34.746322991 +0100 > +++ gcc/testsuite/gcc.target/i386/sse4_1-pr88189-1.c 2018-11-27 11:11:36.116423601 +0100 > @@ -0,0 +1,35 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target sse4 } */ > +/* { dg-options "-O2 -msse4.1 -mfpmath=sse" } */ > + > +#ifndef CHECK_H > +#define CHECK_H "sse4_1-check.h" > +#endif > + > +#ifndef TEST > +#define TEST sse4_1_test > +#endif > + > +#include CHECK_H > + > +__attribute__((noipa)) double > +f1 (double a, double b) > +{ > + return a < 0 ? a : b; > +} > + > +__attribute__((noipa)) float > +f2 (float a, float b) > +{ > + return a < 0 ? a : b; > +} > + > +static void > +TEST (void) > +{ > + if (f1 (5.0, 7.0) != 7.0 > + || f1 (-2.0, 7.0) != -2.0 > + || f2 (1.0f, 2.0f) != 2.0f > + || f2 (-1.0f, -3.0f) != -1.0f) > + abort (); > +} > --- gcc/testsuite/gcc.target/i386/sse4_1-pr88189-2.c.jj 2018-11-27 11:06:39.842306204 +0100 > +++ gcc/testsuite/gcc.target/i386/sse4_1-pr88189-2.c 2018-11-27 11:11:26.536581478 +0100 > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse4.1 -mno-avx -mfpmath=sse" } */ > + > +__attribute__((noipa)) double > +f1 (double a, double b) > +{ > + return a < 0 ? a : b; > +} > + > +__attribute__((noipa)) float > +f2 (float a, float b) > +{ > + return a < 0 ? a : b; > +} > + > +/* { dg-final { scan-assembler-times "blendvp\[sd]" 2 } } */ > --- gcc/testsuite/gcc.target/i386/avx-pr88189-1.c.jj 2018-11-27 11:01:28.998428914 +0100 > +++ gcc/testsuite/gcc.target/i386/avx-pr88189-1.c 2018-11-27 11:11:49.642200671 +0100 > @@ -0,0 +1,8 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target avx } */ > +/* { dg-options "-O2 -mavx -mfpmath=sse" } */ > + > +#define CHECK_H "avx-check.h" > +#define TEST avx_test > + > +#include "sse4_1-pr88189-1.c" > --- gcc/testsuite/gcc.target/i386/avx-pr88189-2.c.jj 2018-11-27 11:08:12.677776273 +0100 > +++ gcc/testsuite/gcc.target/i386/avx-pr88189-2.c 2018-11-27 11:11:13.726792579 +0100 > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx -mfpmath=sse" } */ > + > +__attribute__((noipa)) double > +f1 (double a, double b) > +{ > + return a < 0 ? a : b; > +} > + > +__attribute__((noipa)) float > +f2 (float a, float b) > +{ > + return a < 0 ? a : b; > +} > + > +/* { dg-final { scan-assembler-times "vblendvp\[sd]" 2 } } */ > > Jakub
--- gcc/config/i386/i386.c.jj 2018-11-26 22:25:50.716253308 +0100 +++ gcc/config/i386/i386.c 2018-11-27 11:18:23.135715272 +0100 @@ -23585,15 +23585,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp { emit_insn (gen_rtx_SET (dest, cmp)); } - else if (op_false == CONST0_RTX (mode) - && !maskcmp) + else if (op_false == CONST0_RTX (mode) && !maskcmp) { op_true = force_reg (mode, op_true); x = gen_rtx_AND (mode, cmp, op_true); emit_insn (gen_rtx_SET (dest, x)); } - else if (op_true == CONST0_RTX (mode) - && !maskcmp) + else if (op_true == CONST0_RTX (mode) && !maskcmp) { op_false = force_reg (mode, op_false); x = gen_rtx_NOT (mode, cmp); @@ -23601,14 +23599,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp emit_insn (gen_rtx_SET (dest, x)); } else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode) - && !maskcmp) + && !maskcmp) { op_false = force_reg (mode, op_false); x = gen_rtx_IOR (mode, cmp, op_false); emit_insn (gen_rtx_SET (dest, x)); } - else if (TARGET_XOP - && !maskcmp) + else if (TARGET_XOP && !maskcmp) { op_true = force_reg (mode, op_true); @@ -23639,6 +23636,20 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp if (TARGET_SSE4_1) gen = gen_sse4_1_blendvpd; break; + case E_SFmode: + if (TARGET_SSE4_1) + { + gen = gen_sse4_1_blendvss; + op_true = force_reg (mode, op_true); + } + break; + case E_DFmode: + if (TARGET_SSE4_1) + { + gen = gen_sse4_1_blendvsd; + op_true = force_reg (mode, op_true); + } + break; case E_V16QImode: case E_V8HImode: case E_V4SImode: --- gcc/config/i386/sse.md.jj 2018-11-21 17:39:51.000000000 +0100 +++ gcc/config/i386/sse.md 2018-11-27 10:48:38.500120925 +0100 @@ -15641,6 +15641,46 @@ (define_insn "<sse4_1>_blendv<ssemodesuf (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "<MODE>")]) +;; Also define scalar versions. These are used for conditional move. +;; Using subregs into vector modes causes register allocation lossage. +;; These patterns do not allow memory operands because the native +;; instructions read the full 128-bits. + +(define_insn "sse4_1_blendv<ssemodesuffix>" + [(set (match_operand:MODEF 0 "register_operand" "=Yr,*x,x") + (unspec:MODEF + [(match_operand:MODEF 1 "register_operand" "0,0,x") + (match_operand:MODEF 2 "register_operand" "Yr,*x,x") + (match_operand:MODEF 3 "register_operand" "Yz,Yz,x")] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" +{ + if (get_attr_mode (insn) == MODE_V4SF) + return (which_alternative == 2 + ? "vblendvps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + : "blendvps\t{%3, %2, %0|%0, %2, %3}"); + else + return (which_alternative == 2 + ? "vblendv<ssevecmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + : "blendv<ssevecmodesuffix>\t{%3, %2, %0|%0, %2, %3}"); +} + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector,vector,vector") + (set (attr "mode") + (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + (match_test "TARGET_AVX") + (const_string "<ssevecmode>") + (match_test "optimize_function_for_size_p (cfun)") + (const_string "V4SF") + ] + (const_string "<ssevecmode>")))]) + (define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>" [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") (unspec:VF_128_256 --- gcc/testsuite/gcc.target/i386/sse4_1-pr88189-1.c.jj 2018-11-27 11:00:34.746322991 +0100 +++ gcc/testsuite/gcc.target/i386/sse4_1-pr88189-1.c 2018-11-27 11:11:36.116423601 +0100 @@ -0,0 +1,35 @@ +/* { dg-do run } */ +/* { dg-require-effective-target sse4 } */ +/* { dg-options "-O2 -msse4.1 -mfpmath=sse" } */ + +#ifndef CHECK_H +#define CHECK_H "sse4_1-check.h" +#endif + +#ifndef TEST +#define TEST sse4_1_test +#endif + +#include CHECK_H + +__attribute__((noipa)) double +f1 (double a, double b) +{ + return a < 0 ? a : b; +} + +__attribute__((noipa)) float +f2 (float a, float b) +{ + return a < 0 ? a : b; +} + +static void +TEST (void) +{ + if (f1 (5.0, 7.0) != 7.0 + || f1 (-2.0, 7.0) != -2.0 + || f2 (1.0f, 2.0f) != 2.0f + || f2 (-1.0f, -3.0f) != -1.0f) + abort (); +} --- gcc/testsuite/gcc.target/i386/sse4_1-pr88189-2.c.jj 2018-11-27 11:06:39.842306204 +0100 +++ gcc/testsuite/gcc.target/i386/sse4_1-pr88189-2.c 2018-11-27 11:11:26.536581478 +0100 @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse4.1 -mno-avx -mfpmath=sse" } */ + +__attribute__((noipa)) double +f1 (double a, double b) +{ + return a < 0 ? a : b; +} + +__attribute__((noipa)) float +f2 (float a, float b) +{ + return a < 0 ? a : b; +} + +/* { dg-final { scan-assembler-times "blendvp\[sd]" 2 } } */ --- gcc/testsuite/gcc.target/i386/avx-pr88189-1.c.jj 2018-11-27 11:01:28.998428914 +0100 +++ gcc/testsuite/gcc.target/i386/avx-pr88189-1.c 2018-11-27 11:11:49.642200671 +0100 @@ -0,0 +1,8 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-O2 -mavx -mfpmath=sse" } */ + +#define CHECK_H "avx-check.h" +#define TEST avx_test + +#include "sse4_1-pr88189-1.c" --- gcc/testsuite/gcc.target/i386/avx-pr88189-2.c.jj 2018-11-27 11:08:12.677776273 +0100 +++ gcc/testsuite/gcc.target/i386/avx-pr88189-2.c 2018-11-27 11:11:13.726792579 +0100 @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx -mfpmath=sse" } */ + +__attribute__((noipa)) double +f1 (double a, double b) +{ + return a < 0 ? a : b; +} + +__attribute__((noipa)) float +f2 (float a, float b) +{ + return a < 0 ? a : b; +} + +/* { dg-final { scan-assembler-times "vblendvp\[sd]" 2 } } */