Message ID | b620e634-3adf-dba7-51a1-25220a150d6c@suse.com |
---|---|
State | New |
Headers | show |
Series | [v3] x86: make better use of VBROADCASTSS / VPBROADCASTD | expand |
> -----Original Message----- > From: Jan Beulich <jbeulich@suse.com> > Sent: Tuesday, July 11, 2023 2:04 PM > To: gcc-patches@gcc.gnu.org > Cc: Kirill Yukhin <kirill.yukhin@gmail.com>; Liu, Hongtao > <hongtao.liu@intel.com> > Subject: [PATCH v3] x86: make better use of VBROADCASTSS / > VPBROADCASTD > > ... in vec_dupv4sf / *vec_dupv4si. The respective broadcast insns are never > longer (yet sometimes shorter) than the corresponding VSHUFPS / VPSHUFD, > due to the immediate operand of the shuffle insns balancing the > (uniform) need for VEX3 in the broadcast ones. When EVEX encoding is > respective the broadcast insns are always shorter. > > Add new alternatives to cover the AVX2 and AVX512 cases as appropriate. > > While touching this anyway, switch to consistently using "sseshuf1" in the > "type" attributes for all shuffle forms. > > gcc/ > > * config/i386/sse.md (vec_dupv4sf): Make first alternative use > vbroadcastss for AVX2. New AVX512F alternative. > (*vec_dupv4si): New AVX2 and AVX512F alternatives using > vpbroadcastd. Replace sselog1 by sseshuf1 in "type" attribute. > > gcc/testsuite/ > > * gcc.target/i386/avx2-dupv4sf.c: New test. > * gcc.target/i386/avx2-dupv4si.c: Likewise. > * gcc.target/i386/avx512f-dupv4sf.c: Likewise. > * gcc.target/i386/avx512f-dupv4si.c: Likewise. > --- > Note that unlike originally intended, "prefix_extra" isn't dropped: > "length_vex" uses it to determine whether 2-byte VEX encoding is possible > (which it isn't for VBROADCASTSS / VPBROADCASTD). "length" > itself specifically does not use it for VEX/EVEX encoded insns. > > Especially with the added "enabled" attribute I didn't really see how to > (further) fold alternatives 0 and 1. Instead *vec_dupv4si might benefit from > using sse2_noavx2 instead of sse2 for alternative 2, except that there is no > sse2_noavx2, only sse2_noavx. > > I'm working from the assumption that the isa attributes to the original 1st and > 2nd alternatives don't need further restricting (to sse2_noavx2 or > avx_noavx2 as applicable), as the new earlier alternatives cover all operand > forms already when at least AVX2 is enabled. Yes, the patch LGTM. > --- > v3: Testcases for new alternatives. "type" and "prefix_extra" > adjustments. > v2: Correct operand constraints. Respect -mprefer-vector-width=. Fold > two alternatives of vec_dupv4sf. > > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -25969,41 +25969,64 @@ > (const_int 1)))]) > > (define_insn "vec_dupv4sf" > - [(set (match_operand:V4SF 0 "register_operand" "=v,v,x") > + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v,x") > (vec_duplicate:V4SF > - (match_operand:SF 1 "nonimmediate_operand" "Yv,m,0")))] > + (match_operand:SF 1 "nonimmediate_operand" "Yv,v,m,0")))] > "TARGET_SSE" > "@ > - vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} > + * return TARGET_AVX2 ? \"vbroadcastss\t{%1, %0|%0, %1}\" : > \"vshufps\t{$0, %d1, %0|%0, %d1, 0}\"; > + vbroadcastss\t{%1, %g0|%g0, %1} > vbroadcastss\t{%1, %0|%0, %1} > shufps\t{$0, %0, %0|%0, %0, 0}" > - [(set_attr "isa" "avx,avx,noavx") > - (set_attr "type" "sseshuf1,ssemov,sseshuf1") > - (set_attr "length_immediate" "1,0,1") > - (set_attr "prefix_extra" "0,1,*") > - (set_attr "prefix" "maybe_evex,maybe_evex,orig") > - (set_attr "mode" "V4SF")]) > + [(set_attr "isa" "avx,*,avx,noavx") > + (set (attr "type") > + (cond [(and (eq_attr "alternative" "0") > + (match_test "!TARGET_AVX2")) > + (const_string "sseshuf1") > + (eq_attr "alternative" "3") > + (const_string "sseshuf1") > + ] > + (const_string "ssemov"))) > + (set (attr "length_immediate") > + (if_then_else (eq_attr "type" "sseshuf1") > + (const_string "1") > + (const_string "0"))) > + (set_attr "prefix_extra" "0,1,1,*") > + (set_attr "prefix" "maybe_evex,evex,maybe_evex,orig") > + (set_attr "mode" "V4SF,V16SF,V4SF,V4SF") > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL > + && !TARGET_PREFER_AVX256") > + (const_string "*")))]) > > (define_insn "*vec_dupv4si" > - [(set (match_operand:V4SI 0 "register_operand" "=v,v,x") > + [(set (match_operand:V4SI 0 "register_operand" "=v,v,v,v,x") > (vec_duplicate:V4SI > - (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))] > + (match_operand:SI 1 "nonimmediate_operand" "Yvm,v,Yv,m,0")))] > "TARGET_SSE" > "@ > + vpbroadcastd\t{%1, %0|%0, %1} > + vpbroadcastd\t{%1, %g0|%g0, %1} > %vpshufd\t{$0, %1, %0|%0, %1, 0} > vbroadcastss\t{%1, %0|%0, %1} > shufps\t{$0, %0, %0|%0, %0, 0}" > - [(set_attr "isa" "sse2,avx,noavx") > - (set_attr "type" "sselog1,ssemov,sselog1") > - (set_attr "length_immediate" "1,0,1") > - (set_attr "prefix_extra" "0,1,*") > - (set_attr "prefix" "maybe_vex,maybe_evex,orig") > - (set_attr "mode" "TI,V4SF,V4SF") > + [(set_attr "isa" "avx2,*,sse2,avx,noavx") > + (set_attr "type" "ssemov,ssemov,sseshuf1,ssemov,sseshuf1") > + (set_attr "length_immediate" "0,0,1,0,1") > + (set_attr "prefix_extra" "1,1,0,1,*") > + (set_attr "prefix" "maybe_evex,evex,maybe_vex,maybe_evex,orig") > + (set_attr "mode" "TI,XI,TI,V4SF,V4SF") > (set (attr "preferred_for_speed") > - (cond [(eq_attr "alternative" "1") > + (cond [(eq_attr "alternative" "3") > (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC") > ] > - (symbol_ref "true")))]) > + (symbol_ref "true"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL > + && !TARGET_PREFER_AVX256") > + (const_string "*")))]) > > (define_insn "*vec_dupv2di" > [(set (match_operand:V2DI 0 "register_operand" "=x,v,v,v,x") > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx2-dupv4sf.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx2 -O2" } */ > +/* { dg-final { scan-assembler-times "vbroadcastss" 2 } } */ > + > +typedef float __attribute__ ((vector_size (16))) v4sf; > + > +v4sf bcst_reg (float f) > +{ > + register float x asm ("xmm7") = f; > + > + asm ("" : "+v" (x)); > + return (v4sf) {x, x, x, x}; > +} > + > +v4sf bcst_mem (const float *f) > +{ > + return (v4sf) {*f, *f, *f, *f}; > +} > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx2-dupv4si.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx2 -O2" } */ > +/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */ > + > +typedef int __attribute__ ((vector_size (16))) v4si; > + > +v4si bcst_reg (int i) > +{ > + register int x asm ("xmm7") = i; > + > + asm ("" : "+v" (x)); > + return (v4si) {x, x, x, x}; > +} > + > +v4si bcst_mem (const int *i) > +{ > + return (v4si) {*i, *i, *i, *i}; > +} > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512f-dupv4sf.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" > +} */ > +/* { dg-final { scan-assembler "vbroadcastss\[^\n\]*%xmm17, *%zmm" } } > +*/ > + > +typedef float __attribute__ ((vector_size (16))) v4sf; > + > +v4sf bcst (float f) > +{ > + register float x asm ("xmm17") = f; > + > + asm ("" : "+v" (x)); > + return (v4sf) {x, x, x, x}; > +} > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512f-dupv4si.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" > +} */ > +/* { dg-final { scan-assembler "vpbroadcastd\[^\n\]*%xmm17, *%zmm" } } > +*/ > + > +typedef int __attribute__ ((vector_size (16))) v4si; > + > +v4si bcst (int i) > +{ > + register int x asm ("xmm17") = i; > + > + asm ("" : "+v" (x)); > + return (v4si) {x, x, x, x}; > +}
--- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -25969,41 +25969,64 @@ (const_int 1)))]) (define_insn "vec_dupv4sf" - [(set (match_operand:V4SF 0 "register_operand" "=v,v,x") + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v,x") (vec_duplicate:V4SF - (match_operand:SF 1 "nonimmediate_operand" "Yv,m,0")))] + (match_operand:SF 1 "nonimmediate_operand" "Yv,v,m,0")))] "TARGET_SSE" "@ - vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} + * return TARGET_AVX2 ? \"vbroadcastss\t{%1, %0|%0, %1}\" : \"vshufps\t{$0, %d1, %0|%0, %d1, 0}\"; + vbroadcastss\t{%1, %g0|%g0, %1} vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "avx,avx,noavx") - (set_attr "type" "sseshuf1,ssemov,sseshuf1") - (set_attr "length_immediate" "1,0,1") - (set_attr "prefix_extra" "0,1,*") - (set_attr "prefix" "maybe_evex,maybe_evex,orig") - (set_attr "mode" "V4SF")]) + [(set_attr "isa" "avx,*,avx,noavx") + (set (attr "type") + (cond [(and (eq_attr "alternative" "0") + (match_test "!TARGET_AVX2")) + (const_string "sseshuf1") + (eq_attr "alternative" "3") + (const_string "sseshuf1") + ] + (const_string "ssemov"))) + (set (attr "length_immediate") + (if_then_else (eq_attr "type" "sseshuf1") + (const_string "1") + (const_string "0"))) + (set_attr "prefix_extra" "0,1,1,*") + (set_attr "prefix" "maybe_evex,evex,maybe_evex,orig") + (set_attr "mode" "V4SF,V16SF,V4SF,V4SF") + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL + && !TARGET_PREFER_AVX256") + (const_string "*")))]) (define_insn "*vec_dupv4si" - [(set (match_operand:V4SI 0 "register_operand" "=v,v,x") + [(set (match_operand:V4SI 0 "register_operand" "=v,v,v,v,x") (vec_duplicate:V4SI - (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))] + (match_operand:SI 1 "nonimmediate_operand" "Yvm,v,Yv,m,0")))] "TARGET_SSE" "@ + vpbroadcastd\t{%1, %0|%0, %1} + vpbroadcastd\t{%1, %g0|%g0, %1} %vpshufd\t{$0, %1, %0|%0, %1, 0} vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "sse2,avx,noavx") - (set_attr "type" "sselog1,ssemov,sselog1") - (set_attr "length_immediate" "1,0,1") - (set_attr "prefix_extra" "0,1,*") - (set_attr "prefix" "maybe_vex,maybe_evex,orig") - (set_attr "mode" "TI,V4SF,V4SF") + [(set_attr "isa" "avx2,*,sse2,avx,noavx") + (set_attr "type" "ssemov,ssemov,sseshuf1,ssemov,sseshuf1") + (set_attr "length_immediate" "0,0,1,0,1") + (set_attr "prefix_extra" "1,1,0,1,*") + (set_attr "prefix" "maybe_evex,evex,maybe_vex,maybe_evex,orig") + (set_attr "mode" "TI,XI,TI,V4SF,V4SF") (set (attr "preferred_for_speed") - (cond [(eq_attr "alternative" "1") + (cond [(eq_attr "alternative" "3") (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC") ] - (symbol_ref "true")))]) + (symbol_ref "true"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL + && !TARGET_PREFER_AVX256") + (const_string "*")))]) (define_insn "*vec_dupv2di" [(set (match_operand:V2DI 0 "register_operand" "=x,v,v,v,x") --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-dupv4sf.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx2 -O2" } */ +/* { dg-final { scan-assembler-times "vbroadcastss" 2 } } */ + +typedef float __attribute__ ((vector_size (16))) v4sf; + +v4sf bcst_reg (float f) +{ + register float x asm ("xmm7") = f; + + asm ("" : "+v" (x)); + return (v4sf) {x, x, x, x}; +} + +v4sf bcst_mem (const float *f) +{ + return (v4sf) {*f, *f, *f, *f}; +} --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-dupv4si.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx2 -O2" } */ +/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */ + +typedef int __attribute__ ((vector_size (16))) v4si; + +v4si bcst_reg (int i) +{ + register int x asm ("xmm7") = i; + + asm ("" : "+v" (x)); + return (v4si) {x, x, x, x}; +} + +v4si bcst_mem (const int *i) +{ + return (v4si) {*i, *i, *i, *i}; +} --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-dupv4sf.c @@ -0,0 +1,13 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ +/* { dg-final { scan-assembler "vbroadcastss\[^\n\]*%xmm17, *%zmm" } } */ + +typedef float __attribute__ ((vector_size (16))) v4sf; + +v4sf bcst (float f) +{ + register float x asm ("xmm17") = f; + + asm ("" : "+v" (x)); + return (v4sf) {x, x, x, x}; +} --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-dupv4si.c @@ -0,0 +1,13 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ +/* { dg-final { scan-assembler "vpbroadcastd\[^\n\]*%xmm17, *%zmm" } } */ + +typedef int __attribute__ ((vector_size (16))) v4si; + +v4si bcst (int i) +{ + register int x asm ("xmm17") = i; + + asm ("" : "+v" (x)); + return (v4si) {x, x, x, x}; +}