Message ID | c643ed57-cdba-5487-1781-47904dbe6208@suse.com |
---|---|
State | New |
Headers | show |
Series | [v2] x86: make better use of VBROADCASTSS / VPBROADCASTD | expand |
On Wed, Jun 21, 2023 at 2:06 PM Jan Beulich via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > ... in vec_dupv4sf / *vec_dupv4si. The respective broadcast insns are > never longer (yet sometimes shorter) than the corresponding VSHUFPS / > VPSHUFD, due to the immediate operand of the shuffle insns balancing the > possible need for VEX3 in the broadcast ones. When EVEX encoding is > required the broadcast insns are always shorter. > > Add new alternatives to cover the AVX2 and AVX512 cases as appropriate. > > gcc/ > > * config/i386/sse.md (vec_dupv4sf): Make first alternative use > vbroadcastss for AVX2. New AVX512F alternative. > (*vec_dupv4si): New AVX2 and AVX512F alternatives using > vpbroadcastd. > --- > Especially with the added "enabled" attribute I didn't really see how to > (further) fold alternatives 0 and 1. Instead *vec_dupv4si might benefit > from using sse2_noavx2 instead of sse2 for alternative 2, except that > there is no sse2_noavx2, only sse2_noavx. > > Is there a reason why vec_dupv4sf uses sseshuf1 for its shuffle > alternatives, but *vec_dupv4si uses sselog1? I'd be happy to correct > this in whichever is the appropriate direction, while touching this > anyway. It should be sseshuf1(or sseshuf depending on input operands number in the pattern) for shufps, sselog means logical instructions. I think it comes from Intel SDM Vlolumes1: 5.6.1 Intel® SSE2 Packed and Scalar Double Precision Floating-Point Instructions there're descriptions for different kinds of instructions. > > I'm working from the assumption that the isa attributes to the original > 1st and 2nd alternatives don't need further restricting (to sse2_noavx2 > or avx_noavx2 as applicable), as the new earlier alternatives cover all > operand forms already when at least AVX2 is enabled. > > Isn't prefix_extra use bogus here? What extra prefix does vbroadcastss According to comments, yes, no extra prefix is needed. ;; There are also additional prefixes in 3DNOW, SSSE3. ;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte, ;; sseiadd1,ssecvt1 to 0f7a with no DREX byte. ;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a. > use? (Same further down in *vec_dupv4si and avx2_vbroadcasti128_<mode> > and elsewhere.) > --- > v2: Correct operand constraints. Respect -mprefer-vector-width=. Fold > two alternatives of vec_dupv4sf. > > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -26141,41 +26141,64 @@ > (const_int 1)))]) > > (define_insn "vec_dupv4sf" > - [(set (match_operand:V4SF 0 "register_operand" "=v,v,x") > + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v,x") > (vec_duplicate:V4SF > - (match_operand:SF 1 "nonimmediate_operand" "Yv,m,0")))] > + (match_operand:SF 1 "nonimmediate_operand" "Yv,v,m,0")))] > "TARGET_SSE" > "@ > - vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} > + * return TARGET_AVX2 ? \"vbroadcastss\t{%1, %0|%0, %1}\" : \"vshufps\t{$0, %d1, %0|%0, %d1, 0}\"; > + vbroadcastss\t{%1, %g0|%g0, %1} > vbroadcastss\t{%1, %0|%0, %1} > shufps\t{$0, %0, %0|%0, %0, 0}" > - [(set_attr "isa" "avx,avx,noavx") > - (set_attr "type" "sseshuf1,ssemov,sseshuf1") > - (set_attr "length_immediate" "1,0,1") > - (set_attr "prefix_extra" "0,1,*") > - (set_attr "prefix" "maybe_evex,maybe_evex,orig") > - (set_attr "mode" "V4SF")]) > + [(set_attr "isa" "avx,*,avx,noavx") > + (set (attr "type") > + (cond [(and (eq_attr "alternative" "0") > + (match_test "!TARGET_AVX2")) > + (const_string "sseshuf1") > + (eq_attr "alternative" "3") > + (const_string "sseshuf1") > + ] > + (const_string "ssemov"))) > + (set (attr "length_immediate") > + (if_then_else (eq_attr "type" "sseshuf1") > + (const_string "1") > + (const_string "0"))) > + (set_attr "prefix_extra" "0,0,1,*") > + (set_attr "prefix" "maybe_evex,evex,maybe_evex,orig") > + (set_attr "mode" "V4SF,V16SF,V4SF,V4SF") > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL > + && !TARGET_PREFER_AVX256") > + (const_string "*")))]) > > (define_insn "*vec_dupv4si" > - [(set (match_operand:V4SI 0 "register_operand" "=v,v,x") > + [(set (match_operand:V4SI 0 "register_operand" "=v,v,v,v,x") > (vec_duplicate:V4SI > - (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))] > + (match_operand:SI 1 "nonimmediate_operand" "Yvm,v,Yv,m,0")))] > "TARGET_SSE" > "@ > + vpbroadcastd\t{%1, %0|%0, %1} > + vpbroadcastd\t{%1, %g0|%g0, %1} > %vpshufd\t{$0, %1, %0|%0, %1, 0} > vbroadcastss\t{%1, %0|%0, %1} > shufps\t{$0, %0, %0|%0, %0, 0}" > - [(set_attr "isa" "sse2,avx,noavx") > - (set_attr "type" "sselog1,ssemov,sselog1") > - (set_attr "length_immediate" "1,0,1") > - (set_attr "prefix_extra" "0,1,*") > - (set_attr "prefix" "maybe_vex,maybe_evex,orig") > - (set_attr "mode" "TI,V4SF,V4SF") > + [(set_attr "isa" "avx2,*,sse2,avx,noavx") > + (set_attr "type" "ssemov,ssemov,sselog1,ssemov,sselog1") > + (set_attr "length_immediate" "0,0,1,0,1") > + (set_attr "prefix_extra" "0,0,0,1,*") > + (set_attr "prefix" "maybe_evex,evex,maybe_vex,maybe_evex,orig") > + (set_attr "mode" "TI,XI,TI,V4SF,V4SF") > (set (attr "preferred_for_speed") > - (cond [(eq_attr "alternative" "1") > + (cond [(eq_attr "alternative" "3") > (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC") > ] > - (symbol_ref "true")))]) > + (symbol_ref "true"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL > + && !TARGET_PREFER_AVX256") > + (const_string "*")))]) > > (define_insn "*vec_dupv2di" > [(set (match_operand:V2DI 0 "register_operand" "=x,v,v,v,x") Could you write a testcase for the newly added constraint? You can use an inline assembly to explicitly use an evex xmm register. .i.e. typedef float v4sf __attribute__((vector_size(16))); typedef int v4si __attribute__((vector_size(16))); v4sf foo (float a) { register float c __asm("xmm16") = a; asm volatile ("": "+v"(c)); return __extension__(v4sf) {c, c, c, c}; } v4si foo1 (int a) { register int c __asm("xmm16") = a; asm volatile ("": "+v"(c)); return __extension__(v4si) {c, c, c, c}; } with your patch, the above testcase should generate vbroadcastss/vpbroadcastd %xmm16, %zmm0 with -mavx512f -mno-avx512vl -mprefer-vector-width=512. Refer to gcc/testsuite/gcc.target/i386/avx512bw-pack-2.c Otherwise LGTM. -- BR, Hongtao
On 21.06.2023 09:37, Hongtao Liu wrote: > On Wed, Jun 21, 2023 at 2:06 PM Jan Beulich via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> Is there a reason why vec_dupv4sf uses sseshuf1 for its shuffle >> alternatives, but *vec_dupv4si uses sselog1? I'd be happy to correct >> this in whichever is the appropriate direction, while touching this >> anyway. > It should be sseshuf1(or sseshuf depending on input operands number in > the pattern) for shufps, sselog means logical instructions. Would you be okay for me to fold in that adjustment, or do you insist on a separate patch? >> I'm working from the assumption that the isa attributes to the original >> 1st and 2nd alternatives don't need further restricting (to sse2_noavx2 >> or avx_noavx2 as applicable), as the new earlier alternatives cover all >> operand forms already when at least AVX2 is enabled. >> >> Isn't prefix_extra use bogus here? What extra prefix does vbroadcastss > According to comments, yes, no extra prefix is needed. > > ;; There are also additional prefixes in 3DNOW, SSSE3. > ;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte, > ;; sseiadd1,ssecvt1 to 0f7a with no DREX byte. > ;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a. Right, that's what triggered my question. I guess dropping these "prefix_extra" really wants to be a separate patch (or maybe even multiple, but it's hard to see how to split), dealing with all of the instances which likely have accumulated simply via copy-and-paste. >> --- a/gcc/config/i386/sse.md >> +++ b/gcc/config/i386/sse.md >> @@ -26141,41 +26141,64 @@ >> (const_int 1)))]) >> >> (define_insn "vec_dupv4sf" >> - [(set (match_operand:V4SF 0 "register_operand" "=v,v,x") >> + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v,x") >> (vec_duplicate:V4SF >> - (match_operand:SF 1 "nonimmediate_operand" "Yv,m,0")))] >> + (match_operand:SF 1 "nonimmediate_operand" "Yv,v,m,0")))] >> "TARGET_SSE" >> "@ >> - vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} >> + * return TARGET_AVX2 ? \"vbroadcastss\t{%1, %0|%0, %1}\" : \"vshufps\t{$0, %d1, %0|%0, %d1, 0}\"; >> + vbroadcastss\t{%1, %g0|%g0, %1} >> vbroadcastss\t{%1, %0|%0, %1} >> shufps\t{$0, %0, %0|%0, %0, 0}" >> - [(set_attr "isa" "avx,avx,noavx") >> - (set_attr "type" "sseshuf1,ssemov,sseshuf1") >> - (set_attr "length_immediate" "1,0,1") >> - (set_attr "prefix_extra" "0,1,*") >> - (set_attr "prefix" "maybe_evex,maybe_evex,orig") >> - (set_attr "mode" "V4SF")]) >> + [(set_attr "isa" "avx,*,avx,noavx") >> + (set (attr "type") >> + (cond [(and (eq_attr "alternative" "0") >> + (match_test "!TARGET_AVX2")) >> + (const_string "sseshuf1") >> + (eq_attr "alternative" "3") >> + (const_string "sseshuf1") >> + ] >> + (const_string "ssemov"))) >> + (set (attr "length_immediate") >> + (if_then_else (eq_attr "type" "sseshuf1") >> + (const_string "1") >> + (const_string "0"))) >> + (set_attr "prefix_extra" "0,0,1,*") >> + (set_attr "prefix" "maybe_evex,evex,maybe_evex,orig") >> + (set_attr "mode" "V4SF,V16SF,V4SF,V4SF") >> + (set (attr "enabled") >> + (if_then_else (eq_attr "alternative" "1") >> + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL >> + && !TARGET_PREFER_AVX256") >> + (const_string "*")))]) >> >> (define_insn "*vec_dupv4si" >> - [(set (match_operand:V4SI 0 "register_operand" "=v,v,x") >> + [(set (match_operand:V4SI 0 "register_operand" "=v,v,v,v,x") >> (vec_duplicate:V4SI >> - (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))] >> + (match_operand:SI 1 "nonimmediate_operand" "Yvm,v,Yv,m,0")))] >> "TARGET_SSE" >> "@ >> + vpbroadcastd\t{%1, %0|%0, %1} >> + vpbroadcastd\t{%1, %g0|%g0, %1} >> %vpshufd\t{$0, %1, %0|%0, %1, 0} >> vbroadcastss\t{%1, %0|%0, %1} >> shufps\t{$0, %0, %0|%0, %0, 0}" >> - [(set_attr "isa" "sse2,avx,noavx") >> - (set_attr "type" "sselog1,ssemov,sselog1") >> - (set_attr "length_immediate" "1,0,1") >> - (set_attr "prefix_extra" "0,1,*") >> - (set_attr "prefix" "maybe_vex,maybe_evex,orig") >> - (set_attr "mode" "TI,V4SF,V4SF") >> + [(set_attr "isa" "avx2,*,sse2,avx,noavx") >> + (set_attr "type" "ssemov,ssemov,sselog1,ssemov,sselog1") >> + (set_attr "length_immediate" "0,0,1,0,1") >> + (set_attr "prefix_extra" "0,0,0,1,*") >> + (set_attr "prefix" "maybe_evex,evex,maybe_vex,maybe_evex,orig") >> + (set_attr "mode" "TI,XI,TI,V4SF,V4SF") >> (set (attr "preferred_for_speed") >> - (cond [(eq_attr "alternative" "1") >> + (cond [(eq_attr "alternative" "3") >> (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC") >> ] >> - (symbol_ref "true")))]) >> + (symbol_ref "true"))) >> + (set (attr "enabled") >> + (if_then_else (eq_attr "alternative" "1") >> + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL >> + && !TARGET_PREFER_AVX256") >> + (const_string "*")))]) >> >> (define_insn "*vec_dupv2di" >> [(set (match_operand:V2DI 0 "register_operand" "=x,v,v,v,x") > Could you write a testcase for the newly added constraint? > > You can use an inline assembly to explicitly use an evex xmm register. > .i.e. > > typedef float v4sf __attribute__((vector_size(16))); > typedef int v4si __attribute__((vector_size(16))); > v4sf > foo (float a) > { > register float c __asm("xmm16") = a; > asm volatile ("": "+v"(c)); > return __extension__(v4sf) {c, c, c, c}; > } > > > v4si > foo1 (int a) > { > register int c __asm("xmm16") = a; > asm volatile ("": "+v"(c)); > return __extension__(v4si) {c, c, c, c}; > } > > with your patch, the above testcase should generate > vbroadcastss/vpbroadcastd %xmm16, %zmm0 with -mavx512f -mno-avx512vl > -mprefer-vector-width=512. > > Refer to gcc/testsuite/gcc.target/i386/avx512bw-pack-2.c I can certainly do that. Not having had a need to adjust any existing tests made me conclude that the earlier alternatives also don't (all) have a corresponding testcase. Jan
On 21.06.2023 09:44, Jan Beulich wrote: > On 21.06.2023 09:37, Hongtao Liu wrote: >> On Wed, Jun 21, 2023 at 2:06 PM Jan Beulich via Gcc-patches >> <gcc-patches@gcc.gnu.org> wrote: >>> >>> Isn't prefix_extra use bogus here? What extra prefix does vbroadcastss >> According to comments, yes, no extra prefix is needed. >> >> ;; There are also additional prefixes in 3DNOW, SSSE3. >> ;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte, >> ;; sseiadd1,ssecvt1 to 0f7a with no DREX byte. >> ;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a. > > Right, that's what triggered my question. I guess dropping these > "prefix_extra" really wants to be a separate patch (or maybe even > multiple, but it's hard to see how to split), dealing with all of the > instances which likely have accumulated simply via copy-and-paste. Or wait - I'm altering those lines anyway, so I could as well drop them right away (and slightly shrink patch size), if that's okay with you. Of course I should then not forget to also mention this in the changelog entry. Jan
> -----Original Message----- > From: Jan Beulich <jbeulich@suse.com> > Sent: Wednesday, June 21, 2023 8:40 PM > To: Hongtao Liu <crazylht@gmail.com> > Cc: gcc-patches@gcc.gnu.org; Kirill Yukhin <kirill.yukhin@gmail.com>; Liu, > Hongtao <hongtao.liu@intel.com> > Subject: Re: [PATCH v2] x86: make better use of VBROADCASTSS / > VPBROADCASTD > > On 21.06.2023 09:44, Jan Beulich wrote: > > On 21.06.2023 09:37, Hongtao Liu wrote: > >> On Wed, Jun 21, 2023 at 2:06 PM Jan Beulich via Gcc-patches > >> <gcc-patches@gcc.gnu.org> wrote: > >>> > >>> Isn't prefix_extra use bogus here? What extra prefix does > >>> vbroadcastss > >> According to comments, yes, no extra prefix is needed. > >> > >> ;; There are also additional prefixes in 3DNOW, SSSE3. > >> ;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte, ;; > >> sseiadd1,ssecvt1 to 0f7a with no DREX byte. > >> ;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a. > > > > Right, that's what triggered my question. I guess dropping these > > "prefix_extra" really wants to be a separate patch (or maybe even > > multiple, but it's hard to see how to split), dealing with all of the > > instances which likely have accumulated simply via copy-and-paste. > > Or wait - I'm altering those lines anyway, so I could as well drop them right > away (and slightly shrink patch size), if that's okay with you. Of course I > should then not forget to also mention this in the changelog entry. > Yes. > Jan
On Sun, Jun 25, 2023 at 9:17 AM Liu, Hongtao <hongtao.liu@intel.com> wrote: > > > > > -----Original Message----- > > From: Jan Beulich <jbeulich@suse.com> > > Sent: Wednesday, June 21, 2023 8:40 PM > > To: Hongtao Liu <crazylht@gmail.com> > > Cc: gcc-patches@gcc.gnu.org; Kirill Yukhin <kirill.yukhin@gmail.com>; Liu, > > Hongtao <hongtao.liu@intel.com> > > Subject: Re: [PATCH v2] x86: make better use of VBROADCASTSS / > > VPBROADCASTD > > > > On 21.06.2023 09:44, Jan Beulich wrote: > > > On 21.06.2023 09:37, Hongtao Liu wrote: > > >> On Wed, Jun 21, 2023 at 2:06 PM Jan Beulich via Gcc-patches > > >> <gcc-patches@gcc.gnu.org> wrote: > > >>> > > >>> Isn't prefix_extra use bogus here? What extra prefix does > > >>> vbroadcastss > > >> According to comments, yes, no extra prefix is needed. > > >> > > >> ;; There are also additional prefixes in 3DNOW, SSSE3. > > >> ;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte, ;; > > >> sseiadd1,ssecvt1 to 0f7a with no DREX byte. > > >> ;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a. > > > > > > Right, that's what triggered my question. I guess dropping these > > > "prefix_extra" really wants to be a separate patch (or maybe even > > > multiple, but it's hard to see how to split), dealing with all of the > > > instances which likely have accumulated simply via copy-and-paste. > > > > Or wait - I'm altering those lines anyway, so I could as well drop them right > > away (and slightly shrink patch size), if that's okay with you. Of course I > > should then not forget to also mention this in the changelog entry. > > > Yes. >Would you be okay for me to fold in that adjustment, or do you >insist on a separate patch? Also for this, no need for a separate patch. > > Jan
--- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -26141,41 +26141,64 @@ (const_int 1)))]) (define_insn "vec_dupv4sf" - [(set (match_operand:V4SF 0 "register_operand" "=v,v,x") + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v,x") (vec_duplicate:V4SF - (match_operand:SF 1 "nonimmediate_operand" "Yv,m,0")))] + (match_operand:SF 1 "nonimmediate_operand" "Yv,v,m,0")))] "TARGET_SSE" "@ - vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} + * return TARGET_AVX2 ? \"vbroadcastss\t{%1, %0|%0, %1}\" : \"vshufps\t{$0, %d1, %0|%0, %d1, 0}\"; + vbroadcastss\t{%1, %g0|%g0, %1} vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "avx,avx,noavx") - (set_attr "type" "sseshuf1,ssemov,sseshuf1") - (set_attr "length_immediate" "1,0,1") - (set_attr "prefix_extra" "0,1,*") - (set_attr "prefix" "maybe_evex,maybe_evex,orig") - (set_attr "mode" "V4SF")]) + [(set_attr "isa" "avx,*,avx,noavx") + (set (attr "type") + (cond [(and (eq_attr "alternative" "0") + (match_test "!TARGET_AVX2")) + (const_string "sseshuf1") + (eq_attr "alternative" "3") + (const_string "sseshuf1") + ] + (const_string "ssemov"))) + (set (attr "length_immediate") + (if_then_else (eq_attr "type" "sseshuf1") + (const_string "1") + (const_string "0"))) + (set_attr "prefix_extra" "0,0,1,*") + (set_attr "prefix" "maybe_evex,evex,maybe_evex,orig") + (set_attr "mode" "V4SF,V16SF,V4SF,V4SF") + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL + && !TARGET_PREFER_AVX256") + (const_string "*")))]) (define_insn "*vec_dupv4si" - [(set (match_operand:V4SI 0 "register_operand" "=v,v,x") + [(set (match_operand:V4SI 0 "register_operand" "=v,v,v,v,x") (vec_duplicate:V4SI - (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))] + (match_operand:SI 1 "nonimmediate_operand" "Yvm,v,Yv,m,0")))] "TARGET_SSE" "@ + vpbroadcastd\t{%1, %0|%0, %1} + vpbroadcastd\t{%1, %g0|%g0, %1} %vpshufd\t{$0, %1, %0|%0, %1, 0} vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "sse2,avx,noavx") - (set_attr "type" "sselog1,ssemov,sselog1") - (set_attr "length_immediate" "1,0,1") - (set_attr "prefix_extra" "0,1,*") - (set_attr "prefix" "maybe_vex,maybe_evex,orig") - (set_attr "mode" "TI,V4SF,V4SF") + [(set_attr "isa" "avx2,*,sse2,avx,noavx") + (set_attr "type" "ssemov,ssemov,sselog1,ssemov,sselog1") + (set_attr "length_immediate" "0,0,1,0,1") + (set_attr "prefix_extra" "0,0,0,1,*") + (set_attr "prefix" "maybe_evex,evex,maybe_vex,maybe_evex,orig") + (set_attr "mode" "TI,XI,TI,V4SF,V4SF") (set (attr "preferred_for_speed") - (cond [(eq_attr "alternative" "1") + (cond [(eq_attr "alternative" "3") (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC") ] - (symbol_ref "true")))]) + (symbol_ref "true"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL + && !TARGET_PREFER_AVX256") + (const_string "*")))]) (define_insn "*vec_dupv2di" [(set (match_operand:V2DI 0 "register_operand" "=x,v,v,v,x")