Message ID | 169ca252-3828-b466-4d47-a8fe720ec4ef@suse.com |
---|---|
State | New |
Headers | show |
Series | [v3] x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F | expand |
On Tue, Jun 20, 2023 at 3:07 PM Jan Beulich via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > There's no reason to constrain this to AVX512VL, unless instructed so by > -mprefer-vector-width=, as the wider operation is unusable for more > narrow operands only when the possible memory source is a non-broadcast > one. This way even the scalar copysign<mode>3 can benefit from the > operation being a single-insn one (leaving aside moves which the > compiler decides to insert for unclear reasons, and leaving aside the > fact that bcst_mem_operand() is too restrictive for broadcast to be > embedded right into VPTERNLOG*). > > While there also bring *<avx512>_vternlog<mode>_all's in sync with that > of the three splitters. > > Along with this also request value duplication in > ix86_expand_copysign()'s call to ix86_build_signbit_mask(), eliminating > excess space allocation in .rodata.*, filled with zeros which are never > read. > > gcc/ > > * config/i386/i386-expand.cc (ix86_expand_copysign): Request > value duplication by ix86_build_signbit_mask() when AVX512F and > not HFmode. > * config/i386/sse.md (*<avx512>_vternlog<mode>_all): Convert to > 2-alternative form. Adjust "mode" attribute. Add "enabled" > attribute. > (*<avx512>_vpternlog<mode>_1): Also permit when TARGET_AVX512F > && !TARGET_PREFER_AVX256. > (*<avx512>_vpternlog<mode>_2): Likewise. > (*<avx512>_vpternlog<mode>_3): Likewise. > > gcc/testsuite/ > * gcc.target/i386/avx512f-copysign.c: New test. > --- > I haven't been able to find documentation on the dejagnu(?) regex syntax > (?:...). With ordinary (...) failing (producing twice as many matches), > I could only derive this from other scan-assembler patterns. > > I guess the underlying pattern, going along the lines of what > <mask_codefor>one_cmpl<mode>2<mask_name> uses, can be applied elsewhere > as well. That should be guarded with !TARGET_PREFER_AVX256, let's handle that in a separate patch. > > HFmode could use embedded broadcast too for copysign and alike, but that > would need to be V2HF -> V8HF (for which I don't think there are any > existing patterns). > --- > v3: Adjust insn conditional as well. Add testcase. > v2: Respect -mprefer-vector-width=. > > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -2266,7 +2266,7 @@ ix86_expand_copysign (rtx operands[]) > else > dest = NULL_RTX; > op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode); > - mask = ix86_build_signbit_mask (vmode, 0, 0); > + mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0); > > if (CONST_DOUBLE_P (operands[1])) > { > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -12399,22 +12399,35 @@ > (set_attr "mode" "<sseinsnmode>")]) > > (define_insn "*<avx512>_vternlog<mode>_all" > - [(set (match_operand:V 0 "register_operand" "=v") > + [(set (match_operand:V 0 "register_operand" "=v,v") > (unspec:V > - [(match_operand:V 1 "register_operand" "0") > - (match_operand:V 2 "register_operand" "v") > - (match_operand:V 3 "bcst_vector_operand" "vmBr") > + [(match_operand:V 1 "register_operand" "0,0") > + (match_operand:V 2 "register_operand" "v,v") > + (match_operand:V 3 "bcst_vector_operand" "vBr,m") > (match_operand:SI 4 "const_0_to_255_operand")] > UNSPEC_VTERNLOG))] > - "TARGET_AVX512F > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > /* Disallow embeded broadcast for vector HFmode since > it's not real AVX512FP16 instruction. */ > && (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) >= 4 > || GET_CODE (operands[3]) != VEC_DUPLICATE)" > - "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}" > +{ > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"; > + else > + return "vpternlog<ternlogsuffix>\t{%4, %g3, %g2, %g0|%g0, %g2, %g3, %4}"; > +} > [(set_attr "type" "sselog") > (set_attr "prefix" "evex") > - (set_attr "mode" "<sseinsnmode>")]) > + (set (attr "mode") > + (if_then_else (match_test "TARGET_AVX512VL") > + (const_string "<sseinsnmode>") > + (const_string "XI"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > + (const_string "*")))]) > > ;; There must be lots of other combinations like > ;; > @@ -12443,7 +12456,8 @@ > (any_logic2:V > (match_operand:V 3 "regmem_or_bitnot_regmem_operand") > (match_operand:V 4 "regmem_or_bitnot_regmem_operand"))))] > - "(<MODE_SIZE> == 64 || TARGET_AVX512VL) > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > && ix86_pre_reload_split () > && (rtx_equal_p (STRIP_UNARY (operands[1]), > STRIP_UNARY (operands[4])) > @@ -12527,7 +12541,8 @@ > (match_operand:V 2 "regmem_or_bitnot_regmem_operand")) > (match_operand:V 3 "regmem_or_bitnot_regmem_operand")) > (match_operand:V 4 "regmem_or_bitnot_regmem_operand")))] > - "(<MODE_SIZE> == 64 || TARGET_AVX512VL) > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > && ix86_pre_reload_split () > && (rtx_equal_p (STRIP_UNARY (operands[1]), > STRIP_UNARY (operands[4])) > @@ -12610,7 +12625,8 @@ > (match_operand:V 1 "regmem_or_bitnot_regmem_operand") > (match_operand:V 2 "regmem_or_bitnot_regmem_operand")) > (match_operand:V 3 "regmem_or_bitnot_regmem_operand")))] > - "(<MODE_SIZE> == 64 || TARGET_AVX512VL) > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > && ix86_pre_reload_split ()" > "#" > "&& 1" > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c > @@ -0,0 +1,32 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512f -mno-avx512vl -O2" } */ Please explicitly add -mprefer-vector-width=512, our tester will also test unix{-m32 \-march=cascadelake,\ -march=cascadelake} which set the - mprefer-vector-width=256, -mprefer-vector-width=512 in dg-options can overwrite that. Others LGTM. > +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$(?:216|228|0xd8|0xe4)," 5 } } */ > + > +double cs_df (double x, double y) > +{ > + return __builtin_copysign (x, y); > +} > + > +float cs_sf (float x, float y) > +{ > + return __builtin_copysignf (x, y); > +} > + > +typedef double __attribute__ ((vector_size (16))) v2df; > +typedef double __attribute__ ((vector_size (32))) v4df; > +typedef double __attribute__ ((vector_size (64))) v8df; > + > +v2df cs_v2df (v2df x, v2df y) > +{ > + return __builtin_ia32_copysignpd (x, y); > +} > + > +v4df cs_v4df (v4df x, v4df y) > +{ > + return __builtin_ia32_copysignpd256 (x, y); > +} > + > +v8df cs_v8df (v8df x, v8df y) > +{ > + return __builtin_ia32_copysignpd512 (x, y); > +}
On 20.06.2023 10:33, Hongtao Liu wrote: > On Tue, Jun 20, 2023 at 3:07 PM Jan Beulich via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> I guess the underlying pattern, going along the lines of what >> <mask_codefor>one_cmpl<mode>2<mask_name> uses, can be applied elsewhere >> as well. > That should be guarded with !TARGET_PREFER_AVX256, let's handle that > in a separate patch. Sure, and as indicated there are more places where similar things could be done. >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c >> @@ -0,0 +1,32 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-mavx512f -mno-avx512vl -O2" } */ > Please explicitly add -mprefer-vector-width=512, our tester will also > test unix{-m32 \-march=cascadelake,\ -march=cascadelake} which set the > - mprefer-vector-width=256, -mprefer-vector-width=512 in dg-options > can overwrite that. Oh, I see. Will do. And I expect I then also need to adjust the newly added avx512f-dupv2di.c from the earlier patch. I guess I could commit that option addition there as obvious? > Others LGTM. May I take this as "okay with that change", or should I submit v4? Jan
On Tue, Jun 20, 2023 at 5:03 PM Jan Beulich <jbeulich@suse.com> wrote: > > On 20.06.2023 10:33, Hongtao Liu wrote: > > On Tue, Jun 20, 2023 at 3:07 PM Jan Beulich via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > >> > >> I guess the underlying pattern, going along the lines of what > >> <mask_codefor>one_cmpl<mode>2<mask_name> uses, can be applied elsewhere > >> as well. > > That should be guarded with !TARGET_PREFER_AVX256, let's handle that > > in a separate patch. > > Sure, and as indicated there are more places where similar things could > be done. > > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c > >> @@ -0,0 +1,32 @@ > >> +/* { dg-do compile } */ > >> +/* { dg-options "-mavx512f -mno-avx512vl -O2" } */ > > Please explicitly add -mprefer-vector-width=512, our tester will also > > test unix{-m32 \-march=cascadelake,\ -march=cascadelake} which set the > > - mprefer-vector-width=256, -mprefer-vector-width=512 in dg-options > > can overwrite that. > > Oh, I see. Will do. And I expect I then also need to adjust the newly > added avx512f-dupv2di.c from the earlier patch. I guess I could commit > that option addition there as obvious? Still need to send out the patch, and commit as an obvious fix. > > > Others LGTM. > > May I take this as "okay with that change", or should I submit v4? Okay. no need for a v4 version. > > Jan
On Tue, Jun 20, 2023 at 5:34 PM Hongtao Liu <crazylht@gmail.com> wrote: > > On Tue, Jun 20, 2023 at 5:03 PM Jan Beulich <jbeulich@suse.com> wrote: > > > > On 20.06.2023 10:33, Hongtao Liu wrote: > > > On Tue, Jun 20, 2023 at 3:07 PM Jan Beulich via Gcc-patches > > > <gcc-patches@gcc.gnu.org> wrote: > > >> > > >> I guess the underlying pattern, going along the lines of what > > >> <mask_codefor>one_cmpl<mode>2<mask_name> uses, can be applied elsewhere > > >> as well. > > > That should be guarded with !TARGET_PREFER_AVX256, let's handle that > > > in a separate patch. > > > > Sure, and as indicated there are more places where similar things could > > be done. > > > > >> --- /dev/null > > >> +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c > > >> @@ -0,0 +1,32 @@ > > >> +/* { dg-do compile } */ > > >> +/* { dg-options "-mavx512f -mno-avx512vl -O2" } */ > > > Please explicitly add -mprefer-vector-width=512, our tester will also > > > test unix{-m32 \-march=cascadelake,\ -march=cascadelake} which set the > > > - mprefer-vector-width=256, -mprefer-vector-width=512 in dg-options > > > can overwrite that. > > > > Oh, I see. Will do. And I expect I then also need to adjust the newly > > added avx512f-dupv2di.c from the earlier patch. I guess I could commit > > that option addition there as obvious? > Still need to send out the patch, and commit as an obvious fix. > > > > > Others LGTM. > > > > May I take this as "okay with that change", or should I submit v4? > Okay. no need for a v4 version. > > avx512f-copysign.c failed for -m32, we need to add -mfpmath=sse to dg-options. cs_df: .LFB0: .cfi_startproc fldl 12(%esp) fxam fnstsw %ax fstp %st(0) fldl 4(%esp) fabs testb $2, %ah je .L1 fchs .L1: ret .cfi_endproc .LFE0: .size cs_df, .-cs_df .p2align 4 .globl cs_sf .type cs_sf, @function cs_sf: .LFB1: .cfi_startproc flds 8(%esp) fxam fnstsw %ax fstp %st(0) flds 4(%esp) testb $2, %ah fabs fld %st(0) fchs fcmove %st(1), %st fstp %st(1) ret .cfi_endproc .LFE1: .size cs_sf, .-cs_sf .p2align 4 .globl cs_v2df .type cs_v2df, @function > > Jan > > > > -- > BR, > Hongtao
On 27.06.2023 07:11, Hongtao Liu wrote: > On Tue, Jun 20, 2023 at 5:34 PM Hongtao Liu <crazylht@gmail.com> wrote: >> >> On Tue, Jun 20, 2023 at 5:03 PM Jan Beulich <jbeulich@suse.com> wrote: >>> >>> On 20.06.2023 10:33, Hongtao Liu wrote: >>>> On Tue, Jun 20, 2023 at 3:07 PM Jan Beulich via Gcc-patches >>>> <gcc-patches@gcc.gnu.org> wrote: >>>>> >>>>> I guess the underlying pattern, going along the lines of what >>>>> <mask_codefor>one_cmpl<mode>2<mask_name> uses, can be applied elsewhere >>>>> as well. >>>> That should be guarded with !TARGET_PREFER_AVX256, let's handle that >>>> in a separate patch. >>> >>> Sure, and as indicated there are more places where similar things could >>> be done. >>> >>>>> --- /dev/null >>>>> +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c >>>>> @@ -0,0 +1,32 @@ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-mavx512f -mno-avx512vl -O2" } */ >>>> Please explicitly add -mprefer-vector-width=512, our tester will also >>>> test unix{-m32 \-march=cascadelake,\ -march=cascadelake} which set the >>>> - mprefer-vector-width=256, -mprefer-vector-width=512 in dg-options >>>> can overwrite that. >>> >>> Oh, I see. Will do. And I expect I then also need to adjust the newly >>> added avx512f-dupv2di.c from the earlier patch. I guess I could commit >>> that option addition there as obvious? >> Still need to send out the patch, and commit as an obvious fix. >>> >>>> Others LGTM. >>> >>> May I take this as "okay with that change", or should I submit v4? >> Okay. no need for a v4 version. >>> > avx512f-copysign.c failed for -m32, we need to add -mfpmath=sse to dg-options. Oh, of course. I will take care of this, but it may take me a couple of days, as I just came back from a week of vacation. One question though: Elsewhere such tests are simply suppressed for 32-bit. Personally I'd prefer going that route, but if you think adding -mfpmath=sse is indeed better, I'll follow your request. Jan
> -----Original Message----- > From: Jan Beulich <jbeulich@suse.com> > Sent: Tuesday, July 4, 2023 11:30 PM > To: Hongtao Liu <crazylht@gmail.com> > Cc: gcc-patches@gcc.gnu.org; Kirill Yukhin <kirill.yukhin@gmail.com>; Liu, > Hongtao <hongtao.liu@intel.com> > Subject: Re: [PATCH v3] x86: make VPTERNLOG* usable on less than 512-bit > operands with just AVX512F > > On 27.06.2023 07:11, Hongtao Liu wrote: > > On Tue, Jun 20, 2023 at 5:34 PM Hongtao Liu <crazylht@gmail.com> wrote: > >> > >> On Tue, Jun 20, 2023 at 5:03 PM Jan Beulich <jbeulich@suse.com> wrote: > >>> > >>> On 20.06.2023 10:33, Hongtao Liu wrote: > >>>> On Tue, Jun 20, 2023 at 3:07 PM Jan Beulich via Gcc-patches > >>>> <gcc-patches@gcc.gnu.org> wrote: > >>>>> > >>>>> I guess the underlying pattern, going along the lines of what > >>>>> <mask_codefor>one_cmpl<mode>2<mask_name> uses, can be > applied > >>>>> elsewhere as well. > >>>> That should be guarded with !TARGET_PREFER_AVX256, let's handle > >>>> that in a separate patch. > >>> > >>> Sure, and as indicated there are more places where similar things > >>> could be done. > >>> > >>>>> --- /dev/null > >>>>> +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c > >>>>> @@ -0,0 +1,32 @@ > >>>>> +/* { dg-do compile } */ > >>>>> +/* { dg-options "-mavx512f -mno-avx512vl -O2" } */ > >>>> Please explicitly add -mprefer-vector-width=512, our tester will > >>>> also test unix{-m32 \-march=cascadelake,\ -march=cascadelake} which > >>>> set the > >>>> - mprefer-vector-width=256, -mprefer-vector-width=512 in dg-options > >>>> can overwrite that. > >>> > >>> Oh, I see. Will do. And I expect I then also need to adjust the > >>> newly added avx512f-dupv2di.c from the earlier patch. I guess I > >>> could commit that option addition there as obvious? > >> Still need to send out the patch, and commit as an obvious fix. > >>> > >>>> Others LGTM. > >>> > >>> May I take this as "okay with that change", or should I submit v4? > >> Okay. no need for a v4 version. > >>> > > avx512f-copysign.c failed for -m32, we need to add -mfpmath=sse to dg- > options. > > Oh, of course. I will take care of this, but it may take me a couple of days, as I > just came back from a week of vacation. One question though: > Elsewhere such tests are simply suppressed for 32-bit. Personally I'd prefer > going that route, but if you think adding -mfpmath=sse is indeed better, I'll > follow your request. Either is ok. > > Jan
--- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -2266,7 +2266,7 @@ ix86_expand_copysign (rtx operands[]) else dest = NULL_RTX; op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode); - mask = ix86_build_signbit_mask (vmode, 0, 0); + mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0); if (CONST_DOUBLE_P (operands[1])) { --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -12399,22 +12399,35 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "*<avx512>_vternlog<mode>_all" - [(set (match_operand:V 0 "register_operand" "=v") + [(set (match_operand:V 0 "register_operand" "=v,v") (unspec:V - [(match_operand:V 1 "register_operand" "0") - (match_operand:V 2 "register_operand" "v") - (match_operand:V 3 "bcst_vector_operand" "vmBr") + [(match_operand:V 1 "register_operand" "0,0") + (match_operand:V 2 "register_operand" "v,v") + (match_operand:V 3 "bcst_vector_operand" "vBr,m") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_VTERNLOG))] - "TARGET_AVX512F + "(<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) /* Disallow embeded broadcast for vector HFmode since it's not real AVX512FP16 instruction. */ && (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) >= 4 || GET_CODE (operands[3]) != VEC_DUPLICATE)" - "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}" +{ + if (TARGET_AVX512VL) + return "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"; + else + return "vpternlog<ternlogsuffix>\t{%4, %g3, %g2, %g0|%g0, %g2, %g3, %4}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) + (set (attr "mode") + (if_then_else (match_test "TARGET_AVX512VL") + (const_string "<sseinsnmode>") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") + (const_string "*")))]) ;; There must be lots of other combinations like ;; @@ -12443,7 +12456,8 @@ (any_logic2:V (match_operand:V 3 "regmem_or_bitnot_regmem_operand") (match_operand:V 4 "regmem_or_bitnot_regmem_operand"))))] - "(<MODE_SIZE> == 64 || TARGET_AVX512VL) + "(<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && ix86_pre_reload_split () && (rtx_equal_p (STRIP_UNARY (operands[1]), STRIP_UNARY (operands[4])) @@ -12527,7 +12541,8 @@ (match_operand:V 2 "regmem_or_bitnot_regmem_operand")) (match_operand:V 3 "regmem_or_bitnot_regmem_operand")) (match_operand:V 4 "regmem_or_bitnot_regmem_operand")))] - "(<MODE_SIZE> == 64 || TARGET_AVX512VL) + "(<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && ix86_pre_reload_split () && (rtx_equal_p (STRIP_UNARY (operands[1]), STRIP_UNARY (operands[4])) @@ -12610,7 +12625,8 @@ (match_operand:V 1 "regmem_or_bitnot_regmem_operand") (match_operand:V 2 "regmem_or_bitnot_regmem_operand")) (match_operand:V 3 "regmem_or_bitnot_regmem_operand")))] - "(<MODE_SIZE> == 64 || TARGET_AVX512VL) + "(<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && ix86_pre_reload_split ()" "#" "&& 1" --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -mno-avx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$(?:216|228|0xd8|0xe4)," 5 } } */ + +double cs_df (double x, double y) +{ + return __builtin_copysign (x, y); +} + +float cs_sf (float x, float y) +{ + return __builtin_copysignf (x, y); +} + +typedef double __attribute__ ((vector_size (16))) v2df; +typedef double __attribute__ ((vector_size (32))) v4df; +typedef double __attribute__ ((vector_size (64))) v8df; + +v2df cs_v2df (v2df x, v2df y) +{ + return __builtin_ia32_copysignpd (x, y); +} + +v4df cs_v4df (v4df x, v4df y) +{ + return __builtin_ia32_copysignpd256 (x, y); +} + +v8df cs_v8df (v8df x, v8df y) +{ + return __builtin_ia32_copysignpd512 (x, y); +}