Message ID | b2607ae7-045a-d1bc-2cc8-d2f114677cb6@suse.com |
---|---|
State | New |
Headers | show |
Series | x86: make better use of VPTERNLOG{D,Q} | expand |
On Wed, Jun 21, 2023 at 2:28 PM Jan Beulich via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > With respective two-operand bitwise operations now expressable by a > single VPTERNLOG, add splitters to also deal with ior and xor > counterparts of the original and-only case. Note that the splitters need > to be separate, as the placement of "not" differs in the final insns > (*iornot<mode>3, *xnor<mode>3) which are intended to pick up one half of > the result. > > gcc/ > > * config/i386/sse.md: New splitters to simplify > not;vec_duplicate;{ior,xor} as vec_duplicate;{iornot,xnor}. > > gcc/testsuite/ > > * gcc.target/i386/pr100711-4.c: New test. > * gcc.target/i386/pr100711-5.c: New test. > > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -17366,6 +17366,36 @@ > (match_dup 2)))] > "operands[3] = gen_reg_rtx (<MODE>mode);") > > +(define_split > + [(set (match_operand:VI 0 "register_operand") > + (ior:VI > + (vec_duplicate:VI > + (not:<ssescalarmode> > + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))) > + (match_operand:VI 2 "vector_operand")))] > + "<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" > + [(set (match_dup 3) > + (vec_duplicate:VI (match_dup 1))) > + (set (match_dup 0) > + (ior:VI (not:VI (match_dup 3)) (match_dup 2)))] > + "operands[3] = gen_reg_rtx (<MODE>mode);") > + > +(define_split > + [(set (match_operand:VI 0 "register_operand") > + (xor:VI > + (vec_duplicate:VI > + (not:<ssescalarmode> > + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))) > + (match_operand:VI 2 "vector_operand")))] > + "<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" > + [(set (match_dup 3) > + (vec_duplicate:VI (match_dup 1))) > + (set (match_dup 0) > + (not:VI (xor:VI (match_dup 3) (match_dup 2))))] > + "operands[3] = gen_reg_rtx (<MODE>mode);") > + Can we merge this splitter(xor:not) into ior:not one with a code iterator for xor,ior, They look the same except for the xor/ior. No need to merge it into and:not case which have different guard conditions. Others LGTM. > (define_insn "*andnot<mode>3_mask" > [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") > (vec_merge:VI48_AVX512VL > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr100711-4.c > @@ -0,0 +1,42 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */ > + > +typedef char v64qi __attribute__ ((vector_size (64))); > +typedef short v32hi __attribute__ ((vector_size (64))); > +typedef int v16si __attribute__ ((vector_size (64))); > +typedef long long v8di __attribute__((vector_size (64))); > + > +v64qi foo_v64qi (char a, v64qi b) > +{ > + return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; > +} > + > +v32hi foo_v32hi (short a, v32hi b) > +{ > + return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; > +} > + > +v16si foo_v16si (int a, v16si b) > +{ > + return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; > +} > + > +v8di foo_v8di (long long a, v8di b) > +{ > + return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; > +} > + > +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 4 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xdd" 2 { target { ia32 } } } } */ > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr100711-5.c > @@ -0,0 +1,40 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */ > + > +typedef char v64qi __attribute__ ((vector_size (64))); > +typedef short v32hi __attribute__ ((vector_size (64))); > +typedef int v16si __attribute__ ((vector_size (64))); > +typedef long long v8di __attribute__((vector_size (64))); > + > +v64qi foo_v64qi (char a, v64qi b) > +{ > + return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; > +} > + > +v32hi foo_v32hi (short a, v32hi b) > +{ > + return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; > +} > + > +v16si foo_v16si (int a, v16si b) > +{ > + return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, > + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; > +} > + > +v8di foo_v8di (long long a, v8di b) > +{ > + return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; > +} > + > +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0x99" 4 } } */ >
On 25.06.2023 07:06, Hongtao Liu wrote: > On Wed, Jun 21, 2023 at 2:28 PM Jan Beulich via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> With respective two-operand bitwise operations now expressable by a >> single VPTERNLOG, add splitters to also deal with ior and xor >> counterparts of the original and-only case. Note that the splitters need >> to be separate, as the placement of "not" differs in the final insns >> (*iornot<mode>3, *xnor<mode>3) which are intended to pick up one half of >> the result. >> >> gcc/ >> >> * config/i386/sse.md: New splitters to simplify >> not;vec_duplicate;{ior,xor} as vec_duplicate;{iornot,xnor}. >> >> gcc/testsuite/ >> >> * gcc.target/i386/pr100711-4.c: New test. >> * gcc.target/i386/pr100711-5.c: New test. >> >> --- a/gcc/config/i386/sse.md >> +++ b/gcc/config/i386/sse.md >> @@ -17366,6 +17366,36 @@ >> (match_dup 2)))] >> "operands[3] = gen_reg_rtx (<MODE>mode);") >> >> +(define_split >> + [(set (match_operand:VI 0 "register_operand") >> + (ior:VI >> + (vec_duplicate:VI >> + (not:<ssescalarmode> >> + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))) >> + (match_operand:VI 2 "vector_operand")))] >> + "<MODE_SIZE> == 64 || TARGET_AVX512VL >> + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" >> + [(set (match_dup 3) >> + (vec_duplicate:VI (match_dup 1))) >> + (set (match_dup 0) >> + (ior:VI (not:VI (match_dup 3)) (match_dup 2)))] >> + "operands[3] = gen_reg_rtx (<MODE>mode);") >> + >> +(define_split >> + [(set (match_operand:VI 0 "register_operand") >> + (xor:VI >> + (vec_duplicate:VI >> + (not:<ssescalarmode> >> + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))) >> + (match_operand:VI 2 "vector_operand")))] >> + "<MODE_SIZE> == 64 || TARGET_AVX512VL >> + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" >> + [(set (match_dup 3) >> + (vec_duplicate:VI (match_dup 1))) >> + (set (match_dup 0) >> + (not:VI (xor:VI (match_dup 3) (match_dup 2))))] >> + "operands[3] = gen_reg_rtx (<MODE>mode);") >> + > Can we merge this splitter(xor:not) into ior:not one with a code > iterator for xor,ior, They look the same except for the xor/ior. They're only almost the same: Note (ior (not )) vs (not (xor )) as the result of the splitting. The difference is necessary to fit with what patch 1 introduces (which in turn is the way it is to fit with what generic code transforms things to up front). (I had it the way you suggest initially, until I figured why one of the two would end up never being used.) Jan
On Sun, Jun 25, 2023 at 2:16 PM Jan Beulich <jbeulich@suse.com> wrote: > > On 25.06.2023 07:06, Hongtao Liu wrote: > > On Wed, Jun 21, 2023 at 2:28 PM Jan Beulich via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > >> > >> With respective two-operand bitwise operations now expressable by a > >> single VPTERNLOG, add splitters to also deal with ior and xor > >> counterparts of the original and-only case. Note that the splitters need > >> to be separate, as the placement of "not" differs in the final insns > >> (*iornot<mode>3, *xnor<mode>3) which are intended to pick up one half of > >> the result. > >> > >> gcc/ > >> > >> * config/i386/sse.md: New splitters to simplify > >> not;vec_duplicate;{ior,xor} as vec_duplicate;{iornot,xnor}. > >> > >> gcc/testsuite/ > >> > >> * gcc.target/i386/pr100711-4.c: New test. > >> * gcc.target/i386/pr100711-5.c: New test. > >> > >> --- a/gcc/config/i386/sse.md > >> +++ b/gcc/config/i386/sse.md > >> @@ -17366,6 +17366,36 @@ > >> (match_dup 2)))] > >> "operands[3] = gen_reg_rtx (<MODE>mode);") > >> > >> +(define_split > >> + [(set (match_operand:VI 0 "register_operand") > >> + (ior:VI > >> + (vec_duplicate:VI > >> + (not:<ssescalarmode> > >> + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))) > >> + (match_operand:VI 2 "vector_operand")))] > >> + "<MODE_SIZE> == 64 || TARGET_AVX512VL > >> + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" > >> + [(set (match_dup 3) > >> + (vec_duplicate:VI (match_dup 1))) > >> + (set (match_dup 0) > >> + (ior:VI (not:VI (match_dup 3)) (match_dup 2)))] > >> + "operands[3] = gen_reg_rtx (<MODE>mode);") > >> + > >> +(define_split > >> + [(set (match_operand:VI 0 "register_operand") > >> + (xor:VI > >> + (vec_duplicate:VI > >> + (not:<ssescalarmode> > >> + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))) > >> + (match_operand:VI 2 "vector_operand")))] > >> + "<MODE_SIZE> == 64 || TARGET_AVX512VL > >> + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" > >> + [(set (match_dup 3) > >> + (vec_duplicate:VI (match_dup 1))) > >> + (set (match_dup 0) > >> + (not:VI (xor:VI (match_dup 3) (match_dup 2))))] > >> + "operands[3] = gen_reg_rtx (<MODE>mode);") > >> + > > Can we merge this splitter(xor:not) into ior:not one with a code > > iterator for xor,ior, They look the same except for the xor/ior. > > They're only almost the same: Note (ior (not )) vs (not (xor )) as > the result of the splitting. The difference is necessary to fit > with what patch 1 introduces (which in turn is the way it is to > fit with what generic code transforms things to up front). (I had > it the way you suggest initially, until I figured why one of the > two would end up never being used.) > 3597 /* Convert (XOR (NOT x) (NOT y)) to (XOR x y). 3598 Also convert (XOR (NOT x) y) to (NOT (XOR x y)), similarly for 3599 (NOT y). */ 3600 { 3601 int num_negated = 0; 3602 3603 if (GET_CODE (op0) == NOT) 3604 num_negated++, op0 = XEXP (op0, 0); 3605 if (GET_CODE (op1) == NOT) 3606 num_negated++, op1 = XEXP (op1, 0); It looks simplify_rtx plays the trick. And it's documented. 8602@cindex @code{xor}, canonicalization of 8603@item 8604The only possible RTL expressions involving both bitwise exclusive-or 8605and bitwise negation are @code{(xor:@var{m} @var{x} @var{y})} 8606and @code{(not:@var{m} (xor:@var{m} @var{x} @var{y}))}. Then the original patch LGTM. > Jan
--- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17366,6 +17366,36 @@ (match_dup 2)))] "operands[3] = gen_reg_rtx (<MODE>mode);") +(define_split + [(set (match_operand:VI 0 "register_operand") + (ior:VI + (vec_duplicate:VI + (not:<ssescalarmode> + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))) + (match_operand:VI 2 "vector_operand")))] + "<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" + [(set (match_dup 3) + (vec_duplicate:VI (match_dup 1))) + (set (match_dup 0) + (ior:VI (not:VI (match_dup 3)) (match_dup 2)))] + "operands[3] = gen_reg_rtx (<MODE>mode);") + +(define_split + [(set (match_operand:VI 0 "register_operand") + (xor:VI + (vec_duplicate:VI + (not:<ssescalarmode> + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))) + (match_operand:VI 2 "vector_operand")))] + "<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" + [(set (match_dup 3) + (vec_duplicate:VI (match_dup 1))) + (set (match_dup 0) + (not:VI (xor:VI (match_dup 3) (match_dup 2))))] + "operands[3] = gen_reg_rtx (<MODE>mode);") + (define_insn "*andnot<mode>3_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (vec_merge:VI48_AVX512VL --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr100711-4.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */ + +typedef char v64qi __attribute__ ((vector_size (64))); +typedef short v32hi __attribute__ ((vector_size (64))); +typedef int v16si __attribute__ ((vector_size (64))); +typedef long long v8di __attribute__((vector_size (64))); + +v64qi foo_v64qi (char a, v64qi b) +{ + return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +v32hi foo_v32hi (short a, v32hi b) +{ + return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +v16si foo_v16si (int a, v16si b) +{ + return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +v8di foo_v8di (long long a, v8di b) +{ + return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xdd" 2 { target { ia32 } } } } */ --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr100711-5.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */ + +typedef char v64qi __attribute__ ((vector_size (64))); +typedef short v32hi __attribute__ ((vector_size (64))); +typedef int v16si __attribute__ ((vector_size (64))); +typedef long long v8di __attribute__((vector_size (64))); + +v64qi foo_v64qi (char a, v64qi b) +{ + return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; +} + +v32hi foo_v32hi (short a, v32hi b) +{ + return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; +} + +v16si foo_v16si (int a, v16si b) +{ + return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; +} + +v8di foo_v8di (long long a, v8di b) +{ + return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; +} + +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0x99" 4 } } */