Message ID | 457ffad0-9ecd-3e19-f5ab-6153ce4b8bad@suse.com |
---|---|
State | New |
Headers | show |
Series | x86: make better use of VPTERNLOG{D,Q} | expand |
On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > All combinations of and, ior, xor, and not involving two operands can be > expressed that way in a single insn. > > gcc/ > > PR target/93768 > * config/i386/i386.cc (ix86_rtx_costs): Further special-case > bitwise vector operations. > * config/i386/sse.md (*iornot<mode>3): New insn. > (*xnor<mode>3): Likewise. > (*<nlogic><mode>3): Likewise. > (andor): New code iterator. > (nlogic): New code attribute. > (ternlog_nlogic): Likewise. > > gcc/testsuite/ > > PR target/93768 > gcc.target/i386/avx512-binop-not-1.h: New. > gcc.target/i386/avx512-binop-not-2.h: New. > gcc.target/i386/avx512f-orn-si-zmm-1.c: New test. > gcc.target/i386/avx512f-orn-si-zmm-2.c: New test. > --- > The use of VI matches that in e.g. one_cmpl<mode>2 / > <mask_codefor>one_cmpl<mode>2<mask_name> and *andnot<mode>3, despite > (here and there) > - V64QI and V32HI being needlessly excluded when AVX512BW isn't enabled, > - V<n>TI not being covered, > - vector modes more narrow than 16 bytes not being covered. > > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -21178,6 +21178,32 @@ ix86_rtx_costs (rtx x, machine_mode mode > return false; > > case IOR: > + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > + { > + /* (ior (not ...) ...) can be a single insn in AVX512. */ > + if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F > + && (GET_MODE_SIZE (mode) == 64 > + || (TARGET_AVX512VL > + && (GET_MODE_SIZE (mode) == 32 > + || GET_MODE_SIZE (mode) == 16)))) > + { > + rtx right = GET_CODE (XEXP (x, 1)) != NOT > + ? XEXP (x, 1) : XEXP (XEXP (x, 1), 0); > + > + *total = ix86_vec_cost (mode, cost->sse_op) > + + rtx_cost (XEXP (XEXP (x, 0), 0), mode, > + outer_code, opno, speed) > + + rtx_cost (right, mode, outer_code, opno, speed); > + return true; > + } > + *total = ix86_vec_cost (mode, cost->sse_op); > + } > + else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) > + *total = cost->add * 2; > + else > + *total = cost->add; > + return false; > + > case XOR: > if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > *total = ix86_vec_cost (mode, cost->sse_op); > @@ -21198,11 +21224,20 @@ ix86_rtx_costs (rtx x, machine_mode mode > /* pandn is a single instruction. */ > if (GET_CODE (XEXP (x, 0)) == NOT) > { > + rtx right = XEXP (x, 1); > + > + /* (and (not ...) (not ...)) can be a single insn in AVX512. */ > + if (GET_CODE (right) == NOT && TARGET_AVX512F > + && (GET_MODE_SIZE (mode) == 64 > + || (TARGET_AVX512VL > + && (GET_MODE_SIZE (mode) == 32 > + || GET_MODE_SIZE (mode) == 16)))) > + right = XEXP (right, 0); > + > *total = ix86_vec_cost (mode, cost->sse_op) > + rtx_cost (XEXP (XEXP (x, 0), 0), mode, > outer_code, opno, speed) > - + rtx_cost (XEXP (x, 1), mode, > - outer_code, opno, speed); > + + rtx_cost (right, mode, outer_code, opno, speed); > return true; > } > else if (GET_CODE (XEXP (x, 1)) == NOT) > @@ -21260,8 +21295,25 @@ ix86_rtx_costs (rtx x, machine_mode mode > > case NOT: > if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > - // vnot is pxor -1. > - *total = ix86_vec_cost (mode, cost->sse_op) + 1; > + { > + /* (not (xor ...)) can be a single insn in AVX512. */ > + if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F > + && (GET_MODE_SIZE (mode) == 64 > + || (TARGET_AVX512VL > + && (GET_MODE_SIZE (mode) == 32 > + || GET_MODE_SIZE (mode) == 16)))) > + { > + *total = ix86_vec_cost (mode, cost->sse_op) > + + rtx_cost (XEXP (XEXP (x, 0), 0), mode, > + outer_code, opno, speed) > + + rtx_cost (XEXP (XEXP (x, 0), 1), mode, > + outer_code, opno, speed); > + return true; > + } > + > + // vnot is pxor -1. > + *total = ix86_vec_cost (mode, cost->sse_op) + 1; > + } > else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) > *total = cost->add * 2; > else > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -17616,6 +17616,98 @@ > operands[2] = force_reg (V1TImode, CONSTM1_RTX (V1TImode)); > }) > > +(define_insn "*iornot<mode>3" > + [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") > + (ior:VI > + (not:VI > + (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) > + (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > + && (register_operand (operands[1], <MODE>mode) > + || register_operand (operands[2], <MODE>mode))" > +{ > + if (!register_operand (operands[1], <MODE>mode)) > + { > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{$0xdd, %1, %2, %0|%0, %2, %1, 0xdd}"; > + return "vpternlog<ternlogsuffix>\t{$0xdd, %g1, %g2, %g0|%g0, %g2, %g1, 0xdd}"; > + } > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{$0xbb, %2, %1, %0|%0, %1, %2, 0xbb}"; > + return "vpternlog<ternlogsuffix>\t{$0xbb, %g2, %g1, %g0|%g0, %g1, %g2, 0xbb}"; > +} > + [(set_attr "type" "sselog") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set (attr "mode") > + (if_then_else (match_test "TARGET_AVX512VL") > + (const_string "<sseinsnmode>") > + (const_string "XI"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "2,3") > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > + (const_string "*")))]) > + > +(define_insn "*xnor<mode>3" > + [(set (match_operand:VI 0 "register_operand" "=v,v") > + (not:VI > + (xor:VI > + (match_operand:VI 1 "bcst_vector_operand" "%v,v") > + (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > + && (register_operand (operands[1], <MODE>mode) > + || register_operand (operands[2], <MODE>mode))" > +{ > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{$0x99, %2, %1, %0|%0, %1, %2, 0x99}"; > + else > + return "vpternlog<ternlogsuffix>\t{$0x99, %g2, %g1, %g0|%g0, %g1, %g2, 0x99}"; > +} > + [(set_attr "type" "sselog") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set (attr "mode") > + (if_then_else (match_test "TARGET_AVX512VL") > + (const_string "<sseinsnmode>") > + (const_string "XI"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > + (const_string "*")))]) > + > +(define_code_iterator andor [and ior]) > +(define_code_attr nlogic [(and "nor") (ior "nand")]) > +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) > + > +(define_insn "*<nlogic><mode>3" > + [(set (match_operand:VI 0 "register_operand" "=v,v") > + (andor:VI > + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) > + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] I'm thinking of doing it in simplify_rtx or gimple match.pd to transform (and (not op1)) (not op2)) -> (not: (ior: op1 op2)) (ior (not op1) (not op2)) -> (not : (and op1 op2)) Even w/o avx512f, the transformation should also benefit since it takes less logic operations 3 -> 2.(or 2 -> 2 for pandn). The other 2 patterns: *xnor<mode>3 and iornot<mode>3 LGTM. > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > + && (register_operand (operands[1], <MODE>mode) > + || register_operand (operands[2], <MODE>mode))" > +{ > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %2, %1, %0|%0, %1, %2, <ternlog_nlogic>}"; > + else > + return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %g2, %g1, %g0|%g0, %g1, %g2, <ternlog_nlogic>}"; > +} > + [(set_attr "type" "sselog") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set (attr "mode") > + (if_then_else (match_test "TARGET_AVX512VL") > + (const_string "<sseinsnmode>") > + (const_string "XI"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > + (const_string "*")))]) > + > (define_mode_iterator AVX512ZEXTMASK > [(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI]) > > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-1.h > @@ -0,0 +1,13 @@ > +#include <immintrin.h> > + > +#define PASTER2(x,y) x##y > +#define PASTER3(x,y,z) _mm##x##_##y##_##z > +#define OP(vec, op, suffix) PASTER3 (vec, op, suffix) > +#define DUP(vec, suffix, val) PASTER3 (vec, set1, suffix) (val) > + > +type > +foo (type x, SCALAR *f) > +{ > + return OP (vec, op, suffix) (x, OP (vec, xor, suffix) (DUP (vec, suffix, *f), > + DUP (vec, suffix, ~0))); > +} > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-2.h > @@ -0,0 +1,13 @@ > +#include <immintrin.h> > + > +#define PASTER2(x,y) x##y > +#define PASTER3(x,y,z) _mm##x##_##y##_##z > +#define OP(vec, op, suffix) PASTER3 (vec, op, suffix) > +#define DUP(vec, suffix, val) PASTER3 (vec, set1, suffix) (val) > + > +type > +foo (type x, SCALAR *f) > +{ > + return OP (vec, op, suffix) (OP (vec, xor, suffix) (x, DUP (vec, suffix, ~0)), > + DUP (vec, suffix, *f)); > +} > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-1.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ > +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xdd, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */ > +/* { dg-final { scan-assembler-not "vpbroadcast" } } */ > + > +#define type __m512i > +#define vec 512 > +#define op or > +#define suffix epi32 > +#define SCALAR int > + > +#include "avx512-binop-not-1.h" > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-2.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ > +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xbb, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */ > +/* { dg-final { scan-assembler-not "vpbroadcast" } } */ > + > +#define type __m512i > +#define vec 512 > +#define op or > +#define suffix epi32 > +#define SCALAR int > + > +#include "avx512-binop-not-2.h" >
On 25.06.2023 06:42, Hongtao Liu wrote: > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> +(define_code_iterator andor [and ior]) >> +(define_code_attr nlogic [(and "nor") (ior "nand")]) >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) >> + >> +(define_insn "*<nlogic><mode>3" >> + [(set (match_operand:VI 0 "register_operand" "=v,v") >> + (andor:VI >> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) >> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform > (and (not op1)) (not op2)) -> (not: (ior: op1 op2)) This wouldn't be a win (not + andn) -> (or + not), but what's more important is ... > (ior (not op1) (not op2)) -> (not : (and op1 op2)) > > Even w/o avx512f, the transformation should also benefit since it > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn). ... that these transformations (from the, as per the doc, canonical representation of nand and nor) are already occurring in common code, _if_ no suitable insn can be found. That was at least the conclusion I drew from looking around a lot, supported by the code that's generated prior to this change. Jan
On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote: > > On 25.06.2023 06:42, Hongtao Liu wrote: > > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > >> > >> +(define_code_iterator andor [and ior]) > >> +(define_code_attr nlogic [(and "nor") (ior "nand")]) > >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) > >> + > >> +(define_insn "*<nlogic><mode>3" > >> + [(set (match_operand:VI 0 "register_operand" "=v,v") > >> + (andor:VI > >> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) > >> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform > > (and (not op1)) (not op2)) -> (not: (ior: op1 op2)) > > This wouldn't be a win (not + andn) -> (or + not), but what's > more important is ... > > > (ior (not op1) (not op2)) -> (not : (and op1 op2)) > > > > Even w/o avx512f, the transformation should also benefit since it > > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn). > > ... that these transformations (from the, as per the doc, > canonical representation of nand and nor) are already occurring I see, there're already such simplifications in the gimple phase, so the question: is there any need for and/ior:not not pattern? Can you provide a testcase to demonstrate that and/ior: not not pattern is needed? > in common code, _if_ no suitable insn can be found. That was at > least the conclusion I drew from looking around a lot, supported > by the code that's generated prior to this change. > > Jan
On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote: > > On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote: > > > > On 25.06.2023 06:42, Hongtao Liu wrote: > > > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches > > > <gcc-patches@gcc.gnu.org> wrote: > > >> > > >> +(define_code_iterator andor [and ior]) > > >> +(define_code_attr nlogic [(and "nor") (ior "nand")]) > > >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) > > >> + > > >> +(define_insn "*<nlogic><mode>3" > > >> + [(set (match_operand:VI 0 "register_operand" "=v,v") > > >> + (andor:VI > > >> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) > > >> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > > > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform > > > (and (not op1)) (not op2)) -> (not: (ior: op1 op2)) > > > > This wouldn't be a win (not + andn) -> (or + not), but what's > > more important is ... > > > > > (ior (not op1) (not op2)) -> (not : (and op1 op2)) > > > > > > Even w/o avx512f, the transformation should also benefit since it > > > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn). > > > > ... that these transformations (from the, as per the doc, > > canonical representation of nand and nor) are already occurring > I see, there're already such simplifications in the gimple phase, so > the question: is there any need for and/ior:not not pattern? > Can you provide a testcase to demonstrate that and/ior: not not > pattern is needed? typedef int v4si __attribute__((vector_size(16))); v4si foo1 (v4si a, v4si b) { return ~a & ~b; } I only gimple have optimized it to <bb 2> [local count: 1073741824]: # DEBUG BEGIN_STMT _1 = a_2(D) | b_3(D); _4 = ~_1; return _4; But rtl still try to match (set (reg:V4SI 86) (and:V4SI (not:V4SI (reg:V4SI 88)) (not:V4SI (reg:V4SI 89)))) Hmm. > > in common code, _if_ no suitable insn can be found. That was at > > least the conclusion I drew from looking around a lot, supported > > by the code that's generated prior to this change. > > > > Jan > > > > -- > BR, > Hongtao
On Sun, Jun 25, 2023 at 3:23 PM Hongtao Liu <crazylht@gmail.com> wrote: > > On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote: > > > > On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote: > > > > > > On 25.06.2023 06:42, Hongtao Liu wrote: > > > > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches > > > > <gcc-patches@gcc.gnu.org> wrote: > > > >> > > > >> +(define_code_iterator andor [and ior]) > > > >> +(define_code_attr nlogic [(and "nor") (ior "nand")]) > > > >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) > > > >> + > > > >> +(define_insn "*<nlogic><mode>3" > > > >> + [(set (match_operand:VI 0 "register_operand" "=v,v") > > > >> + (andor:VI > > > >> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) > > > >> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > > > > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform > > > > (and (not op1)) (not op2)) -> (not: (ior: op1 op2)) > > > > > > This wouldn't be a win (not + andn) -> (or + not), but what's > > > more important is ... > > > > > > > (ior (not op1) (not op2)) -> (not : (and op1 op2)) > > > > > > > > Even w/o avx512f, the transformation should also benefit since it > > > > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn). > > > > > > ... that these transformations (from the, as per the doc, > > > canonical representation of nand and nor) are already occurring > > I see, there're already such simplifications in the gimple phase, so > > the question: is there any need for and/ior:not not pattern? > > Can you provide a testcase to demonstrate that and/ior: not not > > pattern is needed? > > typedef int v4si __attribute__((vector_size(16))); > v4si > foo1 (v4si a, v4si b) > { > return ~a & ~b; > } > > I only gimple have optimized it to > > <bb 2> [local count: 1073741824]: > # DEBUG BEGIN_STMT > _1 = a_2(D) | b_3(D); > _4 = ~_1; > return _4; > > > But rtl still try to match > > (set (reg:V4SI 86) > (and:V4SI (not:V4SI (reg:V4SI 88)) > (not:V4SI (reg:V4SI 89)))) > > Hmm. In rtl, we're using xor -1 for not, so it's (insn 8 7 9 2 (set (reg:V4SI 87) (ior:V4SI (reg:V4SI 88) (reg:V4SI 89))) "/app/example.cpp":6:15 6830 {*iorv4si3} (expr_list:REG_DEAD (reg:V4SI 89) (expr_list:REG_DEAD (reg:V4SI 88) (nil)))) (insn 9 8 14 2 (set (reg:V4SI 86) (xor:V4SI (reg:V4SI 87) (const_vector:V4SI [ (const_int -1 [0xffffffffffffffff]) repeated x4 ]))) "/app/example.cpp":6:18 6792 {*one_cmplv4si2} Then simplified to > (set (reg:V4SI 86) > (and:V4SI (not:V4SI (reg:V4SI 88)) > (not:V4SI (reg:V4SI 89)))) > by 3565 case XOR: 3566 if (trueop1 == CONST0_RTX (mode)) 3567 return op0; 3568 if (INTEGRAL_MODE_P (mode) && trueop1 == CONSTM1_RTX (mode)) 3569 return simplify_gen_unary (NOT, mode, op0, mode); and 1018 /* Apply De Morgan's laws to reduce number of patterns for machines 1019 with negating logical insns (and-not, nand, etc.). If result has 1020 only one NOT, put it first, since that is how the patterns are 1021 coded. */ 1022 if (GET_CODE (op) == IOR || GET_CODE (op) == AND) 1023 { 1024 rtx in1 = XEXP (op, 0), in2 = XEXP (op, 1); 1025 machine_mode op_mode; 1026 1027 op_mode = GET_MODE (in1); 1028 in1 = simplify_gen_unary (NOT, op_mode, in1, op_mode); 1029 1030 op_mode = GET_MODE (in2); 1031 if (op_mode == VOIDmode) 1032 op_mode = mode; 1033 in2 = simplify_gen_unary (NOT, op_mode, in2, op_mode); 1034 1035 if (GET_CODE (in2) == NOT && GET_CODE (in1) != NOT) 1036 std::swap (in1, in2); 1037 1038 return gen_rtx_fmt_ee (GET_CODE (op) == IOR ? AND : IOR, 1039 mode, in1, in2); 1040 } Ok, got it, and/ior:not not pattern LGTM then. > > > in common code, _if_ no suitable insn can be found. That was at > > > least the conclusion I drew from looking around a lot, supported > > > by the code that's generated prior to this change. > > > > > > Jan > > > > > > > > -- > > BR, > > Hongtao > > > > -- > BR, > Hongtao
On 25.06.2023 09:30, Hongtao Liu wrote: > On Sun, Jun 25, 2023 at 3:23 PM Hongtao Liu <crazylht@gmail.com> wrote: >> >> On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote: >>> >>> On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote: >>>> >>>> On 25.06.2023 06:42, Hongtao Liu wrote: >>>>> On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches >>>>> <gcc-patches@gcc.gnu.org> wrote: >>>>>> >>>>>> +(define_code_iterator andor [and ior]) >>>>>> +(define_code_attr nlogic [(and "nor") (ior "nand")]) >>>>>> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) >>>>>> + >>>>>> +(define_insn "*<nlogic><mode>3" >>>>>> + [(set (match_operand:VI 0 "register_operand" "=v,v") >>>>>> + (andor:VI >>>>>> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) >>>>>> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] >>>>> I'm thinking of doing it in simplify_rtx or gimple match.pd to transform >>>>> (and (not op1)) (not op2)) -> (not: (ior: op1 op2)) >>>> >>>> This wouldn't be a win (not + andn) -> (or + not), but what's >>>> more important is ... >>>> >>>>> (ior (not op1) (not op2)) -> (not : (and op1 op2)) >>>>> >>>>> Even w/o avx512f, the transformation should also benefit since it >>>>> takes less logic operations 3 -> 2.(or 2 -> 2 for pandn). >>>> >>>> ... that these transformations (from the, as per the doc, >>>> canonical representation of nand and nor) are already occurring >>> I see, there're already such simplifications in the gimple phase, so >>> the question: is there any need for and/ior:not not pattern? >>> Can you provide a testcase to demonstrate that and/ior: not not >>> pattern is needed? >> >> typedef int v4si __attribute__((vector_size(16))); >> v4si >> foo1 (v4si a, v4si b) >> { >> return ~a & ~b; >> } >> >> I only gimple have optimized it to >> >> <bb 2> [local count: 1073741824]: >> # DEBUG BEGIN_STMT >> _1 = a_2(D) | b_3(D); >> _4 = ~_1; >> return _4; >> >> >> But rtl still try to match >> >> (set (reg:V4SI 86) >> (and:V4SI (not:V4SI (reg:V4SI 88)) >> (not:V4SI (reg:V4SI 89)))) >> >> Hmm. > In rtl, we're using xor -1 for not, so it's > > (insn 8 7 9 2 (set (reg:V4SI 87) > (ior:V4SI (reg:V4SI 88) > (reg:V4SI 89))) "/app/example.cpp":6:15 6830 {*iorv4si3} > (expr_list:REG_DEAD (reg:V4SI 89) > (expr_list:REG_DEAD (reg:V4SI 88) > (nil)))) > (insn 9 8 14 2 (set (reg:V4SI 86) > (xor:V4SI (reg:V4SI 87) > (const_vector:V4SI [ > (const_int -1 [0xffffffffffffffff]) repeated x4 > ]))) "/app/example.cpp":6:18 6792 {*one_cmplv4si2} > > Then simplified to >> (set (reg:V4SI 86) >> (and:V4SI (not:V4SI (reg:V4SI 88)) >> (not:V4SI (reg:V4SI 89)))) >> > > by > > 3565 case XOR: > 3566 if (trueop1 == CONST0_RTX (mode)) > 3567 return op0; > 3568 if (INTEGRAL_MODE_P (mode) && trueop1 == CONSTM1_RTX (mode)) > 3569 return simplify_gen_unary (NOT, mode, op0, mode); > > and > > 1018 /* Apply De Morgan's laws to reduce number of patterns for machines > 1019 with negating logical insns (and-not, nand, etc.). If result has > 1020 only one NOT, put it first, since that is how the patterns are > 1021 coded. */ > 1022 if (GET_CODE (op) == IOR || GET_CODE (op) == AND) > 1023 { > 1024 rtx in1 = XEXP (op, 0), in2 = XEXP (op, 1); > 1025 machine_mode op_mode; > 1026 > 1027 op_mode = GET_MODE (in1); > 1028 in1 = simplify_gen_unary (NOT, op_mode, in1, op_mode); > 1029 > 1030 op_mode = GET_MODE (in2); > 1031 if (op_mode == VOIDmode) > 1032 op_mode = mode; > 1033 in2 = simplify_gen_unary (NOT, op_mode, in2, op_mode); > 1034 > 1035 if (GET_CODE (in2) == NOT && GET_CODE (in1) != NOT) > 1036 std::swap (in1, in2); > 1037 > 1038 return gen_rtx_fmt_ee (GET_CODE (op) == IOR ? AND : IOR, > 1039 mode, in1, in2); > 1040 } > > > Ok, got it, and/ior:not not pattern LGTM then. Just to avoid misunderstandings - together with your initial reply that's then an "okay" to the patch as a whole, right? Thanks, Jan
On Sun, Jun 25, 2023 at 9:35 PM Jan Beulich <jbeulich@suse.com> wrote: > > On 25.06.2023 09:30, Hongtao Liu wrote: > > On Sun, Jun 25, 2023 at 3:23 PM Hongtao Liu <crazylht@gmail.com> wrote: > >> > >> On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote: > >>> > >>> On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote: > >>>> > >>>> On 25.06.2023 06:42, Hongtao Liu wrote: > >>>>> On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches > >>>>> <gcc-patches@gcc.gnu.org> wrote: > >>>>>> > >>>>>> +(define_code_iterator andor [and ior]) > >>>>>> +(define_code_attr nlogic [(and "nor") (ior "nand")]) > >>>>>> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) > >>>>>> + > >>>>>> +(define_insn "*<nlogic><mode>3" > >>>>>> + [(set (match_operand:VI 0 "register_operand" "=v,v") > >>>>>> + (andor:VI > >>>>>> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) > >>>>>> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > >>>>> I'm thinking of doing it in simplify_rtx or gimple match.pd to transform > >>>>> (and (not op1)) (not op2)) -> (not: (ior: op1 op2)) > >>>> > >>>> This wouldn't be a win (not + andn) -> (or + not), but what's > >>>> more important is ... > >>>> > >>>>> (ior (not op1) (not op2)) -> (not : (and op1 op2)) > >>>>> > >>>>> Even w/o avx512f, the transformation should also benefit since it > >>>>> takes less logic operations 3 -> 2.(or 2 -> 2 for pandn). > >>>> > >>>> ... that these transformations (from the, as per the doc, > >>>> canonical representation of nand and nor) are already occurring > >>> I see, there're already such simplifications in the gimple phase, so > >>> the question: is there any need for and/ior:not not pattern? > >>> Can you provide a testcase to demonstrate that and/ior: not not > >>> pattern is needed? > >> > >> typedef int v4si __attribute__((vector_size(16))); > >> v4si > >> foo1 (v4si a, v4si b) > >> { > >> return ~a & ~b; > >> } > >> > >> I only gimple have optimized it to > >> > >> <bb 2> [local count: 1073741824]: > >> # DEBUG BEGIN_STMT > >> _1 = a_2(D) | b_3(D); > >> _4 = ~_1; > >> return _4; > >> > >> > >> But rtl still try to match > >> > >> (set (reg:V4SI 86) > >> (and:V4SI (not:V4SI (reg:V4SI 88)) > >> (not:V4SI (reg:V4SI 89)))) > >> > >> Hmm. > > In rtl, we're using xor -1 for not, so it's > > > > (insn 8 7 9 2 (set (reg:V4SI 87) > > (ior:V4SI (reg:V4SI 88) > > (reg:V4SI 89))) "/app/example.cpp":6:15 6830 {*iorv4si3} > > (expr_list:REG_DEAD (reg:V4SI 89) > > (expr_list:REG_DEAD (reg:V4SI 88) > > (nil)))) > > (insn 9 8 14 2 (set (reg:V4SI 86) > > (xor:V4SI (reg:V4SI 87) > > (const_vector:V4SI [ > > (const_int -1 [0xffffffffffffffff]) repeated x4 > > ]))) "/app/example.cpp":6:18 6792 {*one_cmplv4si2} > > > > Then simplified to > >> (set (reg:V4SI 86) > >> (and:V4SI (not:V4SI (reg:V4SI 88)) > >> (not:V4SI (reg:V4SI 89)))) > >> > > > > by > > > > 3565 case XOR: > > 3566 if (trueop1 == CONST0_RTX (mode)) > > 3567 return op0; > > 3568 if (INTEGRAL_MODE_P (mode) && trueop1 == CONSTM1_RTX (mode)) > > 3569 return simplify_gen_unary (NOT, mode, op0, mode); > > > > and > > > > 1018 /* Apply De Morgan's laws to reduce number of patterns for machines > > 1019 with negating logical insns (and-not, nand, etc.). If result has > > 1020 only one NOT, put it first, since that is how the patterns are > > 1021 coded. */ > > 1022 if (GET_CODE (op) == IOR || GET_CODE (op) == AND) > > 1023 { > > 1024 rtx in1 = XEXP (op, 0), in2 = XEXP (op, 1); > > 1025 machine_mode op_mode; > > 1026 > > 1027 op_mode = GET_MODE (in1); > > 1028 in1 = simplify_gen_unary (NOT, op_mode, in1, op_mode); > > 1029 > > 1030 op_mode = GET_MODE (in2); > > 1031 if (op_mode == VOIDmode) > > 1032 op_mode = mode; > > 1033 in2 = simplify_gen_unary (NOT, op_mode, in2, op_mode); > > 1034 > > 1035 if (GET_CODE (in2) == NOT && GET_CODE (in1) != NOT) > > 1036 std::swap (in1, in2); > > 1037 > > 1038 return gen_rtx_fmt_ee (GET_CODE (op) == IOR ? AND : IOR, > > 1039 mode, in1, in2); > > 1040 } > > > > > > Ok, got it, and/ior:not not pattern LGTM then. > > Just to avoid misunderstandings - together with your initial > reply that's then an "okay" to the patch as a whole, right? Yes. > > Thanks, Jan
--- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -21178,6 +21178,32 @@ ix86_rtx_costs (rtx x, machine_mode mode return false; case IOR: + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + /* (ior (not ...) ...) can be a single insn in AVX512. */ + if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F + && (GET_MODE_SIZE (mode) == 64 + || (TARGET_AVX512VL + && (GET_MODE_SIZE (mode) == 32 + || GET_MODE_SIZE (mode) == 16)))) + { + rtx right = GET_CODE (XEXP (x, 1)) != NOT + ? XEXP (x, 1) : XEXP (XEXP (x, 1), 0); + + *total = ix86_vec_cost (mode, cost->sse_op) + + rtx_cost (XEXP (XEXP (x, 0), 0), mode, + outer_code, opno, speed) + + rtx_cost (right, mode, outer_code, opno, speed); + return true; + } + *total = ix86_vec_cost (mode, cost->sse_op); + } + else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + *total = cost->add * 2; + else + *total = cost->add; + return false; + case XOR: if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) *total = ix86_vec_cost (mode, cost->sse_op); @@ -21198,11 +21224,20 @@ ix86_rtx_costs (rtx x, machine_mode mode /* pandn is a single instruction. */ if (GET_CODE (XEXP (x, 0)) == NOT) { + rtx right = XEXP (x, 1); + + /* (and (not ...) (not ...)) can be a single insn in AVX512. */ + if (GET_CODE (right) == NOT && TARGET_AVX512F + && (GET_MODE_SIZE (mode) == 64 + || (TARGET_AVX512VL + && (GET_MODE_SIZE (mode) == 32 + || GET_MODE_SIZE (mode) == 16)))) + right = XEXP (right, 0); + *total = ix86_vec_cost (mode, cost->sse_op) + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code, opno, speed) - + rtx_cost (XEXP (x, 1), mode, - outer_code, opno, speed); + + rtx_cost (right, mode, outer_code, opno, speed); return true; } else if (GET_CODE (XEXP (x, 1)) == NOT) @@ -21260,8 +21295,25 @@ ix86_rtx_costs (rtx x, machine_mode mode case NOT: if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - // vnot is pxor -1. - *total = ix86_vec_cost (mode, cost->sse_op) + 1; + { + /* (not (xor ...)) can be a single insn in AVX512. */ + if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F + && (GET_MODE_SIZE (mode) == 64 + || (TARGET_AVX512VL + && (GET_MODE_SIZE (mode) == 32 + || GET_MODE_SIZE (mode) == 16)))) + { + *total = ix86_vec_cost (mode, cost->sse_op) + + rtx_cost (XEXP (XEXP (x, 0), 0), mode, + outer_code, opno, speed) + + rtx_cost (XEXP (XEXP (x, 0), 1), mode, + outer_code, opno, speed); + return true; + } + + // vnot is pxor -1. + *total = ix86_vec_cost (mode, cost->sse_op) + 1; + } else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) *total = cost->add * 2; else --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17616,6 +17616,98 @@ operands[2] = force_reg (V1TImode, CONSTM1_RTX (V1TImode)); }) +(define_insn "*iornot<mode>3" + [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") + (ior:VI + (not:VI + (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) + (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] + "(<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) + && (register_operand (operands[1], <MODE>mode) + || register_operand (operands[2], <MODE>mode))" +{ + if (!register_operand (operands[1], <MODE>mode)) + { + if (TARGET_AVX512VL) + return "vpternlog<ternlogsuffix>\t{$0xdd, %1, %2, %0|%0, %2, %1, 0xdd}"; + return "vpternlog<ternlogsuffix>\t{$0xdd, %g1, %g2, %g0|%g0, %g2, %g1, 0xdd}"; + } + if (TARGET_AVX512VL) + return "vpternlog<ternlogsuffix>\t{$0xbb, %2, %1, %0|%0, %1, %2, 0xbb}"; + return "vpternlog<ternlogsuffix>\t{$0xbb, %g2, %g1, %g0|%g0, %g1, %g2, 0xbb}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set (attr "mode") + (if_then_else (match_test "TARGET_AVX512VL") + (const_string "<sseinsnmode>") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "2,3") + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") + (const_string "*")))]) + +(define_insn "*xnor<mode>3" + [(set (match_operand:VI 0 "register_operand" "=v,v") + (not:VI + (xor:VI + (match_operand:VI 1 "bcst_vector_operand" "%v,v") + (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] + "(<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) + && (register_operand (operands[1], <MODE>mode) + || register_operand (operands[2], <MODE>mode))" +{ + if (TARGET_AVX512VL) + return "vpternlog<ternlogsuffix>\t{$0x99, %2, %1, %0|%0, %1, %2, 0x99}"; + else + return "vpternlog<ternlogsuffix>\t{$0x99, %g2, %g1, %g0|%g0, %g1, %g2, 0x99}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set (attr "mode") + (if_then_else (match_test "TARGET_AVX512VL") + (const_string "<sseinsnmode>") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") + (const_string "*")))]) + +(define_code_iterator andor [and ior]) +(define_code_attr nlogic [(and "nor") (ior "nand")]) +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) + +(define_insn "*<nlogic><mode>3" + [(set (match_operand:VI 0 "register_operand" "=v,v") + (andor:VI + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] + "(<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) + && (register_operand (operands[1], <MODE>mode) + || register_operand (operands[2], <MODE>mode))" +{ + if (TARGET_AVX512VL) + return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %2, %1, %0|%0, %1, %2, <ternlog_nlogic>}"; + else + return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %g2, %g1, %g0|%g0, %g1, %g2, <ternlog_nlogic>}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set (attr "mode") + (if_then_else (match_test "TARGET_AVX512VL") + (const_string "<sseinsnmode>") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") + (const_string "*")))]) + (define_mode_iterator AVX512ZEXTMASK [(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI]) --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-1.h @@ -0,0 +1,13 @@ +#include <immintrin.h> + +#define PASTER2(x,y) x##y +#define PASTER3(x,y,z) _mm##x##_##y##_##z +#define OP(vec, op, suffix) PASTER3 (vec, op, suffix) +#define DUP(vec, suffix, val) PASTER3 (vec, set1, suffix) (val) + +type +foo (type x, SCALAR *f) +{ + return OP (vec, op, suffix) (x, OP (vec, xor, suffix) (DUP (vec, suffix, *f), + DUP (vec, suffix, ~0))); +} --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-2.h @@ -0,0 +1,13 @@ +#include <immintrin.h> + +#define PASTER2(x,y) x##y +#define PASTER3(x,y,z) _mm##x##_##y##_##z +#define OP(vec, op, suffix) PASTER3 (vec, op, suffix) +#define DUP(vec, suffix, val) PASTER3 (vec, set1, suffix) (val) + +type +foo (type x, SCALAR *f) +{ + return OP (vec, op, suffix) (OP (vec, xor, suffix) (x, DUP (vec, suffix, ~0)), + DUP (vec, suffix, *f)); +} --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-1.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xdd, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */ +/* { dg-final { scan-assembler-not "vpbroadcast" } } */ + +#define type __m512i +#define vec 512 +#define op or +#define suffix epi32 +#define SCALAR int + +#include "avx512-binop-not-1.h" --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-2.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xbb, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */ +/* { dg-final { scan-assembler-not "vpbroadcast" } } */ + +#define type __m512i +#define vec 512 +#define op or +#define suffix epi32 +#define SCALAR int + +#include "avx512-binop-not-2.h"