Message ID | 20211221062659.102153-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | [i386] Add define_insn_and_split for vpcmp{b, w, d, q} vpcmp{ph, ps, pd}. | expand |
On Tue, Dec 21, 2021 at 2:27 PM liuhongt <hongtao.liu@intel.com> wrote: > > The purpose of those define_insn_and_split: > 1. Combine vpcmpuw and zero_extend into vpcmpuw. > 2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just kmov > 3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant kmov. Use DImode as dest of zero_extend is too aggressive which causes several regression. New patch add define_insn_and_split just combine vpcmpuw and zero_extend into vpcmpuw. Here's the patch i'm checking in. > > It should partially fix the issue in PR. > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ready to push to trunk. > > gcc/ChangeLog: > > PR target/103750 > * config/i386/sse.md > (*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>): > New define_insn_and_split. > (*<avx512>_cmp<mode>3): Ditto. > (*<avx512>_cmp<mode>3_zero_extenddi): New define_insn. > (*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>): > New define_insn_and_split. > (*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>): > Ditto. > (*<avx512>_ucmp<mode>3): Ditto. > (*<avx512>_ucmp<mode>3_zero_extenddi): New define_insn. > (*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>): > New define_insn_and_split. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/bitwise_mask_op-3.c: Adjust test/ > * g++.target/i386/pr103750-1.C: New test. > --- > gcc/config/i386/sse.md | 267 ++++++++++++++++++ > gcc/testsuite/g++.target/i386/pr103750-1.C | 50 ++++ > .../gcc.target/i386/bitwise_mask_op-3.c | 6 +- > 3 files changed, 320 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 5196149ee32..fb885d58272 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -3702,6 +3702,75 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>" > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom > +;; duplicated vpcmpuw to vpcmpuw and kmov > +;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg. > +(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec:<V48H_AVX512VL:avx512fmaskmode> > + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") > + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<V48H_AVX512VL:cmp_imm_predicate>" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) > + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" > + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& <SWI248x:MODE>mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<V48H_AVX512VL:avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")]) > + > +(define_insn_and_split "*<avx512>_cmp<mode>3" > + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") > + (unspec:<avx512fmaskmode> > + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") > + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<MODE>mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") > + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < 64" > + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_insn_and_split "*<avx512>_cmp<mode>3" > [(set (match_operand:<avx512fmaskmode> 0 "register_operand") > (not:<avx512fmaskmode> > @@ -3735,6 +3804,72 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>" > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec:<VI12_AVX512VL:avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<VI12_AVX512VL:cmp_imm_predicate>" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) > + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" > + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& <SWI248x:MODE>mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<VI12_AVX512VL:avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")]) > + > +(define_insn_and_split "*<avx512>_cmp<mode>3" > + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") > + (unspec:<avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" > + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_int_iterator UNSPEC_PCMP_ITER > [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP]) > > @@ -3771,6 +3906,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec:<VI12_AVX512VL:avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "const_0_to_7_operand" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) > + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" > + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& <SWI248x:MODE>mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<VI12_AVX512VL:avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")]) > + > +(define_insn_and_split "*<avx512>_ucmp<mode>3" > + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") > + (unspec:<avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_UNSIGNED_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" > + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" > [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") > (unspec:<avx512fmaskmode> > @@ -3785,6 +3986,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec:<VI48_AVX512VL:avx512fmaskmode> > + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") > + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "const_0_to_7_operand" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) > + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" > + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& <SWI248x:MODE>mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<VI48_AVX512VL:avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")]) > + > +(define_insn_and_split "*<avx512>_ucmp<mode>3" > + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") > + (unspec:<avx512fmaskmode> > + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") > + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_UNSIGNED_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") > + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64" > + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_insn_and_split "*<avx512>_ucmp<mode>3" > [(set (match_operand:<avx512fmaskmode> 0 "register_operand") > (not:<avx512fmaskmode> > diff --git a/gcc/testsuite/g++.target/i386/pr103750-1.C b/gcc/testsuite/g++.target/i386/pr103750-1.C > new file mode 100644 > index 00000000000..83f471331b3 > --- /dev/null > +++ b/gcc/testsuite/g++.target/i386/pr103750-1.C > @@ -0,0 +1,50 @@ > +/* PR target/103750 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */ > +/* { dg-final { scan-assembler-times "kmovw" 2 } } */ > +/* { dg-final { scan-assembler-times "kmovd" 2 } } */ > +/* There shouldn't be any kmovw/kmovd inside the loop. */ > +#include <immintrin.h> > + > +const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept > +{ > + __m256i mch256 = _mm256_set1_epi16(c); > + for ( ; n < e; n += 32) { > + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); > + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1); > + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); > + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); > + if (_kortestz_mask16_u8(mask1, mask2)) > + continue; > + > + unsigned idx = _tzcnt_u32(mask1); > + if (mask1 == 0) { > + idx = __tzcnt_u16(mask2); > + n += 16; > + } > + return n + idx; > + } > + return e; > +} > + > +const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept > +{ > + __m256i mch256 = _mm256_set1_epi16(c); > + for ( ; n < e; n += 32) { > + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); > + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1); > + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); > + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); > + if (_kortestz_mask32_u8(mask1, mask2)) > + continue; > + > + unsigned idx = _tzcnt_u32(mask1); > + if (mask1 == 0) { > + idx = __tzcnt_u16(mask2); > + n += 16; > + } > + return n + idx; > + } > + return e; > +} > + > diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c > index 352c49d6c6b..82bb99e30af 100644 > --- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c > +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c > @@ -12,7 +12,7 @@ foo_orb (__m512i a, __m512i b) > foo = m1 | m2; > } > > -/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */ > > void > foo_xorb (__m512i a, __m512i b) > @@ -22,7 +22,7 @@ foo_xorb (__m512i a, __m512i b) > foo = m1 ^ m2; > } > > -/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */ > > void > foo_andb (__m512i a, __m512i b) > @@ -40,4 +40,4 @@ foo_andnb (__m512i a, __m512i b) > foo = m1 & ~m2; > } > > -/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { ! ia32 } } } } } */ > -- > 2.18.1 >
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5196149ee32..fb885d58272 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -3702,6 +3702,75 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom +;; duplicated vpcmpuw to vpcmpuw and kmov +;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg. +(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec:<V48H_AVX512VL:avx512fmaskmode> + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<V48H_AVX512VL:cmp_imm_predicate>" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "&& <SWI248x:MODE>mode != E_DImode" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<V48H_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")]) + +(define_insn_and_split "*<avx512>_cmp<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_PCMP))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<MODE>mode) < 64" + "#" + "&& 1" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi" + [(set (match_operand:DI 0 "register_operand" "=k") + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < 64" + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn_and_split "*<avx512>_cmp<mode>3" [(set (match_operand:<avx512fmaskmode> 0 "register_operand") (not:<avx512fmaskmode> @@ -3735,6 +3804,72 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec:<VI12_AVX512VL:avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<VI12_AVX512VL:cmp_imm_predicate>" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "&& <SWI248x:MODE>mode != E_DImode" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<VI12_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")]) + +(define_insn_and_split "*<avx512>_cmp<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_PCMP))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" + "#" + "&& 1" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi" + [(set (match_operand:DI 0 "register_operand" "=k") + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_int_iterator UNSPEC_PCMP_ITER [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP]) @@ -3771,6 +3906,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec:<VI12_AVX512VL:avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512BW + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "&& <SWI248x:MODE>mode != E_DImode" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<VI12_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_UNSIGNED_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")]) + +(define_insn_and_split "*<avx512>_ucmp<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_UNSIGNED_PCMP))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" + "#" + "&& 1" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_UNSIGNED_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi" + [(set (match_operand:DI 0 "register_operand" "=k") + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> @@ -3785,6 +3986,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec:<VI48_AVX512VL:avx512fmaskmode> + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512BW + && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "&& <SWI248x:MODE>mode != E_DImode" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<VI48_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_UNSIGNED_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")]) + +(define_insn_and_split "*<avx512>_ucmp<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_UNSIGNED_PCMP))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64" + "#" + "&& 1" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_UNSIGNED_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi" + [(set (match_operand:DI 0 "register_operand" "=k") + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64" + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn_and_split "*<avx512>_ucmp<mode>3" [(set (match_operand:<avx512fmaskmode> 0 "register_operand") (not:<avx512fmaskmode> diff --git a/gcc/testsuite/g++.target/i386/pr103750-1.C b/gcc/testsuite/g++.target/i386/pr103750-1.C new file mode 100644 index 00000000000..83f471331b3 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr103750-1.C @@ -0,0 +1,50 @@ +/* PR target/103750 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */ +/* { dg-final { scan-assembler-times "kmovw" 2 } } */ +/* { dg-final { scan-assembler-times "kmovd" 2 } } */ +/* There shouldn't be any kmovw/kmovd inside the loop. */ +#include <immintrin.h> + +const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept +{ + __m256i mch256 = _mm256_set1_epi16(c); + for ( ; n < e; n += 32) { + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1); + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); + if (_kortestz_mask16_u8(mask1, mask2)) + continue; + + unsigned idx = _tzcnt_u32(mask1); + if (mask1 == 0) { + idx = __tzcnt_u16(mask2); + n += 16; + } + return n + idx; + } + return e; +} + +const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept +{ + __m256i mch256 = _mm256_set1_epi16(c); + for ( ; n < e; n += 32) { + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1); + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); + if (_kortestz_mask32_u8(mask1, mask2)) + continue; + + unsigned idx = _tzcnt_u32(mask1); + if (mask1 == 0) { + idx = __tzcnt_u16(mask2); + n += 16; + } + return n + idx; + } + return e; +} + diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c index 352c49d6c6b..82bb99e30af 100644 --- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c @@ -12,7 +12,7 @@ foo_orb (__m512i a, __m512i b) foo = m1 | m2; } -/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */ void foo_xorb (__m512i a, __m512i b) @@ -22,7 +22,7 @@ foo_xorb (__m512i a, __m512i b) foo = m1 ^ m2; } -/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */ void foo_andb (__m512i a, __m512i b) @@ -40,4 +40,4 @@ foo_andnb (__m512i a, __m512i b) foo = m1 & ~m2; } -/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { ! ia32 } } } } } */