diff mbox series

Improve memcmpeq for 512-bit vector with vpcmpeq + kortest.

Message ID 20231027054736.3529877-1-hongtao.liu@intel.com
State New
Headers show
Series Improve memcmpeq for 512-bit vector with vpcmpeq + kortest. | expand

Commit Message

Liu, Hongtao Oct. 27, 2023, 5:47 a.m. UTC
When 2 vectors are equal, kmask is allones and kortest will set CF,
else CF will be cleared.

So CF bit can be used to check for the result of the comparison.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

Before:
        vmovdqu (%rsi), %ymm0
        vpxorq  (%rdi), %ymm0, %ymm0
        vptest  %ymm0, %ymm0
        jne     .L2
        vmovdqu 32(%rsi), %ymm0
        vpxorq  32(%rdi), %ymm0, %ymm0
        vptest  %ymm0, %ymm0
        je      .L5
.L2:
        movl    $1, %eax
        xorl    $1, %eax
        vzeroupper
        ret

After:
        vmovdqu64       (%rsi), %zmm0
        xorl    %eax, %eax
        vpcmpeqd        (%rdi), %zmm0, %k0
        kortestw        %k0, %k0
        setc    %al
        vzeroupper
        ret

gcc/ChangeLog:

	PR target/104610
	* config/i386/i386-expand.cc (ix86_expand_branch): Handle
	512-bit vector with vpcmpeq + kortest.
	* config/i386/i386.md (cbranchxi4): New expander.
	* config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
	and V8DImode.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr104610-2.c: New test.
---
 gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
 gcc/config/i386/i386.md                    | 16 +++++++
 gcc/config/i386/sse.md                     | 36 +++++++++++---
 gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
 4 files changed, 99 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c

Comments

Richard Biener Oct. 27, 2023, 6:49 a.m. UTC | #1
> Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao.liu@intel.com>:
> 
> When 2 vectors are equal, kmask is allones and kortest will set CF,
> else CF will be cleared.
> 
> So CF bit can be used to check for the result of the comparison.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

Is that also profitable for 256bit aka AVX10?
Is there a jump on carry in case the result feeds control flow rather than a value and is using ktest better then (does combine figure this out?)

> Before:
>        vmovdqu (%rsi), %ymm0
>        vpxorq  (%rdi), %ymm0, %ymm0
>        vptest  %ymm0, %ymm0
>        jne     .L2
>        vmovdqu 32(%rsi), %ymm0
>        vpxorq  32(%rdi), %ymm0, %ymm0
>        vptest  %ymm0, %ymm0
>        je      .L5
> .L2:
>        movl    $1, %eax
>        xorl    $1, %eax
>        vzeroupper
>        ret
> 
> After:
>        vmovdqu64       (%rsi), %zmm0
>        xorl    %eax, %eax
>        vpcmpeqd        (%rdi), %zmm0, %k0
>        kortestw        %k0, %k0
>        setc    %al
>        vzeroupper
>        ret
> 
> gcc/ChangeLog:
> 
>    PR target/104610
>    * config/i386/i386-expand.cc (ix86_expand_branch): Handle
>    512-bit vector with vpcmpeq + kortest.
>    * config/i386/i386.md (cbranchxi4): New expander.
>    * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
>    and V8DImode.
> 
> gcc/testsuite/ChangeLog:
> 
>    * gcc.target/i386/pr104610-2.c: New test.
> ---
> gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
> gcc/config/i386/i386.md                    | 16 +++++++
> gcc/config/i386/sse.md                     | 36 +++++++++++---
> gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
> 4 files changed, 99 insertions(+), 22 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
> 
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 1eae9d7c78c..c664cb61e80 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
>   rtx tmp;
> 
>   /* Handle special case - vector comparsion with boolean result, transform
> -     it using ptest instruction.  */
> +     it using ptest instruction or vpcmpeq + kortest.  */
>   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
>       || (mode == TImode && !TARGET_64BIT)
> -      || mode == OImode)
> +      || mode == OImode
> +      || GET_MODE_SIZE (mode) == 64)
>     {
> -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
> -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
> +      unsigned msize = GET_MODE_SIZE (mode);
> +      machine_mode p_mode
> +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
> +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
> +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
> 
>       gcc_assert (code == EQ || code == NE);
> 
> -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
> +      if (msize == 64)
>    {
> -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> -      mode = p_mode;
> +      if (mode != V16SImode)
> +        {
> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> +        }
> +
> +      tmp = gen_reg_rtx (HImode);
> +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
> +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
> +    }
> +      /* Using ptest for 128/256-bit vectors.  */
> +      else
> +    {
> +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> +        {
> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> +          mode = p_mode;
> +        }
> +
> +      /* Generate XOR since we can't check that one operand is zero
> +         vector.  */
> +      tmp = gen_reg_rtx (mode);
> +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> +      tmp = gen_lowpart (p_mode, tmp);
> +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> +                  gen_rtx_UNSPEC (CCZmode,
> +                          gen_rtvec (2, tmp, tmp),
> +                          UNSPEC_PTEST)));
>    }
> -      /* Generate XOR since we can't check that one operand is zero vector.  */
> -      tmp = gen_reg_rtx (mode);
> -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> -      tmp = gen_lowpart (p_mode, tmp);
> -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> -                  gen_rtx_UNSPEC (CCZmode,
> -                          gen_rtvec (2, tmp, tmp),
> -                          UNSPEC_PTEST)));
>       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
>       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
>                  gen_rtx_LABEL_REF (VOIDmode, label),
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index abaf2f311e8..51d8d0c3b97 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
>   DONE;
> })
> 
> +(define_expand "cbranchxi4"
> +  [(set (reg:CC FLAGS_REG)
> +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
> +            (match_operand:XI 2 "nonimmediate_operand")))
> +   (set (pc) (if_then_else
> +           (match_operator 0 "bt_comparison_operator"
> +        [(reg:CC FLAGS_REG) (const_int 0)])
> +           (label_ref (match_operand 3))
> +           (pc)))]
> +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
> +{
> +  ix86_expand_branch (GET_CODE (operands[0]),
> +              operands[1], operands[2], operands[3]);
> +  DONE;
> +})
> +
> (define_expand "cstore<mode>4"
>   [(set (reg:CC FLAGS_REG)
>    (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index c988935d4df..88fb1154699 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
>    (set_attr "type" "msklog")
>    (set_attr "prefix" "vex")])
> 
> -(define_insn "kortest<mode>"
> -  [(set (reg:CC FLAGS_REG)
> -    (unspec:CC
> +(define_insn "*kortest<mode>"
> +  [(set (reg FLAGS_REG)
> +    (unspec
>      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
>       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
>      UNSPEC_KORTEST))]
> @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
>    (set_attr "type" "msklog")
>    (set_attr "prefix" "vex")])
> 
> +(define_insn "kortest<mode>_ccc"
> +  [(set (reg:CCC FLAGS_REG)
> +    (unspec:CCC
> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> +      UNSPEC_KORTEST))]
> +  "TARGET_AVX512F")
> +
> +(define_insn "kortest<mode>_ccz"
> +  [(set (reg:CCZ FLAGS_REG)
> +    (unspec:CCZ
> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> +      UNSPEC_KORTEST))]
> +  "TARGET_AVX512F")
> +
> +(define_expand "kortest<mode>"
> +  [(set (reg:CC FLAGS_REG)
> +    (unspec:CC
> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> +      UNSPEC_KORTEST))]
> +  "TARGET_AVX512F")
> +
> (define_insn "kunpckhi"
>   [(set (match_operand:HI 0 "register_operand" "=k")
>    (ior:HI
> @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
> 
> (define_expand "cbranch<mode>4"
>   [(set (reg:CC FLAGS_REG)
> -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
> -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
> +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
> +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
>    (set (pc) (if_then_else
>           (match_operator 0 "bt_comparison_operator"
>        [(reg:CC FLAGS_REG) (const_int 0)])
>           (label_ref (match_operand 3))
>           (pc)))]
> -  "TARGET_SSE4_1"
> +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
> {
>   ix86_expand_branch (GET_CODE (operands[0]),
>              operands[1], operands[2], operands[3]);
> diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> new file mode 100644
> index 00000000000..999ef926a18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
> +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
> +
> +int compare (const char* s1, const char* s2)
> +{
> +  return __builtin_memcmp (s1, s2, 64) == 0;
> +}
> +
> +int compare1 (const char* s1, const char* s2)
> +{
> +  return __builtin_memcmp (s1, s2, 64) != 0;
> +}
> -- 
> 2.31.1
>
Hongtao Liu Oct. 27, 2023, 7:21 a.m. UTC | #2
On Fri, Oct 27, 2023 at 2:49 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
>
>
> > Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao.liu@intel.com>:
> >
> > When 2 vectors are equal, kmask is allones and kortest will set CF,
> > else CF will be cleared.
> >
> > So CF bit can be used to check for the result of the comparison.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
>
> Is that also profitable for 256bit aka AVX10?
Yes, it's also available for both 128-bit and 256-bit with AVX10, from
performance perspective it's better.
AVX10:
  vpcmp + kortest
 vs
AVX2:
 vpxor + vptest

 vptest is more expensive than vpcmp + kortest

> Is there a jump on carry in case the result feeds control flow rather than a value and is using ktest better then (does combine figure this out?)
There are JC and JNC, there're many pattern matches for ptest which
can't be automatically adjusted to kortest by combining, backend needs
to manually transform them.
That's why my patch only handles 64-bit vectors(to avoid regressing
those pattern match stuff).

>
> > Before:
> >        vmovdqu (%rsi), %ymm0
> >        vpxorq  (%rdi), %ymm0, %ymm0
> >        vptest  %ymm0, %ymm0
> >        jne     .L2
> >        vmovdqu 32(%rsi), %ymm0
> >        vpxorq  32(%rdi), %ymm0, %ymm0
> >        vptest  %ymm0, %ymm0
> >        je      .L5
> > .L2:
> >        movl    $1, %eax
> >        xorl    $1, %eax
> >        vzeroupper
> >        ret
> >
> > After:
> >        vmovdqu64       (%rsi), %zmm0
> >        xorl    %eax, %eax
> >        vpcmpeqd        (%rdi), %zmm0, %k0
> >        kortestw        %k0, %k0
> >        setc    %al
> >        vzeroupper
> >        ret
> >
> > gcc/ChangeLog:
> >
> >    PR target/104610
> >    * config/i386/i386-expand.cc (ix86_expand_branch): Handle
> >    512-bit vector with vpcmpeq + kortest.
> >    * config/i386/i386.md (cbranchxi4): New expander.
> >    * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
> >    and V8DImode.
> >
> > gcc/testsuite/ChangeLog:
> >
> >    * gcc.target/i386/pr104610-2.c: New test.
> > ---
> > gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
> > gcc/config/i386/i386.md                    | 16 +++++++
> > gcc/config/i386/sse.md                     | 36 +++++++++++---
> > gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
> > 4 files changed, 99 insertions(+), 22 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index 1eae9d7c78c..c664cb61e80 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
> >   rtx tmp;
> >
> >   /* Handle special case - vector comparsion with boolean result, transform
> > -     it using ptest instruction.  */
> > +     it using ptest instruction or vpcmpeq + kortest.  */
> >   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> >       || (mode == TImode && !TARGET_64BIT)
> > -      || mode == OImode)
> > +      || mode == OImode
> > +      || GET_MODE_SIZE (mode) == 64)
> >     {
> > -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
> > -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
> > +      unsigned msize = GET_MODE_SIZE (mode);
> > +      machine_mode p_mode
> > +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
> > +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
> > +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
> >
> >       gcc_assert (code == EQ || code == NE);
> >
> > -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
> > +      if (msize == 64)
> >    {
> > -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > -      mode = p_mode;
> > +      if (mode != V16SImode)
> > +        {
> > +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > +        }
> > +
> > +      tmp = gen_reg_rtx (HImode);
> > +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
> > +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
> > +    }
> > +      /* Using ptest for 128/256-bit vectors.  */
> > +      else
> > +    {
> > +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > +        {
> > +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > +          mode = p_mode;
> > +        }
> > +
> > +      /* Generate XOR since we can't check that one operand is zero
> > +         vector.  */
> > +      tmp = gen_reg_rtx (mode);
> > +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> > +      tmp = gen_lowpart (p_mode, tmp);
> > +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> > +                  gen_rtx_UNSPEC (CCZmode,
> > +                          gen_rtvec (2, tmp, tmp),
> > +                          UNSPEC_PTEST)));
> >    }
> > -      /* Generate XOR since we can't check that one operand is zero vector.  */
> > -      tmp = gen_reg_rtx (mode);
> > -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> > -      tmp = gen_lowpart (p_mode, tmp);
> > -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> > -                  gen_rtx_UNSPEC (CCZmode,
> > -                          gen_rtvec (2, tmp, tmp),
> > -                          UNSPEC_PTEST)));
> >       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
> >       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
> >                  gen_rtx_LABEL_REF (VOIDmode, label),
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > index abaf2f311e8..51d8d0c3b97 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
> >   DONE;
> > })
> >
> > +(define_expand "cbranchxi4"
> > +  [(set (reg:CC FLAGS_REG)
> > +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
> > +            (match_operand:XI 2 "nonimmediate_operand")))
> > +   (set (pc) (if_then_else
> > +           (match_operator 0 "bt_comparison_operator"
> > +        [(reg:CC FLAGS_REG) (const_int 0)])
> > +           (label_ref (match_operand 3))
> > +           (pc)))]
> > +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
> > +{
> > +  ix86_expand_branch (GET_CODE (operands[0]),
> > +              operands[1], operands[2], operands[3]);
> > +  DONE;
> > +})
> > +
> > (define_expand "cstore<mode>4"
> >   [(set (reg:CC FLAGS_REG)
> >    (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index c988935d4df..88fb1154699 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
> >    (set_attr "type" "msklog")
> >    (set_attr "prefix" "vex")])
> >
> > -(define_insn "kortest<mode>"
> > -  [(set (reg:CC FLAGS_REG)
> > -    (unspec:CC
> > +(define_insn "*kortest<mode>"
> > +  [(set (reg FLAGS_REG)
> > +    (unspec
> >      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
> >       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
> >      UNSPEC_KORTEST))]
> > @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
> >    (set_attr "type" "msklog")
> >    (set_attr "prefix" "vex")])
> >
> > +(define_insn "kortest<mode>_ccc"
> > +  [(set (reg:CCC FLAGS_REG)
> > +    (unspec:CCC
> > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > +      UNSPEC_KORTEST))]
> > +  "TARGET_AVX512F")
> > +
> > +(define_insn "kortest<mode>_ccz"
> > +  [(set (reg:CCZ FLAGS_REG)
> > +    (unspec:CCZ
> > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > +      UNSPEC_KORTEST))]
> > +  "TARGET_AVX512F")
> > +
> > +(define_expand "kortest<mode>"
> > +  [(set (reg:CC FLAGS_REG)
> > +    (unspec:CC
> > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > +      UNSPEC_KORTEST))]
> > +  "TARGET_AVX512F")
> > +
> > (define_insn "kunpckhi"
> >   [(set (match_operand:HI 0 "register_operand" "=k")
> >    (ior:HI
> > @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
> >
> > (define_expand "cbranch<mode>4"
> >   [(set (reg:CC FLAGS_REG)
> > -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
> > -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
> > +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
> > +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
> >    (set (pc) (if_then_else
> >           (match_operator 0 "bt_comparison_operator"
> >        [(reg:CC FLAGS_REG) (const_int 0)])
> >           (label_ref (match_operand 3))
> >           (pc)))]
> > -  "TARGET_SSE4_1"
> > +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
> > {
> >   ix86_expand_branch (GET_CODE (operands[0]),
> >              operands[1], operands[2], operands[3]);
> > diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> > new file mode 100644
> > index 00000000000..999ef926a18
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
> > +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
> > +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
> > +
> > +int compare (const char* s1, const char* s2)
> > +{
> > +  return __builtin_memcmp (s1, s2, 64) == 0;
> > +}
> > +
> > +int compare1 (const char* s1, const char* s2)
> > +{
> > +  return __builtin_memcmp (s1, s2, 64) != 0;
> > +}
> > --
> > 2.31.1
> >
Hongtao Liu Oct. 27, 2023, 7:37 a.m. UTC | #3
On Fri, Oct 27, 2023 at 3:21 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Oct 27, 2023 at 2:49 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> >
> >
> > > Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao.liu@intel.com>:
> > >
> > > When 2 vectors are equal, kmask is allones and kortest will set CF,
> > > else CF will be cleared.
> > >
> > > So CF bit can be used to check for the result of the comparison.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk?
> >
> > Is that also profitable for 256bit aka AVX10?
> Yes, it's also available for both 128-bit and 256-bit with AVX10, from
> performance perspective it's better.
> AVX10:
>   vpcmp + kortest
>  vs
> AVX2:
>  vpxor + vptest
>
>  vptest is more expensive than vpcmp + kortest
>
> > Is there a jump on carry in case the result feeds control flow rather than a value and is using ktest better then (does combine figure this out?)
> There are JC and JNC, there're many pattern matches for ptest which
> can't be automatically adjusted to kortest by combining, backend needs
> to manually transform them.
> That's why my patch only handles 64-bit vectors(to avoid regressing
I mean 64 bytes.
> those pattern match stuff).
>
> >
> > > Before:
> > >        vmovdqu (%rsi), %ymm0
> > >        vpxorq  (%rdi), %ymm0, %ymm0
> > >        vptest  %ymm0, %ymm0
> > >        jne     .L2
> > >        vmovdqu 32(%rsi), %ymm0
> > >        vpxorq  32(%rdi), %ymm0, %ymm0
> > >        vptest  %ymm0, %ymm0
> > >        je      .L5
> > > .L2:
> > >        movl    $1, %eax
> > >        xorl    $1, %eax
> > >        vzeroupper
> > >        ret
> > >
> > > After:
> > >        vmovdqu64       (%rsi), %zmm0
> > >        xorl    %eax, %eax
> > >        vpcmpeqd        (%rdi), %zmm0, %k0
> > >        kortestw        %k0, %k0
> > >        setc    %al
> > >        vzeroupper
> > >        ret
> > >
> > > gcc/ChangeLog:
> > >
> > >    PR target/104610
> > >    * config/i386/i386-expand.cc (ix86_expand_branch): Handle
> > >    512-bit vector with vpcmpeq + kortest.
> > >    * config/i386/i386.md (cbranchxi4): New expander.
> > >    * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
> > >    and V8DImode.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >    * gcc.target/i386/pr104610-2.c: New test.
> > > ---
> > > gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
> > > gcc/config/i386/i386.md                    | 16 +++++++
> > > gcc/config/i386/sse.md                     | 36 +++++++++++---
> > > gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
> > > 4 files changed, 99 insertions(+), 22 deletions(-)
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > > index 1eae9d7c78c..c664cb61e80 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
> > >   rtx tmp;
> > >
> > >   /* Handle special case - vector comparsion with boolean result, transform
> > > -     it using ptest instruction.  */
> > > +     it using ptest instruction or vpcmpeq + kortest.  */
> > >   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> > >       || (mode == TImode && !TARGET_64BIT)
> > > -      || mode == OImode)
> > > +      || mode == OImode
> > > +      || GET_MODE_SIZE (mode) == 64)
> > >     {
> > > -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
> > > -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
> > > +      unsigned msize = GET_MODE_SIZE (mode);
> > > +      machine_mode p_mode
> > > +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
> > > +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
> > > +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
> > >
> > >       gcc_assert (code == EQ || code == NE);
> > >
> > > -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > > +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
> > > +      if (msize == 64)
> > >    {
> > > -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > > -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > > -      mode = p_mode;
> > > +      if (mode != V16SImode)
> > > +        {
> > > +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > > +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > > +        }
> > > +
> > > +      tmp = gen_reg_rtx (HImode);
> > > +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
> > > +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
> > > +    }
> > > +      /* Using ptest for 128/256-bit vectors.  */
> > > +      else
> > > +    {
> > > +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > > +        {
> > > +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> > > +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> > > +          mode = p_mode;
> > > +        }
> > > +
> > > +      /* Generate XOR since we can't check that one operand is zero
> > > +         vector.  */
> > > +      tmp = gen_reg_rtx (mode);
> > > +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> > > +      tmp = gen_lowpart (p_mode, tmp);
> > > +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> > > +                  gen_rtx_UNSPEC (CCZmode,
> > > +                          gen_rtvec (2, tmp, tmp),
> > > +                          UNSPEC_PTEST)));
> > >    }
> > > -      /* Generate XOR since we can't check that one operand is zero vector.  */
> > > -      tmp = gen_reg_rtx (mode);
> > > -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> > > -      tmp = gen_lowpart (p_mode, tmp);
> > > -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> > > -                  gen_rtx_UNSPEC (CCZmode,
> > > -                          gen_rtvec (2, tmp, tmp),
> > > -                          UNSPEC_PTEST)));
> > >       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
> > >       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
> > >                  gen_rtx_LABEL_REF (VOIDmode, label),
> > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > > index abaf2f311e8..51d8d0c3b97 100644
> > > --- a/gcc/config/i386/i386.md
> > > +++ b/gcc/config/i386/i386.md
> > > @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
> > >   DONE;
> > > })
> > >
> > > +(define_expand "cbranchxi4"
> > > +  [(set (reg:CC FLAGS_REG)
> > > +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
> > > +            (match_operand:XI 2 "nonimmediate_operand")))
> > > +   (set (pc) (if_then_else
> > > +           (match_operator 0 "bt_comparison_operator"
> > > +        [(reg:CC FLAGS_REG) (const_int 0)])
> > > +           (label_ref (match_operand 3))
> > > +           (pc)))]
> > > +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
> > > +{
> > > +  ix86_expand_branch (GET_CODE (operands[0]),
> > > +              operands[1], operands[2], operands[3]);
> > > +  DONE;
> > > +})
> > > +
> > > (define_expand "cstore<mode>4"
> > >   [(set (reg:CC FLAGS_REG)
> > >    (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index c988935d4df..88fb1154699 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
> > >    (set_attr "type" "msklog")
> > >    (set_attr "prefix" "vex")])
> > >
> > > -(define_insn "kortest<mode>"
> > > -  [(set (reg:CC FLAGS_REG)
> > > -    (unspec:CC
> > > +(define_insn "*kortest<mode>"
> > > +  [(set (reg FLAGS_REG)
> > > +    (unspec
> > >      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
> > >       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
> > >      UNSPEC_KORTEST))]
> > > @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
> > >    (set_attr "type" "msklog")
> > >    (set_attr "prefix" "vex")])
> > >
> > > +(define_insn "kortest<mode>_ccc"
> > > +  [(set (reg:CCC FLAGS_REG)
> > > +    (unspec:CCC
> > > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > > +      UNSPEC_KORTEST))]
> > > +  "TARGET_AVX512F")
> > > +
> > > +(define_insn "kortest<mode>_ccz"
> > > +  [(set (reg:CCZ FLAGS_REG)
> > > +    (unspec:CCZ
> > > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > > +      UNSPEC_KORTEST))]
> > > +  "TARGET_AVX512F")
> > > +
> > > +(define_expand "kortest<mode>"
> > > +  [(set (reg:CC FLAGS_REG)
> > > +    (unspec:CC
> > > +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> > > +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> > > +      UNSPEC_KORTEST))]
> > > +  "TARGET_AVX512F")
> > > +
> > > (define_insn "kunpckhi"
> > >   [(set (match_operand:HI 0 "register_operand" "=k")
> > >    (ior:HI
> > > @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
> > >
> > > (define_expand "cbranch<mode>4"
> > >   [(set (reg:CC FLAGS_REG)
> > > -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
> > > -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
> > > +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
> > > +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
> > >    (set (pc) (if_then_else
> > >           (match_operator 0 "bt_comparison_operator"
> > >        [(reg:CC FLAGS_REG) (const_int 0)])
> > >           (label_ref (match_operand 3))
> > >           (pc)))]
> > > -  "TARGET_SSE4_1"
> > > +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
> > > {
> > >   ix86_expand_branch (GET_CODE (operands[0]),
> > >              operands[1], operands[2], operands[3]);
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> > > new file mode 100644
> > > index 00000000000..999ef926a18
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> > > @@ -0,0 +1,14 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
> > > +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
> > > +
> > > +int compare (const char* s1, const char* s2)
> > > +{
> > > +  return __builtin_memcmp (s1, s2, 64) == 0;
> > > +}
> > > +
> > > +int compare1 (const char* s1, const char* s2)
> > > +{
> > > +  return __builtin_memcmp (s1, s2, 64) != 0;
> > > +}
> > > --
> > > 2.31.1
> > >
>
>
>
> --
> BR,
> Hongtao
Richard Biener Oct. 27, 2023, 10:16 a.m. UTC | #4
> Am 27.10.2023 um 09:13 schrieb Hongtao Liu <crazylht@gmail.com>:
> 
> On Fri, Oct 27, 2023 at 2:49 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
>> 
>> 
>> 
>>>> Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao.liu@intel.com>:
>>> 
>>> When 2 vectors are equal, kmask is allones and kortest will set CF,
>>> else CF will be cleared.
>>> 
>>> So CF bit can be used to check for the result of the comparison.
>>> 
>>> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
>>> Ok for trunk?
>> 
>> Is that also profitable for 256bit aka AVX10?
> Yes, it's also available for both 128-bit and 256-bit with AVX10, from
> performance perspective it's better.
> AVX10:
>  vpcmp + kortest
> vs
> AVX2:
> vpxor + vptest
> 
> vptest is more expensive than vpcmp + kortest
> 
>> Is there a jump on carry in case the result feeds control flow rather than a value and is using ktest better then (does combine figure this out?)
> There are JC and JNC, there're many pattern matches for ptest which
> can't be automatically adjusted to kortest by combining, backend needs
> to manually transform them.
> That's why my patch only handles 64-bit vectors(to avoid regressing
> those pattern match stuff).

Ah, I see.  That’s exactly what I was wondering.

Richard 

> 
>> 
>>> Before:
>>>       vmovdqu (%rsi), %ymm0
>>>       vpxorq  (%rdi), %ymm0, %ymm0
>>>       vptest  %ymm0, %ymm0
>>>       jne     .L2
>>>       vmovdqu 32(%rsi), %ymm0
>>>       vpxorq  32(%rdi), %ymm0, %ymm0
>>>       vptest  %ymm0, %ymm0
>>>       je      .L5
>>> .L2:
>>>       movl    $1, %eax
>>>       xorl    $1, %eax
>>>       vzeroupper
>>>       ret
>>> 
>>> After:
>>>       vmovdqu64       (%rsi), %zmm0
>>>       xorl    %eax, %eax
>>>       vpcmpeqd        (%rdi), %zmm0, %k0
>>>       kortestw        %k0, %k0
>>>       setc    %al
>>>       vzeroupper
>>>       ret
>>> 
>>> gcc/ChangeLog:
>>> 
>>>   PR target/104610
>>>   * config/i386/i386-expand.cc (ix86_expand_branch): Handle
>>>   512-bit vector with vpcmpeq + kortest.
>>>   * config/i386/i386.md (cbranchxi4): New expander.
>>>   * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
>>>   and V8DImode.
>>> 
>>> gcc/testsuite/ChangeLog:
>>> 
>>>   * gcc.target/i386/pr104610-2.c: New test.
>>> ---
>>> gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
>>> gcc/config/i386/i386.md                    | 16 +++++++
>>> gcc/config/i386/sse.md                     | 36 +++++++++++---
>>> gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
>>> 4 files changed, 99 insertions(+), 22 deletions(-)
>>> create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> 
>>> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
>>> index 1eae9d7c78c..c664cb61e80 100644
>>> --- a/gcc/config/i386/i386-expand.cc
>>> +++ b/gcc/config/i386/i386-expand.cc
>>> @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
>>>  rtx tmp;
>>> 
>>>  /* Handle special case - vector comparsion with boolean result, transform
>>> -     it using ptest instruction.  */
>>> +     it using ptest instruction or vpcmpeq + kortest.  */
>>>  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
>>>      || (mode == TImode && !TARGET_64BIT)
>>> -      || mode == OImode)
>>> +      || mode == OImode
>>> +      || GET_MODE_SIZE (mode) == 64)
>>>    {
>>> -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
>>> -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
>>> +      unsigned msize = GET_MODE_SIZE (mode);
>>> +      machine_mode p_mode
>>> +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
>>> +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
>>> +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
>>> 
>>>      gcc_assert (code == EQ || code == NE);
>>> 
>>> -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
>>> +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
>>> +      if (msize == 64)
>>>   {
>>> -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> -      mode = p_mode;
>>> +      if (mode != V16SImode)
>>> +        {
>>> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> +        }
>>> +
>>> +      tmp = gen_reg_rtx (HImode);
>>> +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
>>> +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
>>> +    }
>>> +      /* Using ptest for 128/256-bit vectors.  */
>>> +      else
>>> +    {
>>> +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
>>> +        {
>>> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> +          mode = p_mode;
>>> +        }
>>> +
>>> +      /* Generate XOR since we can't check that one operand is zero
>>> +         vector.  */
>>> +      tmp = gen_reg_rtx (mode);
>>> +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
>>> +      tmp = gen_lowpart (p_mode, tmp);
>>> +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
>>> +                  gen_rtx_UNSPEC (CCZmode,
>>> +                          gen_rtvec (2, tmp, tmp),
>>> +                          UNSPEC_PTEST)));
>>>   }
>>> -      /* Generate XOR since we can't check that one operand is zero vector.  */
>>> -      tmp = gen_reg_rtx (mode);
>>> -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
>>> -      tmp = gen_lowpart (p_mode, tmp);
>>> -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
>>> -                  gen_rtx_UNSPEC (CCZmode,
>>> -                          gen_rtvec (2, tmp, tmp),
>>> -                          UNSPEC_PTEST)));
>>>      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
>>>      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
>>>                 gen_rtx_LABEL_REF (VOIDmode, label),
>>> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
>>> index abaf2f311e8..51d8d0c3b97 100644
>>> --- a/gcc/config/i386/i386.md
>>> +++ b/gcc/config/i386/i386.md
>>> @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
>>>  DONE;
>>> })
>>> 
>>> +(define_expand "cbranchxi4"
>>> +  [(set (reg:CC FLAGS_REG)
>>> +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
>>> +            (match_operand:XI 2 "nonimmediate_operand")))
>>> +   (set (pc) (if_then_else
>>> +           (match_operator 0 "bt_comparison_operator"
>>> +        [(reg:CC FLAGS_REG) (const_int 0)])
>>> +           (label_ref (match_operand 3))
>>> +           (pc)))]
>>> +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
>>> +{
>>> +  ix86_expand_branch (GET_CODE (operands[0]),
>>> +              operands[1], operands[2], operands[3]);
>>> +  DONE;
>>> +})
>>> +
>>> (define_expand "cstore<mode>4"
>>>  [(set (reg:CC FLAGS_REG)
>>>   (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
>>> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>>> index c988935d4df..88fb1154699 100644
>>> --- a/gcc/config/i386/sse.md
>>> +++ b/gcc/config/i386/sse.md
>>> @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
>>>   (set_attr "type" "msklog")
>>>   (set_attr "prefix" "vex")])
>>> 
>>> -(define_insn "kortest<mode>"
>>> -  [(set (reg:CC FLAGS_REG)
>>> -    (unspec:CC
>>> +(define_insn "*kortest<mode>"
>>> +  [(set (reg FLAGS_REG)
>>> +    (unspec
>>>     [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
>>>      (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
>>>     UNSPEC_KORTEST))]
>>> @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
>>>   (set_attr "type" "msklog")
>>>   (set_attr "prefix" "vex")])
>>> 
>>> +(define_insn "kortest<mode>_ccc"
>>> +  [(set (reg:CCC FLAGS_REG)
>>> +    (unspec:CCC
>>> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> +      UNSPEC_KORTEST))]
>>> +  "TARGET_AVX512F")
>>> +
>>> +(define_insn "kortest<mode>_ccz"
>>> +  [(set (reg:CCZ FLAGS_REG)
>>> +    (unspec:CCZ
>>> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> +      UNSPEC_KORTEST))]
>>> +  "TARGET_AVX512F")
>>> +
>>> +(define_expand "kortest<mode>"
>>> +  [(set (reg:CC FLAGS_REG)
>>> +    (unspec:CC
>>> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> +      UNSPEC_KORTEST))]
>>> +  "TARGET_AVX512F")
>>> +
>>> (define_insn "kunpckhi"
>>>  [(set (match_operand:HI 0 "register_operand" "=k")
>>>   (ior:HI
>>> @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
>>> 
>>> (define_expand "cbranch<mode>4"
>>>  [(set (reg:CC FLAGS_REG)
>>> -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
>>> -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
>>> +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
>>> +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
>>>   (set (pc) (if_then_else
>>>          (match_operator 0 "bt_comparison_operator"
>>>       [(reg:CC FLAGS_REG) (const_int 0)])
>>>          (label_ref (match_operand 3))
>>>          (pc)))]
>>> -  "TARGET_SSE4_1"
>>> +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
>>> {
>>>  ix86_expand_branch (GET_CODE (operands[0]),
>>>             operands[1], operands[2], operands[3]);
>>> diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> new file mode 100644
>>> index 00000000000..999ef926a18
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> @@ -0,0 +1,14 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
>>> +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
>>> +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
>>> +
>>> +int compare (const char* s1, const char* s2)
>>> +{
>>> +  return __builtin_memcmp (s1, s2, 64) == 0;
>>> +}
>>> +
>>> +int compare1 (const char* s1, const char* s2)
>>> +{
>>> +  return __builtin_memcmp (s1, s2, 64) != 0;
>>> +}
>>> --
>>> 2.31.1
>>> 
> 
> 
> 
> -- 
> BR,
> Hongtao
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1eae9d7c78c..c664cb61e80 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2411,30 +2411,53 @@  ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
   rtx tmp;
 
   /* Handle special case - vector comparsion with boolean result, transform
-     it using ptest instruction.  */
+     it using ptest instruction or vpcmpeq + kortest.  */
   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
       || (mode == TImode && !TARGET_64BIT)
-      || mode == OImode)
+      || mode == OImode
+      || GET_MODE_SIZE (mode) == 64)
     {
-      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
-      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
+      unsigned msize = GET_MODE_SIZE (mode);
+      machine_mode p_mode
+	= msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
+      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
+      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
 
       gcc_assert (code == EQ || code == NE);
 
-      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
+      if (msize == 64)
 	{
-	  op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
-	  op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
-	  mode = p_mode;
+	  if (mode != V16SImode)
+	    {
+	      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+	      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+	    }
+
+	  tmp = gen_reg_rtx (HImode);
+	  emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
+	  emit_insn (gen_kortesthi_ccc (tmp, tmp));
+	}
+      /* Using ptest for 128/256-bit vectors.  */
+      else
+	{
+	  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+	    {
+	      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+	      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+	      mode = p_mode;
+	    }
+
+	  /* Generate XOR since we can't check that one operand is zero
+	     vector.  */
+	  tmp = gen_reg_rtx (mode);
+	  emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
+	  tmp = gen_lowpart (p_mode, tmp);
+	  emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+				  gen_rtx_UNSPEC (CCZmode,
+						  gen_rtvec (2, tmp, tmp),
+						  UNSPEC_PTEST)));
 	}
-      /* Generate XOR since we can't check that one operand is zero vector.  */
-      tmp = gen_reg_rtx (mode);
-      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
-      tmp = gen_lowpart (p_mode, tmp);
-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
-			      gen_rtx_UNSPEC (CCZmode,
-					      gen_rtvec (2, tmp, tmp),
-					      UNSPEC_PTEST)));
       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
 				  gen_rtx_LABEL_REF (VOIDmode, label),
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index abaf2f311e8..51d8d0c3b97 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1442,6 +1442,22 @@  (define_expand "cbranchoi4"
   DONE;
 })
 
+(define_expand "cbranchxi4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:XI 1 "nonimmediate_operand")
+		    (match_operand:XI 2 "nonimmediate_operand")))
+   (set (pc) (if_then_else
+	       (match_operator 0 "bt_comparison_operator"
+		[(reg:CC FLAGS_REG) (const_int 0)])
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
+{
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
 (define_expand "cstore<mode>4"
   [(set (reg:CC FLAGS_REG)
 	(compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c988935d4df..88fb1154699 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2175,9 +2175,9 @@  (define_insn "ktest<mode>"
    (set_attr "type" "msklog")
    (set_attr "prefix" "vex")])
 
-(define_insn "kortest<mode>"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC
+(define_insn "*kortest<mode>"
+  [(set (reg FLAGS_REG)
+	(unspec
 	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
 	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
 	  UNSPEC_KORTEST))]
@@ -2187,6 +2187,30 @@  (define_insn "kortest<mode>"
    (set_attr "type" "msklog")
    (set_attr "prefix" "vex")])
 
+(define_insn "kortest<mode>_ccc"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC
+	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+	  UNSPEC_KORTEST))]
+  "TARGET_AVX512F")
+
+(define_insn "kortest<mode>_ccz"
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ
+	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+	  UNSPEC_KORTEST))]
+  "TARGET_AVX512F")
+
+(define_expand "kortest<mode>"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+	   (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+	  UNSPEC_KORTEST))]
+  "TARGET_AVX512F")
+
 (define_insn "kunpckhi"
   [(set (match_operand:HI 0 "register_operand" "=k")
 	(ior:HI
@@ -27840,14 +27864,14 @@  (define_insn "<avx512>_store<mode>_mask"
 
 (define_expand "cbranch<mode>4"
   [(set (reg:CC FLAGS_REG)
-	(compare:CC (match_operand:VI48_AVX 1 "register_operand")
-		    (match_operand:VI48_AVX 2 "nonimmediate_operand")))
+	(compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
+		    (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
    (set (pc) (if_then_else
 	       (match_operator 0 "bt_comparison_operator"
 		[(reg:CC FLAGS_REG) (const_int 0)])
 	       (label_ref (match_operand 3))
 	       (pc)))]
-  "TARGET_SSE4_1"
+  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
 {
   ix86_expand_branch (GET_CODE (operands[0]),
 		      operands[1], operands[2], operands[3]);
diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
new file mode 100644
index 00000000000..999ef926a18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -mtune=generic" } */
+/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
+/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
+
+int compare (const char* s1, const char* s2)
+{
+  return __builtin_memcmp (s1, s2, 64) == 0;
+}
+
+int compare1 (const char* s1, const char* s2)
+{
+  return __builtin_memcmp (s1, s2, 64) != 0;
+}