diff mbox series

Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'

Message ID 20230710011714.3615931-1-hongtao.liu@intel.com
State New
Headers show
Series Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' | expand

Commit Message

Liu, Hongtao July 10, 2023, 1:17 a.m. UTC
False dependency happens when destination is only updated by
pternlog. There is no false dependency when destination is also used
in source. So either a pxor should be inserted, or input operand
should be set with constraint '0'.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

	PR target/110438
	PR target/110202
	* config/i386/predicates.md
	(int_float_vector_all_ones_operand): New predicate.
	* config/i386/sse.md (*vmov<mode>_constm1_pternlog_false_dep): New
	define_insn.
	(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
	Ditto.
	(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
	Ditto.
	(*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to
	define_insn_and_split to avoid false dependence.
	(*<avx512>_cvtmask2<ssemodesuffix><mode>): Ditto.
	(<mask_codefor>one_cmpl<mode>2<mask_name>): Adjust constraint
	of operands 1 to '0' to avoid false dependence.
	(*andnot<mode>3): Ditto.
	(iornot<mode>3): Ditto.
	(*<nlogic><mode>3): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr110438.c: New test.
---
 gcc/config/i386/predicates.md            |   8 +-
 gcc/config/i386/sse.md                   | 113 ++++++++++++++++++++---
 gcc/testsuite/gcc.target/i386/pr110438.c |  30 ++++++
 3 files changed, 135 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c

Comments

Alexander Monakov July 10, 2023, 4:23 p.m. UTC | #1
On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote:

> False dependency happens when destination is only updated by
> pternlog. There is no false dependency when destination is also used
> in source. So either a pxor should be inserted, or input operand
> should be set with constraint '0'.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ready to push to trunk.

Shouldn't this patch also remove uses of vpternlog in
standard_sse_constant_opcode?

A couple more questions below:

> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal"
>  	      ]
>  	      (symbol_ref "true")))])
>  
> +; False dependency happens on destination register which is not really
> +; used when moving all ones to vector register
> +(define_split
> +  [(set (match_operand:VMOVE 0 "register_operand")
> +	(match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> +  "TARGET_AVX512F && reload_completed
> +  && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
> +  && optimize_function_for_speed_p (cfun)"

Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate.
Doesn't it work here as well?

> +  [(set (match_dup 0) (match_dup 2))
> +   (parallel
> +     [(set (match_dup 0) (match_dup 1))
> +      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> +  "operands[2] = CONST0_RTX (<MODE>mode);")
> +
> +(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
> +  [(set (match_operand:VMOVE 0 "register_operand" "=v")
> +	(match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
> +   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> +   "TARGET_AVX512VL || <MODE_SIZE> == 64"
> +   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
> +  [(set_attr "type" "sselog1")
> +   (set_attr "prefix" "evex")])
> +
>  ;; If mem_addr points to a memory region with less than whole vector size bytes
>  ;; of accessible memory and k is a mask that would prevent reading the inaccessible
>  ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
> @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
>      operands[3] = CONST0_RTX (<MODE>mode);
>    }")
>  
> -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
>    [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
>  	(vec_merge:VI48_AVX512VL
>  	  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
>    "@
>     vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
>     vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> +  "&& !TARGET_AVX512DQ && reload_completed
> +   && optimize_function_for_speed_p (cfun)"
> +  [(set (match_dup 0) (match_dup 4))
> +   (parallel
> +    [(set (match_dup 0)
> +	  (vec_merge:VI48_AVX512VL
> +	    (match_dup 2)
> +	    (match_dup 3)
> +	    (match_dup 1)))
> +     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> +  "operands[4] = CONST0_RTX (<MODE>mode);"
>    [(set_attr "isa" "avx512dq,*")
>     (set_attr "length_immediate" "0,1")
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>  
> +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
> +  [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> +	(vec_merge:VI48_AVX512VL
> +	  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> +	  (match_operand:VI48_AVX512VL 3 "const0_operand")
> +	  (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
> +   (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> +  "TARGET_AVX512F && !TARGET_AVX512DQ"
> +  "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> +  [(set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_expand "extendv2sfv2df2"
>    [(set (match_operand:V2DF 0 "register_operand")
>  	(float_extend:V2DF
> @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2"
>      operands[2] = force_reg (<MODE>mode, operands[2]);
>  })
>  
> -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> -  [(set (match_operand:VI 0 "register_operand" "=v,v")
> -	(xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
> -		(match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
> +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
> +  [(set (match_operand:VI 0 "register_operand" "=v,v,v")
> +	(xor:VI (match_operand:VI 1 "bcst_vector_operand"     " 0, m,Br")
> +		(match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
>    "TARGET_AVX512F
>     && (!<mask_applied>
>         || <ssescalarmode>mode == SImode
>         || <ssescalarmode>mode == DImode)"
>  {
> +  if (!<mask_applied> && which_alternative
> +      && optimize_function_for_speed_p (cfun))
> +    return "#";
> +
>    if (TARGET_AVX512VL)
>      return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
>    else
>      return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
>  }
> +  "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
> +   && optimize_function_for_speed_p (cfun)"
> +  [(set (match_dup 0) (match_dup 3))
> +   (parallel
> +     [(set (match_dup 0)
> +	   (xor:VI (match_dup 1) (match_dup 2)))
> +      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> +  "operands[3] = CONST0_RTX (<MODE>mode);"

Perhaps I'm misreading this, but this seems to result in

  vpxor zmm0, zmm0
  vpternlog zmm0, zmm0, [mem], 0x55

while in the PR the agreement was to emit

  vmovdq? zmm0, [mem]
  vpternlog zmm0, zmm0, zmm0, 0x55

when the source is in memory, because the former has three uops in fused domain?

>    [(set_attr "type" "sselog")

>     (set_attr "prefix" "evex")
>     (set (attr "mode")
> @@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
>  		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
>  		      (const_int 1)))])
>  
> +(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
> +  [(set (match_operand:VI 0 "register_operand" "=v,v")
> +	(xor:VI (match_operand:VI 1 "bcst_vector_operand"     "m, Br")
> +		(match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))
> +   (unspec [(match_operand:VI 3 "register_operand" "0,0")]
> +     UNSPEC_INSN_FALSE_DEP)]
> +  "TARGET_AVX512F"
> +{
> +  if (TARGET_AVX512VL)
> +    return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> +  else
> +    return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> +}
> +  [(set_attr "type" "sselog")
> +   (set_attr "prefix" "evex")
> +   (set (attr "mode")
> +        (if_then_else (match_test "TARGET_AVX512VL")
> +		      (const_string "<sseinsnmode>")
> +		      (const_string "XI")))
> +   (set (attr "enabled")
> +	(if_then_else (eq_attr "alternative" "0")
> +		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> +		      (const_int 1)))])
> +
>  (define_split
>    [(set (match_operand:VI48_AVX512F 0 "register_operand")
>  	(vec_duplicate:VI48_AVX512F
> @@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3"
>    [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
>  	(and:VI
>  	  (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
> -	  (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
> +	  (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
>    "TARGET_SSE
>     && (register_operand (operands[1], <MODE>mode)
>         || register_operand (operands[2], <MODE>mode))"
> @@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3"
>    [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
>  	(ior:VI
>  	  (not:VI
> -	    (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
> -	  (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
> +	    (match_operand:VI 1 "bcst_vector_operand" "0,m,  0,vBr"))
> +	  (match_operand:VI 2 "bcst_vector_operand"   "m,0,vBr,  0")))]
>    "(<MODE_SIZE> == 64 || TARGET_AVX512VL
>      || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
>     && (register_operand (operands[1], <MODE>mode)
> @@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3"
>  		      (const_string "<sseinsnmode>")
>  		      (const_string "XI")))
>     (set (attr "enabled")
> -	(if_then_else (eq_attr "alternative" "2,3")
> +	(if_then_else (eq_attr "alternative" "0,1")
>  		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
>  		      (const_string "*")))])
>  
> @@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3"
>    [(set (match_operand:VI 0 "register_operand" "=v,v")
>  	(not:VI
>  	  (xor:VI
> -	    (match_operand:VI 1 "bcst_vector_operand" "%v,v")
> -	    (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> +	    (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
> +	    (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
>    "(<MODE_SIZE> == 64 || TARGET_AVX512VL
>      || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
>     && (register_operand (operands[1], <MODE>mode)
> @@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3"
>  		      (const_string "<sseinsnmode>")
>  		      (const_string "XI")))
>     (set (attr "enabled")
> -	(if_then_else (eq_attr "alternative" "1")
> +	(if_then_else (eq_attr "alternative" "0")
>  		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
>  		      (const_string "*")))])
>  
> @@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
>  (define_insn "*<nlogic><mode>3"
>    [(set (match_operand:VI 0 "register_operand" "=v,v")
>  	(andor:VI
> -	  (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> -	  (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> +	  (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
> +	  (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
>    "(<MODE_SIZE> == 64 || TARGET_AVX512VL
>      || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
>     && (register_operand (operands[1], <MODE>mode)
> @@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3"
>  		      (const_string "<sseinsnmode>")
>  		      (const_string "XI")))
>     (set (attr "enabled")
> -	(if_then_else (eq_attr "alternative" "1")
> +	(if_then_else (eq_attr "alternative" "0")
>  		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
>  		      (const_string "*")))])
>  
> diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
> new file mode 100644
> index 00000000000..11b8cc59fd2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr110438.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
> +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
> +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
> +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
> +
> +
> +#include <immintrin.h>
> +
> +__m512i g(void)
> +{
> +  return (__m512i){ 0 } - 1;
> +}
> +
> +__m512i g1(__m512i* a)
> +{
> +  return ~(*a);
> +}
> +
> +void
> +foo (int* a, int* __restrict b)
> +{
> +  for (int i = 0; i != 16; i++)
> +    {
> +      if (b[i])
> +	a[i] = -1;
> +      else
> +	a[i] = 0;
> +    }
> +}
>
Hongtao Liu July 11, 2023, 12:03 a.m. UTC | #2
On Tue, Jul 11, 2023 at 12:24 AM Alexander Monakov via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
> On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote:
>
> > False dependency happens when destination is only updated by
> > pternlog. There is no false dependency when destination is also used
> > in source. So either a pxor should be inserted, or input operand
> > should be set with constraint '0'.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ready to push to trunk.
>
> Shouldn't this patch also remove uses of vpternlog in
> standard_sse_constant_opcode?
It's still needed when !optimize_function_for_speed_p (cfun).
>
> A couple more questions below:
>
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal"
> >             ]
> >             (symbol_ref "true")))])
> >
> > +; False dependency happens on destination register which is not really
> > +; used when moving all ones to vector register
> > +(define_split
> > +  [(set (match_operand:VMOVE 0 "register_operand")
> > +     (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> > +  "TARGET_AVX512F && reload_completed
> > +  && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
> > +  && optimize_function_for_speed_p (cfun)"
>
> Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate.
> Doesn't it work here as well?
I'm just aligned with lzcnt/popcnt case, the difference between
option_insn_for_speed_p and optimized_function_for_speed_p is the
former will consider
!crtl->maybe_hot_insn_p but the latter just returns
!optimize_function_for_size_p (cfun). It looks
optimize_insn_for_speed_p() is more reasonable for single insn.

 350optimize_insn_for_size_p (void)
 351{
 352  enum optimize_size_level ret = optimize_function_for_size_p (cfun);
 353  if (ret < OPTIMIZE_SIZE_BALANCED && !crtl->maybe_hot_insn_p)
 354    ret = OPTIMIZE_SIZE_BALANCED;
 355  return ret;

>
> > +  [(set (match_dup 0) (match_dup 2))
> > +   (parallel
> > +     [(set (match_dup 0) (match_dup 1))
> > +      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > +  "operands[2] = CONST0_RTX (<MODE>mode);")
> > +
> > +(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
> > +  [(set (match_operand:VMOVE 0 "register_operand" "=v")
> > +     (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
> > +   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> > +   "TARGET_AVX512VL || <MODE_SIZE> == 64"
> > +   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
> > +  [(set_attr "type" "sselog1")
> > +   (set_attr "prefix" "evex")])
> > +
> >  ;; If mem_addr points to a memory region with less than whole vector size bytes
> >  ;; of accessible memory and k is a mask that would prevent reading the inaccessible
> >  ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
> > @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
> >      operands[3] = CONST0_RTX (<MODE>mode);
> >    }")
> >
> > -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> > +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> >    [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
> >       (vec_merge:VI48_AVX512VL
> >         (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> > @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> >    "@
> >     vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
> >     vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> > +  "&& !TARGET_AVX512DQ && reload_completed
> > +   && optimize_function_for_speed_p (cfun)"
> > +  [(set (match_dup 0) (match_dup 4))
> > +   (parallel
> > +    [(set (match_dup 0)
> > +       (vec_merge:VI48_AVX512VL
> > +         (match_dup 2)
> > +         (match_dup 3)
> > +         (match_dup 1)))
> > +     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > +  "operands[4] = CONST0_RTX (<MODE>mode);"
> >    [(set_attr "isa" "avx512dq,*")
> >     (set_attr "length_immediate" "0,1")
> >     (set_attr "prefix" "evex")
> >     (set_attr "mode" "<sseinsnmode>")])
> >
> > +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
> > +  [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> > +     (vec_merge:VI48_AVX512VL
> > +       (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> > +       (match_operand:VI48_AVX512VL 3 "const0_operand")
> > +       (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
> > +   (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> > +  "TARGET_AVX512F && !TARGET_AVX512DQ"
> > +  "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> > +  [(set_attr "length_immediate" "1")
> > +   (set_attr "prefix" "evex")
> > +   (set_attr "mode" "<sseinsnmode>")])
> > +
> >  (define_expand "extendv2sfv2df2"
> >    [(set (match_operand:V2DF 0 "register_operand")
> >       (float_extend:V2DF
> > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2"
> >      operands[2] = force_reg (<MODE>mode, operands[2]);
> >  })
> >
> > -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> > -  [(set (match_operand:VI 0 "register_operand" "=v,v")
> > -     (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
> > -             (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
> > +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
> > +  [(set (match_operand:VI 0 "register_operand" "=v,v,v")
> > +     (xor:VI (match_operand:VI 1 "bcst_vector_operand"     " 0, m,Br")
> > +             (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
> >    "TARGET_AVX512F
> >     && (!<mask_applied>
> >         || <ssescalarmode>mode == SImode
> >         || <ssescalarmode>mode == DImode)"
> >  {
> > +  if (!<mask_applied> && which_alternative
> > +      && optimize_function_for_speed_p (cfun))
> > +    return "#";
> > +
> >    if (TARGET_AVX512VL)
> >      return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> >    else
> >      return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> >  }
> > +  "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
> > +   && optimize_function_for_speed_p (cfun)"
> > +  [(set (match_dup 0) (match_dup 3))
> > +   (parallel
> > +     [(set (match_dup 0)
> > +        (xor:VI (match_dup 1) (match_dup 2)))
> > +      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > +  "operands[3] = CONST0_RTX (<MODE>mode);"
>
> Perhaps I'm misreading this, but this seems to result in
>
>   vpxor zmm0, zmm0
>   vpternlog zmm0, zmm0, [mem], 0x55
>
I thought the first alternative (v,0,BC) would handle that, looks not,
i'll adjust the splitter to explicitly put operands[1] into
operands[0] when it's memory.
> while in the PR the agreement was to emit
>
>   vmovdq? zmm0, [mem]
>   vpternlog zmm0, zmm0, zmm0, 0x55
>
> when the source is in memory, because the former has three uops in fused domain?
>
> >    [(set_attr "type" "sselog")
>
> >     (set_attr "prefix" "evex")
> >     (set (attr "mode")
> > @@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> >                     (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> >                     (const_int 1)))])
> >
> > +(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
> > +  [(set (match_operand:VI 0 "register_operand" "=v,v")
> > +     (xor:VI (match_operand:VI 1 "bcst_vector_operand"     "m, Br")
> > +             (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))
> > +   (unspec [(match_operand:VI 3 "register_operand" "0,0")]
> > +     UNSPEC_INSN_FALSE_DEP)]
> > +  "TARGET_AVX512F"
> > +{
> > +  if (TARGET_AVX512VL)
> > +    return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> > +  else
> > +    return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> > +}
> > +  [(set_attr "type" "sselog")
> > +   (set_attr "prefix" "evex")
> > +   (set (attr "mode")
> > +        (if_then_else (match_test "TARGET_AVX512VL")
> > +                   (const_string "<sseinsnmode>")
> > +                   (const_string "XI")))
> > +   (set (attr "enabled")
> > +     (if_then_else (eq_attr "alternative" "0")
> > +                   (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > +                   (const_int 1)))])
> > +
> >  (define_split
> >    [(set (match_operand:VI48_AVX512F 0 "register_operand")
> >       (vec_duplicate:VI48_AVX512F
> > @@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3"
> >    [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
> >       (and:VI
> >         (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
> > -       (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
> > +       (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
> >    "TARGET_SSE
> >     && (register_operand (operands[1], <MODE>mode)
> >         || register_operand (operands[2], <MODE>mode))"
> > @@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3"
> >    [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
> >       (ior:VI
> >         (not:VI
> > -         (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
> > -       (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
> > +         (match_operand:VI 1 "bcst_vector_operand" "0,m,  0,vBr"))
> > +       (match_operand:VI 2 "bcst_vector_operand"   "m,0,vBr,  0")))]
> >    "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> >      || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> >     && (register_operand (operands[1], <MODE>mode)
> > @@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3"
> >                     (const_string "<sseinsnmode>")
> >                     (const_string "XI")))
> >     (set (attr "enabled")
> > -     (if_then_else (eq_attr "alternative" "2,3")
> > +     (if_then_else (eq_attr "alternative" "0,1")
> >                     (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> >                     (const_string "*")))])
> >
> > @@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3"
> >    [(set (match_operand:VI 0 "register_operand" "=v,v")
> >       (not:VI
> >         (xor:VI
> > -         (match_operand:VI 1 "bcst_vector_operand" "%v,v")
> > -         (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > +         (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
> > +         (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
> >    "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> >      || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> >     && (register_operand (operands[1], <MODE>mode)
> > @@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3"
> >                     (const_string "<sseinsnmode>")
> >                     (const_string "XI")))
> >     (set (attr "enabled")
> > -     (if_then_else (eq_attr "alternative" "1")
> > +     (if_then_else (eq_attr "alternative" "0")
> >                     (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> >                     (const_string "*")))])
> >
> > @@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> >  (define_insn "*<nlogic><mode>3"
> >    [(set (match_operand:VI 0 "register_operand" "=v,v")
> >       (andor:VI
> > -       (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> > -       (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > +       (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
> > +       (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
> >    "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> >      || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> >     && (register_operand (operands[1], <MODE>mode)
> > @@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3"
> >                     (const_string "<sseinsnmode>")
> >                     (const_string "XI")))
> >     (set (attr "enabled")
> > -     (if_then_else (eq_attr "alternative" "1")
> > +     (if_then_else (eq_attr "alternative" "0")
> >                     (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> >                     (const_string "*")))])
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
> > new file mode 100644
> > index 00000000000..11b8cc59fd2
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr110438.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
> > +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
> > +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
> > +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
> > +
> > +
> > +#include <immintrin.h>
> > +
> > +__m512i g(void)
> > +{
> > +  return (__m512i){ 0 } - 1;
> > +}
> > +
> > +__m512i g1(__m512i* a)
> > +{
> > +  return ~(*a);
> > +}
> > +
> > +void
> > +foo (int* a, int* __restrict b)
> > +{
> > +  for (int i = 0; i != 16; i++)
> > +    {
> > +      if (b[i])
> > +     a[i] = -1;
> > +      else
> > +     a[i] = 0;
> > +    }
> > +}
> >
diff mbox series

Patch

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7ddbe01a6f9..37d20c6303a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@  (define_predicate "float_vector_all_ones_operand"
     return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
        (match_test "INTEGRAL_MODE_P (GET_MODE (op))")
        (match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+       (match_operand 0 "float_vector_all_ones_operand")
+       (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
    that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 418c337a775..56920a3e1d3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,29 @@  (define_insn "mov<mode>_internal"
 	      ]
 	      (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+	(match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
+  && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+     [(set (match_dup 0) (match_dup 1))
+      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (<MODE>mode);")
+
+(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+	(match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL || <MODE_SIZE> == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
@@ -9336,7 +9359,7 @@  (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
     operands[3] = CONST0_RTX (<MODE>mode);
   }")
 
-(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
+(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VI48_AVX512VL
 	  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9346,11 +9369,35 @@  (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
   "@
    vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
    vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+  "&& !TARGET_AVX512DQ && reload_completed
+   && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+    [(set (match_dup 0)
+	  (vec_merge:VI48_AVX512VL
+	    (match_dup 2)
+	    (match_dup 3)
+	    (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] = CONST0_RTX (<MODE>mode);"
   [(set_attr "isa" "avx512dq,*")
    (set_attr "length_immediate" "0,1")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
+	(vec_merge:VI48_AVX512VL
+	  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
+	  (match_operand:VI48_AVX512VL 3 "const0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
+   (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_AVX512F && !TARGET_AVX512DQ"
+  "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+  [(set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_expand "extendv2sfv2df2"
   [(set (match_operand:V2DF 0 "register_operand")
 	(float_extend:V2DF
@@ -17166,20 +17213,32 @@  (define_expand "one_cmpl<mode>2"
     operands[2] = force_reg (<MODE>mode, operands[2]);
 })
 
-(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
-  [(set (match_operand:VI 0 "register_operand" "=v,v")
-	(xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
-		(match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
+(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
+  [(set (match_operand:VI 0 "register_operand" "=v,v,v")
+	(xor:VI (match_operand:VI 1 "bcst_vector_operand"     " 0, m,Br")
+		(match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
   "TARGET_AVX512F
    && (!<mask_applied>
        || <ssescalarmode>mode == SImode
        || <ssescalarmode>mode == DImode)"
 {
+  if (!<mask_applied> && which_alternative
+      && optimize_function_for_speed_p (cfun))
+    return "#";
+
   if (TARGET_AVX512VL)
     return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
   else
     return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
 }
+  "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
+   && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 3))
+   (parallel
+     [(set (match_dup 0)
+	   (xor:VI (match_dup 1) (match_dup 2)))
+      (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[3] = CONST0_RTX (<MODE>mode);"
   [(set_attr "type" "sselog")
    (set_attr "prefix" "evex")
    (set (attr "mode")
@@ -17191,6 +17250,30 @@  (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
 		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
 		      (const_int 1)))])
 
+(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
+  [(set (match_operand:VI 0 "register_operand" "=v,v")
+	(xor:VI (match_operand:VI 1 "bcst_vector_operand"     "m, Br")
+		(match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))
+   (unspec [(match_operand:VI 3 "register_operand" "0,0")]
+     UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_AVX512F"
+{
+  if (TARGET_AVX512VL)
+    return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
+  else
+    return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set (attr "mode")
+        (if_then_else (match_test "TARGET_AVX512VL")
+		      (const_string "<sseinsnmode>")
+		      (const_string "XI")))
+   (set (attr "enabled")
+	(if_then_else (eq_attr "alternative" "0")
+		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+		      (const_int 1)))])
+
 (define_split
   [(set (match_operand:VI48_AVX512F 0 "register_operand")
 	(vec_duplicate:VI48_AVX512F
@@ -17226,7 +17309,7 @@  (define_insn "*andnot<mode>3"
   [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
 	(and:VI
 	  (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
-	  (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
+	  (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
   "TARGET_SSE
    && (register_operand (operands[1], <MODE>mode)
        || register_operand (operands[2], <MODE>mode))"
@@ -17685,8 +17768,8 @@  (define_insn "*iornot<mode>3"
   [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
 	(ior:VI
 	  (not:VI
-	    (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
-	  (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
+	    (match_operand:VI 1 "bcst_vector_operand" "0,m,  0,vBr"))
+	  (match_operand:VI 2 "bcst_vector_operand"   "m,0,vBr,  0")))]
   "(<MODE_SIZE> == 64 || TARGET_AVX512VL
     || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
    && (register_operand (operands[1], <MODE>mode)
@@ -17710,7 +17793,7 @@  (define_insn "*iornot<mode>3"
 		      (const_string "<sseinsnmode>")
 		      (const_string "XI")))
    (set (attr "enabled")
-	(if_then_else (eq_attr "alternative" "2,3")
+	(if_then_else (eq_attr "alternative" "0,1")
 		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
 		      (const_string "*")))])
 
@@ -17718,8 +17801,8 @@  (define_insn "*xnor<mode>3"
   [(set (match_operand:VI 0 "register_operand" "=v,v")
 	(not:VI
 	  (xor:VI
-	    (match_operand:VI 1 "bcst_vector_operand" "%v,v")
-	    (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+	    (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
+	    (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
   "(<MODE_SIZE> == 64 || TARGET_AVX512VL
     || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
    && (register_operand (operands[1], <MODE>mode)
@@ -17738,7 +17821,7 @@  (define_insn "*xnor<mode>3"
 		      (const_string "<sseinsnmode>")
 		      (const_string "XI")))
    (set (attr "enabled")
-	(if_then_else (eq_attr "alternative" "1")
+	(if_then_else (eq_attr "alternative" "0")
 		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
 		      (const_string "*")))])
 
@@ -17749,8 +17832,8 @@  (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
 (define_insn "*<nlogic><mode>3"
   [(set (match_operand:VI 0 "register_operand" "=v,v")
 	(andor:VI
-	  (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
-	  (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+	  (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
+	  (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
   "(<MODE_SIZE> == 64 || TARGET_AVX512VL
     || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
    && (register_operand (operands[1], <MODE>mode)
@@ -17769,7 +17852,7 @@  (define_insn "*<nlogic><mode>3"
 		      (const_string "<sseinsnmode>")
 		      (const_string "XI")))
    (set (attr "enabled")
-	(if_then_else (eq_attr "alternative" "1")
+	(if_then_else (eq_attr "alternative" "0")
 		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
 		      (const_string "*")))])
 
diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
new file mode 100644
index 00000000000..11b8cc59fd2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110438.c
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
+/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
+/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
+
+
+#include <immintrin.h>
+
+__m512i g(void)
+{
+  return (__m512i){ 0 } - 1;
+}
+
+__m512i g1(__m512i* a)
+{
+  return ~(*a);
+}
+
+void
+foo (int* a, int* __restrict b)
+{
+  for (int i = 0; i != 16; i++)
+    {
+      if (b[i])
+	a[i] = -1;
+      else
+	a[i] = 0;
+    }
+}