diff mbox series

Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).

Message ID 20210602053907.86765-2-hongtao.liu@intel.com
State New
Headers show
Series Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)). | expand

Commit Message

Liu, Hongtao June 2, 2021, 5:39 a.m. UTC
For i386, it will enable below opt

from
	notl    %edi
      	vpbroadcastd    %edi, %xmm0
      	vpand   %xmm1, %xmm0, %xmm0
to
      	vpbroadcastd    %edi, %xmm0
      	vpandn   %xmm1, %xmm0, %xmm0

gcc/ChangeLog:

	PR target/100711
	* simplify-rtx.c (simplify_unary_operation_1):
	Canonicalize (vec_duplicate (not A)) to
	(not (vec_duplicate A)).
	* doc/md.texi (Insn Canonicalizations): Document
	canonicalization of vec_duplicate.

gcc/testsuite/ChangeLog:

	PR target/100711
	* gcc.target/i386/avx2-pr100711.c: New test.
	* gcc.target/i386/avx512bw-pr100711.c: New test.
---
 gcc/doc/md.texi                               |  5 ++
 gcc/simplify-rtx.c                            |  6 ++
 gcc/testsuite/gcc.target/i386/avx2-pr100711.c | 73 +++++++++++++++++++
 .../gcc.target/i386/avx512bw-pr100711.c       | 48 ++++++++++++
 4 files changed, 132 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr100711.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c

Comments

Richard Biener June 2, 2021, 7:07 a.m. UTC | #1
On Wed, Jun 2, 2021 at 7:41 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> For i386, it will enable below opt
>
> from
>         notl    %edi
>         vpbroadcastd    %edi, %xmm0
>         vpand   %xmm1, %xmm0, %xmm0
> to
>         vpbroadcastd    %edi, %xmm0
>         vpandn   %xmm1, %xmm0, %xmm0

There will be cases where (vec_duplicate (not A)) is better
than (not (vec_duplicate A)), so I'm not sure it is a good idea
to forcefully canonicalize unary operations.  I suppose the
simplification happens inside combine - doesn't combine
already have code to try variants of an expression and isn't
this a good candidate that can be added there, avoiding
the canonicalization?

Richard.

> gcc/ChangeLog:
>
>         PR target/100711
>         * simplify-rtx.c (simplify_unary_operation_1):
>         Canonicalize (vec_duplicate (not A)) to
>         (not (vec_duplicate A)).
>         * doc/md.texi (Insn Canonicalizations): Document
>         canonicalization of vec_duplicate.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/100711
>         * gcc.target/i386/avx2-pr100711.c: New test.
>         * gcc.target/i386/avx512bw-pr100711.c: New test.
> ---
>  gcc/doc/md.texi                               |  5 ++
>  gcc/simplify-rtx.c                            |  6 ++
>  gcc/testsuite/gcc.target/i386/avx2-pr100711.c | 73 +++++++++++++++++++
>  .../gcc.target/i386/avx512bw-pr100711.c       | 48 ++++++++++++
>  4 files changed, 132 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr100711.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
>
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 0e65b3ae663..06b42901413 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -8297,6 +8297,11 @@ operand of @code{mult} is also a shift, then that is extended also.
>  This transformation is only applied when it can be proven that the
>  original operation had sufficient precision to prevent overflow.
>
> +@cindex @code{vec_duplicate}, canonicalization of
> +@item
> +@code{(vec_duplicate (not @var{a}))} is converted to
> +@code{(not (vec_duplicate @var{a}))}.
> +
>  @end itemize
>
>  Further canonicalization rules are defined in the function
> diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
> index 04423bbd195..171fc447d50 100644
> --- a/gcc/simplify-rtx.c
> +++ b/gcc/simplify-rtx.c
> @@ -1708,6 +1708,12 @@ simplify_context::simplify_unary_operation_1 (rtx_code code, machine_mode mode,
>  #endif
>        break;
>
> +      /* Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).  */
> +    case VEC_DUPLICATE:
> +      if (GET_CODE (op) == NOT)
> +       return gen_rtx_NOT (mode, gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0)));
> +      break;
> +
>      default:
>        break;
>      }
> diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr100711.c b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
> new file mode 100644
> index 00000000000..5b144623873
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
> @@ -0,0 +1,73 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -O2" } */
> +/* { dg-final { scan-assembler-times "pandn" 8 } } */
> +/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
> +typedef char v16qi __attribute__((vector_size(16)));
> +typedef char v32qi __attribute__((vector_size(32)));
> +typedef short v8hi __attribute__((vector_size(16)));
> +typedef short v16hi __attribute__((vector_size(32)));
> +typedef int v4si __attribute__((vector_size(16)));
> +typedef int v8si __attribute__((vector_size(32)));
> +typedef long long v2di __attribute__((vector_size(16)));
> +typedef long long v4di __attribute__((vector_size(32)));
> +
> +v16qi
> +f1 (char a, v16qi c)
> +{
> +  char b = ~a;
> +  return (__extension__(v16qi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v32qi
> +f2 (char a, v32qi c)
> +{
> +  char b = ~a;
> +  return (__extension__(v32qi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v8hi
> +f3 (short a, v8hi c)
> +{
> +  short b = ~a;
> +  return (__extension__(v8hi) {b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v16hi
> +f4 (short a, v16hi c)
> +{
> +  short b = ~a;
> +  return (__extension__(v16hi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v4si
> +f5 (int a, v4si c)
> +{
> +  int b = ~a;
> +  return (__extension__(v4si) {b, b, b, b}) & c;
> +}
> +
> +v8si
> +f6 (int a, v8si c)
> +{
> +  int b = ~a;
> +  return (__extension__(v8si) {b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v2di
> +f7 (long long a, v2di c)
> +{
> +  long long b = ~a;
> +  return (__extension__(v2di) {b, b}) & c;
> +}
> +
> +v4di
> +f8 (long long a, v4di c)
> +{
> +  long long b = ~a;
> +  return (__extension__(v4di) {b, b, b, b}) & c;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
> new file mode 100644
> index 00000000000..f0a103d0bc2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
> @@ -0,0 +1,48 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -O2" } */
> +/* { dg-final { scan-assembler-times "pandn" 4 } } */
> +/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
> +
> +typedef char v64qi __attribute__((vector_size(64)));
> +typedef short v32hi __attribute__((vector_size(64)));
> +typedef int v16si __attribute__((vector_size(64)));
> +typedef long long v8di __attribute__((vector_size(64)));
> +
> +v64qi
> +f1 (char a, v64qi c)
> +{
> +  char b = ~a;
> +  return (__extension__(v64qi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v32hi
> +f2 (short a, v32hi c)
> +{
> +  short b = ~a;
> +  return (__extension__(v32hi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v16si
> +f3 (int a, v16si c)
> +{
> +  int b = ~a;
> +  return (__extension__(v16si) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v8di
> +f4 (long long a, v8di c)
> +{
> +  long long b = ~a;
> +  return (__extension__(v8di) {b, b, b, b, b, b, b, b}) & c;
> +}
> --
> 2.18.1
>
Segher Boessenkool June 2, 2021, 8:46 p.m. UTC | #2
Hi!

On Wed, Jun 02, 2021 at 09:07:35AM +0200, Richard Biener wrote:
> On Wed, Jun 2, 2021 at 7:41 AM liuhongt via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> > For i386, it will enable below opt
> >
> > from
> >         notl    %edi
> >         vpbroadcastd    %edi, %xmm0
> >         vpand   %xmm1, %xmm0, %xmm0
> > to
> >         vpbroadcastd    %edi, %xmm0
> >         vpandn   %xmm1, %xmm0, %xmm0
> 
> There will be cases where (vec_duplicate (not A)) is better
> than (not (vec_duplicate A)), so I'm not sure it is a good idea
> to forcefully canonicalize unary operations.

It is two unaries in sequence, where the order does not matter either.
As in all such cases you either have to handle both cases everywhere, or
have a canonical order.

> I suppose the
> simplification happens inside combine

combine uses simplify-rtx for most cases (it is part of combine, but
used in quite a few other places these days).

> - doesn't combine
> already have code to try variants of an expression and isn't
> this a good candidate that can be added there, avoiding
> the canonicalization?

As I mentioned, this is done in simplify-rtx in cases that do not have a
canonical representation.  This is critical because it prevents loops.

A very typical example is how UMIN is optimised:

   case UMIN:
      if (trueop1 == CONST0_RTX (mode) && ! side_effects_p (op0))
	return op1;
      if (rtx_equal_p (trueop0, trueop1) && ! side_effects_p (op0))
	return op0;
      tem = simplify_associative_operation (code, mode, op0, op1);
      if (tem)
	return tem;
      break;

(the stuff using "tem").

Hongtao, can we do something similar here?  Does that work well?  Please
try it out :-)


Segher
Li, Pan2 via Gcc-patches June 3, 2021, 11:03 a.m. UTC | #3
>-----Original Message-----
>From: Segher Boessenkool <segher@kernel.crashing.org>
>Sent: Thursday, June 3, 2021 4:46 AM
>To: Richard Biener <richard.guenther@gmail.com>
>Cc: Liu, Hongtao <hongtao.liu@intel.com>; GCC Patches <gcc-
>patches@gcc.gnu.org>
>Subject: Re: [PATCH] Canonicalize (vec_duplicate (not A)) to (not
>(vec_duplicate A)).
>
>Hi!
>
>On Wed, Jun 02, 2021 at 09:07:35AM +0200, Richard Biener wrote:
>> On Wed, Jun 2, 2021 at 7:41 AM liuhongt via Gcc-patches
>> <gcc-patches@gcc.gnu.org> wrote:
>> > For i386, it will enable below opt
>> >
>> > from
>> >         notl    %edi
>> >         vpbroadcastd    %edi, %xmm0
>> >         vpand   %xmm1, %xmm0, %xmm0
>> > to
>> >         vpbroadcastd    %edi, %xmm0
>> >         vpandn   %xmm1, %xmm0, %xmm0
>>
>> There will be cases where (vec_duplicate (not A)) is better than (not
>> (vec_duplicate A)), so I'm not sure it is a good idea to forcefully
>> canonicalize unary operations.
>
>It is two unaries in sequence, where the order does not matter either.
>As in all such cases you either have to handle both cases everywhere, or have
>a canonical order.
>
>> I suppose the
>> simplification happens inside combine
>
>combine uses simplify-rtx for most cases (it is part of combine, but used in
>quite a few other places these days).
>
>> - doesn't combine
>> already have code to try variants of an expression and isn't this a
>> good candidate that can be added there, avoiding the canonicalization?
>
>As I mentioned, this is done in simplify-rtx in cases that do not have a
>canonical representation.  This is critical because it prevents loops.
>
>A very typical example is how UMIN is optimised:
>
>   case UMIN:
>      if (trueop1 == CONST0_RTX (mode) && ! side_effects_p (op0))
>	return op1;
>      if (rtx_equal_p (trueop0, trueop1) && ! side_effects_p (op0))
>	return op0;
>      tem = simplify_associative_operation (code, mode, op0, op1);
>      if (tem)
>	return tem;
>      break;
>
>(the stuff using "tem").
>
>Hongtao, can we do something similar here?  Does that work well?  Please try
>it out :-)

In simplify_rtx, no simplication occurs, there is just the difference between
 (vec_duplicate (not REG)) and (not (vec_duplicate (REG)). So here tem will only be 0.
Basically we don't know it's a simplication until combine successfully split the
3->2 instructions (not + broadcast + and to andnot + broadcast), but it's pretty awkward
to do this in combine.

Consider andnot is existed for many backends, I think a canonicalization is needed here.
Maybe we can add insn canonicalization for transforming (and (vect_duplicate (not A)) B) to 
(and (not (duplicate (not A)) B) instead of (vec_duplicate (not A)) to (not (vec_duplicate A))?

>
>
>Segher
Jakub Jelinek June 3, 2021, 11:06 a.m. UTC | #4
On Thu, Jun 03, 2021 at 11:03:43AM +0000, Liu, Hongtao via Gcc-patches wrote:
> In simplify_rtx, no simplication occurs, there is just the difference between
>  (vec_duplicate (not REG)) and (not (vec_duplicate (REG)). So here tem will only be 0.
> Basically we don't know it's a simplication until combine successfully split the
> 3->2 instructions (not + broadcast + and to andnot + broadcast), but it's pretty awkward
> to do this in combine.
> 
> Consider andnot is existed for many backends, I think a canonicalization is needed here.
> Maybe we can add insn canonicalization for transforming (and (vect_duplicate (not A)) B) to 
> (and (not (duplicate (not A)) B) instead of (vec_duplicate (not A)) to (not (vec_duplicate A))?

For the (not (vec_duplicate)) vs. (vec_duplicate (not)) it isn't clear which
one is generally a win on major targets, so I'd say it is better to add a
combine splitter to swap it in backends that want that.

	Jakub
Segher Boessenkool June 3, 2021, 7:59 p.m. UTC | #5
On Thu, Jun 03, 2021 at 11:03:43AM +0000, Liu, Hongtao wrote:
> >A very typical example is how UMIN is optimised:
> >
> >   case UMIN:
> >      if (trueop1 == CONST0_RTX (mode) && ! side_effects_p (op0))
> >	return op1;
> >      if (rtx_equal_p (trueop0, trueop1) && ! side_effects_p (op0))
> >	return op0;
> >      tem = simplify_associative_operation (code, mode, op0, op1);
> >      if (tem)
> >	return tem;
> >      break;
> >
> >(the stuff using "tem").
> >
> >Hongtao, can we do something similar here?  Does that work well?  Please try
> >it out :-)
> 
> In simplify_rtx, no simplication occurs, there is just the difference between
>  (vec_duplicate (not REG)) and (not (vec_duplicate (REG)). So here tem will only be 0.

simplify-rtx is used by combine.  When you do and+not+splat for example
my suggestion should kick in.  Try it out, don't just dismiss it?

> Basically we don't know it's a simplication until combine successfully split the
> 3->2 instructions (not + broadcast + and to andnot + broadcast), but it's pretty awkward
> to do this in combine.

But you need to do this *before* it is split.  That is the whole point.

> Consider andnot is existed for many backends, I think a canonicalization is needed here.

Please do note that that is not as easy as yoou may think: you need to
make sure nothing ever creates non-canonical code.

> Maybe we can add insn canonicalization for transforming (and (vect_duplicate (not A)) B) to 
> (and (not (duplicate (not A)) B) instead of (vec_duplicate (not A)) to (not (vec_duplicate A))?

I don't understand what this means?


Segher
Li, Pan2 via Gcc-patches June 4, 2021, 2:48 a.m. UTC | #6
>-----Original Message-----
>From: Segher Boessenkool <segher@kernel.crashing.org>
>Sent: Friday, June 4, 2021 4:00 AM
>To: Liu, Hongtao <hongtao.liu@intel.com>
>Cc: Richard Biener <richard.guenther@gmail.com>; GCC Patches <gcc-
>patches@gcc.gnu.org>
>Subject: Re: [PATCH] Canonicalize (vec_duplicate (not A)) to (not
>(vec_duplicate A)).
>
>On Thu, Jun 03, 2021 at 11:03:43AM +0000, Liu, Hongtao wrote:
>> >A very typical example is how UMIN is optimised:
>> >
>> >   case UMIN:
>> >      if (trueop1 == CONST0_RTX (mode) && ! side_effects_p (op0))
>> >	return op1;
>> >      if (rtx_equal_p (trueop0, trueop1) && ! side_effects_p (op0))
>> >	return op0;
>> >      tem = simplify_associative_operation (code, mode, op0, op1);
>> >      if (tem)
>> >	return tem;
>> >      break;
>> >
>> >(the stuff using "tem").
>> >
>> >Hongtao, can we do something similar here?  Does that work well?
>> >Please try it out :-)
>>
>> In simplify_rtx, no simplication occurs, there is just the difference
>> between  (vec_duplicate (not REG)) and (not (vec_duplicate (REG)). So here
>tem will only be 0.
>
>simplify-rtx is used by combine.  When you do and+not+splat for example my
>suggestion should kick in.  Try it out, don't just dismiss it?
>
Forgive my obtuseness, do you mean try the following changes, if so then there will be no "kick in", 
temp will be 0, there's no simplification here since it's just the difference between  (vec_duplicate (not REG))
 and (not (vec_duplicate (REG)). Or maybe you mean something else?

@@ -1708,6 +1708,17 @@ simplify_context::simplify_unary_operation_1 (rtx_code code, machine_mode mode,
 #endif
       break;

+      /* Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).  */
+    case VEC_DUPLICATE:
+      if (GET_CODE (op) == NOT)
+       {
+         rtx vec_dup = gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0));
+         temp = simplify_unary_operation (NOT, mode, vec_dup, GET_MODE (op));
+         if (temp)
+           return temp;
+       }
+      break;
+
>> Basically we don't know it's a simplication until combine successfully
>> split the
>> 3->2 instructions (not + broadcast + and to andnot + broadcast), but
>> 3->it's pretty awkward
>> to do this in combine.
>
>But you need to do this *before* it is split.  That is the whole point.
>
>> Consider andnot is existed for many backends, I think a canonicalization is
>needed here.
>
>Please do note that that is not as easy as yoou may think: you need to make
>sure nothing ever creates non-canonical code.
>
>> Maybe we can add insn canonicalization for transforming (and
>> (vect_duplicate (not A)) B) to (and (not (duplicate (not A)) B) instead of
>(vec_duplicate (not A)) to (not (vec_duplicate A))?
>
>I don't understand what this means?
I mean let's give a last shot for andnot in case AND like below

@ -3702,6 +3702,16 @@ simplify_context::simplify_binary_operation_1 (rtx_code code,
       tem = simplify_associative_operation (code, mode, op0, op1);
       if (tem)
        return tem;
+
+      if (GET_CODE (op0) == VEC_DUPLICATE
+         && GET_CODE (XEXP (op0, 0)) == NOT)
+       {
+         rtx vec_dup = gen_rtx_VEC_DUPLICATE (GET_MODE (op0),
+                                              XEXP (XEXP (op0, 0), 0));
+         return simplify_gen_binary (AND, mode,
+                                     gen_rtx_NOT (mode, vec_dup),
+                                     op1);
+       }
       break;
>
>
>Segher
diff mbox series

Patch

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 0e65b3ae663..06b42901413 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -8297,6 +8297,11 @@  operand of @code{mult} is also a shift, then that is extended also.
 This transformation is only applied when it can be proven that the
 original operation had sufficient precision to prevent overflow.
 
+@cindex @code{vec_duplicate}, canonicalization of
+@item
+@code{(vec_duplicate (not @var{a}))} is converted to
+@code{(not (vec_duplicate @var{a}))}.
+
 @end itemize
 
 Further canonicalization rules are defined in the function
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 04423bbd195..171fc447d50 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -1708,6 +1708,12 @@  simplify_context::simplify_unary_operation_1 (rtx_code code, machine_mode mode,
 #endif
       break;
 
+      /* Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).  */
+    case VEC_DUPLICATE:
+      if (GET_CODE (op) == NOT)
+	return gen_rtx_NOT (mode, gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0)));
+      break;
+
     default:
       break;
     }
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr100711.c b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
new file mode 100644
index 00000000000..5b144623873
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
@@ -0,0 +1,73 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "pandn" 8 } } */
+/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+
+v16qi
+f1 (char a, v16qi c)
+{
+  char b = ~a;
+  return (__extension__(v16qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v32qi
+f2 (char a, v32qi c)
+{
+  char b = ~a;
+  return (__extension__(v32qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v8hi
+f3 (short a, v8hi c)
+{
+  short b = ~a;
+  return (__extension__(v8hi) {b, b, b, b, b, b, b, b}) & c;
+}
+
+v16hi
+f4 (short a, v16hi c)
+{
+  short b = ~a;
+  return (__extension__(v16hi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v4si
+f5 (int a, v4si c)
+{
+  int b = ~a;
+  return (__extension__(v4si) {b, b, b, b}) & c;
+}
+
+v8si
+f6 (int a, v8si c)
+{
+  int b = ~a;
+  return (__extension__(v8si) {b, b, b, b, b, b, b, b}) & c;
+}
+
+v2di
+f7 (long long a, v2di c)
+{
+  long long b = ~a;
+  return (__extension__(v2di) {b, b}) & c;
+}
+
+v4di
+f8 (long long a, v4di c)
+{
+  long long b = ~a;
+  return (__extension__(v4di) {b, b, b, b}) & c;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
new file mode 100644
index 00000000000..f0a103d0bc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
@@ -0,0 +1,48 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "pandn" 4 } } */
+/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef long long v8di __attribute__((vector_size(64)));
+
+v64qi
+f1 (char a, v64qi c)
+{
+  char b = ~a;
+  return (__extension__(v64qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v32hi
+f2 (short a, v32hi c)
+{
+  short b = ~a;
+  return (__extension__(v32hi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v16si
+f3 (int a, v16si c)
+{
+  int b = ~a;
+  return (__extension__(v16si) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v8di
+f4 (long long a, v8di c)
+{
+  long long b = ~a;
+  return (__extension__(v8di) {b, b, b, b, b, b, b, b}) & c;
+}