diff mbox

[i386,AVX-512] Update extract_even_odd w/ AVX-512BW insns.

Message ID 20151001111155.GA17847@msticlxl57.ims.intel.com
State New
Headers show

Commit Message

Kirill Yukhin Oct. 1, 2015, 11:11 a.m. UTC
Hello,
Patch in the bottom improves insns sequences for
strided loads.
E.g. on `-march=skylake-avx512' for this test:
 unsigned char yy[10000];
 unsigned char xx[10000];

  void
  __attribute__ ((noinline)) generateMTFValues (unsigned char s)
  {
     unsigned char i;
     for (i = 0; i < s; i++)
       yy[i] = xx [i*2 + 1];
  }

We have:
        vmovdqa64       .LC0(%rip), %zmm0       # 34    *movv32hi_internal/2    [length = 11]
        vmovdqa64       .LC1(%rip), %zmm1       # 36    *movv32hi_internal/2    [length = 11]
        vmovdqu64       xx+1(%rip), %zmm3       # 29    *movv64qi_internal/2    [length = 11]
        vmovdqu64       xx+65(%rip), %zmm4      # 32    *movv64qi_internal/2    [length = 11]
        vmovdqa64       %zmm0, %zmm2    # 153   *movv32hi_internal/2    [length = 6]
        vmovdqa64       %zmm1, %zmm5    # 155   *movv32hi_internal/2    [length = 6]
        vpermi2w        %zmm4, %zmm3, %zmm2     # 35    avx512bw_vpermi2varv32hi3       [length = 6]
        vpermi2w        %zmm4, %zmm3, %zmm5     # 37    avx512bw_vpermi2varv32hi3       [length = 6]
        vmovdqa64       .LC2(%rip), %zmm4       # 38    *movv64qi_internal/2    [length = 11]
        vmovdqa64       .LC3(%rip), %zmm3       # 40    *movv64qi_internal/2    [length = 11]
        vpshufb %zmm4, %zmm2, %zmm2     # 39    avx512bw_pshufbv64qi3/2 [length = 6]
        vpshufb %zmm3, %zmm5, %zmm5     # 41    avx512bw_pshufbv64qi3/2 [length = 6]
        vporq   %zmm5, %zmm2, %zmm2     # 42    *iorv64qi3/2    [length = 4]
        vmovdqu32       %zmm2, yy(%rip) # 44    avx512f_storedquv16si   [length = 11]
Due to most common permute expander got in charge.

Patch reduces the code to:
        vmovdqu64       xx+1(%rip), %zmm0       # 28    *movv16si_internal/2    [length = 11]
        vmovdqu64       xx+65(%rip), %zmm1      # 134   *movv16si_internal/2    [length = 11]
        vpmovwb %zmm0, %ymm0    # 34    avx512bw_truncatev32hiv32qi2/1  [length = 6]
        vpmovwb %zmm1, %ymm1    # 35    avx512bw_truncatev32hiv32qi2/1  [length = 6]
        vinserti64x4    $0x1, %ymm1, %zmm0, %zmm0       # 36    avx_vec_concatv64qi/1   [length = 7]
        vmovdqu32       %zmm0, yy(%rip) # 38    avx512f_storedquv16si   [length = 11]

Also it allows to do extract_even_odd for V64QI.

Bootstrapped. New tests pass (fail w/o the change). Regtesting is in progress.

Is it ok for trunk?

gcc/
	* config/i386/i386.c (expand_vec_perm_even_odd_trunc): New.
	(expand_vec_perm_even_odd_1): Handle V64QImode.
	(ix86_expand_vec_perm_const_1): Try expansion with
	expand_vec_perm_even_odd_trunc as well.
	* config/i386/sse.md (VI124_AVX512F): Rename to ...
	(define_mode_iterator VI124_AVX2_24_AVX512F_1_AVX512BW): This. Extend
	to V54QI.
	(define_mode_iterator VI248_AVX2_8_AVX512F): Rename to ...
	(define_mode_iterator VI248_AVX2_8_AVX512F_24_AVX512BW): This. Extend
	to V32HI and V16SI.
	(define_insn "avx512bw_<code>v32hiv32qi2"): Unhide pattern name.
	(define_expand "vec_pack_trunc_<mode>"): Update iterator name.
	(define_expand "vec_unpacks_lo_<mode>"): Ditto.
	(define_expand "vec_unpacks_hi_<mode>"): Ditto.
	(define_expand "vec_unpacku_lo_<mode>"): Ditto.
	(define_expand "vec_unpacku_hi_<mode>"): Ditto.

gcc/testsuite/
	* gcc.target/i386/vect-pack-trunc-1.c: New test.
	* gcc.target/i386/vect-pack-trunc-2.c: Ditto.
	* gcc.target/i386/vect-perm-even-1.c: Ditto.
	* gcc.target/i386/vect-perm-odd-1.c: Ditto.
	* gcc.target/i386/vect-unpack-1.c: Ditto.
	* gcc.target/i386/vect-unpack-2.c: Ditto.
--
Thanks, K

Comments

Kirill Yukhin Oct. 2, 2015, 2:37 p.m. UTC | #1
On 01 Oct 14:11, Kirill Yukhin wrote:
> Bootstrapped. New tests pass (fail w/o the change). Regtesting is in progress.
> 
> Is it ok for trunk?
> 
> gcc/
> 	* config/i386/i386.c (expand_vec_perm_even_odd_trunc): New.
> 	(expand_vec_perm_even_odd_1): Handle V64QImode.
> 	(ix86_expand_vec_perm_const_1): Try expansion with
> 	expand_vec_perm_even_odd_trunc as well.
> 	* config/i386/sse.md (VI124_AVX512F): Rename to ...
> 	(define_mode_iterator VI124_AVX2_24_AVX512F_1_AVX512BW): This. Extend
> 	to V54QI.
> 	(define_mode_iterator VI248_AVX2_8_AVX512F): Rename to ...
> 	(define_mode_iterator VI248_AVX2_8_AVX512F_24_AVX512BW): This. Extend
> 	to V32HI and V16SI.
> 	(define_insn "avx512bw_<code>v32hiv32qi2"): Unhide pattern name.
> 	(define_expand "vec_pack_trunc_<mode>"): Update iterator name.
> 	(define_expand "vec_unpacks_lo_<mode>"): Ditto.
> 	(define_expand "vec_unpacks_hi_<mode>"): Ditto.
> 	(define_expand "vec_unpacku_lo_<mode>"): Ditto.
> 	(define_expand "vec_unpacku_hi_<mode>"): Ditto.
> 
> gcc/testsuite/
> 	* gcc.target/i386/vect-pack-trunc-1.c: New test.
> 	* gcc.target/i386/vect-pack-trunc-2.c: Ditto.
> 	* gcc.target/i386/vect-perm-even-1.c: Ditto.
> 	* gcc.target/i386/vect-perm-odd-1.c: Ditto.
> 	* gcc.target/i386/vect-unpack-1.c: Ditto.
> 	* gcc.target/i386/vect-unpack-2.c: Ditto.
Checked into main trunk. I'll also check it into gcc-5-branch 
if no objections from RMs next ww.

--
Thanks, K
H.J. Lu Oct. 6, 2015, 11:05 a.m. UTC | #2
On Fri, Oct 2, 2015 at 7:37 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
> On 01 Oct 14:11, Kirill Yukhin wrote:
>> Bootstrapped. New tests pass (fail w/o the change). Regtesting is in progress.
>>
>> Is it ok for trunk?
>>
>> gcc/
>>       * config/i386/i386.c (expand_vec_perm_even_odd_trunc): New.
>>       (expand_vec_perm_even_odd_1): Handle V64QImode.
>>       (ix86_expand_vec_perm_const_1): Try expansion with
>>       expand_vec_perm_even_odd_trunc as well.
>>       * config/i386/sse.md (VI124_AVX512F): Rename to ...
>>       (define_mode_iterator VI124_AVX2_24_AVX512F_1_AVX512BW): This. Extend
>>       to V54QI.
>>       (define_mode_iterator VI248_AVX2_8_AVX512F): Rename to ...
>>       (define_mode_iterator VI248_AVX2_8_AVX512F_24_AVX512BW): This. Extend
>>       to V32HI and V16SI.
>>       (define_insn "avx512bw_<code>v32hiv32qi2"): Unhide pattern name.
>>       (define_expand "vec_pack_trunc_<mode>"): Update iterator name.
>>       (define_expand "vec_unpacks_lo_<mode>"): Ditto.
>>       (define_expand "vec_unpacks_hi_<mode>"): Ditto.
>>       (define_expand "vec_unpacku_lo_<mode>"): Ditto.
>>       (define_expand "vec_unpacku_hi_<mode>"): Ditto.
>>
>> gcc/testsuite/
>>       * gcc.target/i386/vect-pack-trunc-1.c: New test.
>>       * gcc.target/i386/vect-pack-trunc-2.c: Ditto.
>>       * gcc.target/i386/vect-perm-even-1.c: Ditto.
>>       * gcc.target/i386/vect-perm-odd-1.c: Ditto.
>>       * gcc.target/i386/vect-unpack-1.c: Ditto.
>>       * gcc.target/i386/vect-unpack-2.c: Ditto.
> Checked into main trunk. I'll also check it into gcc-5-branch
> if no objections from RMs next ww.
>

This caused:

FAIL: gcc.target/i386/vect-perm-odd-1.c (test for excess errors)

on gcc-5-branch.
H.J. Lu Oct. 6, 2015, 11:18 a.m. UTC | #3
On Tue, Oct 6, 2015 at 4:05 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Fri, Oct 2, 2015 at 7:37 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
>> On 01 Oct 14:11, Kirill Yukhin wrote:
>>> Bootstrapped. New tests pass (fail w/o the change). Regtesting is in progress.
>>>
>>> Is it ok for trunk?
>>>
>>> gcc/
>>>       * config/i386/i386.c (expand_vec_perm_even_odd_trunc): New.
>>>       (expand_vec_perm_even_odd_1): Handle V64QImode.
>>>       (ix86_expand_vec_perm_const_1): Try expansion with
>>>       expand_vec_perm_even_odd_trunc as well.
>>>       * config/i386/sse.md (VI124_AVX512F): Rename to ...
>>>       (define_mode_iterator VI124_AVX2_24_AVX512F_1_AVX512BW): This. Extend
>>>       to V54QI.
>>>       (define_mode_iterator VI248_AVX2_8_AVX512F): Rename to ...
>>>       (define_mode_iterator VI248_AVX2_8_AVX512F_24_AVX512BW): This. Extend
>>>       to V32HI and V16SI.
>>>       (define_insn "avx512bw_<code>v32hiv32qi2"): Unhide pattern name.
>>>       (define_expand "vec_pack_trunc_<mode>"): Update iterator name.
>>>       (define_expand "vec_unpacks_lo_<mode>"): Ditto.
>>>       (define_expand "vec_unpacks_hi_<mode>"): Ditto.
>>>       (define_expand "vec_unpacku_lo_<mode>"): Ditto.
>>>       (define_expand "vec_unpacku_hi_<mode>"): Ditto.
>>>
>>> gcc/testsuite/
>>>       * gcc.target/i386/vect-pack-trunc-1.c: New test.
>>>       * gcc.target/i386/vect-pack-trunc-2.c: Ditto.
>>>       * gcc.target/i386/vect-perm-even-1.c: Ditto.
>>>       * gcc.target/i386/vect-perm-odd-1.c: Ditto.
>>>       * gcc.target/i386/vect-unpack-1.c: Ditto.
>>>       * gcc.target/i386/vect-unpack-2.c: Ditto.
>> Checked into main trunk. I'll also check it into gcc-5-branch
>> if no objections from RMs next ww.
>>
>
> This caused:
>
> FAIL: gcc.target/i386/vect-perm-odd-1.c (test for excess errors)
>
> on gcc-5-branch.
>

vect-perm-odd-1.s: Assembler messages:
vect-perm-odd-1.s:233: Error: operand type mismatch for `vpor'
vect-perm-odd-1.s:240: Error: operand type mismatch for `vpor'

 vpor    %zmm1, %zmm2, %zmm1

It should be

 vporq    %zmm1, %zmm2, %zmm1
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d370521..24202b3 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -49216,6 +49216,62 @@  expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V64QI operands
+   with two "shifts", two "truncs" and one "concat" insns for "odd"
+   and two "truncs" and one concat insn for "even."
+   Have already failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
+{
+  rtx t1, t2, t3, t4;
+  unsigned i, odd, nelt = d->nelt;
+
+  if (!TARGET_AVX512BW
+      || d->one_operand_p
+      || d->vmode != V64QImode)
+    return false;
+
+  /* Check that permutation is even or odd.  */
+  odd = d->perm[0];
+  if (odd > 1)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != 2 * i + odd)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+
+  if (odd)
+    {
+      t1 = gen_reg_rtx (V32HImode);
+      t2 = gen_reg_rtx (V32HImode);
+      emit_insn (gen_lshrv32hi3 (t1,
+				 gen_lowpart (V32HImode, d->op0),
+				 GEN_INT (8)));
+      emit_insn (gen_lshrv32hi3 (t2,
+				 gen_lowpart (V32HImode, d->op1),
+				 GEN_INT (8)));
+    }
+  else
+    {
+      t1 = gen_lowpart (V32HImode, d->op0);
+      t2 = gen_lowpart (V32HImode, d->op1);
+    }
+
+  t3 = gen_reg_rtx (V32QImode);
+  t4 = gen_reg_rtx (V32QImode);
+  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
+  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
+  emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
    and extract-odd permutations.  */
 
@@ -49318,6 +49374,9 @@  expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
     case V32QImode:
       return expand_vec_perm_even_odd_pack (d);
 
+    case V64QImode:
+      return expand_vec_perm_even_odd_trunc (d);
+
     case V4DImode:
       if (!TARGET_AVX2)
 	{
@@ -49779,6 +49838,8 @@  ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 
   /* Try sequences of four instructions.  */
 
+  if (expand_vec_perm_even_odd_trunc (d))
+    return true;
   if (expand_vec_perm_vpshufb2_vpermq (d))
     return true;
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 4eefb45..013681c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -381,8 +381,8 @@ 
   [(V16HI "TARGET_AVX2") V8HI
    (V8SI "TARGET_AVX2") V4SI])
 
-(define_mode_iterator VI124_AVX512F
-  [(V32QI "TARGET_AVX2") V16QI
+(define_mode_iterator VI124_AVX2_24_AVX512F_1_AVX512BW
+  [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI
    (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI
    (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI])
 
@@ -398,9 +398,9 @@ 
   [(V8SI "TARGET_AVX2") V4SI
    (V4DI "TARGET_AVX2") V2DI])
 
-(define_mode_iterator VI248_AVX2_8_AVX512F
-  [(V16HI "TARGET_AVX2") V8HI
-   (V8SI "TARGET_AVX2") V4SI
+(define_mode_iterator VI248_AVX2_8_AVX512F_24_AVX512BW
+  [(V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI
+   (V16SI "TARGET_AVX512BW") (V8SI "TARGET_AVX2") V4SI
    (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
 
 (define_mode_iterator VI248_AVX512BW_AVX512VL
@@ -8749,7 +8749,7 @@ 
       (match_operand:<avx512fmaskmode> 2 "register_operand")))]
   "TARGET_AVX512F")
 
-(define_insn "*avx512bw_<code>v32hiv32qi2"
+(define_insn "avx512bw_<code>v32hiv32qi2"
   [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m")
 	(any_truncate:V32QI
 	    (match_operand:V32HI 1 "register_operand" "v,v")))]
@@ -11331,8 +11331,8 @@ 
 
 (define_expand "vec_pack_trunc_<mode>"
   [(match_operand:<ssepackmode> 0 "register_operand")
-   (match_operand:VI248_AVX2_8_AVX512F 1 "register_operand")
-   (match_operand:VI248_AVX2_8_AVX512F 2 "register_operand")]
+   (match_operand:VI248_AVX2_8_AVX512F_24_AVX512BW 1 "register_operand")
+   (match_operand:VI248_AVX2_8_AVX512F_24_AVX512BW 2 "register_operand")]
   "TARGET_SSE2"
 {
   rtx op1 = gen_lowpart (<ssepackmode>mode, operands[1]);
@@ -13221,25 +13221,25 @@ 
 
 (define_expand "vec_unpacks_lo_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI124_AVX512F 1 "register_operand")]
+   (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;")
 
 (define_expand "vec_unpacks_hi_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI124_AVX512F 1 "register_operand")]
+   (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;")
 
 (define_expand "vec_unpacku_lo_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI124_AVX512F 1 "register_operand")]
+   (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;")
 
 (define_expand "vec_unpacku_hi_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI124_AVX512F 1 "register_operand")]
+   (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;")
 
diff --git a/gcc/testsuite/gcc.target/i386/vect-pack-trunc-1.c b/gcc/testsuite/gcc.target/i386/vect-pack-trunc-1.c
new file mode 100644
index 0000000..774d4bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-pack-trunc-1.c
@@ -0,0 +1,28 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -mavx512bw -save-temps" } */
+
+#include "avx512bw-check.h"
+
+#define N 400
+unsigned char yy[10000];
+
+void
+__attribute__ ((noinline)) foo (unsigned short s)
+{
+   unsigned short i;
+   for (i = 0; i < s; i++)
+     yy[i] = (unsigned char) i;
+}
+
+void
+avx512bw_test ()
+{
+  unsigned short i;
+  foo (N);
+
+  for (i = 0; i < N; i++)
+    if ( (unsigned char)i != yy [i] )
+      abort ();
+}
+
+/* { dg-final { scan-assembler-times "vpmovwb\[ \\t\]+\[^\n\]*%zmm" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-pack-trunc-2.c b/gcc/testsuite/gcc.target/i386/vect-pack-trunc-2.c
new file mode 100644
index 0000000..a1a075f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-pack-trunc-2.c
@@ -0,0 +1,27 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -mavx512bw -save-temps" } */
+
+#include "avx512bw-check.h"
+
+#define N 400
+unsigned short yy[10000];
+
+void
+__attribute__ ((noinline)) foo (unsigned int s)
+{
+   unsigned int i;
+   for (i = 0; i < s; i++)
+     yy[i] = (unsigned short) i;
+}
+
+void
+avx512bw_test ()
+{
+  unsigned int i;
+  foo (N);
+  for (i = 0; i < N; i++)
+    if ( (unsigned short)i != yy [i] )
+      abort ();
+}
+
+/* { dg-final { scan-assembler-times "vpermi2w\[ \\t\]+\[^\n\]*%zmm" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-perm-even-1.c b/gcc/testsuite/gcc.target/i386/vect-perm-even-1.c
new file mode 100644
index 0000000..a2ff73d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-perm-even-1.c
@@ -0,0 +1,33 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -mavx512bw -save-temps" } */
+
+#include "avx512bw-check.h"
+
+#define N 400
+unsigned char yy[10000];
+unsigned char xx[10000];
+
+void
+__attribute__ ((noinline)) foo (unsigned short s)
+{
+   unsigned short i;
+   for (i = 0; i < s; i++)
+     yy[i] = xx [i*2 + 1];
+}
+
+void
+avx512bw_test ()
+{
+  unsigned short i;
+  unsigned char j = 0;
+  for (i = 0; i < 2 * N + 1; i++, j++)
+    xx [i] = j;
+
+  foo (N);
+
+  for (i = 0; i < N; i++)
+    if ( (unsigned char)(2*i+1) != yy [i] )
+      abort ();
+}
+
+/* { dg-final { scan-assembler-times "vpmovwb\[ \\t\]+\[^\n\]*%zmm" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-perm-odd-1.c b/gcc/testsuite/gcc.target/i386/vect-perm-odd-1.c
new file mode 100644
index 0000000..65f1a80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-perm-odd-1.c
@@ -0,0 +1,45 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -mavx512bw -save-temps" } */
+
+#include "avx512bw-check.h"
+
+#define N 400
+
+typedef struct
+{
+  unsigned char real;
+  unsigned char imag;
+} complex8_t;
+
+void
+__attribute__ ((noinline)) foo (unsigned char *a,
+				complex8_t *x, unsigned len)
+{
+  unsigned i;
+  for (i = 0; i < len; i++)
+    a[i] = x[i].imag + x[i].real;
+}
+
+void
+avx512bw_test ()
+{
+  unsigned short i;
+  unsigned char j = 0;
+  complex8_t x [N];
+  unsigned char a [N];
+
+  for (i = 0; i < N; i++, j++)
+    {
+      x [i].real = j;
+      x [i].imag = j;
+    }
+
+  foo (a, x, N);
+
+  j = 0;
+  for (i = 0; i < N; i++, j++)
+    if ( a[i] != (unsigned char)(j+j) )
+      abort ();
+}
+
+/* { dg-final { scan-assembler-times "vpmovwb\[ \\t\]+\[^\n\]*%zmm" 4 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-unpack-1.c b/gcc/testsuite/gcc.target/i386/vect-unpack-1.c
new file mode 100644
index 0000000..eedca47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-unpack-1.c
@@ -0,0 +1,27 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -mavx512bw -save-temps" } */
+
+#include "avx512bw-check.h"
+
+#define N 255
+unsigned int yy[10000];
+
+void
+__attribute__ ((noinline)) foo (unsigned char s)
+{
+   unsigned char i;
+   for (i = 0; i < s; i++)
+     yy[i] = (unsigned int) i;
+}
+
+void
+avx512bw_test ()
+{
+  unsigned char i;
+  foo (N);
+  for (i = 0; i < N; i++)
+    if ( (unsigned int)i != yy [i] )
+      abort ();
+}
+
+/* { dg-final { scan-assembler-times "vpmovzxbw\[ \\t\]+\[^\n\]*%zmm" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-unpack-2.c b/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
new file mode 100644
index 0000000..b825f0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
@@ -0,0 +1,27 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -mavx512bw -save-temps" } */
+
+#include "avx512bw-check.h"
+
+#define N 120
+signed int yy[10000];
+
+void
+__attribute__ ((noinline)) foo (signed char s)
+{
+   signed char i;
+   for (i = 0; i < s; i++)
+     yy[i] = (signed int) i;
+}
+
+void
+avx512bw_test ()
+{
+  signed char i;
+  foo (N);
+  for (i = 0; i < N; i++)
+    if ( (signed int)i != yy [i] )
+      abort ();
+}
+
+/* { dg-final { scan-assembler-times "vpmovsxbw\[ \\t\]+\[^\n\]*%zmm" 2 } } */