@@ -51910,7 +51910,7 @@ ix86_expand_vecop_qihi (enum rtx_code co
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 64; ++i)
+ for (i = 0; i < d.nelt; ++i)
d.perm[i] = i * 2;
}
else
@@ -51918,8 +51918,8 @@ ix86_expand_vecop_qihi (enum rtx_code co
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
Happily, that is even one insn shorter than even extraction. */
- for (i = 0; i < 64; ++i)
- d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
+ for (i = 0; i < d.nelt; ++i)
+ d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
}
ok = ix86_expand_vec_perm_const_1 (&d);
@@ -0,0 +1,27 @@
+/* PR target/70329 */
+/* { dg-do run } */
+/* { dg-options "-O0 -mavx512bw" } */
+/* { dg-require-effective-target avx512bw } */
+
+#define AVX512BW
+#include "avx512f-helper.h"
+
+typedef unsigned char A __attribute__ ((vector_size (64)));
+typedef unsigned int B __attribute__ ((vector_size (64)));
+
+unsigned __attribute__ ((noinline, noclone))
+foo (A a, A b, B c)
+{
+ a *= b;
+ c[1] += a[8];
+ return c[1];
+}
+
+void
+TEST (void)
+{
+ A a = (A) { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+ unsigned x = foo (a, a, (B) { 1, 2 });
+ if (x != 83)
+ abort ();
+}
@@ -0,0 +1,33 @@
+/* PR target/70329 */
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -mavx512bw" } */
+/* { dg-require-effective-target avx512bw } */
+
+#define AVX512BW
+#include "avx512f-helper.h"
+
+__attribute__((noinline, noclone)) void
+foo (unsigned char *src1, unsigned char *src2, unsigned char *dst)
+{
+ int i;
+
+ for (i = 0; i < 64; i++)
+ dst[i] = (unsigned char) ((int) src1[i] * (int) src2[i]);
+}
+
+void
+TEST (void)
+{
+ unsigned char a[64], b[64], c[64];
+ int i;
+
+ for (i = 0; i < 64; i++)
+ {
+ a[i] = i;
+ b[i] = (i + 1);
+ }
+ foo (a, b, c);
+ for (i = 0; i < 64; i++)
+ if (c[i] != (unsigned char) (i * (i + 1)))
+ abort ();
+}