Message ID | 20181214194708.GR12380@tucnak |
---|---|
State | New |
Headers | show |
Series | Fix up AVX512F masked gather vectorization, add support for AVX512F 512-bit masked scatter vectorization (PR tree-optimization/88464) | expand |
On 12/14/18, Jakub Jelinek <jakub@redhat.com> wrote: > Hi! > > In the previous patch I've unfortunately left one important case from the > testcase and apparently it wasn't covered by anything else in the > testsuite. > The 3 functions covered float and double gathers with indexes with the same > bitsize and WIDENING gather (double gather with int index), but didn't > cover > NARROWING case (float gather with long index with -m64). That was the only > case that tried to permute the mask, unfortunately that isn't really > supported and ICEs. What works is VEC_UNPACK_{LO,HI}_EXPR on the > VECTOR_BOOLEAN_TYPE_P, that is what other spots in the vectorizer emit for > those. > > I had to also fix up the x86 backend, which had in expansion of these > NARROWING gather builtins code cut&pasted from the 256-bit builtin, > unfortunately it wasn't adjusted for the fact that the 512-bit builtin uses > integral mask argument while the 256-bit one doesn't. And even in the > 256-bit one there was a bug, it relied on the mask and src arguments to be > always in the same register (which is actually what the vectorizer > generates > for those right now, but it could do something else). > > This patch fixes that and enables also masked x86 AVX512F 512-bit > scatter support. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > What is still unhandled (doesn't vectorize) is 128-bit or 256-bit scatters, > I bet the mask operand is vectorized using normal non-bool vectors, but the > instructions with AVX512VL actually need a mask register. There are > instructions that can handle that, but let's defer that for later. > > 2018-12-14 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/88464 > * tree-vect-stmts.c (vect_build_gather_load_calls): For NARROWING > and mask with integral masktype, don't try to permute mask vectors, > instead emit VEC_UNPACK_{LO,HI}_EXPR. Fix up NOP_EXPR operand. > (vectorizable_store): Handle masked scatters with decl and integral > mask type. > (permute_vec_elements): Allow scalar_dest to be NULL. > * config/i386/i386.c (ix86_get_builtin) > <case IX86_BUILTIN_GATHER3ALTDIV16SF>: Use lowpart_subreg for masks. > <case IX86_BUILTIN_GATHER3ALTDIV8SF>: Don't assume mask and src have > to be the same. > > * gcc.target/i386/avx512f-pr88462-1.c: Rename to ... > * gcc.target/i386/avx512f-pr88464-1.c: ... this. Fix up PR number. > Expect 4 vectorized loops instead of 3. > (f4): New function. > * gcc.target/i386/avx512f-pr88462-2.c: Rename to ... > * gcc.target/i386/avx512f-pr88464-2.c: ... this. Fix up PR number > and #include. > (avx512f_test): Prepare arguments for f4 and check the results. > * gcc.target/i386/avx512f-pr88464-3.c: New test. > * gcc.target/i386/avx512f-pr88464-4.c: New test. LGTM for the x86 part. Thanks, Uros. > --- gcc/tree-vect-stmts.c.jj 2018-12-13 18:01:13.000000000 +0100 > +++ gcc/tree-vect-stmts.c 2018-12-14 17:10:42.079054458 +0100 > @@ -2655,6 +2655,7 @@ vect_build_gather_load_calls (stmt_vec_i > if (mask && TREE_CODE (masktype) == INTEGER_TYPE) > masktype = build_same_sized_truth_vector_type (srctype); > > + tree mask_halftype = masktype; > tree perm_mask = NULL_TREE; > tree mask_perm_mask = NULL_TREE; > if (known_eq (nunits, gather_off_nunits)) > @@ -2690,13 +2691,16 @@ vect_build_gather_load_calls (stmt_vec_i > > ncopies *= 2; > > - if (mask) > + if (mask && masktype == real_masktype) > { > for (int i = 0; i < count; ++i) > sel[i] = i | (count / 2); > indices.new_vector (sel, 2, count); > mask_perm_mask = vect_gen_perm_mask_checked (masktype, indices); > } > + else if (mask) > + mask_halftype > + = build_same_sized_truth_vector_type (gs_info->offset_vectype); > } > else > gcc_unreachable (); > @@ -2761,7 +2765,7 @@ vect_build_gather_load_calls (stmt_vec_i > { > if (j == 0) > vec_mask = vect_get_vec_def_for_operand (mask, stmt_info); > - else > + else if (modifier != NARROW || (j & 1) == 0) > vec_mask = vect_get_vec_def_for_stmt_copy (loop_vinfo, > vec_mask); > > @@ -2779,17 +2783,27 @@ vect_build_gather_load_calls (stmt_vec_i > mask_op = var; > } > } > + if (modifier == NARROW && masktype != real_masktype) > + { > + var = vect_get_new_ssa_name (mask_halftype, vect_simple_var); > + gassign *new_stmt > + = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR > + : VEC_UNPACK_LO_EXPR, > + mask_op); > + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > + mask_op = var; > + } > src_op = mask_op; > } > > tree mask_arg = mask_op; > if (masktype != real_masktype) > { > - tree utype; > - if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype)) > + tree utype, optype = TREE_TYPE (mask_op); > + if (TYPE_MODE (real_masktype) == TYPE_MODE (optype)) > utype = real_masktype; > else > - utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1); > + utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1); > var = vect_get_new_ssa_name (utype, vect_scalar_var); > mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op); > gassign *new_stmt > @@ -2801,7 +2815,7 @@ vect_build_gather_load_calls (stmt_vec_i > gcc_assert (TYPE_PRECISION (utype) > <= TYPE_PRECISION (real_masktype)); > var = vect_get_new_ssa_name (real_masktype, vect_scalar_var); > - new_stmt = gimple_build_assign (var, NOP_EXPR, utype); > + new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg); > vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > mask_arg = var; > } > @@ -6361,7 +6375,8 @@ vectorizable_store (stmt_vec_info stmt_i > return false; > } > else if (memory_access_type != VMAT_LOAD_STORE_LANES > - && (memory_access_type != VMAT_GATHER_SCATTER || gs_info.decl)) > + && (memory_access_type != VMAT_GATHER_SCATTER > + || (gs_info.decl && !VECTOR_BOOLEAN_TYPE_P (mask_vectype)))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -6419,7 +6434,9 @@ vectorizable_store (stmt_vec_info stmt_i > tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, src; > tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); > tree rettype, srctype, ptrtype, idxtype, masktype, scaletype; > - tree ptr, mask, var, scale, perm_mask = NULL_TREE; > + tree ptr, var, scale, vec_mask; > + tree mask_arg = NULL_TREE, mask_op = NULL_TREE, perm_mask = > NULL_TREE; > + tree mask_halfvectype = mask_vectype; > edge pe = loop_preheader_edge (loop); > gimple_seq seq; > basic_block new_bb; > @@ -6460,6 +6477,10 @@ vectorizable_store (stmt_vec_info stmt_i > perm_mask = vect_gen_perm_mask_checked (vectype, indices); > gcc_assert (perm_mask != NULL_TREE); > ncopies *= 2; > + > + if (mask) > + mask_halfvectype > + = build_same_sized_truth_vector_type (gs_info.offset_vectype); > } > else > gcc_unreachable (); > @@ -6482,10 +6503,11 @@ vectorizable_store (stmt_vec_info stmt_i > gcc_assert (!new_bb); > } > > - /* Currently we support only unconditional scatter stores, > - so mask should be all ones. */ > - mask = build_int_cst (masktype, -1); > - mask = vect_init_vector (stmt_info, mask, masktype, NULL); > + if (mask == NULL_TREE) > + { > + mask_arg = build_int_cst (masktype, -1); > + mask_arg = vect_init_vector (stmt_info, mask_arg, masktype, NULL); > + } > > scale = build_int_cst (scaletype, gs_info.scale); > > @@ -6494,36 +6516,46 @@ vectorizable_store (stmt_vec_info stmt_i > { > if (j == 0) > { > - src = vec_oprnd1 > - = vect_get_vec_def_for_operand (op, stmt_info); > - op = vec_oprnd0 > - = vect_get_vec_def_for_operand (gs_info.offset, stmt_info); > + src = vec_oprnd1 = vect_get_vec_def_for_operand (op, stmt_info); > + op = vec_oprnd0 = vect_get_vec_def_for_operand (gs_info.offset, > + stmt_info); > + if (mask) > + mask_op = vec_mask = vect_get_vec_def_for_operand (mask, > + stmt_info); > } > else if (modifier != NONE && (j & 1)) > { > if (modifier == WIDEN) > { > - src = vec_oprnd1 > - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1); > + src > + = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, > + vec_oprnd1); > op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask, > stmt_info, gsi); > + if (mask) > + mask_op > + = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo, > + vec_mask); > } > else if (modifier == NARROW) > { > src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask, > stmt_info, gsi); > - op = vec_oprnd0 > - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0); > + op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo, > + vec_oprnd0); > } > else > gcc_unreachable (); > } > else > { > - src = vec_oprnd1 > - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1); > - op = vec_oprnd0 > - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0); > + src = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, > + vec_oprnd1); > + op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo, > + vec_oprnd0); > + if (mask) > + mask_op = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo, > + vec_mask); > } > > if (!useless_type_conversion_p (srctype, TREE_TYPE (src))) > @@ -6550,8 +6582,45 @@ vectorizable_store (stmt_vec_info stmt_i > op = var; > } > > + if (mask) > + { > + tree utype; > + mask_arg = mask_op; > + if (modifier == NARROW) > + { > + var = vect_get_new_ssa_name (mask_halfvectype, > + vect_simple_var); > + gassign *new_stmt > + = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR > + : VEC_UNPACK_LO_EXPR, > + mask_op); > + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > + mask_arg = var; > + } > + tree optype = TREE_TYPE (mask_arg); > + if (TYPE_MODE (masktype) == TYPE_MODE (optype)) > + utype = masktype; > + else > + utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1); > + var = vect_get_new_ssa_name (utype, vect_scalar_var); > + mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_arg); > + gassign *new_stmt > + = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg); > + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > + mask_arg = var; > + if (!useless_type_conversion_p (masktype, utype)) > + { > + gcc_assert (TYPE_PRECISION (utype) > + <= TYPE_PRECISION (masktype)); > + var = vect_get_new_ssa_name (masktype, vect_scalar_var); > + new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg); > + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > + mask_arg = var; > + } > + } > + > gcall *new_stmt > - = gimple_build_call (gs_info.decl, 5, ptr, mask, op, src, scale); > + = gimple_build_call (gs_info.decl, 5, ptr, mask_arg, op, src, scale); > stmt_vec_info new_stmt_info > = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > > @@ -7284,7 +7353,7 @@ permute_vec_elements (tree x, tree y, tr > gimple *perm_stmt; > > tree scalar_dest = gimple_get_lhs (stmt_info->stmt); > - if (TREE_CODE (scalar_dest) == SSA_NAME) > + if (scalar_dest && TREE_CODE (scalar_dest) == SSA_NAME) > perm_dest = vect_create_destination_var (scalar_dest, vectype); > else > perm_dest = vect_get_new_vect_var (vectype, vect_simple_var, NULL); > --- gcc/config/i386/i386.c.jj 2018-12-13 13:45:11.000000000 +0100 > +++ gcc/config/i386/i386.c 2018-12-14 17:34:11.131135056 +0100 > @@ -37605,13 +37605,7 @@ rdseed_step: > op0 = copy_to_mode_reg (GET_MODE (op0), op0); > emit_insn (gen (half, op0)); > op0 = half; > - if (GET_MODE (op3) != VOIDmode) > - { > - if (!nonimmediate_operand (op3, GET_MODE (op3))) > - op3 = copy_to_mode_reg (GET_MODE (op3), op3); > - emit_insn (gen (half, op3)); > - op3 = half; > - } > + op3 = lowpart_subreg (QImode, op3, HImode); > break; > case IX86_BUILTIN_GATHER3ALTDIV8SF: > case IX86_BUILTIN_GATHER3ALTDIV8SI: > @@ -37628,6 +37622,7 @@ rdseed_step: > op0 = half; > if (GET_MODE (op3) != VOIDmode) > { > + half = gen_reg_rtx (mode0); > if (!nonimmediate_operand (op3, GET_MODE (op3))) > op3 = copy_to_mode_reg (GET_MODE (op3), op3); > emit_insn (gen (half, op3)); > --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c.jj 2018-12-14 > 16:34:55.361955571 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c 2018-12-14 > 18:07:25.694686784 +0100 > @@ -0,0 +1,45 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 > -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte > vectors" 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +__attribute__((noipa)) void > +f1 (double * __restrict__ a, const double * __restrict__ b, const int * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (a[i] > 10.0) > + a[i] = b[c[i]]; > +} > + > +__attribute__((noipa)) void > +f2 (double * __restrict__ a, const double * __restrict__ b, const long * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (a[i] > 10.0) > + a[i] = b[c[i]]; > +} > + > +__attribute__((noipa)) void > +f3 (float * __restrict__ a, const float * __restrict__ b, const int * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (a[i] > 10.0f) > + a[i] = b[c[i]]; > +} > + > +__attribute__((noipa)) void > +f4 (float * __restrict__ a, const float * __restrict__ b, const long * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (a[i] > 10.0f) > + a[i] = b[c[i]]; > +} > --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c.jj 2018-12-14 > 16:35:00.681869029 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c 2018-12-14 > 17:43:40.294876267 +0100 > @@ -0,0 +1,61 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do run { target { avx512f } } } */ > +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 > -mtune=skylake-avx512" } */ > + > +#include "avx512f-check.h" > + > +#include "avx512f-pr88464-1.c" > + > +static void > +avx512f_test (void) > +{ > + double a[1024], b[1024]; > + float c[1024], f[1024]; > + int d[1024]; > + long e[1024]; > + int i; > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + a[i] = (i % 3) != 0 ? 15.0 : -5.0; > + b[i] = 2 * i; > + d[i] = (i % 3) ? 1023 - i : __INT_MAX__; > + } > + f1 (a, b, d, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) > + abort (); > + a[i] = (i % 3) != 1 ? 15.0 : -5.0; > + b[i] = 3 * i; > + e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; > + } > + f2 (a, b, e, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0)) > + abort (); > + c[i] = (i % 3) != 2 ? 15.0f : -5.0f; > + d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; > + f[i] = 4 * i; > + } > + f3 (c, f, d, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f)) > + abort (); > + c[i] = (i % 3) != 0 ? 15.0f : -5.0f; > + e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; > + f[i] = 5 * i; > + } > + f4 (c, f, e, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f)) > + abort (); > + } > +} > --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c.jj 2018-12-14 > 18:01:19.297647800 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c 2018-12-14 > 18:07:14.906862302 +0100 > @@ -0,0 +1,45 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 > -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte > vectors" 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +__attribute__((noipa)) void > +f1 (double * __restrict__ a, const double * __restrict__ b, const int * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (b[i] > -2.0) > + a[c[i]] = b[i]; > +} > + > +__attribute__((noipa)) void > +f2 (double * __restrict__ a, const double * __restrict__ b, const long * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (b[i] > -2.0) > + a[c[i]] = b[i]; > +} > + > +__attribute__((noipa)) void > +f3 (float * __restrict__ a, const float * __restrict__ b, const int * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (b[i] > -2.0f) > + a[c[i]] = b[i]; > +} > + > +__attribute__((noipa)) void > +f4 (float * __restrict__ a, const float * __restrict__ b, const long * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (b[i] > -2.0f) > + a[c[i]] = b[i]; > +} > --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c.jj 2018-12-14 > 18:03:03.100958998 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c 2018-12-14 > 18:12:32.209699741 +0100 > @@ -0,0 +1,61 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do run { target { avx512f } } } */ > +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 > -mtune=skylake-avx512" } */ > + > +#include "avx512f-check.h" > + > +#include "avx512f-pr88464-3.c" > + > +static void > +avx512f_test (void) > +{ > + double a[1024], b[1024]; > + float c[1024], f[1024]; > + int d[1024]; > + long e[1024]; > + int i; > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + a[i] = -5.0; > + b[i] = (i % 3) != 0 ? 2.0 * i : -5.0; > + d[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; > + } > + f1 (a, b, d, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) > + abort (); > + a[i] = -5.0; > + b[i] = (i % 3) != 1 ? 3.0 * i : -5.0; > + e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; > + } > + f2 (a, b, e, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (a[i] != ((i % 3) != 2 ? (1023 - i) * 3.0 : -5.0)) > + abort (); > + c[i] = -5.0f; > + d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; > + f[i] = (i % 3) != 2 ? 4.0f * i : -5.0f; > + } > + f3 (c, f, d, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (c[i] != ((i % 3) != 1 ? (1023 - i) * 4.0f : -5.0f)) > + abort (); > + c[i] = -5.0f; > + e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; > + f[i] = (i % 3) != 0 ? 5.0f * i : -5.0f; > + } > + f4 (c, f, e, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f)) > + abort (); > + } > +} > --- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj 2018-12-13 > 18:01:13.913271190 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c 2018-11-06 > 14:56:08.851174491 +0100 > @@ -1,35 +0,0 @@ > -/* PR tree-optimization/88462 */ > -/* { dg-do compile } */ > -/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 > -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > -/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte > vectors" 3 "vect" } } */ > -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 > "vect" } } */ > - > -__attribute__((noipa)) void > -f1 (double * __restrict__ a, const double * __restrict__ b, const int * > __restrict__ c, int n) > -{ > - int i; > -#pragma GCC ivdep > - for (i = 0; i < n; ++i) > - if (a[i] > 10.0) > - a[i] = b[c[i]]; > -} > - > -__attribute__((noipa)) void > -f2 (double * __restrict__ a, const double * __restrict__ b, const long * > __restrict__ c, int n) > -{ > - int i; > -#pragma GCC ivdep > - for (i = 0; i < n; ++i) > - if (a[i] > 10.0) > - a[i] = b[c[i]]; > -} > - > -__attribute__((noipa)) void > -f3 (float * __restrict__ a, const float * __restrict__ b, const int * > __restrict__ c, int n) > -{ > - int i; > -#pragma GCC ivdep > - for (i = 0; i < n; ++i) > - if (a[i] > 10.0f) > - a[i] = b[c[i]]; > -} > --- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj 2018-12-13 > 18:01:13.914271174 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c 2018-11-06 > 14:56:08.851174491 +0100 > @@ -1,51 +0,0 @@ > -/* PR tree-optimization/88462 */ > -/* { dg-do run { target { avx512f } } } */ > -/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 > -mtune=skylake-avx512" } */ > - > -#include "avx512f-check.h" > - > -#include "avx512f-pr88462-1.c" > - > -static void > -avx512f_test (void) > -{ > - double a[1024], b[1024]; > - float c[1024], f[1024]; > - int d[1024]; > - long e[1024]; > - int i; > - for (i = 0; i < 1024; i++) > - { > - asm volatile ("" : "+g" (i)); > - a[i] = (i % 3) != 0 ? 15.0 : -5.0; > - b[i] = 2 * i; > - d[i] = (i % 3) ? 1023 - i : __INT_MAX__; > - } > - f1 (a, b, d, 1024); > - for (i = 0; i < 1024; i++) > - { > - asm volatile ("" : "+g" (i)); > - if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) > - abort (); > - a[i] = (i % 3) != 1 ? 15.0 : -5.0; > - b[i] = 3 * i; > - e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; > - } > - f2 (a, b, e, 1024); > - for (i = 0; i < 1024; i++) > - { > - asm volatile ("" : "+g" (i)); > - if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0)) > - abort (); > - c[i] = (i % 3) != 2 ? 15.0f : -5.0f; > - d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; > - f[i] = 4 * i; > - } > - f3 (c, f, d, 1024); > - for (i = 0; i < 1024; i++) > - { > - asm volatile ("" : "+g" (i)); > - if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f)) > - abort (); > - } > -} > > Jakub >
On December 14, 2018 8:47:08 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote: >Hi! > >In the previous patch I've unfortunately left one important case from >the >testcase and apparently it wasn't covered by anything else in the >testsuite. >The 3 functions covered float and double gathers with indexes with the >same >bitsize and WIDENING gather (double gather with int index), but didn't >cover >NARROWING case (float gather with long index with -m64). That was the >only >case that tried to permute the mask, unfortunately that isn't really >supported and ICEs. What works is VEC_UNPACK_{LO,HI}_EXPR on the >VECTOR_BOOLEAN_TYPE_P, that is what other spots in the vectorizer emit >for >those. > >I had to also fix up the x86 backend, which had in expansion of these >NARROWING gather builtins code cut&pasted from the 256-bit builtin, >unfortunately it wasn't adjusted for the fact that the 512-bit builtin >uses >integral mask argument while the 256-bit one doesn't. And even in the >256-bit one there was a bug, it relied on the mask and src arguments to >be >always in the same register (which is actually what the vectorizer >generates >for those right now, but it could do something else). > >This patch fixes that and enables also masked x86 AVX512F 512-bit >scatter support. > >Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? OK. Richard. > >What is still unhandled (doesn't vectorize) is 128-bit or 256-bit >scatters, >I bet the mask operand is vectorized using normal non-bool vectors, but >the >instructions with AVX512VL actually need a mask register. There are >instructions that can handle that, but let's defer that for later. > >2018-12-14 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/88464 > * tree-vect-stmts.c (vect_build_gather_load_calls): For NARROWING > and mask with integral masktype, don't try to permute mask vectors, > instead emit VEC_UNPACK_{LO,HI}_EXPR. Fix up NOP_EXPR operand. > (vectorizable_store): Handle masked scatters with decl and integral > mask type. > (permute_vec_elements): Allow scalar_dest to be NULL. > * config/i386/i386.c (ix86_get_builtin) > <case IX86_BUILTIN_GATHER3ALTDIV16SF>: Use lowpart_subreg for masks. > <case IX86_BUILTIN_GATHER3ALTDIV8SF>: Don't assume mask and src have > to be the same. > > * gcc.target/i386/avx512f-pr88462-1.c: Rename to ... > * gcc.target/i386/avx512f-pr88464-1.c: ... this. Fix up PR number. > Expect 4 vectorized loops instead of 3. > (f4): New function. > * gcc.target/i386/avx512f-pr88462-2.c: Rename to ... > * gcc.target/i386/avx512f-pr88464-2.c: ... this. Fix up PR number > and #include. > (avx512f_test): Prepare arguments for f4 and check the results. > * gcc.target/i386/avx512f-pr88464-3.c: New test. > * gcc.target/i386/avx512f-pr88464-4.c: New test. > >--- gcc/tree-vect-stmts.c.jj 2018-12-13 18:01:13.000000000 +0100 >+++ gcc/tree-vect-stmts.c 2018-12-14 17:10:42.079054458 +0100 >@@ -2655,6 +2655,7 @@ vect_build_gather_load_calls (stmt_vec_i > if (mask && TREE_CODE (masktype) == INTEGER_TYPE) > masktype = build_same_sized_truth_vector_type (srctype); > >+ tree mask_halftype = masktype; > tree perm_mask = NULL_TREE; > tree mask_perm_mask = NULL_TREE; > if (known_eq (nunits, gather_off_nunits)) >@@ -2690,13 +2691,16 @@ vect_build_gather_load_calls (stmt_vec_i > > ncopies *= 2; > >- if (mask) >+ if (mask && masktype == real_masktype) > { > for (int i = 0; i < count; ++i) > sel[i] = i | (count / 2); > indices.new_vector (sel, 2, count); > mask_perm_mask = vect_gen_perm_mask_checked (masktype, indices); > } >+ else if (mask) >+ mask_halftype >+ = build_same_sized_truth_vector_type (gs_info->offset_vectype); > } > else > gcc_unreachable (); >@@ -2761,7 +2765,7 @@ vect_build_gather_load_calls (stmt_vec_i > { > if (j == 0) > vec_mask = vect_get_vec_def_for_operand (mask, stmt_info); >- else >+ else if (modifier != NARROW || (j & 1) == 0) > vec_mask = vect_get_vec_def_for_stmt_copy (loop_vinfo, > vec_mask); > >@@ -2779,17 +2783,27 @@ vect_build_gather_load_calls (stmt_vec_i > mask_op = var; > } > } >+ if (modifier == NARROW && masktype != real_masktype) >+ { >+ var = vect_get_new_ssa_name (mask_halftype, vect_simple_var); >+ gassign *new_stmt >+ = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR >+ : VEC_UNPACK_LO_EXPR, >+ mask_op); >+ vect_finish_stmt_generation (stmt_info, new_stmt, gsi); >+ mask_op = var; >+ } > src_op = mask_op; > } > > tree mask_arg = mask_op; > if (masktype != real_masktype) > { >- tree utype; >- if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype)) >+ tree utype, optype = TREE_TYPE (mask_op); >+ if (TYPE_MODE (real_masktype) == TYPE_MODE (optype)) > utype = real_masktype; > else >- utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1); >+ utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1); > var = vect_get_new_ssa_name (utype, vect_scalar_var); > mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op); > gassign *new_stmt >@@ -2801,7 +2815,7 @@ vect_build_gather_load_calls (stmt_vec_i > gcc_assert (TYPE_PRECISION (utype) > <= TYPE_PRECISION (real_masktype)); > var = vect_get_new_ssa_name (real_masktype, vect_scalar_var); >- new_stmt = gimple_build_assign (var, NOP_EXPR, utype); >+ new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg); > vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > mask_arg = var; > } >@@ -6361,7 +6375,8 @@ vectorizable_store (stmt_vec_info stmt_i > return false; > } > else if (memory_access_type != VMAT_LOAD_STORE_LANES >- && (memory_access_type != VMAT_GATHER_SCATTER || >gs_info.decl)) >+ && (memory_access_type != VMAT_GATHER_SCATTER >+ || (gs_info.decl && !VECTOR_BOOLEAN_TYPE_P (mask_vectype)))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >@@ -6419,7 +6434,9 @@ vectorizable_store (stmt_vec_info stmt_i > tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, src; > tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); > tree rettype, srctype, ptrtype, idxtype, masktype, scaletype; >- tree ptr, mask, var, scale, perm_mask = NULL_TREE; >+ tree ptr, var, scale, vec_mask; >+ tree mask_arg = NULL_TREE, mask_op = NULL_TREE, perm_mask = >NULL_TREE; >+ tree mask_halfvectype = mask_vectype; > edge pe = loop_preheader_edge (loop); > gimple_seq seq; > basic_block new_bb; >@@ -6460,6 +6477,10 @@ vectorizable_store (stmt_vec_info stmt_i > perm_mask = vect_gen_perm_mask_checked (vectype, indices); > gcc_assert (perm_mask != NULL_TREE); > ncopies *= 2; >+ >+ if (mask) >+ mask_halfvectype >+ = build_same_sized_truth_vector_type (gs_info.offset_vectype); > } > else > gcc_unreachable (); >@@ -6482,10 +6503,11 @@ vectorizable_store (stmt_vec_info stmt_i > gcc_assert (!new_bb); > } > >- /* Currently we support only unconditional scatter stores, >- so mask should be all ones. */ >- mask = build_int_cst (masktype, -1); >- mask = vect_init_vector (stmt_info, mask, masktype, NULL); >+ if (mask == NULL_TREE) >+ { >+ mask_arg = build_int_cst (masktype, -1); >+ mask_arg = vect_init_vector (stmt_info, mask_arg, masktype, NULL); >+ } > > scale = build_int_cst (scaletype, gs_info.scale); > >@@ -6494,36 +6516,46 @@ vectorizable_store (stmt_vec_info stmt_i > { > if (j == 0) > { >- src = vec_oprnd1 >- = vect_get_vec_def_for_operand (op, stmt_info); >- op = vec_oprnd0 >- = vect_get_vec_def_for_operand (gs_info.offset, stmt_info); >+ src = vec_oprnd1 = vect_get_vec_def_for_operand (op, >stmt_info); >+ op = vec_oprnd0 = vect_get_vec_def_for_operand (gs_info.offset, >+ stmt_info); >+ if (mask) >+ mask_op = vec_mask = vect_get_vec_def_for_operand (mask, >+ stmt_info); > } > else if (modifier != NONE && (j & 1)) > { > if (modifier == WIDEN) > { >- src = vec_oprnd1 >- = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1); >+ src >+ = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, >+ vec_oprnd1); > op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask, > stmt_info, gsi); >+ if (mask) >+ mask_op >+ = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo, >+ vec_mask); > } > else if (modifier == NARROW) > { > src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask, > stmt_info, gsi); >- op = vec_oprnd0 >- = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0); >+ op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo, >+ vec_oprnd0); > } > else > gcc_unreachable (); > } > else > { >- src = vec_oprnd1 >- = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1); >- op = vec_oprnd0 >- = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0); >+ src = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, >+ vec_oprnd1); >+ op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo, >+ vec_oprnd0); >+ if (mask) >+ mask_op = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo, >+ vec_mask); > } > > if (!useless_type_conversion_p (srctype, TREE_TYPE (src))) >@@ -6550,8 +6582,45 @@ vectorizable_store (stmt_vec_info stmt_i > op = var; > } > >+ if (mask) >+ { >+ tree utype; >+ mask_arg = mask_op; >+ if (modifier == NARROW) >+ { >+ var = vect_get_new_ssa_name (mask_halfvectype, >+ vect_simple_var); >+ gassign *new_stmt >+ = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR >+ : VEC_UNPACK_LO_EXPR, >+ mask_op); >+ vect_finish_stmt_generation (stmt_info, new_stmt, gsi); >+ mask_arg = var; >+ } >+ tree optype = TREE_TYPE (mask_arg); >+ if (TYPE_MODE (masktype) == TYPE_MODE (optype)) >+ utype = masktype; >+ else >+ utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1); >+ var = vect_get_new_ssa_name (utype, vect_scalar_var); >+ mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_arg); >+ gassign *new_stmt >+ = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg); >+ vect_finish_stmt_generation (stmt_info, new_stmt, gsi); >+ mask_arg = var; >+ if (!useless_type_conversion_p (masktype, utype)) >+ { >+ gcc_assert (TYPE_PRECISION (utype) >+ <= TYPE_PRECISION (masktype)); >+ var = vect_get_new_ssa_name (masktype, vect_scalar_var); >+ new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg); >+ vect_finish_stmt_generation (stmt_info, new_stmt, gsi); >+ mask_arg = var; >+ } >+ } >+ > gcall *new_stmt >- = gimple_build_call (gs_info.decl, 5, ptr, mask, op, src, scale); >+ = gimple_build_call (gs_info.decl, 5, ptr, mask_arg, op, src, >scale); > stmt_vec_info new_stmt_info > = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > >@@ -7284,7 +7353,7 @@ permute_vec_elements (tree x, tree y, tr > gimple *perm_stmt; > > tree scalar_dest = gimple_get_lhs (stmt_info->stmt); >- if (TREE_CODE (scalar_dest) == SSA_NAME) >+ if (scalar_dest && TREE_CODE (scalar_dest) == SSA_NAME) > perm_dest = vect_create_destination_var (scalar_dest, vectype); > else > perm_dest = vect_get_new_vect_var (vectype, vect_simple_var, NULL); >--- gcc/config/i386/i386.c.jj 2018-12-13 13:45:11.000000000 +0100 >+++ gcc/config/i386/i386.c 2018-12-14 17:34:11.131135056 +0100 >@@ -37605,13 +37605,7 @@ rdseed_step: > op0 = copy_to_mode_reg (GET_MODE (op0), op0); > emit_insn (gen (half, op0)); > op0 = half; >- if (GET_MODE (op3) != VOIDmode) >- { >- if (!nonimmediate_operand (op3, GET_MODE (op3))) >- op3 = copy_to_mode_reg (GET_MODE (op3), op3); >- emit_insn (gen (half, op3)); >- op3 = half; >- } >+ op3 = lowpart_subreg (QImode, op3, HImode); > break; > case IX86_BUILTIN_GATHER3ALTDIV8SF: > case IX86_BUILTIN_GATHER3ALTDIV8SI: >@@ -37628,6 +37622,7 @@ rdseed_step: > op0 = half; > if (GET_MODE (op3) != VOIDmode) > { >+ half = gen_reg_rtx (mode0); > if (!nonimmediate_operand (op3, GET_MODE (op3))) > op3 = copy_to_mode_reg (GET_MODE (op3), op3); > emit_insn (gen (half, op3)); >--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c.jj 2018-12-14 >16:34:55.361955571 +0100 >+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c 2018-12-14 >18:07:25.694686784 +0100 >@@ -0,0 +1,45 @@ >+/* PR tree-optimization/88464 */ >+/* { dg-do compile } */ >+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 >-mtune=skylake-avx512 -fdump-tree-vect-details" } */ >+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte >vectors" 4 "vect" } } */ >+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" >4 "vect" } } */ >+ >+__attribute__((noipa)) void >+f1 (double * __restrict__ a, const double * __restrict__ b, const int >* __restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (a[i] > 10.0) >+ a[i] = b[c[i]]; >+} >+ >+__attribute__((noipa)) void >+f2 (double * __restrict__ a, const double * __restrict__ b, const long >* __restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (a[i] > 10.0) >+ a[i] = b[c[i]]; >+} >+ >+__attribute__((noipa)) void >+f3 (float * __restrict__ a, const float * __restrict__ b, const int * >__restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (a[i] > 10.0f) >+ a[i] = b[c[i]]; >+} >+ >+__attribute__((noipa)) void >+f4 (float * __restrict__ a, const float * __restrict__ b, const long * >__restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (a[i] > 10.0f) >+ a[i] = b[c[i]]; >+} >--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c.jj 2018-12-14 >16:35:00.681869029 +0100 >+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c 2018-12-14 >17:43:40.294876267 +0100 >@@ -0,0 +1,61 @@ >+/* PR tree-optimization/88464 */ >+/* { dg-do run { target { avx512f } } } */ >+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 >-mtune=skylake-avx512" } */ >+ >+#include "avx512f-check.h" >+ >+#include "avx512f-pr88464-1.c" >+ >+static void >+avx512f_test (void) >+{ >+ double a[1024], b[1024]; >+ float c[1024], f[1024]; >+ int d[1024]; >+ long e[1024]; >+ int i; >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ a[i] = (i % 3) != 0 ? 15.0 : -5.0; >+ b[i] = 2 * i; >+ d[i] = (i % 3) ? 1023 - i : __INT_MAX__; >+ } >+ f1 (a, b, d, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) >+ abort (); >+ a[i] = (i % 3) != 1 ? 15.0 : -5.0; >+ b[i] = 3 * i; >+ e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; >+ } >+ f2 (a, b, e, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0)) >+ abort (); >+ c[i] = (i % 3) != 2 ? 15.0f : -5.0f; >+ d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; >+ f[i] = 4 * i; >+ } >+ f3 (c, f, d, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f)) >+ abort (); >+ c[i] = (i % 3) != 0 ? 15.0f : -5.0f; >+ e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; >+ f[i] = 5 * i; >+ } >+ f4 (c, f, e, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f)) >+ abort (); >+ } >+} >--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c.jj 2018-12-14 >18:01:19.297647800 +0100 >+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c 2018-12-14 >18:07:14.906862302 +0100 >@@ -0,0 +1,45 @@ >+/* PR tree-optimization/88464 */ >+/* { dg-do compile } */ >+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 >-mtune=skylake-avx512 -fdump-tree-vect-details" } */ >+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte >vectors" 4 "vect" } } */ >+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" >4 "vect" } } */ >+ >+__attribute__((noipa)) void >+f1 (double * __restrict__ a, const double * __restrict__ b, const int >* __restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (b[i] > -2.0) >+ a[c[i]] = b[i]; >+} >+ >+__attribute__((noipa)) void >+f2 (double * __restrict__ a, const double * __restrict__ b, const long >* __restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (b[i] > -2.0) >+ a[c[i]] = b[i]; >+} >+ >+__attribute__((noipa)) void >+f3 (float * __restrict__ a, const float * __restrict__ b, const int * >__restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (b[i] > -2.0f) >+ a[c[i]] = b[i]; >+} >+ >+__attribute__((noipa)) void >+f4 (float * __restrict__ a, const float * __restrict__ b, const long * >__restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (b[i] > -2.0f) >+ a[c[i]] = b[i]; >+} >--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c.jj 2018-12-14 >18:03:03.100958998 +0100 >+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c 2018-12-14 >18:12:32.209699741 +0100 >@@ -0,0 +1,61 @@ >+/* PR tree-optimization/88464 */ >+/* { dg-do run { target { avx512f } } } */ >+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 >-mtune=skylake-avx512" } */ >+ >+#include "avx512f-check.h" >+ >+#include "avx512f-pr88464-3.c" >+ >+static void >+avx512f_test (void) >+{ >+ double a[1024], b[1024]; >+ float c[1024], f[1024]; >+ int d[1024]; >+ long e[1024]; >+ int i; >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ a[i] = -5.0; >+ b[i] = (i % 3) != 0 ? 2.0 * i : -5.0; >+ d[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; >+ } >+ f1 (a, b, d, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) >+ abort (); >+ a[i] = -5.0; >+ b[i] = (i % 3) != 1 ? 3.0 * i : -5.0; >+ e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; >+ } >+ f2 (a, b, e, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (a[i] != ((i % 3) != 2 ? (1023 - i) * 3.0 : -5.0)) >+ abort (); >+ c[i] = -5.0f; >+ d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; >+ f[i] = (i % 3) != 2 ? 4.0f * i : -5.0f; >+ } >+ f3 (c, f, d, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (c[i] != ((i % 3) != 1 ? (1023 - i) * 4.0f : -5.0f)) >+ abort (); >+ c[i] = -5.0f; >+ e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; >+ f[i] = (i % 3) != 0 ? 5.0f * i : -5.0f; >+ } >+ f4 (c, f, e, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f)) >+ abort (); >+ } >+} >--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj 2018-12-13 >18:01:13.913271190 +0100 >+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c 2018-11-06 >14:56:08.851174491 +0100 >@@ -1,35 +0,0 @@ >-/* PR tree-optimization/88462 */ >-/* { dg-do compile } */ >-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 >-mtune=skylake-avx512 -fdump-tree-vect-details" } */ >-/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte >vectors" 3 "vect" } } */ >-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" >3 "vect" } } */ >- >-__attribute__((noipa)) void >-f1 (double * __restrict__ a, const double * __restrict__ b, const int >* __restrict__ c, int n) >-{ >- int i; >-#pragma GCC ivdep >- for (i = 0; i < n; ++i) >- if (a[i] > 10.0) >- a[i] = b[c[i]]; >-} >- >-__attribute__((noipa)) void >-f2 (double * __restrict__ a, const double * __restrict__ b, const long >* __restrict__ c, int n) >-{ >- int i; >-#pragma GCC ivdep >- for (i = 0; i < n; ++i) >- if (a[i] > 10.0) >- a[i] = b[c[i]]; >-} >- >-__attribute__((noipa)) void >-f3 (float * __restrict__ a, const float * __restrict__ b, const int * >__restrict__ c, int n) >-{ >- int i; >-#pragma GCC ivdep >- for (i = 0; i < n; ++i) >- if (a[i] > 10.0f) >- a[i] = b[c[i]]; >-} >--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj 2018-12-13 >18:01:13.914271174 +0100 >+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c 2018-11-06 >14:56:08.851174491 +0100 >@@ -1,51 +0,0 @@ >-/* PR tree-optimization/88462 */ >-/* { dg-do run { target { avx512f } } } */ >-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 >-mtune=skylake-avx512" } */ >- >-#include "avx512f-check.h" >- >-#include "avx512f-pr88462-1.c" >- >-static void >-avx512f_test (void) >-{ >- double a[1024], b[1024]; >- float c[1024], f[1024]; >- int d[1024]; >- long e[1024]; >- int i; >- for (i = 0; i < 1024; i++) >- { >- asm volatile ("" : "+g" (i)); >- a[i] = (i % 3) != 0 ? 15.0 : -5.0; >- b[i] = 2 * i; >- d[i] = (i % 3) ? 1023 - i : __INT_MAX__; >- } >- f1 (a, b, d, 1024); >- for (i = 0; i < 1024; i++) >- { >- asm volatile ("" : "+g" (i)); >- if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) >- abort (); >- a[i] = (i % 3) != 1 ? 15.0 : -5.0; >- b[i] = 3 * i; >- e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; >- } >- f2 (a, b, e, 1024); >- for (i = 0; i < 1024; i++) >- { >- asm volatile ("" : "+g" (i)); >- if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0)) >- abort (); >- c[i] = (i % 3) != 2 ? 15.0f : -5.0f; >- d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; >- f[i] = 4 * i; >- } >- f3 (c, f, d, 1024); >- for (i = 0; i < 1024; i++) >- { >- asm volatile ("" : "+g" (i)); >- if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f)) >- abort (); >- } >-} > > Jakub
--- gcc/tree-vect-stmts.c.jj 2018-12-13 18:01:13.000000000 +0100 +++ gcc/tree-vect-stmts.c 2018-12-14 17:10:42.079054458 +0100 @@ -2655,6 +2655,7 @@ vect_build_gather_load_calls (stmt_vec_i if (mask && TREE_CODE (masktype) == INTEGER_TYPE) masktype = build_same_sized_truth_vector_type (srctype); + tree mask_halftype = masktype; tree perm_mask = NULL_TREE; tree mask_perm_mask = NULL_TREE; if (known_eq (nunits, gather_off_nunits)) @@ -2690,13 +2691,16 @@ vect_build_gather_load_calls (stmt_vec_i ncopies *= 2; - if (mask) + if (mask && masktype == real_masktype) { for (int i = 0; i < count; ++i) sel[i] = i | (count / 2); indices.new_vector (sel, 2, count); mask_perm_mask = vect_gen_perm_mask_checked (masktype, indices); } + else if (mask) + mask_halftype + = build_same_sized_truth_vector_type (gs_info->offset_vectype); } else gcc_unreachable (); @@ -2761,7 +2765,7 @@ vect_build_gather_load_calls (stmt_vec_i { if (j == 0) vec_mask = vect_get_vec_def_for_operand (mask, stmt_info); - else + else if (modifier != NARROW || (j & 1) == 0) vec_mask = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_mask); @@ -2779,17 +2783,27 @@ vect_build_gather_load_calls (stmt_vec_i mask_op = var; } } + if (modifier == NARROW && masktype != real_masktype) + { + var = vect_get_new_ssa_name (mask_halftype, vect_simple_var); + gassign *new_stmt + = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR + : VEC_UNPACK_LO_EXPR, + mask_op); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + mask_op = var; + } src_op = mask_op; } tree mask_arg = mask_op; if (masktype != real_masktype) { - tree utype; - if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype)) + tree utype, optype = TREE_TYPE (mask_op); + if (TYPE_MODE (real_masktype) == TYPE_MODE (optype)) utype = real_masktype; else - utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1); + utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1); var = vect_get_new_ssa_name (utype, vect_scalar_var); mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op); gassign *new_stmt @@ -2801,7 +2815,7 @@ vect_build_gather_load_calls (stmt_vec_i gcc_assert (TYPE_PRECISION (utype) <= TYPE_PRECISION (real_masktype)); var = vect_get_new_ssa_name (real_masktype, vect_scalar_var); - new_stmt = gimple_build_assign (var, NOP_EXPR, utype); + new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg); vect_finish_stmt_generation (stmt_info, new_stmt, gsi); mask_arg = var; } @@ -6361,7 +6375,8 @@ vectorizable_store (stmt_vec_info stmt_i return false; } else if (memory_access_type != VMAT_LOAD_STORE_LANES - && (memory_access_type != VMAT_GATHER_SCATTER || gs_info.decl)) + && (memory_access_type != VMAT_GATHER_SCATTER + || (gs_info.decl && !VECTOR_BOOLEAN_TYPE_P (mask_vectype)))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -6419,7 +6434,9 @@ vectorizable_store (stmt_vec_info stmt_i tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, src; tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); tree rettype, srctype, ptrtype, idxtype, masktype, scaletype; - tree ptr, mask, var, scale, perm_mask = NULL_TREE; + tree ptr, var, scale, vec_mask; + tree mask_arg = NULL_TREE, mask_op = NULL_TREE, perm_mask = NULL_TREE; + tree mask_halfvectype = mask_vectype; edge pe = loop_preheader_edge (loop); gimple_seq seq; basic_block new_bb; @@ -6460,6 +6477,10 @@ vectorizable_store (stmt_vec_info stmt_i perm_mask = vect_gen_perm_mask_checked (vectype, indices); gcc_assert (perm_mask != NULL_TREE); ncopies *= 2; + + if (mask) + mask_halfvectype + = build_same_sized_truth_vector_type (gs_info.offset_vectype); } else gcc_unreachable (); @@ -6482,10 +6503,11 @@ vectorizable_store (stmt_vec_info stmt_i gcc_assert (!new_bb); } - /* Currently we support only unconditional scatter stores, - so mask should be all ones. */ - mask = build_int_cst (masktype, -1); - mask = vect_init_vector (stmt_info, mask, masktype, NULL); + if (mask == NULL_TREE) + { + mask_arg = build_int_cst (masktype, -1); + mask_arg = vect_init_vector (stmt_info, mask_arg, masktype, NULL); + } scale = build_int_cst (scaletype, gs_info.scale); @@ -6494,36 +6516,46 @@ vectorizable_store (stmt_vec_info stmt_i { if (j == 0) { - src = vec_oprnd1 - = vect_get_vec_def_for_operand (op, stmt_info); - op = vec_oprnd0 - = vect_get_vec_def_for_operand (gs_info.offset, stmt_info); + src = vec_oprnd1 = vect_get_vec_def_for_operand (op, stmt_info); + op = vec_oprnd0 = vect_get_vec_def_for_operand (gs_info.offset, + stmt_info); + if (mask) + mask_op = vec_mask = vect_get_vec_def_for_operand (mask, + stmt_info); } else if (modifier != NONE && (j & 1)) { if (modifier == WIDEN) { - src = vec_oprnd1 - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1); + src + = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, + vec_oprnd1); op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask, stmt_info, gsi); + if (mask) + mask_op + = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo, + vec_mask); } else if (modifier == NARROW) { src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask, stmt_info, gsi); - op = vec_oprnd0 - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0); + op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo, + vec_oprnd0); } else gcc_unreachable (); } else { - src = vec_oprnd1 - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1); - op = vec_oprnd0 - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0); + src = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, + vec_oprnd1); + op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo, + vec_oprnd0); + if (mask) + mask_op = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo, + vec_mask); } if (!useless_type_conversion_p (srctype, TREE_TYPE (src))) @@ -6550,8 +6582,45 @@ vectorizable_store (stmt_vec_info stmt_i op = var; } + if (mask) + { + tree utype; + mask_arg = mask_op; + if (modifier == NARROW) + { + var = vect_get_new_ssa_name (mask_halfvectype, + vect_simple_var); + gassign *new_stmt + = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR + : VEC_UNPACK_LO_EXPR, + mask_op); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + mask_arg = var; + } + tree optype = TREE_TYPE (mask_arg); + if (TYPE_MODE (masktype) == TYPE_MODE (optype)) + utype = masktype; + else + utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1); + var = vect_get_new_ssa_name (utype, vect_scalar_var); + mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_arg); + gassign *new_stmt + = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + mask_arg = var; + if (!useless_type_conversion_p (masktype, utype)) + { + gcc_assert (TYPE_PRECISION (utype) + <= TYPE_PRECISION (masktype)); + var = vect_get_new_ssa_name (masktype, vect_scalar_var); + new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + mask_arg = var; + } + } + gcall *new_stmt - = gimple_build_call (gs_info.decl, 5, ptr, mask, op, src, scale); + = gimple_build_call (gs_info.decl, 5, ptr, mask_arg, op, src, scale); stmt_vec_info new_stmt_info = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); @@ -7284,7 +7353,7 @@ permute_vec_elements (tree x, tree y, tr gimple *perm_stmt; tree scalar_dest = gimple_get_lhs (stmt_info->stmt); - if (TREE_CODE (scalar_dest) == SSA_NAME) + if (scalar_dest && TREE_CODE (scalar_dest) == SSA_NAME) perm_dest = vect_create_destination_var (scalar_dest, vectype); else perm_dest = vect_get_new_vect_var (vectype, vect_simple_var, NULL); --- gcc/config/i386/i386.c.jj 2018-12-13 13:45:11.000000000 +0100 +++ gcc/config/i386/i386.c 2018-12-14 17:34:11.131135056 +0100 @@ -37605,13 +37605,7 @@ rdseed_step: op0 = copy_to_mode_reg (GET_MODE (op0), op0); emit_insn (gen (half, op0)); op0 = half; - if (GET_MODE (op3) != VOIDmode) - { - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - emit_insn (gen (half, op3)); - op3 = half; - } + op3 = lowpart_subreg (QImode, op3, HImode); break; case IX86_BUILTIN_GATHER3ALTDIV8SF: case IX86_BUILTIN_GATHER3ALTDIV8SI: @@ -37628,6 +37622,7 @@ rdseed_step: op0 = half; if (GET_MODE (op3) != VOIDmode) { + half = gen_reg_rtx (mode0); if (!nonimmediate_operand (op3, GET_MODE (op3))) op3 = copy_to_mode_reg (GET_MODE (op3), op3); emit_insn (gen (half, op3)); --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c.jj 2018-12-14 16:34:55.361955571 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c 2018-12-14 18:07:25.694686784 +0100 @@ -0,0 +1,45 @@ +/* PR tree-optimization/88464 */ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +__attribute__((noipa)) void +f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0) + a[i] = b[c[i]]; +} + +__attribute__((noipa)) void +f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0) + a[i] = b[c[i]]; +} + +__attribute__((noipa)) void +f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0f) + a[i] = b[c[i]]; +} + +__attribute__((noipa)) void +f4 (float * __restrict__ a, const float * __restrict__ b, const long * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0f) + a[i] = b[c[i]]; +} --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c.jj 2018-12-14 16:35:00.681869029 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c 2018-12-14 17:43:40.294876267 +0100 @@ -0,0 +1,61 @@ +/* PR tree-optimization/88464 */ +/* { dg-do run { target { avx512f } } } */ +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */ + +#include "avx512f-check.h" + +#include "avx512f-pr88464-1.c" + +static void +avx512f_test (void) +{ + double a[1024], b[1024]; + float c[1024], f[1024]; + int d[1024]; + long e[1024]; + int i; + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + a[i] = (i % 3) != 0 ? 15.0 : -5.0; + b[i] = 2 * i; + d[i] = (i % 3) ? 1023 - i : __INT_MAX__; + } + f1 (a, b, d, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) + abort (); + a[i] = (i % 3) != 1 ? 15.0 : -5.0; + b[i] = 3 * i; + e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; + } + f2 (a, b, e, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0)) + abort (); + c[i] = (i % 3) != 2 ? 15.0f : -5.0f; + d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; + f[i] = 4 * i; + } + f3 (c, f, d, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f)) + abort (); + c[i] = (i % 3) != 0 ? 15.0f : -5.0f; + e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; + f[i] = 5 * i; + } + f4 (c, f, e, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f)) + abort (); + } +} --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c.jj 2018-12-14 18:01:19.297647800 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c 2018-12-14 18:07:14.906862302 +0100 @@ -0,0 +1,45 @@ +/* PR tree-optimization/88464 */ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +__attribute__((noipa)) void +f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (b[i] > -2.0) + a[c[i]] = b[i]; +} + +__attribute__((noipa)) void +f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (b[i] > -2.0) + a[c[i]] = b[i]; +} + +__attribute__((noipa)) void +f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (b[i] > -2.0f) + a[c[i]] = b[i]; +} + +__attribute__((noipa)) void +f4 (float * __restrict__ a, const float * __restrict__ b, const long * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (b[i] > -2.0f) + a[c[i]] = b[i]; +} --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c.jj 2018-12-14 18:03:03.100958998 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c 2018-12-14 18:12:32.209699741 +0100 @@ -0,0 +1,61 @@ +/* PR tree-optimization/88464 */ +/* { dg-do run { target { avx512f } } } */ +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */ + +#include "avx512f-check.h" + +#include "avx512f-pr88464-3.c" + +static void +avx512f_test (void) +{ + double a[1024], b[1024]; + float c[1024], f[1024]; + int d[1024]; + long e[1024]; + int i; + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + a[i] = -5.0; + b[i] = (i % 3) != 0 ? 2.0 * i : -5.0; + d[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; + } + f1 (a, b, d, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) + abort (); + a[i] = -5.0; + b[i] = (i % 3) != 1 ? 3.0 * i : -5.0; + e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; + } + f2 (a, b, e, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (a[i] != ((i % 3) != 2 ? (1023 - i) * 3.0 : -5.0)) + abort (); + c[i] = -5.0f; + d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; + f[i] = (i % 3) != 2 ? 4.0f * i : -5.0f; + } + f3 (c, f, d, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (c[i] != ((i % 3) != 1 ? (1023 - i) * 4.0f : -5.0f)) + abort (); + c[i] = -5.0f; + e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; + f[i] = (i % 3) != 0 ? 5.0f * i : -5.0f; + } + f4 (c, f, e, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f)) + abort (); + } +} --- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj 2018-12-13 18:01:13.913271190 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c 2018-11-06 14:56:08.851174491 +0100 @@ -1,35 +0,0 @@ -/* PR tree-optimization/88462 */ -/* { dg-do compile } */ -/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ -/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 3 "vect" } } */ -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 "vect" } } */ - -__attribute__((noipa)) void -f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n) -{ - int i; -#pragma GCC ivdep - for (i = 0; i < n; ++i) - if (a[i] > 10.0) - a[i] = b[c[i]]; -} - -__attribute__((noipa)) void -f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n) -{ - int i; -#pragma GCC ivdep - for (i = 0; i < n; ++i) - if (a[i] > 10.0) - a[i] = b[c[i]]; -} - -__attribute__((noipa)) void -f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n) -{ - int i; -#pragma GCC ivdep - for (i = 0; i < n; ++i) - if (a[i] > 10.0f) - a[i] = b[c[i]]; -} --- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj 2018-12-13 18:01:13.914271174 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c 2018-11-06 14:56:08.851174491 +0100 @@ -1,51 +0,0 @@ -/* PR tree-optimization/88462 */ -/* { dg-do run { target { avx512f } } } */ -/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */ - -#include "avx512f-check.h" - -#include "avx512f-pr88462-1.c" - -static void -avx512f_test (void) -{ - double a[1024], b[1024]; - float c[1024], f[1024]; - int d[1024]; - long e[1024]; - int i; - for (i = 0; i < 1024; i++) - { - asm volatile ("" : "+g" (i)); - a[i] = (i % 3) != 0 ? 15.0 : -5.0; - b[i] = 2 * i; - d[i] = (i % 3) ? 1023 - i : __INT_MAX__; - } - f1 (a, b, d, 1024); - for (i = 0; i < 1024; i++) - { - asm volatile ("" : "+g" (i)); - if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) - abort (); - a[i] = (i % 3) != 1 ? 15.0 : -5.0; - b[i] = 3 * i; - e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; - } - f2 (a, b, e, 1024); - for (i = 0; i < 1024; i++) - { - asm volatile ("" : "+g" (i)); - if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0)) - abort (); - c[i] = (i % 3) != 2 ? 15.0f : -5.0f; - d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; - f[i] = 4 * i; - } - f3 (c, f, d, 1024); - for (i = 0; i < 1024; i++) - { - asm volatile ("" : "+g" (i)); - if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f)) - abort (); - } -}