Message ID | 20111104132231.GN1052@tyan-ft48-01.lab.bos.redhat.com |
---|---|
State | New |
Headers | show |
On Fri, Nov 4, 2011 at 2:22 PM, Jakub Jelinek <jakub@redhat.com> wrote: > Hi! > > On Fri, Nov 04, 2011 at 12:21:49PM +0100, Richard Guenther wrote: >> Ok. I guess it's ok to use builtins for now - I didn't think of >> the memory reference issue ;) > > Based on IRC discussion I'm posting an updated patch (both former > patches (base and incremental) in one). > > I'm now using expr_invariant_in_loop_p instead of chrec_contains*, > which nicely handles also the is_gimple_min_invariant case, > and I've added several comments and fixed the MEM_REF offset > folding. Smoke tested on the *gather* testcases, will do full > bootstrap/regtest soon. Ok for the vectorizer pieces, I'll defer to x86 maintainers for the target bits. Thanks, Richard. > 2011-11-04 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/50789 > * tree-vect-stmts.c (process_use): Add force argument, avoid > exist_non_indexing_operands_for_use_p check if true. > (vect_mark_stmts_to_be_vectorized): Adjust callers. Handle > STMT_VINFO_GATHER_P. > (gen_perm_mask): New function. > (perm_mask_for_reverse): Use it. > (reverse_vec_element): Rename to... > (permute_vec_elements): ... this. Add Y and MASK_VEC arguments, > generalize for any permutations. > (vectorizable_load): Adjust caller. Handle STMT_VINFO_GATHER_P. > * target.def (TARGET_VECTORIZE_BUILTIN_GATHER): New hook. > * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_GATHER): Document it. > * doc/tm.texi: Regenerate. > * tree-data-ref.c (initialize_data_dependence_relation, > compute_self_dependence): No longer static. > * tree-data-ref.h (initialize_data_dependence_relation, > compute_self_dependence): New prototypes. > * tree-vect-data-refs.c (vect_check_gather): New function. > (vect_analyze_data_refs): Detect possible gather load data > refs. > * tree-vectorizer.h (struct _stmt_vec_info): Add gather_p field. > (STMT_VINFO_GATHER_P): Define. > (vect_check_gather): New prototype. > * config/i386/i386-builtin-types.def: Add types for alternate > gather builtins. > * config/i386/sse.md (AVXMODE48P_DI): Remove. > (VEC_GATHER_MODE): Rename mode_attr to... > (VEC_GATHER_IDXSI): ... this. > (VEC_GATHER_IDXDI, VEC_GATHER_SRCDI): New mode_attrs. > (avx2_gathersi<mode>, *avx2_gathersi<mode>): Use <VEC_GATHER_IDXSI> > instead of <VEC_GATHER_MODE>. > (avx2_gatherdi<mode>): Use <VEC_GATHER_IDXDI> instead of > <<AVXMODE48P_DI> and <VEC_GATHER_SRCDI> instead of VEC_GATHER_MODE > on src and mask operands. > (*avx2_gatherdi<mode>): Likewise. Use VEC_GATHER_MODE iterator > instead of AVXMODE48P_DI. > (avx2_gatherdi<mode>256, *avx2_gatherdi<mode>256): Removed. > * config/i386/i386.c (enum ix86_builtins): Add > IX86_BUILTIN_GATHERALTSIV4DF, IX86_BUILTIN_GATHERALTDIV8SF, > IX86_BUILTIN_GATHERALTSIV4DI and IX86_BUILTIN_GATHERALTDIV8SI. > (ix86_init_mmx_sse_builtins): Create those builtins. > (ix86_expand_builtin): Handle those builtins and adjust expansions > of other gather builtins. > (ix86_vectorize_builtin_gather): New function. > (TARGET_VECTORIZE_BUILTIN_GATHER): Define. > > * gcc.target/i386/avx2-gather-1.c: New test. > * gcc.target/i386/avx2-gather-2.c: New test. > * gcc.target/i386/avx2-gather-3.c: New test. > * gcc.target/i386/avx2-gather-4.c: New test. > > --- gcc/tree-vect-stmts.c.jj 2011-11-04 08:52:19.000000000 +0100 > +++ gcc/tree-vect-stmts.c 2011-11-04 08:54:11.000000000 +0100 > @@ -332,6 +332,8 @@ exist_non_indexing_operands_for_use_p (t > - LIVE_P, RELEVANT - enum values to be set in the STMT_VINFO of the stmt > that defined USE. This is done by calling mark_relevant and passing it > the WORKLIST (to add DEF_STMT to the WORKLIST in case it is relevant). > + - FORCE is true if exist_non_indexing_operands_for_use_p check shouldn't > + be performed. > > Outputs: > Generally, LIVE_P and RELEVANT are used to define the liveness and > @@ -351,7 +353,8 @@ exist_non_indexing_operands_for_use_p (t > > static bool > process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p, > - enum vect_relevant relevant, VEC(gimple,heap) **worklist) > + enum vect_relevant relevant, VEC(gimple,heap) **worklist, > + bool force) > { > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); > @@ -363,7 +366,7 @@ process_use (gimple stmt, tree use, loop > > /* case 1: we are only interested in uses that need to be vectorized. Uses > that are used for address computation are not considered relevant. */ > - if (!exist_non_indexing_operands_for_use_p (use, stmt)) > + if (!force && !exist_non_indexing_operands_for_use_p (use, stmt)) > return true; > > if (!vect_is_simple_use (use, loop_vinfo, NULL, &def_stmt, &def, &dt)) > @@ -646,7 +649,7 @@ vect_mark_stmts_to_be_vectorized (loop_v > break; > } > > - if (is_pattern_stmt_p (vinfo_for_stmt (stmt))) > + if (is_pattern_stmt_p (stmt_vinfo)) > { > /* Pattern statements are not inserted into the code, so > FOR_EACH_PHI_OR_STMT_USE optimizes their operands out, and we > @@ -660,9 +663,9 @@ vect_mark_stmts_to_be_vectorized (loop_v > if (rhs_code == COND_EXPR && COMPARISON_CLASS_P (op)) > { > if (!process_use (stmt, TREE_OPERAND (op, 0), loop_vinfo, > - live_p, relevant, &worklist) > + live_p, relevant, &worklist, false) > || !process_use (stmt, TREE_OPERAND (op, 1), loop_vinfo, > - live_p, relevant, &worklist)) > + live_p, relevant, &worklist, false)) > { > VEC_free (gimple, heap, worklist); > return false; > @@ -673,7 +676,7 @@ vect_mark_stmts_to_be_vectorized (loop_v > { > op = gimple_op (stmt, i); > if (!process_use (stmt, op, loop_vinfo, live_p, relevant, > - &worklist)) > + &worklist, false)) > { > VEC_free (gimple, heap, worklist); > return false; > @@ -686,7 +689,7 @@ vect_mark_stmts_to_be_vectorized (loop_v > { > tree arg = gimple_call_arg (stmt, i); > if (!process_use (stmt, arg, loop_vinfo, live_p, relevant, > - &worklist)) > + &worklist, false)) > { > VEC_free (gimple, heap, worklist); > return false; > @@ -699,12 +702,25 @@ vect_mark_stmts_to_be_vectorized (loop_v > { > tree op = USE_FROM_PTR (use_p); > if (!process_use (stmt, op, loop_vinfo, live_p, relevant, > - &worklist)) > + &worklist, false)) > { > VEC_free (gimple, heap, worklist); > return false; > } > } > + > + if (STMT_VINFO_GATHER_P (stmt_vinfo)) > + { > + tree off; > + tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL); > + gcc_assert (decl); > + if (!process_use (stmt, off, loop_vinfo, live_p, relevant, > + &worklist, true)) > + { > + VEC_free (gimple, heap, worklist); > + return false; > + } > + } > } /* while worklist */ > > VEC_free (gimple, heap, worklist); > @@ -4142,23 +4158,17 @@ vectorizable_store (gimple stmt, gimple_ > return true; > } > > -/* Given a vector type VECTYPE returns a builtin DECL to be used > - for vector permutation and returns the mask that implements > - reversal of the vector elements. If that is impossible to do, > - returns NULL. */ > +/* Given a vector type VECTYPE and permutation SEL returns > + the VECTOR_CST mask that implements the permutation of the > + vector elements. If that is impossible to do, returns NULL. */ > > static tree > -perm_mask_for_reverse (tree vectype) > +gen_perm_mask (tree vectype, unsigned char *sel) > { > tree mask_elt_type, mask_type, mask_vec; > int i, nunits; > - unsigned char *sel; > > nunits = TYPE_VECTOR_SUBPARTS (vectype); > - sel = XALLOCAVEC (unsigned char, nunits); > - > - for (i = 0; i < nunits; ++i) > - sel[i] = nunits - 1 - i; > > if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) > return NULL; > @@ -4169,33 +4179,52 @@ perm_mask_for_reverse (tree vectype) > mask_type = get_vectype_for_scalar_type (mask_elt_type); > > mask_vec = NULL; > - for (i = 0; i < nunits; i++) > - mask_vec = tree_cons (NULL, build_int_cst (mask_elt_type, i), mask_vec); > + for (i = nunits - 1; i >= 0; i--) > + mask_vec = tree_cons (NULL, build_int_cst (mask_elt_type, sel[i]), > + mask_vec); > mask_vec = build_vector (mask_type, mask_vec); > > return mask_vec; > } > > -/* Given a vector variable X, that was generated for the scalar LHS of > - STMT, generate instructions to reverse the vector elements of X, > - insert them a *GSI and return the permuted vector variable. */ > +/* Given a vector type VECTYPE returns the VECTOR_CST mask that implements > + reversal of the vector elements. If that is impossible to do, > + returns NULL. */ > > static tree > -reverse_vec_elements (tree x, gimple stmt, gimple_stmt_iterator *gsi) > +perm_mask_for_reverse (tree vectype) > +{ > + int i, nunits; > + unsigned char *sel; > + > + nunits = TYPE_VECTOR_SUBPARTS (vectype); > + sel = XALLOCAVEC (unsigned char, nunits); > + > + for (i = 0; i < nunits; ++i) > + sel[i] = nunits - 1 - i; > + > + return gen_perm_mask (vectype, sel); > +} > + > +/* Given a vector variable X and Y, that was generated for the scalar > + STMT, generate instructions to permute the vector elements of X and Y > + using permutation mask MASK_VEC, insert them at *GSI and return the > + permuted vector variable. */ > + > +static tree > +permute_vec_elements (tree x, tree y, tree mask_vec, gimple stmt, > + gimple_stmt_iterator *gsi) > { > tree vectype = TREE_TYPE (x); > - tree mask_vec, perm_dest, data_ref; > + tree perm_dest, data_ref; > gimple perm_stmt; > > - mask_vec = perm_mask_for_reverse (vectype); > - > perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype); > + data_ref = make_ssa_name (perm_dest, NULL); > > /* Generate the permute statement. */ > - perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, perm_dest, > - x, x, mask_vec); > - data_ref = make_ssa_name (perm_dest, perm_stmt); > - gimple_set_lhs (perm_stmt, data_ref); > + perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, data_ref, > + x, y, mask_vec); > vect_finish_stmt_generation (stmt, perm_stmt, gsi); > > return data_ref; > @@ -4254,6 +4283,10 @@ vectorizable_load (gimple stmt, gimple_s > bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); > int vf; > tree aggr_type; > + tree gather_base = NULL_TREE, gather_off = NULL_TREE; > + tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE; > + int gather_scale = 1; > + enum vect_def_type gather_dt = vect_unknown_def_type; > > if (loop_vinfo) > { > @@ -4334,7 +4367,7 @@ vectorizable_load (gimple stmt, gimple_s > { > strided_load = true; > /* FORNOW */ > - gcc_assert (! nested_in_vect_loop); > + gcc_assert (! nested_in_vect_loop && !STMT_VINFO_GATHER_P (stmt_info)); > > first_stmt = GROUP_FIRST_ELEMENT (stmt_info); > if (!slp && !PURE_SLP_STMT (stmt_info)) > @@ -4349,7 +4382,7 @@ vectorizable_load (gimple stmt, gimple_s > > if (negative) > { > - gcc_assert (!strided_load); > + gcc_assert (!strided_load && !STMT_VINFO_GATHER_P (stmt_info)); > alignment_support_scheme = vect_supportable_dr_alignment (dr, false); > if (alignment_support_scheme != dr_aligned > && alignment_support_scheme != dr_unaligned_supported) > @@ -4366,6 +4399,23 @@ vectorizable_load (gimple stmt, gimple_s > } > } > > + if (STMT_VINFO_GATHER_P (stmt_info)) > + { > + gimple def_stmt; > + tree def; > + gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base, > + &gather_off, &gather_scale); > + gcc_assert (gather_decl); > + if (!vect_is_simple_use_1 (gather_off, loop_vinfo, bb_vinfo, > + &def_stmt, &def, &gather_dt, > + &gather_off_vectype)) > + { > + if (vect_print_dump_info (REPORT_DETAILS)) > + fprintf (vect_dump, "gather index use not simple."); > + return false; > + } > + } > + > if (!vec_stmt) /* transformation not required. */ > { > STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; > @@ -4378,6 +4428,161 @@ vectorizable_load (gimple stmt, gimple_s > > /** Transform. **/ > > + if (STMT_VINFO_GATHER_P (stmt_info)) > + { > + tree vec_oprnd0 = NULL_TREE, op; > + tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl)); > + tree rettype, srctype, ptrtype, idxtype, masktype, scaletype; > + tree ptr, mask, var, scale, perm_mask = NULL_TREE, prev_res = NULL_TREE; > + edge pe = loop_preheader_edge (loop); > + gimple_seq seq; > + basic_block new_bb; > + enum { NARROW, NONE, WIDEN } modifier; > + int gather_off_nunits = TYPE_VECTOR_SUBPARTS (gather_off_vectype); > + > + if (nunits == gather_off_nunits) > + modifier = NONE; > + else if (nunits == gather_off_nunits / 2) > + { > + unsigned char *sel = XALLOCAVEC (unsigned char, gather_off_nunits); > + modifier = WIDEN; > + > + for (i = 0; i < gather_off_nunits; ++i) > + sel[i] = i | nunits; > + > + perm_mask = gen_perm_mask (gather_off_vectype, sel); > + gcc_assert (perm_mask != NULL_TREE); > + } > + else if (nunits == gather_off_nunits * 2) > + { > + unsigned char *sel = XALLOCAVEC (unsigned char, nunits); > + modifier = NARROW; > + > + for (i = 0; i < nunits; ++i) > + sel[i] = i < gather_off_nunits > + ? i : i + nunits - gather_off_nunits; > + > + perm_mask = gen_perm_mask (vectype, sel); > + gcc_assert (perm_mask != NULL_TREE); > + ncopies *= 2; > + } > + else > + gcc_unreachable (); > + > + rettype = TREE_TYPE (TREE_TYPE (gather_decl)); > + srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); > + ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); > + idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); > + masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); > + scaletype = TREE_VALUE (arglist); > + gcc_checking_assert (types_compatible_p (srctype, rettype) > + && types_compatible_p (srctype, masktype)); > + > + vec_dest = vect_create_destination_var (scalar_dest, vectype); > + > + ptr = fold_convert (ptrtype, gather_base); > + if (!is_gimple_min_invariant (ptr)) > + { > + ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE); > + new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); > + gcc_assert (!new_bb); > + } > + > + /* Currently we support only unconditional gather loads, > + so mask should be all ones. */ > + if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE) > + mask = build_int_cst (TREE_TYPE (masktype), -1); > + else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype))) > + { > + REAL_VALUE_TYPE r; > + long tmp[6]; > + for (j = 0; j < 6; ++j) > + tmp[j] = -1; > + real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype))); > + mask = build_real (TREE_TYPE (masktype), r); > + } > + else > + gcc_unreachable (); > + mask = build_vector_from_val (masktype, mask); > + mask = vect_init_vector (stmt, mask, masktype, NULL); > + > + scale = build_int_cst (scaletype, gather_scale); > + > + prev_stmt_info = NULL; > + for (j = 0; j < ncopies; ++j) > + { > + if (modifier == WIDEN && (j & 1)) > + op = permute_vec_elements (vec_oprnd0, vec_oprnd0, > + perm_mask, stmt, gsi); > + else if (j == 0) > + op = vec_oprnd0 > + = vect_get_vec_def_for_operand (gather_off, stmt, NULL); > + else > + op = vec_oprnd0 > + = vect_get_vec_def_for_stmt_copy (gather_dt, vec_oprnd0); > + > + if (!useless_type_conversion_p (idxtype, TREE_TYPE (op))) > + { > + gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op)) > + == TYPE_VECTOR_SUBPARTS (idxtype)); > + var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL); > + add_referenced_var (var); > + var = make_ssa_name (var, NULL); > + op = build1 (VIEW_CONVERT_EXPR, idxtype, op); > + new_stmt > + = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, > + op, NULL_TREE); > + vect_finish_stmt_generation (stmt, new_stmt, gsi); > + op = var; > + } > + > + new_stmt > + = gimple_build_call (gather_decl, 5, mask, ptr, op, mask, scale); > + > + if (!useless_type_conversion_p (vectype, rettype)) > + { > + gcc_assert (TYPE_VECTOR_SUBPARTS (vectype) > + == TYPE_VECTOR_SUBPARTS (rettype)); > + var = vect_get_new_vect_var (rettype, vect_simple_var, NULL); > + add_referenced_var (var); > + op = make_ssa_name (var, new_stmt); > + gimple_call_set_lhs (new_stmt, op); > + vect_finish_stmt_generation (stmt, new_stmt, gsi); > + var = make_ssa_name (vec_dest, NULL); > + op = build1 (VIEW_CONVERT_EXPR, vectype, op); > + new_stmt > + = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, op, > + NULL_TREE); > + } > + else > + { > + var = make_ssa_name (vec_dest, new_stmt); > + gimple_call_set_lhs (new_stmt, var); > + } > + > + vect_finish_stmt_generation (stmt, new_stmt, gsi); > + > + if (modifier == NARROW) > + { > + if ((j & 1) == 0) > + { > + prev_res = var; > + continue; > + } > + var = permute_vec_elements (prev_res, var, > + perm_mask, stmt, gsi); > + new_stmt = SSA_NAME_DEF_STMT (var); > + } > + > + if (prev_stmt_info == NULL) > + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; > + else > + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; > + prev_stmt_info = vinfo_for_stmt (new_stmt); > + } > + return true; > + } > + > if (strided_load) > { > first_stmt = GROUP_FIRST_ELEMENT (stmt_info); > @@ -4769,7 +4974,9 @@ vectorizable_load (gimple stmt, gimple_s > > if (negative) > { > - new_temp = reverse_vec_elements (new_temp, stmt, gsi); > + tree perm_mask = perm_mask_for_reverse (vectype); > + new_temp = permute_vec_elements (new_temp, new_temp, > + perm_mask, stmt, gsi); > new_stmt = SSA_NAME_DEF_STMT (new_temp); > } > > --- gcc/target.def.jj 2011-11-04 08:52:19.000000000 +0100 > +++ gcc/target.def 2011-11-04 08:53:13.000000000 +0100 > @@ -1021,6 +1021,14 @@ DEFHOOK > (void), > default_autovectorize_vector_sizes) > > +/* Target builtin that implements vector gather operation. */ > +DEFHOOK > +(builtin_gather, > + "", > + tree, > + (const_tree mem_vectype, const_tree index_type, int scale), > + NULL) > + > HOOK_VECTOR_END (vectorize) > > #undef HOOK_PREFIX > --- gcc/tree-data-ref.c.jj 2011-11-04 08:52:19.000000000 +0100 > +++ gcc/tree-data-ref.c 2011-11-04 08:53:13.000000000 +0100 > @@ -1351,13 +1351,11 @@ dr_may_alias_p (const struct data_refere > return refs_may_alias_p (addr_a, addr_b); > } > > -static void compute_self_dependence (struct data_dependence_relation *); > - > /* Initialize a data dependence relation between data accesses A and > B. NB_LOOPS is the number of loops surrounding the references: the > size of the classic distance/direction vectors. */ > > -static struct data_dependence_relation * > +struct data_dependence_relation * > initialize_data_dependence_relation (struct data_reference *a, > struct data_reference *b, > VEC (loop_p, heap) *loop_nest) > @@ -4121,7 +4119,7 @@ compute_affine_dependence (struct data_d > /* This computes the dependence relation for the same data > reference into DDR. */ > > -static void > +void > compute_self_dependence (struct data_dependence_relation *ddr) > { > unsigned int i; > --- gcc/tree-data-ref.h.jj 2011-11-04 08:52:19.000000000 +0100 > +++ gcc/tree-data-ref.h 2011-11-04 13:22:28.000000000 +0100 > @@ -1,5 +1,5 @@ > /* Data references and dependences detectors. > - Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 > + Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 > Free Software Foundation, Inc. > Contributed by Sebastian Pop <pop@cri.ensmp.fr> > > @@ -423,6 +423,9 @@ extern bool graphite_find_data_reference > VEC (data_reference_p, heap) **); > struct data_reference *create_data_ref (loop_p, loop_p, tree, gimple, bool); > extern bool find_loop_nest (struct loop *, VEC (loop_p, heap) **); > +extern struct data_dependence_relation *initialize_data_dependence_relation > + (struct data_reference *, struct data_reference *, VEC (loop_p, heap) *); > +extern void compute_self_dependence (struct data_dependence_relation *); > extern void compute_all_dependences (VEC (data_reference_p, heap) *, > VEC (ddr_p, heap) **, VEC (loop_p, heap) *, > bool); > --- gcc/doc/tm.texi.in.jj 2011-11-04 08:52:19.000000000 +0100 > +++ gcc/doc/tm.texi.in 2011-11-04 08:53:13.000000000 +0100 > @@ -5696,6 +5696,14 @@ mode returned by @code{TARGET_VECTORIZE_ > The default is zero which means to not iterate over other vector sizes. > @end deftypefn > > +@hook TARGET_VECTORIZE_BUILTIN_GATHER > +Target builtin that implements vector gather operation. @var{mem_vectype} > +is the vector type of the load and @var{index_type} is scalar type of > +the index, scaled by @var{scale}. > +The default is @code{NULL_TREE} which means to not vectorize gather > +loads. > +@end deftypefn > + > @node Anchored Addresses > @section Anchored Addresses > @cindex anchored addresses > --- gcc/doc/tm.texi.jj 2011-11-04 08:52:19.000000000 +0100 > +++ gcc/doc/tm.texi 2011-11-04 08:53:13.000000000 +0100 > @@ -5758,6 +5758,14 @@ mode returned by @code{TARGET_VECTORIZE_ > The default is zero which means to not iterate over other vector sizes. > @end deftypefn > > +@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_GATHER (const_tree @var{mem_vectype}, const_tree @var{index_type}, int @var{scale}) > +Target builtin that implements vector gather operation. @var{mem_vectype} > +is the vector type of the load and @var{index_type} is scalar type of > +the index, scaled by @var{scale}. > +The default is @code{NULL_TREE} which means to not vectorize gather > +loads. > +@end deftypefn > + > @node Anchored Addresses > @section Anchored Addresses > @cindex anchored addresses > --- gcc/tree-vect-data-refs.c.jj 2011-11-04 08:52:57.000000000 +0100 > +++ gcc/tree-vect-data-refs.c 2011-11-04 14:10:17.000000000 +0100 > @@ -2497,6 +2497,199 @@ vect_prune_runtime_alias_test_list (loop > return true; > } > > +/* Check whether a non-affine read in stmt is suitable for gather load > + and if so, return a builtin decl for that operation. */ > + > +tree > +vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep, > + tree *offp, int *scalep) > +{ > + HOST_WIDE_INT scale = 1, pbitpos, pbitsize; > + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); > + tree offtype = NULL_TREE; > + tree decl, base, off; > + enum machine_mode pmode; > + int punsignedp, pvolatilep; > + > + /* The gather builtins need address of the form > + loop_invariant + vector * {1, 2, 4, 8} > + or > + loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. > + Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture > + of loop invariants/SSA_NAMEs defined in the loop, with casts, > + multiplications and additions in it. To get a vector, we need > + a single SSA_NAME that will be defined in the loop and will > + contain everything that is not loop invariant and that can be > + vectorized. The following code attempts to find such a preexistng > + SSA_NAME OFF and put the loop invariants into a tree BASE > + that can be gimplified before the loop. */ > + base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off, > + &pmode, &punsignedp, &pvolatilep, false); > + gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0); > + > + if (TREE_CODE (base) == MEM_REF) > + { > + if (!integer_zerop (TREE_OPERAND (base, 1))) > + { > + if (off == NULL_TREE) > + { > + double_int moff = mem_ref_offset (base); > + off = double_int_to_tree (sizetype, moff); > + } > + else > + off = size_binop (PLUS_EXPR, off, > + fold_convert (sizetype, TREE_OPERAND (base, 1))); > + } > + base = TREE_OPERAND (base, 0); > + } > + else > + base = build_fold_addr_expr (base); > + > + if (off == NULL_TREE) > + off = size_zero_node; > + > + /* If base is not loop invariant, either off is 0, then we start with just > + the constant offset in the loop invariant BASE and continue with base > + as OFF, otherwise give up. > + We could handle that case by gimplifying the addition of base + off > + into some SSA_NAME and use that as off, but for now punt. */ > + if (!expr_invariant_in_loop_p (loop, base)) > + { > + if (!integer_zerop (off)) > + return NULL_TREE; > + off = base; > + base = size_int (pbitpos / BITS_PER_UNIT); > + } > + /* Otherwise put base + constant offset into the loop invariant BASE > + and continue with OFF. */ > + else > + { > + base = fold_convert (sizetype, base); > + base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT)); > + } > + > + /* OFF at this point may be either a SSA_NAME or some tree expression > + from get_inner_reference. Try to peel off loop invariants from it > + into BASE as long as possible. */ > + STRIP_NOPS (off); > + while (offtype == NULL_TREE) > + { > + enum tree_code code; > + tree op0, op1, add = NULL_TREE; > + > + if (TREE_CODE (off) == SSA_NAME) > + { > + gimple def_stmt = SSA_NAME_DEF_STMT (off); > + > + if (expr_invariant_in_loop_p (loop, off)) > + return NULL_TREE; > + > + if (gimple_code (def_stmt) != GIMPLE_ASSIGN) > + break; > + > + op0 = gimple_assign_rhs1 (def_stmt); > + code = gimple_assign_rhs_code (def_stmt); > + op1 = gimple_assign_rhs2 (def_stmt); > + } > + else > + { > + if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) > + return NULL_TREE; > + code = TREE_CODE (off); > + extract_ops_from_tree (off, &code, &op0, &op1); > + } > + switch (code) > + { > + case POINTER_PLUS_EXPR: > + case PLUS_EXPR: > + if (expr_invariant_in_loop_p (loop, op0)) > + { > + add = op0; > + off = op1; > + do_add: > + add = fold_convert (sizetype, add); > + if (scale != 1) > + add = size_binop (MULT_EXPR, add, size_int (scale)); > + base = size_binop (PLUS_EXPR, base, add); > + continue; > + } > + if (expr_invariant_in_loop_p (loop, op1)) > + { > + add = op1; > + off = op0; > + goto do_add; > + } > + break; > + case MINUS_EXPR: > + if (expr_invariant_in_loop_p (loop, op1)) > + { > + add = fold_convert (sizetype, op1); > + add = size_binop (MINUS_EXPR, size_zero_node, add); > + off = op0; > + goto do_add; > + } > + break; > + case MULT_EXPR: > + if (scale == 1 && host_integerp (op1, 0)) > + { > + scale = tree_low_cst (op1, 0); > + off = op0; > + continue; > + } > + break; > + case SSA_NAME: > + off = op0; > + continue; > + CASE_CONVERT: > + if (!POINTER_TYPE_P (TREE_TYPE (op0)) > + && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) > + break; > + if (TYPE_PRECISION (TREE_TYPE (op0)) > + == TYPE_PRECISION (TREE_TYPE (off))) > + { > + off = op0; > + continue; > + } > + if (TYPE_PRECISION (TREE_TYPE (op0)) > + < TYPE_PRECISION (TREE_TYPE (off))) > + { > + off = op0; > + offtype = TREE_TYPE (off); > + STRIP_NOPS (off); > + continue; > + } > + break; > + default: > + break; > + } > + break; > + } > + > + /* If at the end OFF still isn't a SSA_NAME or isn't > + defined in the loop, punt. */ > + if (TREE_CODE (off) != SSA_NAME > + || expr_invariant_in_loop_p (loop, off)) > + return NULL_TREE; > + > + if (offtype == NULL_TREE) > + offtype = TREE_TYPE (off); > + > + decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info), > + offtype, scale); > + if (decl == NULL_TREE) > + return NULL_TREE; > + > + if (basep) > + *basep = base; > + if (offp) > + *offp = off; > + if (scalep) > + *scalep = scale; > + return decl; > +} > + > > /* Function vect_analyze_data_refs. > > @@ -2573,6 +2766,7 @@ vect_analyze_data_refs (loop_vec_info lo > gimple stmt; > stmt_vec_info stmt_info; > tree base, offset, init; > + bool gather = false; > int vf; > > if (!dr || !DR_REF (dr)) > @@ -2594,22 +2788,51 @@ vect_analyze_data_refs (loop_vec_info lo > > /* Check that analysis of the data-ref succeeded. */ > if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) > - || !DR_STEP (dr)) > + || !DR_STEP (dr)) > { > - if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) > - { > - fprintf (vect_dump, "not vectorized: data ref analysis failed "); > - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); > - } > + /* If target supports vector gather loads, see if they can't > + be used. */ > + if (loop_vinfo > + && DR_IS_READ (dr) > + && !TREE_THIS_VOLATILE (DR_REF (dr)) > + && targetm.vectorize.builtin_gather != NULL > + && !nested_in_vect_loop_p (loop, stmt)) > + { > + struct data_reference *newdr > + = create_data_ref (NULL, loop_containing_stmt (stmt), > + DR_REF (dr), stmt, true); > + gcc_assert (newdr != NULL && DR_REF (newdr)); > + if (DR_BASE_ADDRESS (newdr) > + && DR_OFFSET (newdr) > + && DR_INIT (newdr) > + && DR_STEP (newdr) > + && integer_zerop (DR_STEP (newdr))) > + { > + dr = newdr; > + gather = true; > + } > + else > + free_data_ref (newdr); > + } > > - if (bb_vinfo) > - { > - STMT_VINFO_VECTORIZABLE (stmt_info) = false; > - stop_bb_analysis = true; > - continue; > - } > + if (!gather) > + { > + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) > + { > + fprintf (vect_dump, "not vectorized: data ref analysis " > + "failed "); > + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); > + } > + > + if (bb_vinfo) > + { > + STMT_VINFO_VECTORIZABLE (stmt_info) = false; > + stop_bb_analysis = true; > + continue; > + } > > - return false; > + return false; > + } > } > > if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) > @@ -2625,7 +2848,9 @@ vect_analyze_data_refs (loop_vec_info lo > continue; > } > > - return false; > + if (gather) > + free_data_ref (dr); > + return false; > } > > if (TREE_THIS_VOLATILE (DR_REF (dr))) > @@ -2666,6 +2891,8 @@ vect_analyze_data_refs (loop_vec_info lo > continue; > } > > + if (gather) > + free_data_ref (dr); > return false; > } > > @@ -2791,6 +3018,8 @@ vect_analyze_data_refs (loop_vec_info lo > continue; > } > > + if (gather) > + free_data_ref (dr); > return false; > } > > @@ -2818,8 +3047,13 @@ vect_analyze_data_refs (loop_vec_info lo > stop_bb_analysis = true; > continue; > } > - else > - return false; > + > + if (gather) > + { > + STMT_VINFO_DATA_REF (stmt_info) = NULL; > + free_data_ref (dr); > + } > + return false; > } > > /* Adjust the minimal vectorization factor according to the > @@ -2827,6 +3061,86 @@ vect_analyze_data_refs (loop_vec_info lo > vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); > if (vf > *min_vf) > *min_vf = vf; > + > + if (gather) > + { > + unsigned int j, k, n; > + struct data_reference *olddr > + = VEC_index (data_reference_p, datarefs, i); > + VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo); > + struct data_dependence_relation *ddr, *newddr; > + bool bad = false; > + tree off; > + VEC (loop_p, heap) *nest = LOOP_VINFO_LOOP_NEST (loop_vinfo); > + > + if (!vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL) > + || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE) > + { > + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) > + { > + fprintf (vect_dump, > + "not vectorized: not suitable for gather "); > + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); > + } > + return false; > + } > + > + n = VEC_length (data_reference_p, datarefs) - 1; > + for (j = 0, k = i - 1; j < i; j++) > + { > + ddr = VEC_index (ddr_p, ddrs, k); > + gcc_assert (DDR_B (ddr) == olddr); > + newddr = initialize_data_dependence_relation (DDR_A (ddr), dr, > + nest); > + VEC_replace (ddr_p, ddrs, k, newddr); > + free_dependence_relation (ddr); > + if (!bad > + && DR_IS_WRITE (DDR_A (newddr)) > + && DDR_ARE_DEPENDENT (newddr) != chrec_known) > + bad = true; > + k += --n; > + } > + > + k++; > + n = k + VEC_length (data_reference_p, datarefs) - i - 1; > + for (; k < n; k++) > + { > + ddr = VEC_index (ddr_p, ddrs, k); > + gcc_assert (DDR_A (ddr) == olddr); > + newddr = initialize_data_dependence_relation (dr, DDR_B (ddr), > + nest); > + VEC_replace (ddr_p, ddrs, k, newddr); > + free_dependence_relation (ddr); > + if (!bad > + && DR_IS_WRITE (DDR_B (newddr)) > + && DDR_ARE_DEPENDENT (newddr) != chrec_known) > + bad = true; > + } > + > + k = VEC_length (ddr_p, ddrs) > + - VEC_length (data_reference_p, datarefs) + i; > + ddr = VEC_index (ddr_p, ddrs, k); > + gcc_assert (DDR_A (ddr) == olddr && DDR_B (ddr) == olddr); > + newddr = initialize_data_dependence_relation (dr, dr, nest); > + compute_self_dependence (newddr); > + VEC_replace (ddr_p, ddrs, k, newddr); > + free_dependence_relation (ddr); > + VEC_replace (data_reference_p, datarefs, i, dr); > + > + if (bad) > + { > + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) > + { > + fprintf (vect_dump, > + "not vectorized: data dependence conflict" > + " prevents gather"); > + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); > + } > + return false; > + } > + > + STMT_VINFO_GATHER_P (stmt_info) = true; > + } > } > > return true; > --- gcc/tree-vectorizer.h.jj 2011-11-04 08:52:19.000000000 +0100 > +++ gcc/tree-vectorizer.h 2011-11-04 08:53:13.000000000 +0100 > @@ -535,6 +535,9 @@ typedef struct _stmt_vec_info { > /* Is this statement vectorizable or should it be skipped in (partial) > vectorization. */ > bool vectorizable; > + > + /* For loads only, true if this is a gather load. */ > + bool gather_p; > } *stmt_vec_info; > > /* Access Functions. */ > @@ -548,6 +551,7 @@ typedef struct _stmt_vec_info { > #define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt > #define STMT_VINFO_VECTORIZABLE(S) (S)->vectorizable > #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info > +#define STMT_VINFO_GATHER_P(S) (S)->gather_p > > #define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address > #define STMT_VINFO_DR_INIT(S) (S)->dr_init > @@ -858,6 +862,8 @@ extern bool vect_analyze_data_refs_align > extern bool vect_verify_datarefs_alignment (loop_vec_info, bb_vec_info); > extern bool vect_analyze_data_ref_accesses (loop_vec_info, bb_vec_info); > extern bool vect_prune_runtime_alias_test_list (loop_vec_info); > +extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *, > + int *); > extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *); > extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree, > tree *, gimple_stmt_iterator *, > --- gcc/config/i386/i386-builtin-types.def.jj 2011-11-04 08:52:19.000000000 +0100 > +++ gcc/config/i386/i386-builtin-types.def 2011-11-04 08:53:13.000000000 +0100 > @@ -432,20 +432,24 @@ DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, > > DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V4SI, V2DF, INT) > DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4SI, V4DF, INT) > +DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V8SI, V4DF, INT) > DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V2DI, V2DF, INT) > DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4DI, V4DF, INT) > DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4SI, V4SF, INT) > DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V8SI, V8SF, INT) > DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V2DI, V4SF, INT) > DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4DI, V4SF, INT) > +DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V4DI, V8SF, INT) > DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V4SI, V2DI, INT) > DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4SI, V4DI, INT) > +DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V8SI, V4DI, INT) > DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V2DI, V2DI, INT) > DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4DI, V4DI, INT) > DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4SI, V4SI, INT) > DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V8SI, V8SI, INT) > DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V2DI, V4SI, INT) > DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4DI, V4SI, INT) > +DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V4DI, V8SI, INT) > > DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND) > DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND) > --- gcc/config/i386/sse.md.jj 2011-11-04 08:52:19.000000000 +0100 > +++ gcc/config/i386/sse.md 2011-11-04 12:48:16.000000000 +0100 > @@ -316,14 +316,6 @@ (define_mode_attr i128 > ;; Mix-n-match > (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF]) > > -(define_mode_iterator AVXMODE48P_DI > - [V2DI V2DF V4DI V4DF V4SF V4SI]) > -(define_mode_attr AVXMODE48P_DI > - [(V2DI "V2DI") (V2DF "V2DI") > - (V4DI "V4DI") (V4DF "V4DI") > - (V4SI "V2DI") (V4SF "V2DI") > - (V8SI "V4DI") (V8SF "V4DI")]) > - > (define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF]) > > ;; Mapping of immediate bits for blend instructions > @@ -12516,11 +12508,21 @@ (define_insn "vcvtps2ph256" > ;; For gather* insn patterns > (define_mode_iterator VEC_GATHER_MODE > [V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF]) > -(define_mode_attr VEC_GATHER_MODE > +(define_mode_attr VEC_GATHER_IDXSI > [(V2DI "V4SI") (V2DF "V4SI") > (V4DI "V4SI") (V4DF "V4SI") > (V4SI "V4SI") (V4SF "V4SI") > (V8SI "V8SI") (V8SF "V8SI")]) > +(define_mode_attr VEC_GATHER_IDXDI > + [(V2DI "V2DI") (V2DF "V2DI") > + (V4DI "V4DI") (V4DF "V4DI") > + (V4SI "V2DI") (V4SF "V2DI") > + (V8SI "V4DI") (V8SF "V4DI")]) > +(define_mode_attr VEC_GATHER_SRCDI > + [(V2DI "V2DI") (V2DF "V2DF") > + (V4DI "V4DI") (V4DF "V4DF") > + (V4SI "V4SI") (V4SF "V4SF") > + (V8SI "V4SI") (V8SF "V4SF")]) > > (define_expand "avx2_gathersi<mode>" > [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "") > @@ -12529,7 +12531,8 @@ (define_expand "avx2_gathersi<mode>" > (mem:<ssescalarmode> > (match_par_dup 7 > [(match_operand 2 "vsib_address_operand" "") > - (match_operand:<VEC_GATHER_MODE> 3 "register_operand" "") > + (match_operand:<VEC_GATHER_IDXSI> > + 3 "register_operand" "") > (match_operand:SI 5 "const1248_operand " "")])) > (mem:BLK (scratch)) > (match_operand:VEC_GATHER_MODE 4 "register_operand" "")] > @@ -12549,7 +12552,7 @@ (define_insn "*avx2_gathersi<mode>" > (match_operator:<ssescalarmode> 7 "vsib_mem_operator" > [(unspec:P > [(match_operand:P 3 "vsib_address_operand" "p") > - (match_operand:<VEC_GATHER_MODE> 4 "register_operand" "x") > + (match_operand:<VEC_GATHER_IDXSI> 4 "register_operand" "x") > (match_operand:SI 6 "const1248_operand" "n")] > UNSPEC_VSIBADDR)]) > (mem:BLK (scratch)) > @@ -12565,14 +12568,16 @@ (define_insn "*avx2_gathersi<mode>" > (define_expand "avx2_gatherdi<mode>" > [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "") > (unspec:VEC_GATHER_MODE > - [(match_operand:VEC_GATHER_MODE 1 "register_operand" "") > + [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand" "") > (mem:<ssescalarmode> > (match_par_dup 7 > [(match_operand 2 "vsib_address_operand" "") > - (match_operand:<AVXMODE48P_DI> 3 "register_operand" "") > + (match_operand:<VEC_GATHER_IDXDI> > + 3 "register_operand" "") > (match_operand:SI 5 "const1248_operand " "")])) > (mem:BLK (scratch)) > - (match_operand:VEC_GATHER_MODE 4 "register_operand" "")] > + (match_operand:<VEC_GATHER_SRCDI> > + 4 "register_operand" "")] > UNSPEC_GATHER)) > (clobber (match_scratch:VEC_GATHER_MODE 6 ""))])] > "TARGET_AVX2" > @@ -12583,63 +12588,21 @@ (define_expand "avx2_gatherdi<mode>" > }) > > (define_insn "*avx2_gatherdi<mode>" > - [(set (match_operand:AVXMODE48P_DI 0 "register_operand" "=&x") > - (unspec:AVXMODE48P_DI > - [(match_operand:AVXMODE48P_DI 2 "register_operand" "0") > + [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") > + (unspec:VEC_GATHER_MODE > + [(match_operand:<VEC_GATHER_SRCDI> 2 "register_operand" "0") > (match_operator:<ssescalarmode> 7 "vsib_mem_operator" > [(unspec:P > [(match_operand:P 3 "vsib_address_operand" "p") > - (match_operand:<AVXMODE48P_DI> 4 "register_operand" "x") > + (match_operand:<VEC_GATHER_IDXDI> 4 "register_operand" "x") > (match_operand:SI 6 "const1248_operand" "n")] > UNSPEC_VSIBADDR)]) > (mem:BLK (scratch)) > - (match_operand:AVXMODE48P_DI 5 "register_operand" "1")] > + (match_operand:<VEC_GATHER_SRCDI> 5 "register_operand" "1")] > UNSPEC_GATHER)) > - (clobber (match_scratch:AVXMODE48P_DI 1 "=&x"))] > - "TARGET_AVX2" > - "v<sseintprefix>gatherq<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}" > - [(set_attr "type" "ssemov") > - (set_attr "prefix" "vex") > - (set_attr "mode" "<sseinsnmode>")]) > - > -;; Special handling for VEX.256 with float arguments > -;; since there're still xmms as operands > -(define_expand "avx2_gatherdi<mode>256" > - [(parallel [(set (match_operand:VI4F_128 0 "register_operand" "") > - (unspec:VI4F_128 > - [(match_operand:VI4F_128 1 "register_operand" "") > - (mem:<ssescalarmode> > - (match_par_dup 7 > - [(match_operand 2 "vsib_address_operand" "") > - (match_operand:V4DI 3 "register_operand" "") > - (match_operand:SI 5 "const1248_operand " "")])) > - (mem:BLK (scratch)) > - (match_operand:VI4F_128 4 "register_operand" "")] > - UNSPEC_GATHER)) > - (clobber (match_scratch:VI4F_128 6 ""))])] > - "TARGET_AVX2" > -{ > - operands[7] > - = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3], > - operands[5]), UNSPEC_VSIBADDR); > -}) > - > -(define_insn "*avx2_gatherdi<mode>256" > - [(set (match_operand:VI4F_128 0 "register_operand" "=x") > - (unspec:VI4F_128 > - [(match_operand:VI4F_128 2 "register_operand" "0") > - (match_operator:<ssescalarmode> 7 "vsib_mem_operator" > - [(unspec:P > - [(match_operand:P 3 "vsib_address_operand" "p") > - (match_operand:V4DI 4 "register_operand" "x") > - (match_operand:SI 6 "const1248_operand" "n")] > - UNSPEC_VSIBADDR)]) > - (mem:BLK (scratch)) > - (match_operand:VI4F_128 5 "register_operand" "1")] > - UNSPEC_GATHER)) > - (clobber (match_scratch:VI4F_128 1 "=&x"))] > + (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] > "TARGET_AVX2" > - "v<sseintprefix>gatherq<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}" > + "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}" > [(set_attr "type" "ssemov") > (set_attr "prefix" "vex") > (set_attr "mode" "<sseinsnmode>")]) > --- gcc/config/i386/i386.c.jj 2011-11-04 08:52:19.000000000 +0100 > +++ gcc/config/i386/i386.c 2011-11-04 12:48:16.000000000 +0100 > @@ -25105,6 +25105,13 @@ enum ix86_builtins > IX86_BUILTIN_GATHERDIV4SI, > IX86_BUILTIN_GATHERDIV8SI, > > + /* Alternate 4 element gather for the vectorizer where > + all operands are 32-byte wide. */ > + IX86_BUILTIN_GATHERALTSIV4DF, > + IX86_BUILTIN_GATHERALTDIV8SF, > + IX86_BUILTIN_GATHERALTSIV4DI, > + IX86_BUILTIN_GATHERALTDIV8SI, > + > /* TFmode support builtins. */ > IX86_BUILTIN_INFQ, > IX86_BUILTIN_HUGE_VALQ, > @@ -26883,6 +26890,22 @@ ix86_init_mmx_sse_builtins (void) > V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT, > IX86_BUILTIN_GATHERDIV8SI); > > + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ", > + V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT, > + IX86_BUILTIN_GATHERALTSIV4DF); > + > + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ", > + V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT, > + IX86_BUILTIN_GATHERALTDIV8SF); > + > + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ", > + V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT, > + IX86_BUILTIN_GATHERALTSIV4DI); > + > + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ", > + V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT, > + IX86_BUILTIN_GATHERALTDIV8SI); > + > /* MMX access to the vec_init patterns. */ > def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si", > V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI); > @@ -28869,7 +28892,7 @@ rdrand_step: > icode = CODE_FOR_avx2_gatherdiv4sf; > goto gather_gen; > case IX86_BUILTIN_GATHERDIV8SF: > - icode = CODE_FOR_avx2_gatherdiv4sf256; > + icode = CODE_FOR_avx2_gatherdiv8sf; > goto gather_gen; > case IX86_BUILTIN_GATHERSIV2DI: > icode = CODE_FOR_avx2_gathersiv2di; > @@ -28893,7 +28916,20 @@ rdrand_step: > icode = CODE_FOR_avx2_gatherdiv4si; > goto gather_gen; > case IX86_BUILTIN_GATHERDIV8SI: > - icode = CODE_FOR_avx2_gatherdiv4si256; > + icode = CODE_FOR_avx2_gatherdiv8si; > + goto gather_gen; > + case IX86_BUILTIN_GATHERALTSIV4DF: > + icode = CODE_FOR_avx2_gathersiv4df; > + goto gather_gen; > + case IX86_BUILTIN_GATHERALTDIV8SF: > + icode = CODE_FOR_avx2_gatherdiv8sf; > + goto gather_gen; > + case IX86_BUILTIN_GATHERALTSIV4DI: > + icode = CODE_FOR_avx2_gathersiv4df; > + goto gather_gen; > + case IX86_BUILTIN_GATHERALTDIV8SI: > + icode = CODE_FOR_avx2_gatherdiv8si; > + goto gather_gen; > > gather_gen: > arg0 = CALL_EXPR_ARG (exp, 0); > @@ -28912,8 +28948,39 @@ rdrand_step: > mode3 = insn_data[icode].operand[4].mode; > mode4 = insn_data[icode].operand[5].mode; > > - if (target == NULL_RTX) > - target = gen_reg_rtx (insn_data[icode].operand[0].mode); > + if (target == NULL_RTX > + || GET_MODE (target) != insn_data[icode].operand[0].mode) > + subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode); > + else > + subtarget = target; > + > + if (fcode == IX86_BUILTIN_GATHERALTSIV4DF > + || fcode == IX86_BUILTIN_GATHERALTSIV4DI) > + { > + rtx half = gen_reg_rtx (V4SImode); > + if (!nonimmediate_operand (op2, V8SImode)) > + op2 = copy_to_mode_reg (V8SImode, op2); > + emit_insn (gen_vec_extract_lo_v8si (half, op2)); > + op2 = half; > + } > + else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF > + || fcode == IX86_BUILTIN_GATHERALTDIV8SI) > + { > + rtx (*gen) (rtx, rtx); > + rtx half = gen_reg_rtx (mode0); > + if (mode0 == V4SFmode) > + gen = gen_vec_extract_lo_v8sf; > + else > + gen = gen_vec_extract_lo_v8si; > + if (!nonimmediate_operand (op0, GET_MODE (op0))) > + op0 = copy_to_mode_reg (GET_MODE (op0), op0); > + emit_insn (gen (half, op0)); > + op0 = half; > + if (!nonimmediate_operand (op3, GET_MODE (op3))) > + op3 = copy_to_mode_reg (GET_MODE (op3), op3); > + emit_insn (gen (half, op3)); > + op3 = half; > + } > > /* Force memory operand only with base register here. But we > don't want to do it on memory operand for other builtin > @@ -28935,10 +29002,26 @@ rdrand_step: > error ("last argument must be scale 1, 2, 4, 8"); > return const0_rtx; > } > - pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4); > + pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4); > if (! pat) > return const0_rtx; > emit_insn (pat); > + > + if (fcode == IX86_BUILTIN_GATHERDIV8SF > + || fcode == IX86_BUILTIN_GATHERDIV8SI) > + { > + enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode > + ? V4SFmode : V4SImode; > + if (target == NULL_RTX) > + target = gen_reg_rtx (tmode); > + if (tmode == V4SFmode) > + emit_insn (gen_vec_extract_lo_v8sf (target, subtarget)); > + else > + emit_insn (gen_vec_extract_lo_v8si (target, subtarget)); > + } > + else > + target = subtarget; > + > return target; > > default: > @@ -29443,6 +29526,73 @@ ix86_veclibabi_acml (enum built_in_funct > return new_fndecl; > } > > +/* Returns a decl of a function that implements gather load with > + memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE. > + Return NULL_TREE if it is not available. */ > + > +static tree > +ix86_vectorize_builtin_gather (const_tree mem_vectype, > + const_tree index_type, int scale) > +{ > + bool si; > + enum ix86_builtins code; > + > + if (! TARGET_AVX2) > + return NULL_TREE; > + > + if ((TREE_CODE (index_type) != INTEGER_TYPE > + && !POINTER_TYPE_P (index_type)) > + || (TYPE_MODE (index_type) != SImode > + && TYPE_MODE (index_type) != DImode)) > + return NULL_TREE; > + > + if (TYPE_PRECISION (index_type) > POINTER_SIZE) > + return NULL_TREE; > + > + /* v*gather* insn sign extends index to pointer mode. */ > + if (TYPE_PRECISION (index_type) < POINTER_SIZE > + && TYPE_UNSIGNED (index_type)) > + return NULL_TREE; > + > + if (scale <= 0 > + || scale > 8 > + || (scale & (scale - 1)) != 0) > + return NULL_TREE; > + > + si = TYPE_MODE (index_type) == SImode; > + switch (TYPE_MODE (mem_vectype)) > + { > + case V2DFmode: > + code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF; > + break; > + case V4DFmode: > + code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF; > + break; > + case V2DImode: > + code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI; > + break; > + case V4DImode: > + code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI; > + break; > + case V4SFmode: > + code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF; > + break; > + case V8SFmode: > + code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF; > + break; > + case V4SImode: > + code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI; > + break; > + case V8SImode: > + code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI; > + break; > + default: > + return NULL_TREE; > + } > + > + return ix86_builtins[code]; > +} > + > /* Returns a code for a target-specific builtin that implements > reciprocal of the function, or NULL_TREE if not available. */ > > @@ -37642,6 +37792,9 @@ ix86_autovectorize_vector_sizes (void) > #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ > ix86_builtin_vectorized_function > > +#undef TARGET_VECTORIZE_BUILTIN_GATHER > +#define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather > + > #undef TARGET_BUILTIN_RECIPROCAL > #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal > > --- gcc/testsuite/gcc.target/i386/avx2-gather-1.c.jj 2011-11-04 08:53:13.000000000 +0100 > +++ gcc/testsuite/gcc.target/i386/avx2-gather-1.c 2011-11-04 08:53:13.000000000 +0100 > @@ -0,0 +1,215 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target avx2 } */ > +/* { dg-options "-O3 -mavx2" } */ > + > +#include "avx2-check.h" > + > +#define N 1024 > +float vf1[N+16], vf2[N]; > +double vd1[N+16], vd2[N]; > +int k[N]; > +long l[N]; > +short n[N]; > + > +__attribute__((noinline, noclone)) void > +f1 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + vf2[i] = vf1[k[i]]; > +} > + > +__attribute__((noinline, noclone)) void > +f2 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + n[i] = (int) vf1[k[i]]; > +} > + > +__attribute__((noinline, noclone)) void > +f3 (int x) > +{ > + int i; > + for (i = 0; i < N; i++) > + vf2[i] = vf1[k[i] + x]; > +} > + > +__attribute__((noinline, noclone)) void > +f4 (int x) > +{ > + int i; > + for (i = 0; i < N; i++) > + n[i] = (int) vf1[k[i] + x]; > +} > + > +__attribute__((noinline, noclone)) void > +f5 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + vd2[i] = vd1[k[i]]; > +} > + > +__attribute__((noinline, noclone)) void > +f6 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + n[i] = (int) vd1[k[i]]; > +} > + > +__attribute__((noinline, noclone)) void > +f7 (int x) > +{ > + int i; > + for (i = 0; i < N; i++) > + vd2[i] = vd1[k[i] + x]; > +} > + > +__attribute__((noinline, noclone)) void > +f8 (int x) > +{ > + int i; > + for (i = 0; i < N; i++) > + n[i] = (int) vd1[k[i] + x]; > +} > + > +__attribute__((noinline, noclone)) void > +f9 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + vf2[i] = vf1[l[i]]; > +} > + > +__attribute__((noinline, noclone)) void > +f10 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + n[i] = (int) vf1[l[i]]; > +} > + > +__attribute__((noinline, noclone)) void > +f11 (long x) > +{ > + int i; > + for (i = 0; i < N; i++) > + vf2[i] = vf1[l[i] + x]; > +} > + > +__attribute__((noinline, noclone)) void > +f12 (long x) > +{ > + int i; > + for (i = 0; i < N; i++) > + n[i] = (int) vf1[l[i] + x]; > +} > + > +__attribute__((noinline, noclone)) void > +f13 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + vd2[i] = vd1[l[i]]; > +} > + > +__attribute__((noinline, noclone)) void > +f14 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + n[i] = (int) vd1[l[i]]; > +} > + > +__attribute__((noinline, noclone)) void > +f15 (long x) > +{ > + int i; > + for (i = 0; i < N; i++) > + vd2[i] = vd1[l[i] + x]; > +} > + > +__attribute__((noinline, noclone)) void > +f16 (long x) > +{ > + int i; > + for (i = 0; i < N; i++) > + n[i] = (int) vd1[l[i] + x]; > +} > + > +static void > +avx2_test (void) > +{ > + int i; > + > + for (i = 0; i < N + 16; i++) > + { > + asm (""); > + vf1[i] = 17.0f + i; > + vd1[i] = 19.0 + i; > + } > + for (i = 0; i < N; i++) > + { > + asm (""); > + k[i] = (i * 731) & (N - 1); > + l[i] = (i * 657) & (N - 1); > + } > + > + f1 (); > + f2 (); > + for (i = 0; i < N; i++) > + if (vf2[i] != ((i * 731) & (N - 1)) + 17 > + || n[i] != ((i * 731) & (N - 1)) + 17) > + abort (); > + > + f3 (12); > + f4 (14); > + for (i = 0; i < N; i++) > + if (vf2[i] != ((i * 731) & (N - 1)) + 17 + 12 > + || n[i] != ((i * 731) & (N - 1)) + 17 + 14) > + abort (); > + > + f5 (); > + f6 (); > + for (i = 0; i < N; i++) > + if (vd2[i] != ((i * 731) & (N - 1)) + 19 > + || n[i] != ((i * 731) & (N - 1)) + 19) > + abort (); > + > + f7 (7); > + f8 (9); > + for (i = 0; i < N; i++) > + if (vd2[i] != ((i * 731) & (N - 1)) + 19 + 7 > + || n[i] != ((i * 731) & (N - 1)) + 19 + 9) > + abort (); > + > + f9 (); > + f10 (); > + for (i = 0; i < N; i++) > + if (vf2[i] != ((i * 657) & (N - 1)) + 17 > + || n[i] != ((i * 657) & (N - 1)) + 17) > + abort (); > + > + f11 (2); > + f12 (4); > + for (i = 0; i < N; i++) > + if (vf2[i] != ((i * 657) & (N - 1)) + 17 + 2 > + || n[i] != ((i * 657) & (N - 1)) + 17 + 4) > + abort (); > + > + f13 (); > + f14 (); > + for (i = 0; i < N; i++) > + if (vd2[i] != ((i * 657) & (N - 1)) + 19 > + || n[i] != ((i * 657) & (N - 1)) + 19) > + abort (); > + > + f15 (13); > + f16 (15); > + for (i = 0; i < N; i++) > + if (vd2[i] != ((i * 657) & (N - 1)) + 19 + 13 > + || n[i] != ((i * 657) & (N - 1)) + 19 + 15) > + abort (); > +} > --- gcc/testsuite/gcc.target/i386/avx2-gather-2.c.jj 2011-11-04 08:53:13.000000000 +0100 > +++ gcc/testsuite/gcc.target/i386/avx2-gather-2.c 2011-11-04 08:53:13.000000000 +0100 > @@ -0,0 +1,7 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */ > + > +#include "avx2-gather-1.c" > + > +/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops in function" 16 "vect" } } */ > +/* { dg-final { cleanup-tree-dump "vect" } } */ > --- gcc/testsuite/gcc.target/i386/avx2-gather-3.c.jj 2011-11-04 08:53:13.000000000 +0100 > +++ gcc/testsuite/gcc.target/i386/avx2-gather-3.c 2011-11-04 08:53:13.000000000 +0100 > @@ -0,0 +1,167 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target avx2 } */ > +/* { dg-options "-O3 -mavx2 -ffast-math" } */ > + > +#include "avx2-check.h" > + > +#define N 1024 > +float f[N]; > +double d[N]; > +int k[N]; > +float *l[N]; > +double *n[N]; > +int **m[N]; > +long **o[N]; > +long q[N]; > +long *r[N]; > +int *s[N]; > + > +__attribute__((noinline, noclone)) float > +f1 (void) > +{ > + int i; > + float g = 0.0; > + for (i = 0; i < N / 2; i++) > + g += f[k[i]]; > + return g; > +} > + > +__attribute__((noinline, noclone)) float > +f2 (float *p) > +{ > + int i; > + float g = 0.0; > + for (i = 0; i < N / 2; i++) > + g += p[k[i]]; > + return g; > +} > + > +__attribute__((noinline, noclone)) float > +f3 (void) > +{ > + int i; > + float g = 0.0; > + for (i = 0; i < N / 2; i++) > + g += *l[i]; > + return g; > +} > + > +__attribute__((noinline, noclone)) int > +f4 (void) > +{ > + int i; > + int g = 0; > + for (i = 0; i < N / 2; i++) > + g += **m[i]; > + return g; > +} > + > +__attribute__((noinline, noclone)) double > +f5 (void) > +{ > + int i; > + double g = 0.0; > + for (i = 0; i < N / 2; i++) > + g += d[k[i]]; > + return g; > +} > + > +__attribute__((noinline, noclone)) double > +f6 (double *p) > +{ > + int i; > + double g = 0.0; > + for (i = 0; i < N / 2; i++) > + g += p[k[i]]; > + return g; > +} > + > +__attribute__((noinline, noclone)) double > +f7 (void) > +{ > + int i; > + double g = 0.0; > + for (i = 0; i < N / 2; i++) > + g += *n[i]; > + return g; > +} > + > +__attribute__((noinline, noclone)) int > +f8 (void) > +{ > + int i; > + int g = 0; > + for (i = 0; i < N / 2; i++) > + g += **o[i]; > + return g; > +} > + > +__attribute__((noinline, noclone)) float > +f9 (void) > +{ > + int i; > + float g = 0.0; > + for (i = 0; i < N / 2; i++) > + g += f[q[i]]; > + return g; > +} > + > +__attribute__((noinline, noclone)) float > +f10 (float *p) > +{ > + int i; > + float g = 0.0; > + for (i = 0; i < N / 2; i++) > + g += p[q[i]]; > + return g; > +} > + > +__attribute__((noinline, noclone)) double > +f11 (void) > +{ > + int i; > + double g = 0.0; > + for (i = 0; i < N / 2; i++) > + g += d[q[i]]; > + return g; > +} > + > +__attribute__((noinline, noclone)) double > +f12 (double *p) > +{ > + int i; > + double g = 0.0; > + for (i = 0; i < N / 2; i++) > + g += p[q[i]]; > + return g; > +} > + > +static void > +avx2_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + { > + asm (""); > + f[i] = -256.0f + i; > + d[i] = -258.0 + i; > + k[i] = (i * 731) & (N - 1); > + q[i] = (i * 657) & (N - 1); > + l[i] = &f[(i * 239) & (N - 1)]; > + n[i] = &d[(i * 271) & (N - 1)]; > + r[i] = &q[(i * 323) & (N - 1)]; > + s[i] = &k[(i * 565) & (N - 1)]; > + m[i] = &s[(i * 13) & (N - 1)]; > + o[i] = &r[(i * 19) & (N - 1)]; > + } > + > + if (f1 () != 136448.0f || f2 (f) != 136448.0f || f3 () != 130304.0) > + abort (); > + if (f4 () != 261376 || f5 () != 135424.0 || f6 (d) != 135424.0) > + abort (); > + if (f7 () != 129280.0 || f8 () != 259840L || f9 () != 130816.0f) > + abort (); > + if (f10 (f) != 130816.0f || f11 () != 129792.0 || f12 (d) != 129792.0) > + abort (); > +} > --- gcc/testsuite/gcc.target/i386/avx2-gather-4.c.jj 2011-11-04 08:54:11.000000000 +0100 > +++ gcc/testsuite/gcc.target/i386/avx2-gather-4.c 2011-11-04 08:54:11.000000000 +0100 > @@ -0,0 +1,38 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target avx2 } */ > +/* { dg-options "-O3 -mavx2" } */ > + > +#include "avx2-check.h" > + > +#define N 1024 > +int a[N], b[N], c[N], d[N]; > + > +__attribute__((noinline, noclone)) void > +foo (float *__restrict p, float *__restrict q, float *__restrict r, > + long s1, long s2, long s3) > +{ > + int i; > + for (i = 0; i < N; i++) > + p[i] = q[a[i] * s1 + b[i] * s2 + s3] * r[c[i] * s1 + d[i] * s2 + s3]; > +} > + > +static void > +avx2_test (void) > +{ > + int i; > + float e[N], f[N], g[N]; > + for (i = 0; i < N; i++) > + { > + a[i] = (i * 7) & (N / 8 - 1); > + b[i] = (i * 13) & (N / 8 - 1); > + c[i] = (i * 23) & (N / 8 - 1); > + d[i] = (i * 5) & (N / 8 - 1); > + e[i] = 16.5 + i; > + f[i] = 127.5 - i; > + } > + foo (g, e, f, 3, 2, 4); > + for (i = 0; i < N; i++) > + if (g[i] != (float) ((20.5 + a[i] * 3 + b[i] * 2) > + * (123.5 - c[i] * 3 - d[i] * 2))) > + abort (); > +} > > > Jakub >
On Fri, Nov 4, 2011 at 2:22 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Fri, Nov 04, 2011 at 12:21:49PM +0100, Richard Guenther wrote: >> Ok. I guess it's ok to use builtins for now - I didn't think of >> the memory reference issue ;) > > Based on IRC discussion I'm posting an updated patch (both former > patches (base and incremental) in one). > > I'm now using expr_invariant_in_loop_p instead of chrec_contains*, > which nicely handles also the is_gimple_min_invariant case, > and I've added several comments and fixed the MEM_REF offset > folding. Smoke tested on the *gather* testcases, will do full > bootstrap/regtest soon. > > 2011-11-04 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/50789 > * tree-vect-stmts.c (process_use): Add force argument, avoid > exist_non_indexing_operands_for_use_p check if true. > (vect_mark_stmts_to_be_vectorized): Adjust callers. Handle > STMT_VINFO_GATHER_P. > (gen_perm_mask): New function. > (perm_mask_for_reverse): Use it. > (reverse_vec_element): Rename to... > (permute_vec_elements): ... this. Add Y and MASK_VEC arguments, > generalize for any permutations. > (vectorizable_load): Adjust caller. Handle STMT_VINFO_GATHER_P. > * target.def (TARGET_VECTORIZE_BUILTIN_GATHER): New hook. > * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_GATHER): Document it. > * doc/tm.texi: Regenerate. > * tree-data-ref.c (initialize_data_dependence_relation, > compute_self_dependence): No longer static. > * tree-data-ref.h (initialize_data_dependence_relation, > compute_self_dependence): New prototypes. > * tree-vect-data-refs.c (vect_check_gather): New function. > (vect_analyze_data_refs): Detect possible gather load data > refs. > * tree-vectorizer.h (struct _stmt_vec_info): Add gather_p field. > (STMT_VINFO_GATHER_P): Define. > (vect_check_gather): New prototype. > * config/i386/i386-builtin-types.def: Add types for alternate > gather builtins. > * config/i386/sse.md (AVXMODE48P_DI): Remove. > (VEC_GATHER_MODE): Rename mode_attr to... > (VEC_GATHER_IDXSI): ... this. > (VEC_GATHER_IDXDI, VEC_GATHER_SRCDI): New mode_attrs. > (avx2_gathersi<mode>, *avx2_gathersi<mode>): Use <VEC_GATHER_IDXSI> > instead of <VEC_GATHER_MODE>. > (avx2_gatherdi<mode>): Use <VEC_GATHER_IDXDI> instead of > <<AVXMODE48P_DI> and <VEC_GATHER_SRCDI> instead of VEC_GATHER_MODE > on src and mask operands. > (*avx2_gatherdi<mode>): Likewise. Use VEC_GATHER_MODE iterator > instead of AVXMODE48P_DI. > (avx2_gatherdi<mode>256, *avx2_gatherdi<mode>256): Removed. > * config/i386/i386.c (enum ix86_builtins): Add > IX86_BUILTIN_GATHERALTSIV4DF, IX86_BUILTIN_GATHERALTDIV8SF, > IX86_BUILTIN_GATHERALTSIV4DI and IX86_BUILTIN_GATHERALTDIV8SI. > (ix86_init_mmx_sse_builtins): Create those builtins. > (ix86_expand_builtin): Handle those builtins and adjust expansions > of other gather builtins. > (ix86_vectorize_builtin_gather): New function. > (TARGET_VECTORIZE_BUILTIN_GATHER): Define. > > * gcc.target/i386/avx2-gather-1.c: New test. > * gcc.target/i386/avx2-gather-2.c: New test. > * gcc.target/i386/avx2-gather-3.c: New test. > * gcc.target/i386/avx2-gather-4.c: New test. x86 parts are also OK. Thanks, Uros.
--- gcc/tree-vect-stmts.c.jj 2011-11-04 08:52:19.000000000 +0100 +++ gcc/tree-vect-stmts.c 2011-11-04 08:54:11.000000000 +0100 @@ -332,6 +332,8 @@ exist_non_indexing_operands_for_use_p (t - LIVE_P, RELEVANT - enum values to be set in the STMT_VINFO of the stmt that defined USE. This is done by calling mark_relevant and passing it the WORKLIST (to add DEF_STMT to the WORKLIST in case it is relevant). + - FORCE is true if exist_non_indexing_operands_for_use_p check shouldn't + be performed. Outputs: Generally, LIVE_P and RELEVANT are used to define the liveness and @@ -351,7 +353,8 @@ exist_non_indexing_operands_for_use_p (t static bool process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p, - enum vect_relevant relevant, VEC(gimple,heap) **worklist) + enum vect_relevant relevant, VEC(gimple,heap) **worklist, + bool force) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); @@ -363,7 +366,7 @@ process_use (gimple stmt, tree use, loop /* case 1: we are only interested in uses that need to be vectorized. Uses that are used for address computation are not considered relevant. */ - if (!exist_non_indexing_operands_for_use_p (use, stmt)) + if (!force && !exist_non_indexing_operands_for_use_p (use, stmt)) return true; if (!vect_is_simple_use (use, loop_vinfo, NULL, &def_stmt, &def, &dt)) @@ -646,7 +649,7 @@ vect_mark_stmts_to_be_vectorized (loop_v break; } - if (is_pattern_stmt_p (vinfo_for_stmt (stmt))) + if (is_pattern_stmt_p (stmt_vinfo)) { /* Pattern statements are not inserted into the code, so FOR_EACH_PHI_OR_STMT_USE optimizes their operands out, and we @@ -660,9 +663,9 @@ vect_mark_stmts_to_be_vectorized (loop_v if (rhs_code == COND_EXPR && COMPARISON_CLASS_P (op)) { if (!process_use (stmt, TREE_OPERAND (op, 0), loop_vinfo, - live_p, relevant, &worklist) + live_p, relevant, &worklist, false) || !process_use (stmt, TREE_OPERAND (op, 1), loop_vinfo, - live_p, relevant, &worklist)) + live_p, relevant, &worklist, false)) { VEC_free (gimple, heap, worklist); return false; @@ -673,7 +676,7 @@ vect_mark_stmts_to_be_vectorized (loop_v { op = gimple_op (stmt, i); if (!process_use (stmt, op, loop_vinfo, live_p, relevant, - &worklist)) + &worklist, false)) { VEC_free (gimple, heap, worklist); return false; @@ -686,7 +689,7 @@ vect_mark_stmts_to_be_vectorized (loop_v { tree arg = gimple_call_arg (stmt, i); if (!process_use (stmt, arg, loop_vinfo, live_p, relevant, - &worklist)) + &worklist, false)) { VEC_free (gimple, heap, worklist); return false; @@ -699,12 +702,25 @@ vect_mark_stmts_to_be_vectorized (loop_v { tree op = USE_FROM_PTR (use_p); if (!process_use (stmt, op, loop_vinfo, live_p, relevant, - &worklist)) + &worklist, false)) { VEC_free (gimple, heap, worklist); return false; } } + + if (STMT_VINFO_GATHER_P (stmt_vinfo)) + { + tree off; + tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL); + gcc_assert (decl); + if (!process_use (stmt, off, loop_vinfo, live_p, relevant, + &worklist, true)) + { + VEC_free (gimple, heap, worklist); + return false; + } + } } /* while worklist */ VEC_free (gimple, heap, worklist); @@ -4142,23 +4158,17 @@ vectorizable_store (gimple stmt, gimple_ return true; } -/* Given a vector type VECTYPE returns a builtin DECL to be used - for vector permutation and returns the mask that implements - reversal of the vector elements. If that is impossible to do, - returns NULL. */ +/* Given a vector type VECTYPE and permutation SEL returns + the VECTOR_CST mask that implements the permutation of the + vector elements. If that is impossible to do, returns NULL. */ static tree -perm_mask_for_reverse (tree vectype) +gen_perm_mask (tree vectype, unsigned char *sel) { tree mask_elt_type, mask_type, mask_vec; int i, nunits; - unsigned char *sel; nunits = TYPE_VECTOR_SUBPARTS (vectype); - sel = XALLOCAVEC (unsigned char, nunits); - - for (i = 0; i < nunits; ++i) - sel[i] = nunits - 1 - i; if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) return NULL; @@ -4169,33 +4179,52 @@ perm_mask_for_reverse (tree vectype) mask_type = get_vectype_for_scalar_type (mask_elt_type); mask_vec = NULL; - for (i = 0; i < nunits; i++) - mask_vec = tree_cons (NULL, build_int_cst (mask_elt_type, i), mask_vec); + for (i = nunits - 1; i >= 0; i--) + mask_vec = tree_cons (NULL, build_int_cst (mask_elt_type, sel[i]), + mask_vec); mask_vec = build_vector (mask_type, mask_vec); return mask_vec; } -/* Given a vector variable X, that was generated for the scalar LHS of - STMT, generate instructions to reverse the vector elements of X, - insert them a *GSI and return the permuted vector variable. */ +/* Given a vector type VECTYPE returns the VECTOR_CST mask that implements + reversal of the vector elements. If that is impossible to do, + returns NULL. */ static tree -reverse_vec_elements (tree x, gimple stmt, gimple_stmt_iterator *gsi) +perm_mask_for_reverse (tree vectype) +{ + int i, nunits; + unsigned char *sel; + + nunits = TYPE_VECTOR_SUBPARTS (vectype); + sel = XALLOCAVEC (unsigned char, nunits); + + for (i = 0; i < nunits; ++i) + sel[i] = nunits - 1 - i; + + return gen_perm_mask (vectype, sel); +} + +/* Given a vector variable X and Y, that was generated for the scalar + STMT, generate instructions to permute the vector elements of X and Y + using permutation mask MASK_VEC, insert them at *GSI and return the + permuted vector variable. */ + +static tree +permute_vec_elements (tree x, tree y, tree mask_vec, gimple stmt, + gimple_stmt_iterator *gsi) { tree vectype = TREE_TYPE (x); - tree mask_vec, perm_dest, data_ref; + tree perm_dest, data_ref; gimple perm_stmt; - mask_vec = perm_mask_for_reverse (vectype); - perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype); + data_ref = make_ssa_name (perm_dest, NULL); /* Generate the permute statement. */ - perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, perm_dest, - x, x, mask_vec); - data_ref = make_ssa_name (perm_dest, perm_stmt); - gimple_set_lhs (perm_stmt, data_ref); + perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, data_ref, + x, y, mask_vec); vect_finish_stmt_generation (stmt, perm_stmt, gsi); return data_ref; @@ -4254,6 +4283,10 @@ vectorizable_load (gimple stmt, gimple_s bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); int vf; tree aggr_type; + tree gather_base = NULL_TREE, gather_off = NULL_TREE; + tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE; + int gather_scale = 1; + enum vect_def_type gather_dt = vect_unknown_def_type; if (loop_vinfo) { @@ -4334,7 +4367,7 @@ vectorizable_load (gimple stmt, gimple_s { strided_load = true; /* FORNOW */ - gcc_assert (! nested_in_vect_loop); + gcc_assert (! nested_in_vect_loop && !STMT_VINFO_GATHER_P (stmt_info)); first_stmt = GROUP_FIRST_ELEMENT (stmt_info); if (!slp && !PURE_SLP_STMT (stmt_info)) @@ -4349,7 +4382,7 @@ vectorizable_load (gimple stmt, gimple_s if (negative) { - gcc_assert (!strided_load); + gcc_assert (!strided_load && !STMT_VINFO_GATHER_P (stmt_info)); alignment_support_scheme = vect_supportable_dr_alignment (dr, false); if (alignment_support_scheme != dr_aligned && alignment_support_scheme != dr_unaligned_supported) @@ -4366,6 +4399,23 @@ vectorizable_load (gimple stmt, gimple_s } } + if (STMT_VINFO_GATHER_P (stmt_info)) + { + gimple def_stmt; + tree def; + gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base, + &gather_off, &gather_scale); + gcc_assert (gather_decl); + if (!vect_is_simple_use_1 (gather_off, loop_vinfo, bb_vinfo, + &def_stmt, &def, &gather_dt, + &gather_off_vectype)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "gather index use not simple."); + return false; + } + } + if (!vec_stmt) /* transformation not required. */ { STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; @@ -4378,6 +4428,161 @@ vectorizable_load (gimple stmt, gimple_s /** Transform. **/ + if (STMT_VINFO_GATHER_P (stmt_info)) + { + tree vec_oprnd0 = NULL_TREE, op; + tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl)); + tree rettype, srctype, ptrtype, idxtype, masktype, scaletype; + tree ptr, mask, var, scale, perm_mask = NULL_TREE, prev_res = NULL_TREE; + edge pe = loop_preheader_edge (loop); + gimple_seq seq; + basic_block new_bb; + enum { NARROW, NONE, WIDEN } modifier; + int gather_off_nunits = TYPE_VECTOR_SUBPARTS (gather_off_vectype); + + if (nunits == gather_off_nunits) + modifier = NONE; + else if (nunits == gather_off_nunits / 2) + { + unsigned char *sel = XALLOCAVEC (unsigned char, gather_off_nunits); + modifier = WIDEN; + + for (i = 0; i < gather_off_nunits; ++i) + sel[i] = i | nunits; + + perm_mask = gen_perm_mask (gather_off_vectype, sel); + gcc_assert (perm_mask != NULL_TREE); + } + else if (nunits == gather_off_nunits * 2) + { + unsigned char *sel = XALLOCAVEC (unsigned char, nunits); + modifier = NARROW; + + for (i = 0; i < nunits; ++i) + sel[i] = i < gather_off_nunits + ? i : i + nunits - gather_off_nunits; + + perm_mask = gen_perm_mask (vectype, sel); + gcc_assert (perm_mask != NULL_TREE); + ncopies *= 2; + } + else + gcc_unreachable (); + + rettype = TREE_TYPE (TREE_TYPE (gather_decl)); + srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + scaletype = TREE_VALUE (arglist); + gcc_checking_assert (types_compatible_p (srctype, rettype) + && types_compatible_p (srctype, masktype)); + + vec_dest = vect_create_destination_var (scalar_dest, vectype); + + ptr = fold_convert (ptrtype, gather_base); + if (!is_gimple_min_invariant (ptr)) + { + ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE); + new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); + gcc_assert (!new_bb); + } + + /* Currently we support only unconditional gather loads, + so mask should be all ones. */ + if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE) + mask = build_int_cst (TREE_TYPE (masktype), -1); + else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype))) + { + REAL_VALUE_TYPE r; + long tmp[6]; + for (j = 0; j < 6; ++j) + tmp[j] = -1; + real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype))); + mask = build_real (TREE_TYPE (masktype), r); + } + else + gcc_unreachable (); + mask = build_vector_from_val (masktype, mask); + mask = vect_init_vector (stmt, mask, masktype, NULL); + + scale = build_int_cst (scaletype, gather_scale); + + prev_stmt_info = NULL; + for (j = 0; j < ncopies; ++j) + { + if (modifier == WIDEN && (j & 1)) + op = permute_vec_elements (vec_oprnd0, vec_oprnd0, + perm_mask, stmt, gsi); + else if (j == 0) + op = vec_oprnd0 + = vect_get_vec_def_for_operand (gather_off, stmt, NULL); + else + op = vec_oprnd0 + = vect_get_vec_def_for_stmt_copy (gather_dt, vec_oprnd0); + + if (!useless_type_conversion_p (idxtype, TREE_TYPE (op))) + { + gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op)) + == TYPE_VECTOR_SUBPARTS (idxtype)); + var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL); + add_referenced_var (var); + var = make_ssa_name (var, NULL); + op = build1 (VIEW_CONVERT_EXPR, idxtype, op); + new_stmt + = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, + op, NULL_TREE); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + op = var; + } + + new_stmt + = gimple_build_call (gather_decl, 5, mask, ptr, op, mask, scale); + + if (!useless_type_conversion_p (vectype, rettype)) + { + gcc_assert (TYPE_VECTOR_SUBPARTS (vectype) + == TYPE_VECTOR_SUBPARTS (rettype)); + var = vect_get_new_vect_var (rettype, vect_simple_var, NULL); + add_referenced_var (var); + op = make_ssa_name (var, new_stmt); + gimple_call_set_lhs (new_stmt, op); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + var = make_ssa_name (vec_dest, NULL); + op = build1 (VIEW_CONVERT_EXPR, vectype, op); + new_stmt + = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, op, + NULL_TREE); + } + else + { + var = make_ssa_name (vec_dest, new_stmt); + gimple_call_set_lhs (new_stmt, var); + } + + vect_finish_stmt_generation (stmt, new_stmt, gsi); + + if (modifier == NARROW) + { + if ((j & 1) == 0) + { + prev_res = var; + continue; + } + var = permute_vec_elements (prev_res, var, + perm_mask, stmt, gsi); + new_stmt = SSA_NAME_DEF_STMT (var); + } + + if (prev_stmt_info == NULL) + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; + else + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; + prev_stmt_info = vinfo_for_stmt (new_stmt); + } + return true; + } + if (strided_load) { first_stmt = GROUP_FIRST_ELEMENT (stmt_info); @@ -4769,7 +4974,9 @@ vectorizable_load (gimple stmt, gimple_s if (negative) { - new_temp = reverse_vec_elements (new_temp, stmt, gsi); + tree perm_mask = perm_mask_for_reverse (vectype); + new_temp = permute_vec_elements (new_temp, new_temp, + perm_mask, stmt, gsi); new_stmt = SSA_NAME_DEF_STMT (new_temp); } --- gcc/target.def.jj 2011-11-04 08:52:19.000000000 +0100 +++ gcc/target.def 2011-11-04 08:53:13.000000000 +0100 @@ -1021,6 +1021,14 @@ DEFHOOK (void), default_autovectorize_vector_sizes) +/* Target builtin that implements vector gather operation. */ +DEFHOOK +(builtin_gather, + "", + tree, + (const_tree mem_vectype, const_tree index_type, int scale), + NULL) + HOOK_VECTOR_END (vectorize) #undef HOOK_PREFIX --- gcc/tree-data-ref.c.jj 2011-11-04 08:52:19.000000000 +0100 +++ gcc/tree-data-ref.c 2011-11-04 08:53:13.000000000 +0100 @@ -1351,13 +1351,11 @@ dr_may_alias_p (const struct data_refere return refs_may_alias_p (addr_a, addr_b); } -static void compute_self_dependence (struct data_dependence_relation *); - /* Initialize a data dependence relation between data accesses A and B. NB_LOOPS is the number of loops surrounding the references: the size of the classic distance/direction vectors. */ -static struct data_dependence_relation * +struct data_dependence_relation * initialize_data_dependence_relation (struct data_reference *a, struct data_reference *b, VEC (loop_p, heap) *loop_nest) @@ -4121,7 +4119,7 @@ compute_affine_dependence (struct data_d /* This computes the dependence relation for the same data reference into DDR. */ -static void +void compute_self_dependence (struct data_dependence_relation *ddr) { unsigned int i; --- gcc/tree-data-ref.h.jj 2011-11-04 08:52:19.000000000 +0100 +++ gcc/tree-data-ref.h 2011-11-04 13:22:28.000000000 +0100 @@ -1,5 +1,5 @@ /* Data references and dependences detectors. - Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 + Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. Contributed by Sebastian Pop <pop@cri.ensmp.fr> @@ -423,6 +423,9 @@ extern bool graphite_find_data_reference VEC (data_reference_p, heap) **); struct data_reference *create_data_ref (loop_p, loop_p, tree, gimple, bool); extern bool find_loop_nest (struct loop *, VEC (loop_p, heap) **); +extern struct data_dependence_relation *initialize_data_dependence_relation + (struct data_reference *, struct data_reference *, VEC (loop_p, heap) *); +extern void compute_self_dependence (struct data_dependence_relation *); extern void compute_all_dependences (VEC (data_reference_p, heap) *, VEC (ddr_p, heap) **, VEC (loop_p, heap) *, bool); --- gcc/doc/tm.texi.in.jj 2011-11-04 08:52:19.000000000 +0100 +++ gcc/doc/tm.texi.in 2011-11-04 08:53:13.000000000 +0100 @@ -5696,6 +5696,14 @@ mode returned by @code{TARGET_VECTORIZE_ The default is zero which means to not iterate over other vector sizes. @end deftypefn +@hook TARGET_VECTORIZE_BUILTIN_GATHER +Target builtin that implements vector gather operation. @var{mem_vectype} +is the vector type of the load and @var{index_type} is scalar type of +the index, scaled by @var{scale}. +The default is @code{NULL_TREE} which means to not vectorize gather +loads. +@end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses --- gcc/doc/tm.texi.jj 2011-11-04 08:52:19.000000000 +0100 +++ gcc/doc/tm.texi 2011-11-04 08:53:13.000000000 +0100 @@ -5758,6 +5758,14 @@ mode returned by @code{TARGET_VECTORIZE_ The default is zero which means to not iterate over other vector sizes. @end deftypefn +@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_GATHER (const_tree @var{mem_vectype}, const_tree @var{index_type}, int @var{scale}) +Target builtin that implements vector gather operation. @var{mem_vectype} +is the vector type of the load and @var{index_type} is scalar type of +the index, scaled by @var{scale}. +The default is @code{NULL_TREE} which means to not vectorize gather +loads. +@end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses --- gcc/tree-vect-data-refs.c.jj 2011-11-04 08:52:57.000000000 +0100 +++ gcc/tree-vect-data-refs.c 2011-11-04 14:10:17.000000000 +0100 @@ -2497,6 +2497,199 @@ vect_prune_runtime_alias_test_list (loop return true; } +/* Check whether a non-affine read in stmt is suitable for gather load + and if so, return a builtin decl for that operation. */ + +tree +vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep, + tree *offp, int *scalep) +{ + HOST_WIDE_INT scale = 1, pbitpos, pbitsize; + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + tree offtype = NULL_TREE; + tree decl, base, off; + enum machine_mode pmode; + int punsignedp, pvolatilep; + + /* The gather builtins need address of the form + loop_invariant + vector * {1, 2, 4, 8} + or + loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. + Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture + of loop invariants/SSA_NAMEs defined in the loop, with casts, + multiplications and additions in it. To get a vector, we need + a single SSA_NAME that will be defined in the loop and will + contain everything that is not loop invariant and that can be + vectorized. The following code attempts to find such a preexistng + SSA_NAME OFF and put the loop invariants into a tree BASE + that can be gimplified before the loop. */ + base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off, + &pmode, &punsignedp, &pvolatilep, false); + gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0); + + if (TREE_CODE (base) == MEM_REF) + { + if (!integer_zerop (TREE_OPERAND (base, 1))) + { + if (off == NULL_TREE) + { + double_int moff = mem_ref_offset (base); + off = double_int_to_tree (sizetype, moff); + } + else + off = size_binop (PLUS_EXPR, off, + fold_convert (sizetype, TREE_OPERAND (base, 1))); + } + base = TREE_OPERAND (base, 0); + } + else + base = build_fold_addr_expr (base); + + if (off == NULL_TREE) + off = size_zero_node; + + /* If base is not loop invariant, either off is 0, then we start with just + the constant offset in the loop invariant BASE and continue with base + as OFF, otherwise give up. + We could handle that case by gimplifying the addition of base + off + into some SSA_NAME and use that as off, but for now punt. */ + if (!expr_invariant_in_loop_p (loop, base)) + { + if (!integer_zerop (off)) + return NULL_TREE; + off = base; + base = size_int (pbitpos / BITS_PER_UNIT); + } + /* Otherwise put base + constant offset into the loop invariant BASE + and continue with OFF. */ + else + { + base = fold_convert (sizetype, base); + base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT)); + } + + /* OFF at this point may be either a SSA_NAME or some tree expression + from get_inner_reference. Try to peel off loop invariants from it + into BASE as long as possible. */ + STRIP_NOPS (off); + while (offtype == NULL_TREE) + { + enum tree_code code; + tree op0, op1, add = NULL_TREE; + + if (TREE_CODE (off) == SSA_NAME) + { + gimple def_stmt = SSA_NAME_DEF_STMT (off); + + if (expr_invariant_in_loop_p (loop, off)) + return NULL_TREE; + + if (gimple_code (def_stmt) != GIMPLE_ASSIGN) + break; + + op0 = gimple_assign_rhs1 (def_stmt); + code = gimple_assign_rhs_code (def_stmt); + op1 = gimple_assign_rhs2 (def_stmt); + } + else + { + if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) + return NULL_TREE; + code = TREE_CODE (off); + extract_ops_from_tree (off, &code, &op0, &op1); + } + switch (code) + { + case POINTER_PLUS_EXPR: + case PLUS_EXPR: + if (expr_invariant_in_loop_p (loop, op0)) + { + add = op0; + off = op1; + do_add: + add = fold_convert (sizetype, add); + if (scale != 1) + add = size_binop (MULT_EXPR, add, size_int (scale)); + base = size_binop (PLUS_EXPR, base, add); + continue; + } + if (expr_invariant_in_loop_p (loop, op1)) + { + add = op1; + off = op0; + goto do_add; + } + break; + case MINUS_EXPR: + if (expr_invariant_in_loop_p (loop, op1)) + { + add = fold_convert (sizetype, op1); + add = size_binop (MINUS_EXPR, size_zero_node, add); + off = op0; + goto do_add; + } + break; + case MULT_EXPR: + if (scale == 1 && host_integerp (op1, 0)) + { + scale = tree_low_cst (op1, 0); + off = op0; + continue; + } + break; + case SSA_NAME: + off = op0; + continue; + CASE_CONVERT: + if (!POINTER_TYPE_P (TREE_TYPE (op0)) + && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) + break; + if (TYPE_PRECISION (TREE_TYPE (op0)) + == TYPE_PRECISION (TREE_TYPE (off))) + { + off = op0; + continue; + } + if (TYPE_PRECISION (TREE_TYPE (op0)) + < TYPE_PRECISION (TREE_TYPE (off))) + { + off = op0; + offtype = TREE_TYPE (off); + STRIP_NOPS (off); + continue; + } + break; + default: + break; + } + break; + } + + /* If at the end OFF still isn't a SSA_NAME or isn't + defined in the loop, punt. */ + if (TREE_CODE (off) != SSA_NAME + || expr_invariant_in_loop_p (loop, off)) + return NULL_TREE; + + if (offtype == NULL_TREE) + offtype = TREE_TYPE (off); + + decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info), + offtype, scale); + if (decl == NULL_TREE) + return NULL_TREE; + + if (basep) + *basep = base; + if (offp) + *offp = off; + if (scalep) + *scalep = scale; + return decl; +} + /* Function vect_analyze_data_refs. @@ -2573,6 +2766,7 @@ vect_analyze_data_refs (loop_vec_info lo gimple stmt; stmt_vec_info stmt_info; tree base, offset, init; + bool gather = false; int vf; if (!dr || !DR_REF (dr)) @@ -2594,22 +2788,51 @@ vect_analyze_data_refs (loop_vec_info lo /* Check that analysis of the data-ref succeeded. */ if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) - || !DR_STEP (dr)) + || !DR_STEP (dr)) { - if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) - { - fprintf (vect_dump, "not vectorized: data ref analysis failed "); - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); - } + /* If target supports vector gather loads, see if they can't + be used. */ + if (loop_vinfo + && DR_IS_READ (dr) + && !TREE_THIS_VOLATILE (DR_REF (dr)) + && targetm.vectorize.builtin_gather != NULL + && !nested_in_vect_loop_p (loop, stmt)) + { + struct data_reference *newdr + = create_data_ref (NULL, loop_containing_stmt (stmt), + DR_REF (dr), stmt, true); + gcc_assert (newdr != NULL && DR_REF (newdr)); + if (DR_BASE_ADDRESS (newdr) + && DR_OFFSET (newdr) + && DR_INIT (newdr) + && DR_STEP (newdr) + && integer_zerop (DR_STEP (newdr))) + { + dr = newdr; + gather = true; + } + else + free_data_ref (newdr); + } - if (bb_vinfo) - { - STMT_VINFO_VECTORIZABLE (stmt_info) = false; - stop_bb_analysis = true; - continue; - } + if (!gather) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + { + fprintf (vect_dump, "not vectorized: data ref analysis " + "failed "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + if (bb_vinfo) + { + STMT_VINFO_VECTORIZABLE (stmt_info) = false; + stop_bb_analysis = true; + continue; + } - return false; + return false; + } } if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) @@ -2625,7 +2848,9 @@ vect_analyze_data_refs (loop_vec_info lo continue; } - return false; + if (gather) + free_data_ref (dr); + return false; } if (TREE_THIS_VOLATILE (DR_REF (dr))) @@ -2666,6 +2891,8 @@ vect_analyze_data_refs (loop_vec_info lo continue; } + if (gather) + free_data_ref (dr); return false; } @@ -2791,6 +3018,8 @@ vect_analyze_data_refs (loop_vec_info lo continue; } + if (gather) + free_data_ref (dr); return false; } @@ -2818,8 +3047,13 @@ vect_analyze_data_refs (loop_vec_info lo stop_bb_analysis = true; continue; } - else - return false; + + if (gather) + { + STMT_VINFO_DATA_REF (stmt_info) = NULL; + free_data_ref (dr); + } + return false; } /* Adjust the minimal vectorization factor according to the @@ -2827,6 +3061,86 @@ vect_analyze_data_refs (loop_vec_info lo vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); if (vf > *min_vf) *min_vf = vf; + + if (gather) + { + unsigned int j, k, n; + struct data_reference *olddr + = VEC_index (data_reference_p, datarefs, i); + VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo); + struct data_dependence_relation *ddr, *newddr; + bool bad = false; + tree off; + VEC (loop_p, heap) *nest = LOOP_VINFO_LOOP_NEST (loop_vinfo); + + if (!vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL) + || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + { + fprintf (vect_dump, + "not vectorized: not suitable for gather "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + return false; + } + + n = VEC_length (data_reference_p, datarefs) - 1; + for (j = 0, k = i - 1; j < i; j++) + { + ddr = VEC_index (ddr_p, ddrs, k); + gcc_assert (DDR_B (ddr) == olddr); + newddr = initialize_data_dependence_relation (DDR_A (ddr), dr, + nest); + VEC_replace (ddr_p, ddrs, k, newddr); + free_dependence_relation (ddr); + if (!bad + && DR_IS_WRITE (DDR_A (newddr)) + && DDR_ARE_DEPENDENT (newddr) != chrec_known) + bad = true; + k += --n; + } + + k++; + n = k + VEC_length (data_reference_p, datarefs) - i - 1; + for (; k < n; k++) + { + ddr = VEC_index (ddr_p, ddrs, k); + gcc_assert (DDR_A (ddr) == olddr); + newddr = initialize_data_dependence_relation (dr, DDR_B (ddr), + nest); + VEC_replace (ddr_p, ddrs, k, newddr); + free_dependence_relation (ddr); + if (!bad + && DR_IS_WRITE (DDR_B (newddr)) + && DDR_ARE_DEPENDENT (newddr) != chrec_known) + bad = true; + } + + k = VEC_length (ddr_p, ddrs) + - VEC_length (data_reference_p, datarefs) + i; + ddr = VEC_index (ddr_p, ddrs, k); + gcc_assert (DDR_A (ddr) == olddr && DDR_B (ddr) == olddr); + newddr = initialize_data_dependence_relation (dr, dr, nest); + compute_self_dependence (newddr); + VEC_replace (ddr_p, ddrs, k, newddr); + free_dependence_relation (ddr); + VEC_replace (data_reference_p, datarefs, i, dr); + + if (bad) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + { + fprintf (vect_dump, + "not vectorized: data dependence conflict" + " prevents gather"); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + return false; + } + + STMT_VINFO_GATHER_P (stmt_info) = true; + } } return true; --- gcc/tree-vectorizer.h.jj 2011-11-04 08:52:19.000000000 +0100 +++ gcc/tree-vectorizer.h 2011-11-04 08:53:13.000000000 +0100 @@ -535,6 +535,9 @@ typedef struct _stmt_vec_info { /* Is this statement vectorizable or should it be skipped in (partial) vectorization. */ bool vectorizable; + + /* For loads only, true if this is a gather load. */ + bool gather_p; } *stmt_vec_info; /* Access Functions. */ @@ -548,6 +551,7 @@ typedef struct _stmt_vec_info { #define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt #define STMT_VINFO_VECTORIZABLE(S) (S)->vectorizable #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info +#define STMT_VINFO_GATHER_P(S) (S)->gather_p #define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address #define STMT_VINFO_DR_INIT(S) (S)->dr_init @@ -858,6 +862,8 @@ extern bool vect_analyze_data_refs_align extern bool vect_verify_datarefs_alignment (loop_vec_info, bb_vec_info); extern bool vect_analyze_data_ref_accesses (loop_vec_info, bb_vec_info); extern bool vect_prune_runtime_alias_test_list (loop_vec_info); +extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *, + int *); extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *); extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree, tree *, gimple_stmt_iterator *, --- gcc/config/i386/i386-builtin-types.def.jj 2011-11-04 08:52:19.000000000 +0100 +++ gcc/config/i386/i386-builtin-types.def 2011-11-04 08:53:13.000000000 +0100 @@ -432,20 +432,24 @@ DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V4SI, V2DF, INT) DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4SI, V4DF, INT) +DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V8SI, V4DF, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V2DI, V2DF, INT) DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4DI, V4DF, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4SI, V4SF, INT) DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V8SI, V8SF, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V2DI, V4SF, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4DI, V4SF, INT) +DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V4DI, V8SF, INT) DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V4SI, V2DI, INT) DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4SI, V4DI, INT) +DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V8SI, V4DI, INT) DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V2DI, V2DI, INT) DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4DI, V4DI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4SI, V4SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V8SI, V8SI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V2DI, V4SI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4DI, V4SI, INT) +DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V4DI, V8SI, INT) DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND) DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND) --- gcc/config/i386/sse.md.jj 2011-11-04 08:52:19.000000000 +0100 +++ gcc/config/i386/sse.md 2011-11-04 12:48:16.000000000 +0100 @@ -316,14 +316,6 @@ (define_mode_attr i128 ;; Mix-n-match (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF]) -(define_mode_iterator AVXMODE48P_DI - [V2DI V2DF V4DI V4DF V4SF V4SI]) -(define_mode_attr AVXMODE48P_DI - [(V2DI "V2DI") (V2DF "V2DI") - (V4DI "V4DI") (V4DF "V4DI") - (V4SI "V2DI") (V4SF "V2DI") - (V8SI "V4DI") (V8SF "V4DI")]) - (define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF]) ;; Mapping of immediate bits for blend instructions @@ -12516,11 +12508,21 @@ (define_insn "vcvtps2ph256" ;; For gather* insn patterns (define_mode_iterator VEC_GATHER_MODE [V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF]) -(define_mode_attr VEC_GATHER_MODE +(define_mode_attr VEC_GATHER_IDXSI [(V2DI "V4SI") (V2DF "V4SI") (V4DI "V4SI") (V4DF "V4SI") (V4SI "V4SI") (V4SF "V4SI") (V8SI "V8SI") (V8SF "V8SI")]) +(define_mode_attr VEC_GATHER_IDXDI + [(V2DI "V2DI") (V2DF "V2DI") + (V4DI "V4DI") (V4DF "V4DI") + (V4SI "V2DI") (V4SF "V2DI") + (V8SI "V4DI") (V8SF "V4DI")]) +(define_mode_attr VEC_GATHER_SRCDI + [(V2DI "V2DI") (V2DF "V2DF") + (V4DI "V4DI") (V4DF "V4DF") + (V4SI "V4SI") (V4SF "V4SF") + (V8SI "V4SI") (V8SF "V4SF")]) (define_expand "avx2_gathersi<mode>" [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "") @@ -12529,7 +12531,8 @@ (define_expand "avx2_gathersi<mode>" (mem:<ssescalarmode> (match_par_dup 7 [(match_operand 2 "vsib_address_operand" "") - (match_operand:<VEC_GATHER_MODE> 3 "register_operand" "") + (match_operand:<VEC_GATHER_IDXSI> + 3 "register_operand" "") (match_operand:SI 5 "const1248_operand " "")])) (mem:BLK (scratch)) (match_operand:VEC_GATHER_MODE 4 "register_operand" "")] @@ -12549,7 +12552,7 @@ (define_insn "*avx2_gathersi<mode>" (match_operator:<ssescalarmode> 7 "vsib_mem_operator" [(unspec:P [(match_operand:P 3 "vsib_address_operand" "p") - (match_operand:<VEC_GATHER_MODE> 4 "register_operand" "x") + (match_operand:<VEC_GATHER_IDXSI> 4 "register_operand" "x") (match_operand:SI 6 "const1248_operand" "n")] UNSPEC_VSIBADDR)]) (mem:BLK (scratch)) @@ -12565,14 +12568,16 @@ (define_insn "*avx2_gathersi<mode>" (define_expand "avx2_gatherdi<mode>" [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "") (unspec:VEC_GATHER_MODE - [(match_operand:VEC_GATHER_MODE 1 "register_operand" "") + [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand" "") (mem:<ssescalarmode> (match_par_dup 7 [(match_operand 2 "vsib_address_operand" "") - (match_operand:<AVXMODE48P_DI> 3 "register_operand" "") + (match_operand:<VEC_GATHER_IDXDI> + 3 "register_operand" "") (match_operand:SI 5 "const1248_operand " "")])) (mem:BLK (scratch)) - (match_operand:VEC_GATHER_MODE 4 "register_operand" "")] + (match_operand:<VEC_GATHER_SRCDI> + 4 "register_operand" "")] UNSPEC_GATHER)) (clobber (match_scratch:VEC_GATHER_MODE 6 ""))])] "TARGET_AVX2" @@ -12583,63 +12588,21 @@ (define_expand "avx2_gatherdi<mode>" }) (define_insn "*avx2_gatherdi<mode>" - [(set (match_operand:AVXMODE48P_DI 0 "register_operand" "=&x") - (unspec:AVXMODE48P_DI - [(match_operand:AVXMODE48P_DI 2 "register_operand" "0") + [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") + (unspec:VEC_GATHER_MODE + [(match_operand:<VEC_GATHER_SRCDI> 2 "register_operand" "0") (match_operator:<ssescalarmode> 7 "vsib_mem_operator" [(unspec:P [(match_operand:P 3 "vsib_address_operand" "p") - (match_operand:<AVXMODE48P_DI> 4 "register_operand" "x") + (match_operand:<VEC_GATHER_IDXDI> 4 "register_operand" "x") (match_operand:SI 6 "const1248_operand" "n")] UNSPEC_VSIBADDR)]) (mem:BLK (scratch)) - (match_operand:AVXMODE48P_DI 5 "register_operand" "1")] + (match_operand:<VEC_GATHER_SRCDI> 5 "register_operand" "1")] UNSPEC_GATHER)) - (clobber (match_scratch:AVXMODE48P_DI 1 "=&x"))] - "TARGET_AVX2" - "v<sseintprefix>gatherq<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}" - [(set_attr "type" "ssemov") - (set_attr "prefix" "vex") - (set_attr "mode" "<sseinsnmode>")]) - -;; Special handling for VEX.256 with float arguments -;; since there're still xmms as operands -(define_expand "avx2_gatherdi<mode>256" - [(parallel [(set (match_operand:VI4F_128 0 "register_operand" "") - (unspec:VI4F_128 - [(match_operand:VI4F_128 1 "register_operand" "") - (mem:<ssescalarmode> - (match_par_dup 7 - [(match_operand 2 "vsib_address_operand" "") - (match_operand:V4DI 3 "register_operand" "") - (match_operand:SI 5 "const1248_operand " "")])) - (mem:BLK (scratch)) - (match_operand:VI4F_128 4 "register_operand" "")] - UNSPEC_GATHER)) - (clobber (match_scratch:VI4F_128 6 ""))])] - "TARGET_AVX2" -{ - operands[7] - = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3], - operands[5]), UNSPEC_VSIBADDR); -}) - -(define_insn "*avx2_gatherdi<mode>256" - [(set (match_operand:VI4F_128 0 "register_operand" "=x") - (unspec:VI4F_128 - [(match_operand:VI4F_128 2 "register_operand" "0") - (match_operator:<ssescalarmode> 7 "vsib_mem_operator" - [(unspec:P - [(match_operand:P 3 "vsib_address_operand" "p") - (match_operand:V4DI 4 "register_operand" "x") - (match_operand:SI 6 "const1248_operand" "n")] - UNSPEC_VSIBADDR)]) - (mem:BLK (scratch)) - (match_operand:VI4F_128 5 "register_operand" "1")] - UNSPEC_GATHER)) - (clobber (match_scratch:VI4F_128 1 "=&x"))] + (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] "TARGET_AVX2" - "v<sseintprefix>gatherq<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}" + "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}" [(set_attr "type" "ssemov") (set_attr "prefix" "vex") (set_attr "mode" "<sseinsnmode>")]) --- gcc/config/i386/i386.c.jj 2011-11-04 08:52:19.000000000 +0100 +++ gcc/config/i386/i386.c 2011-11-04 12:48:16.000000000 +0100 @@ -25105,6 +25105,13 @@ enum ix86_builtins IX86_BUILTIN_GATHERDIV4SI, IX86_BUILTIN_GATHERDIV8SI, + /* Alternate 4 element gather for the vectorizer where + all operands are 32-byte wide. */ + IX86_BUILTIN_GATHERALTSIV4DF, + IX86_BUILTIN_GATHERALTDIV8SF, + IX86_BUILTIN_GATHERALTSIV4DI, + IX86_BUILTIN_GATHERALTDIV8SI, + /* TFmode support builtins. */ IX86_BUILTIN_INFQ, IX86_BUILTIN_HUGE_VALQ, @@ -26883,6 +26890,22 @@ ix86_init_mmx_sse_builtins (void) V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT, IX86_BUILTIN_GATHERDIV8SI); + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ", + V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT, + IX86_BUILTIN_GATHERALTSIV4DF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ", + V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT, + IX86_BUILTIN_GATHERALTDIV8SF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ", + V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT, + IX86_BUILTIN_GATHERALTSIV4DI); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ", + V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT, + IX86_BUILTIN_GATHERALTDIV8SI); + /* MMX access to the vec_init patterns. */ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si", V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI); @@ -28869,7 +28892,7 @@ rdrand_step: icode = CODE_FOR_avx2_gatherdiv4sf; goto gather_gen; case IX86_BUILTIN_GATHERDIV8SF: - icode = CODE_FOR_avx2_gatherdiv4sf256; + icode = CODE_FOR_avx2_gatherdiv8sf; goto gather_gen; case IX86_BUILTIN_GATHERSIV2DI: icode = CODE_FOR_avx2_gathersiv2di; @@ -28893,7 +28916,20 @@ rdrand_step: icode = CODE_FOR_avx2_gatherdiv4si; goto gather_gen; case IX86_BUILTIN_GATHERDIV8SI: - icode = CODE_FOR_avx2_gatherdiv4si256; + icode = CODE_FOR_avx2_gatherdiv8si; + goto gather_gen; + case IX86_BUILTIN_GATHERALTSIV4DF: + icode = CODE_FOR_avx2_gathersiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHERALTDIV8SF: + icode = CODE_FOR_avx2_gatherdiv8sf; + goto gather_gen; + case IX86_BUILTIN_GATHERALTSIV4DI: + icode = CODE_FOR_avx2_gathersiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHERALTDIV8SI: + icode = CODE_FOR_avx2_gatherdiv8si; + goto gather_gen; gather_gen: arg0 = CALL_EXPR_ARG (exp, 0); @@ -28912,8 +28948,39 @@ rdrand_step: mode3 = insn_data[icode].operand[4].mode; mode4 = insn_data[icode].operand[5].mode; - if (target == NULL_RTX) - target = gen_reg_rtx (insn_data[icode].operand[0].mode); + if (target == NULL_RTX + || GET_MODE (target) != insn_data[icode].operand[0].mode) + subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode); + else + subtarget = target; + + if (fcode == IX86_BUILTIN_GATHERALTSIV4DF + || fcode == IX86_BUILTIN_GATHERALTSIV4DI) + { + rtx half = gen_reg_rtx (V4SImode); + if (!nonimmediate_operand (op2, V8SImode)) + op2 = copy_to_mode_reg (V8SImode, op2); + emit_insn (gen_vec_extract_lo_v8si (half, op2)); + op2 = half; + } + else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF + || fcode == IX86_BUILTIN_GATHERALTDIV8SI) + { + rtx (*gen) (rtx, rtx); + rtx half = gen_reg_rtx (mode0); + if (mode0 == V4SFmode) + gen = gen_vec_extract_lo_v8sf; + else + gen = gen_vec_extract_lo_v8si; + if (!nonimmediate_operand (op0, GET_MODE (op0))) + op0 = copy_to_mode_reg (GET_MODE (op0), op0); + emit_insn (gen (half, op0)); + op0 = half; + if (!nonimmediate_operand (op3, GET_MODE (op3))) + op3 = copy_to_mode_reg (GET_MODE (op3), op3); + emit_insn (gen (half, op3)); + op3 = half; + } /* Force memory operand only with base register here. But we don't want to do it on memory operand for other builtin @@ -28935,10 +29002,26 @@ rdrand_step: error ("last argument must be scale 1, 2, 4, 8"); return const0_rtx; } - pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4); + pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4); if (! pat) return const0_rtx; emit_insn (pat); + + if (fcode == IX86_BUILTIN_GATHERDIV8SF + || fcode == IX86_BUILTIN_GATHERDIV8SI) + { + enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode + ? V4SFmode : V4SImode; + if (target == NULL_RTX) + target = gen_reg_rtx (tmode); + if (tmode == V4SFmode) + emit_insn (gen_vec_extract_lo_v8sf (target, subtarget)); + else + emit_insn (gen_vec_extract_lo_v8si (target, subtarget)); + } + else + target = subtarget; + return target; default: @@ -29443,6 +29526,73 @@ ix86_veclibabi_acml (enum built_in_funct return new_fndecl; } +/* Returns a decl of a function that implements gather load with + memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE. + Return NULL_TREE if it is not available. */ + +static tree +ix86_vectorize_builtin_gather (const_tree mem_vectype, + const_tree index_type, int scale) +{ + bool si; + enum ix86_builtins code; + + if (! TARGET_AVX2) + return NULL_TREE; + + if ((TREE_CODE (index_type) != INTEGER_TYPE + && !POINTER_TYPE_P (index_type)) + || (TYPE_MODE (index_type) != SImode + && TYPE_MODE (index_type) != DImode)) + return NULL_TREE; + + if (TYPE_PRECISION (index_type) > POINTER_SIZE) + return NULL_TREE; + + /* v*gather* insn sign extends index to pointer mode. */ + if (TYPE_PRECISION (index_type) < POINTER_SIZE + && TYPE_UNSIGNED (index_type)) + return NULL_TREE; + + if (scale <= 0 + || scale > 8 + || (scale & (scale - 1)) != 0) + return NULL_TREE; + + si = TYPE_MODE (index_type) == SImode; + switch (TYPE_MODE (mem_vectype)) + { + case V2DFmode: + code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF; + break; + case V4DFmode: + code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF; + break; + case V2DImode: + code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI; + break; + case V4DImode: + code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI; + break; + case V4SFmode: + code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF; + break; + case V8SFmode: + code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF; + break; + case V4SImode: + code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI; + break; + case V8SImode: + code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI; + break; + default: + return NULL_TREE; + } + + return ix86_builtins[code]; +} + /* Returns a code for a target-specific builtin that implements reciprocal of the function, or NULL_TREE if not available. */ @@ -37642,6 +37792,9 @@ ix86_autovectorize_vector_sizes (void) #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ ix86_builtin_vectorized_function +#undef TARGET_VECTORIZE_BUILTIN_GATHER +#define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather + #undef TARGET_BUILTIN_RECIPROCAL #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal --- gcc/testsuite/gcc.target/i386/avx2-gather-1.c.jj 2011-11-04 08:53:13.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/avx2-gather-1.c 2011-11-04 08:53:13.000000000 +0100 @@ -0,0 +1,215 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-options "-O3 -mavx2" } */ + +#include "avx2-check.h" + +#define N 1024 +float vf1[N+16], vf2[N]; +double vd1[N+16], vd2[N]; +int k[N]; +long l[N]; +short n[N]; + +__attribute__((noinline, noclone)) void +f1 (void) +{ + int i; + for (i = 0; i < N; i++) + vf2[i] = vf1[k[i]]; +} + +__attribute__((noinline, noclone)) void +f2 (void) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vf1[k[i]]; +} + +__attribute__((noinline, noclone)) void +f3 (int x) +{ + int i; + for (i = 0; i < N; i++) + vf2[i] = vf1[k[i] + x]; +} + +__attribute__((noinline, noclone)) void +f4 (int x) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vf1[k[i] + x]; +} + +__attribute__((noinline, noclone)) void +f5 (void) +{ + int i; + for (i = 0; i < N; i++) + vd2[i] = vd1[k[i]]; +} + +__attribute__((noinline, noclone)) void +f6 (void) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vd1[k[i]]; +} + +__attribute__((noinline, noclone)) void +f7 (int x) +{ + int i; + for (i = 0; i < N; i++) + vd2[i] = vd1[k[i] + x]; +} + +__attribute__((noinline, noclone)) void +f8 (int x) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vd1[k[i] + x]; +} + +__attribute__((noinline, noclone)) void +f9 (void) +{ + int i; + for (i = 0; i < N; i++) + vf2[i] = vf1[l[i]]; +} + +__attribute__((noinline, noclone)) void +f10 (void) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vf1[l[i]]; +} + +__attribute__((noinline, noclone)) void +f11 (long x) +{ + int i; + for (i = 0; i < N; i++) + vf2[i] = vf1[l[i] + x]; +} + +__attribute__((noinline, noclone)) void +f12 (long x) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vf1[l[i] + x]; +} + +__attribute__((noinline, noclone)) void +f13 (void) +{ + int i; + for (i = 0; i < N; i++) + vd2[i] = vd1[l[i]]; +} + +__attribute__((noinline, noclone)) void +f14 (void) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vd1[l[i]]; +} + +__attribute__((noinline, noclone)) void +f15 (long x) +{ + int i; + for (i = 0; i < N; i++) + vd2[i] = vd1[l[i] + x]; +} + +__attribute__((noinline, noclone)) void +f16 (long x) +{ + int i; + for (i = 0; i < N; i++) + n[i] = (int) vd1[l[i] + x]; +} + +static void +avx2_test (void) +{ + int i; + + for (i = 0; i < N + 16; i++) + { + asm (""); + vf1[i] = 17.0f + i; + vd1[i] = 19.0 + i; + } + for (i = 0; i < N; i++) + { + asm (""); + k[i] = (i * 731) & (N - 1); + l[i] = (i * 657) & (N - 1); + } + + f1 (); + f2 (); + for (i = 0; i < N; i++) + if (vf2[i] != ((i * 731) & (N - 1)) + 17 + || n[i] != ((i * 731) & (N - 1)) + 17) + abort (); + + f3 (12); + f4 (14); + for (i = 0; i < N; i++) + if (vf2[i] != ((i * 731) & (N - 1)) + 17 + 12 + || n[i] != ((i * 731) & (N - 1)) + 17 + 14) + abort (); + + f5 (); + f6 (); + for (i = 0; i < N; i++) + if (vd2[i] != ((i * 731) & (N - 1)) + 19 + || n[i] != ((i * 731) & (N - 1)) + 19) + abort (); + + f7 (7); + f8 (9); + for (i = 0; i < N; i++) + if (vd2[i] != ((i * 731) & (N - 1)) + 19 + 7 + || n[i] != ((i * 731) & (N - 1)) + 19 + 9) + abort (); + + f9 (); + f10 (); + for (i = 0; i < N; i++) + if (vf2[i] != ((i * 657) & (N - 1)) + 17 + || n[i] != ((i * 657) & (N - 1)) + 17) + abort (); + + f11 (2); + f12 (4); + for (i = 0; i < N; i++) + if (vf2[i] != ((i * 657) & (N - 1)) + 17 + 2 + || n[i] != ((i * 657) & (N - 1)) + 17 + 4) + abort (); + + f13 (); + f14 (); + for (i = 0; i < N; i++) + if (vd2[i] != ((i * 657) & (N - 1)) + 19 + || n[i] != ((i * 657) & (N - 1)) + 19) + abort (); + + f15 (13); + f16 (15); + for (i = 0; i < N; i++) + if (vd2[i] != ((i * 657) & (N - 1)) + 19 + 13 + || n[i] != ((i * 657) & (N - 1)) + 19 + 15) + abort (); +} --- gcc/testsuite/gcc.target/i386/avx2-gather-2.c.jj 2011-11-04 08:53:13.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/avx2-gather-2.c 2011-11-04 08:53:13.000000000 +0100 @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */ + +#include "avx2-gather-1.c" + +/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops in function" 16 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ --- gcc/testsuite/gcc.target/i386/avx2-gather-3.c.jj 2011-11-04 08:53:13.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/avx2-gather-3.c 2011-11-04 08:53:13.000000000 +0100 @@ -0,0 +1,167 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-options "-O3 -mavx2 -ffast-math" } */ + +#include "avx2-check.h" + +#define N 1024 +float f[N]; +double d[N]; +int k[N]; +float *l[N]; +double *n[N]; +int **m[N]; +long **o[N]; +long q[N]; +long *r[N]; +int *s[N]; + +__attribute__((noinline, noclone)) float +f1 (void) +{ + int i; + float g = 0.0; + for (i = 0; i < N / 2; i++) + g += f[k[i]]; + return g; +} + +__attribute__((noinline, noclone)) float +f2 (float *p) +{ + int i; + float g = 0.0; + for (i = 0; i < N / 2; i++) + g += p[k[i]]; + return g; +} + +__attribute__((noinline, noclone)) float +f3 (void) +{ + int i; + float g = 0.0; + for (i = 0; i < N / 2; i++) + g += *l[i]; + return g; +} + +__attribute__((noinline, noclone)) int +f4 (void) +{ + int i; + int g = 0; + for (i = 0; i < N / 2; i++) + g += **m[i]; + return g; +} + +__attribute__((noinline, noclone)) double +f5 (void) +{ + int i; + double g = 0.0; + for (i = 0; i < N / 2; i++) + g += d[k[i]]; + return g; +} + +__attribute__((noinline, noclone)) double +f6 (double *p) +{ + int i; + double g = 0.0; + for (i = 0; i < N / 2; i++) + g += p[k[i]]; + return g; +} + +__attribute__((noinline, noclone)) double +f7 (void) +{ + int i; + double g = 0.0; + for (i = 0; i < N / 2; i++) + g += *n[i]; + return g; +} + +__attribute__((noinline, noclone)) int +f8 (void) +{ + int i; + int g = 0; + for (i = 0; i < N / 2; i++) + g += **o[i]; + return g; +} + +__attribute__((noinline, noclone)) float +f9 (void) +{ + int i; + float g = 0.0; + for (i = 0; i < N / 2; i++) + g += f[q[i]]; + return g; +} + +__attribute__((noinline, noclone)) float +f10 (float *p) +{ + int i; + float g = 0.0; + for (i = 0; i < N / 2; i++) + g += p[q[i]]; + return g; +} + +__attribute__((noinline, noclone)) double +f11 (void) +{ + int i; + double g = 0.0; + for (i = 0; i < N / 2; i++) + g += d[q[i]]; + return g; +} + +__attribute__((noinline, noclone)) double +f12 (double *p) +{ + int i; + double g = 0.0; + for (i = 0; i < N / 2; i++) + g += p[q[i]]; + return g; +} + +static void +avx2_test (void) +{ + int i; + + for (i = 0; i < N; i++) + { + asm (""); + f[i] = -256.0f + i; + d[i] = -258.0 + i; + k[i] = (i * 731) & (N - 1); + q[i] = (i * 657) & (N - 1); + l[i] = &f[(i * 239) & (N - 1)]; + n[i] = &d[(i * 271) & (N - 1)]; + r[i] = &q[(i * 323) & (N - 1)]; + s[i] = &k[(i * 565) & (N - 1)]; + m[i] = &s[(i * 13) & (N - 1)]; + o[i] = &r[(i * 19) & (N - 1)]; + } + + if (f1 () != 136448.0f || f2 (f) != 136448.0f || f3 () != 130304.0) + abort (); + if (f4 () != 261376 || f5 () != 135424.0 || f6 (d) != 135424.0) + abort (); + if (f7 () != 129280.0 || f8 () != 259840L || f9 () != 130816.0f) + abort (); + if (f10 (f) != 130816.0f || f11 () != 129792.0 || f12 (d) != 129792.0) + abort (); +} --- gcc/testsuite/gcc.target/i386/avx2-gather-4.c.jj 2011-11-04 08:54:11.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/avx2-gather-4.c 2011-11-04 08:54:11.000000000 +0100 @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-options "-O3 -mavx2" } */ + +#include "avx2-check.h" + +#define N 1024 +int a[N], b[N], c[N], d[N]; + +__attribute__((noinline, noclone)) void +foo (float *__restrict p, float *__restrict q, float *__restrict r, + long s1, long s2, long s3) +{ + int i; + for (i = 0; i < N; i++) + p[i] = q[a[i] * s1 + b[i] * s2 + s3] * r[c[i] * s1 + d[i] * s2 + s3]; +} + +static void +avx2_test (void) +{ + int i; + float e[N], f[N], g[N]; + for (i = 0; i < N; i++) + { + a[i] = (i * 7) & (N / 8 - 1); + b[i] = (i * 13) & (N / 8 - 1); + c[i] = (i * 23) & (N / 8 - 1); + d[i] = (i * 5) & (N / 8 - 1); + e[i] = 16.5 + i; + f[i] = 127.5 - i; + } + foo (g, e, f, 3, 2, 4); + for (i = 0; i < N; i++) + if (g[i] != (float) ((20.5 + a[i] * 3 + b[i] * 2) + * (123.5 - c[i] * 3 - d[i] * 2))) + abort (); +}