Message ID | 20230704095406.3518145-1-juzhe.zhong@rivai.ai |
---|---|
State | New |
Headers | show |
Series | [V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer | expand |
Oh. Sorry for incorrect typo in commit log. >> Address comments from Richard. Change it into "Address comments from Richi." :). Thanks. juzhe.zhong@rivai.ai From: juzhe.zhong Date: 2023-07-04 17:54 To: gcc-patches CC: richard.sandiford; rguenther; Ju-Zhe Zhong Subject: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> Hi, Richard and Richi. Address comments from Richard. Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE. Since: /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different on arguments before and after vectorized. Before vectorized: LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); After vectorized: LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); */ I add "vectorized_p" default argument into internal_fn_mask_index. So that we could simplify the codes. The len_mask_gather_load/len_mask_scatter_store patterns have been added. Now, this patch applies them into vectorizer. Here is the example: void f (int *restrict a, int *restrict b, int n, int base, int step, int *restrict cond) { for (int i = 0; i < n; ++i) { if (cond[i]) a[i * 4] = b[i]; } } Gimple IR: <bb 3> [local count: 105119324]: _58 = (unsigned long) n_13(D); <bb 4> [local count: 630715945]: # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)> # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)> # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)> # ivtmp_59 = PHI <ivtmp_60(4), _58(3)> _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]); ivtmp_44 = _61 * 4; vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... }); mask__24.10_49 = vect__4.9_47 != { 0, ... }; vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49); ivtmp_54 = _61 * 16; .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49); vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44; vectp_b.11_52 = vectp_b.11_51 + ivtmp_44; vectp_a.14_56 = vectp_a.14_55 + ivtmp_54; ivtmp_60 = ivtmp_59 - _61; if (ivtmp_60 != 0) goto <bb 4>; [83.33%] else goto <bb 5>; [16.67%] gcc/ChangeLog: * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer. * internal-fn.h (internal_fn_mask_index): Ditto. * optabs-query.cc (supports_vec_gather_load_p): Ditto. (supports_vec_scatter_store_p): Ditto. * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto. * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto. (check_load_store_for_partial_vectors): Ditto. (vect_get_strided_load_store_ops): Ditto. (vectorizable_store): Ditto. (vectorizable_load): Ditto. --- gcc/internal-fn.cc | 16 ++++- gcc/internal-fn.h | 2 +- gcc/optabs-query.cc | 2 + gcc/tree-vect-data-refs.cc | 18 ++++- gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------ 5 files changed, 150 insertions(+), 23 deletions(-) diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 303df102d81..2c78c870de8 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn) otherwise return -1. */ int -internal_fn_mask_index (internal_fn fn) +internal_fn_mask_index (internal_fn fn, bool vectoried_p) { switch (fn) { @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn) case IFN_LEN_MASK_STORE: return 4; + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different + on arguments before and after vectorized. + + Before vectorized: + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); + + After vectorized: + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); + */ case IFN_LEN_MASK_GATHER_LOAD: case IFN_LEN_MASK_SCATTER_STORE: - return 6; + if (vectoried_p) + return 6; + else + return 4; default: return (conditional_internal_fn_code (fn) != ERROR_MARK diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h index 4234bbfed87..e9168c16297 100644 --- a/gcc/internal-fn.h +++ b/gcc/internal-fn.h @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *, extern bool internal_load_fn_p (internal_fn); extern bool internal_store_fn_p (internal_fn); extern bool internal_gather_scatter_fn_p (internal_fn); -extern int internal_fn_mask_index (internal_fn); +extern int internal_fn_mask_index (internal_fn, bool = true); extern int internal_fn_len_index (internal_fn); extern int internal_fn_stored_value_index (internal_fn); extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc index 2fdd0d34354..bf1f484e874 100644 --- a/gcc/optabs-query.cc +++ b/gcc/optabs-query.cc @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode) this_fn_optabs->supports_vec_gather_load[mode] = (supports_vec_convert_optab_p (gather_load_optab, mode) || supports_vec_convert_optab_p (mask_gather_load_optab, mode) + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode) ? 1 : -1); return this_fn_optabs->supports_vec_gather_load[mode] > 0; @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode) this_fn_optabs->supports_vec_scatter_store[mode] = (supports_vec_convert_optab_p (scatter_store_optab, mode) || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode) ? 1 : -1); return this_fn_optabs->supports_vec_scatter_store[mode] > 0; diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index ebe93832b1e..ab2af103cb4 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, return false; /* Work out which function we need. */ - internal_fn ifn, alt_ifn; + internal_fn ifn, alt_ifn, alt_ifn2; if (read_p) { ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; alt_ifn = IFN_MASK_GATHER_LOAD; + /* When target supports LEN_MASK_GATHER_LOAD, we always + use LEN_MASK_GATHER_LOAD regardless whether len and + mask are valid or not. */ + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD; } else { ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; alt_ifn = IFN_MASK_SCATTER_STORE; + /* When target supports LEN_MASK_SCATTER_STORE, we always + use LEN_MASK_SCATTER_STORE regardless whether len and + mask are valid or not. */ + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE; } for (;;) @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, *offset_vectype_out = offset_vectype; return true; } + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype, + memory_type, + offset_vectype, scale)) + { + *ifn_out = alt_ifn2; + *offset_vectype_out = offset_vectype; + return true; + } if (TYPE_PRECISION (offset_type) >= POINTER_SIZE && TYPE_PRECISION (offset_type) >= element_bits) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index a0c39268bf0..33ec33f8b8d 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info) if (call && gimple_call_internal_p (call)) { internal_fn ifn = gimple_call_internal_fn (call); - int mask_index = internal_fn_mask_index (ifn); + int mask_index = internal_fn_mask_index (ifn, false); if (mask_index >= 0 && use == gimple_call_arg (call, mask_index)) return true; @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, gs_info->offset_vectype, gs_info->scale)) { + ifn = (is_load + ? IFN_LEN_MASK_GATHER_LOAD + : IFN_LEN_MASK_SCATTER_STORE); + if (internal_gather_scatter_fn_supported_p (ifn, vectype, + gs_info->memory_type, + gs_info->offset_vectype, + gs_info->scale)) + { + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); + return; + } if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "can't operate on partial vectors because" @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo, static void vect_get_strided_load_store_ops (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, + gimple_stmt_iterator *gsi, gather_scatter_info *gs_info, - tree *dataref_bump, tree *vec_offset) + tree *dataref_bump, tree *vec_offset, + vec_loop_lens *loop_lens) { struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); tree vectype = STMT_VINFO_VECTYPE (stmt_info); - tree bump = size_binop (MULT_EXPR, - fold_convert (sizetype, unshare_expr (DR_STEP (dr))), - size_int (TYPE_VECTOR_SUBPARTS (vectype))); - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) + { + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]); + ivtmp_8 = _31 * 16 (step in bytes); + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... ); + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */ + tree loop_len + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0); + tree tmp + = fold_build2 (MULT_EXPR, sizetype, + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), + loop_len); + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp"); + gassign *assign = gimple_build_assign (bump, tmp); + gsi_insert_before (gsi, assign, GSI_SAME_STMT); + *dataref_bump = bump; + } + else + { + tree bump + = size_binop (MULT_EXPR, + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), + size_int (TYPE_VECTOR_SUBPARTS (vectype))); + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); + } /* The offset given in GS_INFO can have pointer type, so use the element type of the vector instead. */ @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo, return false; } - int mask_index = internal_fn_mask_index (ifn); + int mask_index = internal_fn_mask_index (ifn, false); if (mask_index >= 0 && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, &mask, NULL, &mask_dt, &mask_vectype)) @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo, else if (memory_access_type == VMAT_GATHER_SCATTER) { aggr_type = elem_type; - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, - &bump, &vec_offset); + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, + &bump, &vec_offset, loop_lens); } else { @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo, unsigned HOST_WIDE_INT align; tree final_mask = NULL_TREE; + tree final_len = NULL_TREE; + tree bias = NULL_TREE; if (loop_masks) final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, vec_num * ncopies, @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo, if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) vec_offset = vec_offsets[vec_num * j + i]; tree scale = size_int (gs_info.scale); + + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE) + { + if (loop_lens) + { + final_len + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + vec_num * ncopies, vectype, + vec_num * j + i, 1); + } + else + { + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + final_len + = build_int_cst (iv_type, + TYPE_VECTOR_SUBPARTS (vectype)); + } + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + if (!final_mask) + { + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); + } + } + gcall *call; - if (final_mask) + if (final_len && final_len) + call + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE, + 7, dataref_ptr, vec_offset, + scale, vec_oprnd, final_len, + bias, final_mask); + else if (final_mask) call = gimple_build_call_internal (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset, scale, vec_oprnd, final_mask); @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo, machine_mode vmode = TYPE_MODE (vectype); machine_mode new_vmode = vmode; internal_fn partial_ifn = IFN_LAST; - /* Produce 'len' and 'bias' argument. */ - tree final_len = NULL_TREE; - tree bias = NULL_TREE; if (loop_lens) { opt_machine_mode new_ovmode @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo, else if (memory_access_type == VMAT_GATHER_SCATTER) { aggr_type = elem_type; - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, - &bump, &vec_offset); + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, + &bump, &vec_offset, loop_lens); } else { @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo, for (i = 0; i < vec_num; i++) { tree final_mask = NULL_TREE; + tree final_len = NULL_TREE; + tree bias = NULL_TREE; if (loop_masks && memory_access_type != VMAT_INVARIANT) final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo, vec_offset = vec_offsets[vec_num * j + i]; tree zero = build_zero_cst (vectype); tree scale = size_int (gs_info.scale); + + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD) + { + if (loop_lens) + { + final_len = vect_get_loop_len ( + loop_vinfo, gsi, loop_lens, vec_num * ncopies, + vectype, vec_num * j + i, 1); + } + else + { + tree iv_type + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + final_len = build_int_cst ( + iv_type, TYPE_VECTOR_SUBPARTS (vectype)); + } + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + if (!final_mask) + { + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); + } + } + gcall *call; - if (final_mask) + if (final_len && final_mask) + call = gimple_build_call_internal ( + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr, + vec_offset, scale, zero, final_len, bias, + final_mask); + else if (final_mask) call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, vec_offset, scale, zero, final_mask); @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo, machine_mode vmode = TYPE_MODE (vectype); machine_mode new_vmode = vmode; internal_fn partial_ifn = IFN_LAST; - /* Produce 'len' and 'bias' argument. */ - tree final_len = NULL_TREE; - tree bias = NULL_TREE; if (loop_lens) { opt_machine_mode new_ovmode
On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote: > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > Hi, Richard and Richi. > > Address comments from Richard. > > Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE. > > Since: > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > on arguments before and after vectorized. > > Before vectorized: > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > After vectorized: > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > */ > > I add "vectorized_p" default argument into internal_fn_mask_index. > So that we could simplify the codes. Eh, it's obvious that we should have the "vectorized" form also in the 'scalar' variant. If you think there's no reasonable way to add a value for len or bias then instead re-order the arguments so 'mask' comes first and the len/bias pair last. But IMHO "any" len/bias value should do here. The rest looks OK now. Thanks, Richard. > The len_mask_gather_load/len_mask_scatter_store patterns have been added. > Now, this patch applies them into vectorizer. > > Here is the example: > > void > f (int *restrict a, > int *restrict b, int n, > int base, int step, > int *restrict cond) > { > for (int i = 0; i < n; ++i) > { > if (cond[i]) > a[i * 4] = b[i]; > } > } > > Gimple IR: > > <bb 3> [local count: 105119324]: > _58 = (unsigned long) n_13(D); > > <bb 4> [local count: 630715945]: > # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)> > # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)> > # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)> > # ivtmp_59 = PHI <ivtmp_60(4), _58(3)> > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]); > ivtmp_44 = _61 * 4; > vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... }); > mask__24.10_49 = vect__4.9_47 != { 0, ... }; > vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49); > ivtmp_54 = _61 * 16; > .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49); > vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44; > vectp_b.11_52 = vectp_b.11_51 + ivtmp_44; > vectp_a.14_56 = vectp_a.14_55 + ivtmp_54; > ivtmp_60 = ivtmp_59 - _61; > if (ivtmp_60 != 0) > goto <bb 4>; [83.33%] > else > goto <bb 5>; [16.67%] > > gcc/ChangeLog: > > * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer. > * internal-fn.h (internal_fn_mask_index): Ditto. > * optabs-query.cc (supports_vec_gather_load_p): Ditto. > (supports_vec_scatter_store_p): Ditto. > * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto. > * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto. > (check_load_store_for_partial_vectors): Ditto. > (vect_get_strided_load_store_ops): Ditto. > (vectorizable_store): Ditto. > (vectorizable_load): Ditto. > > --- > gcc/internal-fn.cc | 16 ++++- > gcc/internal-fn.h | 2 +- > gcc/optabs-query.cc | 2 + > gcc/tree-vect-data-refs.cc | 18 ++++- > gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------ > 5 files changed, 150 insertions(+), 23 deletions(-) > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 303df102d81..2c78c870de8 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn) > otherwise return -1. */ > > int > -internal_fn_mask_index (internal_fn fn) > +internal_fn_mask_index (internal_fn fn, bool vectoried_p) > { > switch (fn) > { > @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn) > case IFN_LEN_MASK_STORE: > return 4; > > + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > + on arguments before and after vectorized. > + > + Before vectorized: > + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > + > + After vectorized: > + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > + */ > case IFN_LEN_MASK_GATHER_LOAD: > case IFN_LEN_MASK_SCATTER_STORE: > - return 6; > + if (vectoried_p) > + return 6; > + else > + return 4; > > default: > return (conditional_internal_fn_code (fn) != ERROR_MARK > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > index 4234bbfed87..e9168c16297 100644 > --- a/gcc/internal-fn.h > +++ b/gcc/internal-fn.h > @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *, > extern bool internal_load_fn_p (internal_fn); > extern bool internal_store_fn_p (internal_fn); > extern bool internal_gather_scatter_fn_p (internal_fn); > -extern int internal_fn_mask_index (internal_fn); > +extern int internal_fn_mask_index (internal_fn, bool = true); > extern int internal_fn_len_index (internal_fn); > extern int internal_fn_stored_value_index (internal_fn); > extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc > index 2fdd0d34354..bf1f484e874 100644 > --- a/gcc/optabs-query.cc > +++ b/gcc/optabs-query.cc > @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode) > this_fn_optabs->supports_vec_gather_load[mode] > = (supports_vec_convert_optab_p (gather_load_optab, mode) > || supports_vec_convert_optab_p (mask_gather_load_optab, mode) > + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode) > ? 1 : -1); > > return this_fn_optabs->supports_vec_gather_load[mode] > 0; > @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode) > this_fn_optabs->supports_vec_scatter_store[mode] > = (supports_vec_convert_optab_p (scatter_store_optab, mode) > || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) > + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode) > ? 1 : -1); > > return this_fn_optabs->supports_vec_scatter_store[mode] > 0; > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > index ebe93832b1e..ab2af103cb4 100644 > --- a/gcc/tree-vect-data-refs.cc > +++ b/gcc/tree-vect-data-refs.cc > @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, > return false; > > /* Work out which function we need. */ > - internal_fn ifn, alt_ifn; > + internal_fn ifn, alt_ifn, alt_ifn2; > if (read_p) > { > ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; > alt_ifn = IFN_MASK_GATHER_LOAD; > + /* When target supports LEN_MASK_GATHER_LOAD, we always > + use LEN_MASK_GATHER_LOAD regardless whether len and > + mask are valid or not. */ > + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD; > } > else > { > ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; > alt_ifn = IFN_MASK_SCATTER_STORE; > + /* When target supports LEN_MASK_SCATTER_STORE, we always > + use LEN_MASK_SCATTER_STORE regardless whether len and > + mask are valid or not. */ > + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE; > } > > for (;;) > @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, > *offset_vectype_out = offset_vectype; > return true; > } > + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype, > + memory_type, > + offset_vectype, scale)) > + { > + *ifn_out = alt_ifn2; > + *offset_vectype_out = offset_vectype; > + return true; > + } > > if (TYPE_PRECISION (offset_type) >= POINTER_SIZE > && TYPE_PRECISION (offset_type) >= element_bits) > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index a0c39268bf0..33ec33f8b8d 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info) > if (call && gimple_call_internal_p (call)) > { > internal_fn ifn = gimple_call_internal_fn (call); > - int mask_index = internal_fn_mask_index (ifn); > + int mask_index = internal_fn_mask_index (ifn, false); > if (mask_index >= 0 > && use == gimple_call_arg (call, mask_index)) > return true; > @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, > gs_info->offset_vectype, > gs_info->scale)) > { > + ifn = (is_load > + ? IFN_LEN_MASK_GATHER_LOAD > + : IFN_LEN_MASK_SCATTER_STORE); > + if (internal_gather_scatter_fn_supported_p (ifn, vectype, > + gs_info->memory_type, > + gs_info->offset_vectype, > + gs_info->scale)) > + { > + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); > + return; > + } > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "can't operate on partial vectors because" > @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo, > static void > vect_get_strided_load_store_ops (stmt_vec_info stmt_info, > loop_vec_info loop_vinfo, > + gimple_stmt_iterator *gsi, > gather_scatter_info *gs_info, > - tree *dataref_bump, tree *vec_offset) > + tree *dataref_bump, tree *vec_offset, > + vec_loop_lens *loop_lens) > { > struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > > - tree bump = size_binop (MULT_EXPR, > - fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > - size_int (TYPE_VECTOR_SUBPARTS (vectype))); > - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) > + { > + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]); > + ivtmp_8 = _31 * 16 (step in bytes); > + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... ); > + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */ > + tree loop_len > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0); > + tree tmp > + = fold_build2 (MULT_EXPR, sizetype, > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > + loop_len); > + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp"); > + gassign *assign = gimple_build_assign (bump, tmp); > + gsi_insert_before (gsi, assign, GSI_SAME_STMT); > + *dataref_bump = bump; > + } > + else > + { > + tree bump > + = size_binop (MULT_EXPR, > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > + size_int (TYPE_VECTOR_SUBPARTS (vectype))); > + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > + } > > /* The offset given in GS_INFO can have pointer type, so use the element > type of the vector instead. */ > @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo, > return false; > } > > - int mask_index = internal_fn_mask_index (ifn); > + int mask_index = internal_fn_mask_index (ifn, false); > if (mask_index >= 0 > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > &mask, NULL, &mask_dt, &mask_vectype)) > @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo, > else if (memory_access_type == VMAT_GATHER_SCATTER) > { > aggr_type = elem_type; > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > - &bump, &vec_offset); > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > + &bump, &vec_offset, loop_lens); > } > else > { > @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo, > unsigned HOST_WIDE_INT align; > > tree final_mask = NULL_TREE; > + tree final_len = NULL_TREE; > + tree bias = NULL_TREE; > if (loop_masks) > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > vec_num * ncopies, > @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo, > if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > vec_offset = vec_offsets[vec_num * j + i]; > tree scale = size_int (gs_info.scale); > + > + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE) > + { > + if (loop_lens) > + { > + final_len > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > + vec_num * ncopies, vectype, > + vec_num * j + i, 1); > + } > + else > + { > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + final_len > + = build_int_cst (iv_type, > + TYPE_VECTOR_SUBPARTS (vectype)); > + } > + signed char biasval > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + bias = build_int_cst (intQI_type_node, biasval); > + if (!final_mask) > + { > + mask_vectype = truth_type_for (vectype); > + final_mask = build_minus_one_cst (mask_vectype); > + } > + } > + > gcall *call; > - if (final_mask) > + if (final_len && final_len) > + call > + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE, > + 7, dataref_ptr, vec_offset, > + scale, vec_oprnd, final_len, > + bias, final_mask); > + else if (final_mask) > call = gimple_build_call_internal > (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset, > scale, vec_oprnd, final_mask); > @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo, > machine_mode vmode = TYPE_MODE (vectype); > machine_mode new_vmode = vmode; > internal_fn partial_ifn = IFN_LAST; > - /* Produce 'len' and 'bias' argument. */ > - tree final_len = NULL_TREE; > - tree bias = NULL_TREE; > if (loop_lens) > { > opt_machine_mode new_ovmode > @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo, > else if (memory_access_type == VMAT_GATHER_SCATTER) > { > aggr_type = elem_type; > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > - &bump, &vec_offset); > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > + &bump, &vec_offset, loop_lens); > } > else > { > @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo, > for (i = 0; i < vec_num; i++) > { > tree final_mask = NULL_TREE; > + tree final_len = NULL_TREE; > + tree bias = NULL_TREE; > if (loop_masks > && memory_access_type != VMAT_INVARIANT) > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo, > vec_offset = vec_offsets[vec_num * j + i]; > tree zero = build_zero_cst (vectype); > tree scale = size_int (gs_info.scale); > + > + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD) > + { > + if (loop_lens) > + { > + final_len = vect_get_loop_len ( > + loop_vinfo, gsi, loop_lens, vec_num * ncopies, > + vectype, vec_num * j + i, 1); > + } > + else > + { > + tree iv_type > + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + final_len = build_int_cst ( > + iv_type, TYPE_VECTOR_SUBPARTS (vectype)); > + } > + signed char biasval > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + bias = build_int_cst (intQI_type_node, biasval); > + if (!final_mask) > + { > + mask_vectype = truth_type_for (vectype); > + final_mask = build_minus_one_cst (mask_vectype); > + } > + } > + > gcall *call; > - if (final_mask) > + if (final_len && final_mask) > + call = gimple_build_call_internal ( > + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr, > + vec_offset, scale, zero, final_len, bias, > + final_mask); > + else if (final_mask) > call = gimple_build_call_internal > (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, > vec_offset, scale, zero, final_mask); > @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo, > machine_mode vmode = TYPE_MODE (vectype); > machine_mode new_vmode = vmode; > internal_fn partial_ifn = IFN_LAST; > - /* Produce 'len' and 'bias' argument. */ > - tree final_len = NULL_TREE; > - tree bias = NULL_TREE; > if (loop_lens) > { > opt_machine_mode new_ovmode >
Hi, Richi. >> Eh, it's obvious that we should have the "vectorized" form >> also in the 'scalar' variant. If you think there's no reasonable >> way to add a value for len or bias then instead re-order the >> arguments so 'mask' comes first and the len/bias pair last. I found this patch is buggy when I am testing LEN_MAS_GATHER_LOAD. And reoder 'mask' comes first can not help. Here, we have 'GATHER_LOAD' and 'MASK_GATHER_LOAD', For GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0); I change it into: LEN_MASK_GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0); In this situation, internal_fn_mask_index should return -1. Wheras, MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33); I change it into LEN_MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33); it should return the index = 4. I can't differentiate them by only using LEN_MASK_GATHER_LOAD. Could I revise internal_fn_mask_index as follows ? int internal_fn_mask_index (internal_fn fn, int nargs) { switch (fn) { case IFN_MASK_LOAD: case IFN_MASK_LOAD_LANES: case IFN_MASK_STORE: case IFN_MASK_STORE_LANES: return 2; case IFN_MASK_GATHER_LOAD: case IFN_MASK_SCATTER_STORE: case IFN_LEN_MASK_LOAD: case IFN_LEN_MASK_STORE: return 4; /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different on arguments before and after vectorized. Before vectorized: LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); After vectorized: LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); */ case IFN_LEN_MASK_GATHER_LOAD: case IFN_LEN_MASK_SCATTER_STORE: return nargs == 4 ? -1 : nargs == 5 ? 4 : 6; default: return (conditional_internal_fn_code (fn) != ERROR_MARK || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1); } } Thanks. juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-07-04 19:05 To: Ju-Zhe Zhong CC: gcc-patches; richard.sandiford Subject: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote: > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > Hi, Richard and Richi. > > Address comments from Richard. > > Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE. > > Since: > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > on arguments before and after vectorized. > > Before vectorized: > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > After vectorized: > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > */ > > I add "vectorized_p" default argument into internal_fn_mask_index. > So that we could simplify the codes. Eh, it's obvious that we should have the "vectorized" form also in the 'scalar' variant. If you think there's no reasonable way to add a value for len or bias then instead re-order the arguments so 'mask' comes first and the len/bias pair last. But IMHO "any" len/bias value should do here. The rest looks OK now. Thanks, Richard. > The len_mask_gather_load/len_mask_scatter_store patterns have been added. > Now, this patch applies them into vectorizer. > > Here is the example: > > void > f (int *restrict a, > int *restrict b, int n, > int base, int step, > int *restrict cond) > { > for (int i = 0; i < n; ++i) > { > if (cond[i]) > a[i * 4] = b[i]; > } > } > > Gimple IR: > > <bb 3> [local count: 105119324]: > _58 = (unsigned long) n_13(D); > > <bb 4> [local count: 630715945]: > # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)> > # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)> > # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)> > # ivtmp_59 = PHI <ivtmp_60(4), _58(3)> > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]); > ivtmp_44 = _61 * 4; > vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... }); > mask__24.10_49 = vect__4.9_47 != { 0, ... }; > vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49); > ivtmp_54 = _61 * 16; > .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49); > vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44; > vectp_b.11_52 = vectp_b.11_51 + ivtmp_44; > vectp_a.14_56 = vectp_a.14_55 + ivtmp_54; > ivtmp_60 = ivtmp_59 - _61; > if (ivtmp_60 != 0) > goto <bb 4>; [83.33%] > else > goto <bb 5>; [16.67%] > > gcc/ChangeLog: > > * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer. > * internal-fn.h (internal_fn_mask_index): Ditto. > * optabs-query.cc (supports_vec_gather_load_p): Ditto. > (supports_vec_scatter_store_p): Ditto. > * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto. > * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto. > (check_load_store_for_partial_vectors): Ditto. > (vect_get_strided_load_store_ops): Ditto. > (vectorizable_store): Ditto. > (vectorizable_load): Ditto. > > --- > gcc/internal-fn.cc | 16 ++++- > gcc/internal-fn.h | 2 +- > gcc/optabs-query.cc | 2 + > gcc/tree-vect-data-refs.cc | 18 ++++- > gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------ > 5 files changed, 150 insertions(+), 23 deletions(-) > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 303df102d81..2c78c870de8 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn) > otherwise return -1. */ > > int > -internal_fn_mask_index (internal_fn fn) > +internal_fn_mask_index (internal_fn fn, bool vectoried_p) > { > switch (fn) > { > @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn) > case IFN_LEN_MASK_STORE: > return 4; > > + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > + on arguments before and after vectorized. > + > + Before vectorized: > + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > + > + After vectorized: > + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > + */ > case IFN_LEN_MASK_GATHER_LOAD: > case IFN_LEN_MASK_SCATTER_STORE: > - return 6; > + if (vectoried_p) > + return 6; > + else > + return 4; > > default: > return (conditional_internal_fn_code (fn) != ERROR_MARK > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > index 4234bbfed87..e9168c16297 100644 > --- a/gcc/internal-fn.h > +++ b/gcc/internal-fn.h > @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *, > extern bool internal_load_fn_p (internal_fn); > extern bool internal_store_fn_p (internal_fn); > extern bool internal_gather_scatter_fn_p (internal_fn); > -extern int internal_fn_mask_index (internal_fn); > +extern int internal_fn_mask_index (internal_fn, bool = true); > extern int internal_fn_len_index (internal_fn); > extern int internal_fn_stored_value_index (internal_fn); > extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc > index 2fdd0d34354..bf1f484e874 100644 > --- a/gcc/optabs-query.cc > +++ b/gcc/optabs-query.cc > @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode) > this_fn_optabs->supports_vec_gather_load[mode] > = (supports_vec_convert_optab_p (gather_load_optab, mode) > || supports_vec_convert_optab_p (mask_gather_load_optab, mode) > + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode) > ? 1 : -1); > > return this_fn_optabs->supports_vec_gather_load[mode] > 0; > @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode) > this_fn_optabs->supports_vec_scatter_store[mode] > = (supports_vec_convert_optab_p (scatter_store_optab, mode) > || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) > + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode) > ? 1 : -1); > > return this_fn_optabs->supports_vec_scatter_store[mode] > 0; > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > index ebe93832b1e..ab2af103cb4 100644 > --- a/gcc/tree-vect-data-refs.cc > +++ b/gcc/tree-vect-data-refs.cc > @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, > return false; > > /* Work out which function we need. */ > - internal_fn ifn, alt_ifn; > + internal_fn ifn, alt_ifn, alt_ifn2; > if (read_p) > { > ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; > alt_ifn = IFN_MASK_GATHER_LOAD; > + /* When target supports LEN_MASK_GATHER_LOAD, we always > + use LEN_MASK_GATHER_LOAD regardless whether len and > + mask are valid or not. */ > + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD; > } > else > { > ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; > alt_ifn = IFN_MASK_SCATTER_STORE; > + /* When target supports LEN_MASK_SCATTER_STORE, we always > + use LEN_MASK_SCATTER_STORE regardless whether len and > + mask are valid or not. */ > + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE; > } > > for (;;) > @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, > *offset_vectype_out = offset_vectype; > return true; > } > + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype, > + memory_type, > + offset_vectype, scale)) > + { > + *ifn_out = alt_ifn2; > + *offset_vectype_out = offset_vectype; > + return true; > + } > > if (TYPE_PRECISION (offset_type) >= POINTER_SIZE > && TYPE_PRECISION (offset_type) >= element_bits) > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index a0c39268bf0..33ec33f8b8d 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info) > if (call && gimple_call_internal_p (call)) > { > internal_fn ifn = gimple_call_internal_fn (call); > - int mask_index = internal_fn_mask_index (ifn); > + int mask_index = internal_fn_mask_index (ifn, false); > if (mask_index >= 0 > && use == gimple_call_arg (call, mask_index)) > return true; > @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, > gs_info->offset_vectype, > gs_info->scale)) > { > + ifn = (is_load > + ? IFN_LEN_MASK_GATHER_LOAD > + : IFN_LEN_MASK_SCATTER_STORE); > + if (internal_gather_scatter_fn_supported_p (ifn, vectype, > + gs_info->memory_type, > + gs_info->offset_vectype, > + gs_info->scale)) > + { > + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); > + return; > + } > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "can't operate on partial vectors because" > @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo, > static void > vect_get_strided_load_store_ops (stmt_vec_info stmt_info, > loop_vec_info loop_vinfo, > + gimple_stmt_iterator *gsi, > gather_scatter_info *gs_info, > - tree *dataref_bump, tree *vec_offset) > + tree *dataref_bump, tree *vec_offset, > + vec_loop_lens *loop_lens) > { > struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > > - tree bump = size_binop (MULT_EXPR, > - fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > - size_int (TYPE_VECTOR_SUBPARTS (vectype))); > - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) > + { > + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]); > + ivtmp_8 = _31 * 16 (step in bytes); > + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... ); > + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */ > + tree loop_len > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0); > + tree tmp > + = fold_build2 (MULT_EXPR, sizetype, > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > + loop_len); > + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp"); > + gassign *assign = gimple_build_assign (bump, tmp); > + gsi_insert_before (gsi, assign, GSI_SAME_STMT); > + *dataref_bump = bump; > + } > + else > + { > + tree bump > + = size_binop (MULT_EXPR, > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > + size_int (TYPE_VECTOR_SUBPARTS (vectype))); > + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > + } > > /* The offset given in GS_INFO can have pointer type, so use the element > type of the vector instead. */ > @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo, > return false; > } > > - int mask_index = internal_fn_mask_index (ifn); > + int mask_index = internal_fn_mask_index (ifn, false); > if (mask_index >= 0 > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > &mask, NULL, &mask_dt, &mask_vectype)) > @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo, > else if (memory_access_type == VMAT_GATHER_SCATTER) > { > aggr_type = elem_type; > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > - &bump, &vec_offset); > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > + &bump, &vec_offset, loop_lens); > } > else > { > @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo, > unsigned HOST_WIDE_INT align; > > tree final_mask = NULL_TREE; > + tree final_len = NULL_TREE; > + tree bias = NULL_TREE; > if (loop_masks) > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > vec_num * ncopies, > @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo, > if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > vec_offset = vec_offsets[vec_num * j + i]; > tree scale = size_int (gs_info.scale); > + > + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE) > + { > + if (loop_lens) > + { > + final_len > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > + vec_num * ncopies, vectype, > + vec_num * j + i, 1); > + } > + else > + { > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + final_len > + = build_int_cst (iv_type, > + TYPE_VECTOR_SUBPARTS (vectype)); > + } > + signed char biasval > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + bias = build_int_cst (intQI_type_node, biasval); > + if (!final_mask) > + { > + mask_vectype = truth_type_for (vectype); > + final_mask = build_minus_one_cst (mask_vectype); > + } > + } > + > gcall *call; > - if (final_mask) > + if (final_len && final_len) > + call > + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE, > + 7, dataref_ptr, vec_offset, > + scale, vec_oprnd, final_len, > + bias, final_mask); > + else if (final_mask) > call = gimple_build_call_internal > (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset, > scale, vec_oprnd, final_mask); > @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo, > machine_mode vmode = TYPE_MODE (vectype); > machine_mode new_vmode = vmode; > internal_fn partial_ifn = IFN_LAST; > - /* Produce 'len' and 'bias' argument. */ > - tree final_len = NULL_TREE; > - tree bias = NULL_TREE; > if (loop_lens) > { > opt_machine_mode new_ovmode > @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo, > else if (memory_access_type == VMAT_GATHER_SCATTER) > { > aggr_type = elem_type; > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > - &bump, &vec_offset); > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > + &bump, &vec_offset, loop_lens); > } > else > { > @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo, > for (i = 0; i < vec_num; i++) > { > tree final_mask = NULL_TREE; > + tree final_len = NULL_TREE; > + tree bias = NULL_TREE; > if (loop_masks > && memory_access_type != VMAT_INVARIANT) > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo, > vec_offset = vec_offsets[vec_num * j + i]; > tree zero = build_zero_cst (vectype); > tree scale = size_int (gs_info.scale); > + > + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD) > + { > + if (loop_lens) > + { > + final_len = vect_get_loop_len ( > + loop_vinfo, gsi, loop_lens, vec_num * ncopies, > + vectype, vec_num * j + i, 1); > + } > + else > + { > + tree iv_type > + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + final_len = build_int_cst ( > + iv_type, TYPE_VECTOR_SUBPARTS (vectype)); > + } > + signed char biasval > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + bias = build_int_cst (intQI_type_node, biasval); > + if (!final_mask) > + { > + mask_vectype = truth_type_for (vectype); > + final_mask = build_minus_one_cst (mask_vectype); > + } > + } > + > gcall *call; > - if (final_mask) > + if (final_len && final_mask) > + call = gimple_build_call_internal ( > + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr, > + vec_offset, scale, zero, final_len, bias, > + final_mask); > + else if (final_mask) > call = gimple_build_call_internal > (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, > vec_offset, scale, zero, final_mask); > @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo, > machine_mode vmode = TYPE_MODE (vectype); > machine_mode new_vmode = vmode; > internal_fn partial_ifn = IFN_LAST; > - /* Produce 'len' and 'bias' argument. */ > - tree final_len = NULL_TREE; > - tree bias = NULL_TREE; > if (loop_lens) > { > opt_machine_mode new_ovmode >
On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote: > Hi, Richi. > > >> Eh, it's obvious that we should have the "vectorized" form > >> also in the 'scalar' variant. If you think there's no reasonable > >> way to add a value for len or bias then instead re-order the > >> arguments so 'mask' comes first and the len/bias pair last. > > I found this patch is buggy when I am testing LEN_MAS_GATHER_LOAD. > And reoder 'mask' comes first can not help. > > Here, we have 'GATHER_LOAD' and 'MASK_GATHER_LOAD', > For GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0); > I change it into: > > LEN_MASK_GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0); > In this situation, internal_fn_mask_index > should return -1. > > Wheras, MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33); > I change it into LEN_MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33); > it should return the index = 4. > > I can't differentiate them by only using LEN_MASK_GATHER_LOAD. > Could I revise internal_fn_mask_index > as follows ? No, please adjust the gather pattern recognition to produce either appropriate LEN_ variant IFNs or simply keep only the unconditional and conditional mask variants from patterns but code generate the len_ variants. I don't really see what the problem is. Maybe you fail to specify the appropriate ifn when you inspect the scalar internal fn call? > int > internal_fn_mask_index (internal_fn fn, int nargs) > { > switch (fn) > { > case IFN_MASK_LOAD: > case IFN_MASK_LOAD_LANES: > case IFN_MASK_STORE: > case IFN_MASK_STORE_LANES: > return 2; > > case IFN_MASK_GATHER_LOAD: > case IFN_MASK_SCATTER_STORE: > case IFN_LEN_MASK_LOAD: > case IFN_LEN_MASK_STORE: > return 4; > > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > on arguments before and after vectorized. > > Before vectorized: > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > After vectorized: > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > */ > case IFN_LEN_MASK_GATHER_LOAD: > case IFN_LEN_MASK_SCATTER_STORE: > return nargs == 4 ? -1 : nargs == 5 ? 4 : 6; > > default: > return (conditional_internal_fn_code (fn) != ERROR_MARK > || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1); > } > } > > > Thanks. > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-07-04 19:05 > To: Ju-Zhe Zhong > CC: gcc-patches; richard.sandiford > Subject: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer > On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote: > > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > > > Hi, Richard and Richi. > > > > Address comments from Richard. > > > > Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE. > > > > Since: > > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > > on arguments before and after vectorized. > > > > Before vectorized: > > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > > > After vectorized: > > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > > */ > > > > I add "vectorized_p" default argument into internal_fn_mask_index. > > So that we could simplify the codes. > > Eh, it's obvious that we should have the "vectorized" form > also in the 'scalar' variant. If you think there's no reasonable > way to add a value for len or bias then instead re-order the > arguments so 'mask' comes first and the len/bias pair last. > > But IMHO "any" len/bias value should do here. > > The rest looks OK now. > > Thanks, > Richard. > > > The len_mask_gather_load/len_mask_scatter_store patterns have been added. > > Now, this patch applies them into vectorizer. > > > > Here is the example: > > > > void > > f (int *restrict a, > > int *restrict b, int n, > > int base, int step, > > int *restrict cond) > > { > > for (int i = 0; i < n; ++i) > > { > > if (cond[i]) > > a[i * 4] = b[i]; > > } > > } > > > > Gimple IR: > > > > <bb 3> [local count: 105119324]: > > _58 = (unsigned long) n_13(D); > > > > <bb 4> [local count: 630715945]: > > # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)> > > # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)> > > # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)> > > # ivtmp_59 = PHI <ivtmp_60(4), _58(3)> > > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]); > > ivtmp_44 = _61 * 4; > > vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... }); > > mask__24.10_49 = vect__4.9_47 != { 0, ... }; > > vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49); > > ivtmp_54 = _61 * 16; > > .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49); > > vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44; > > vectp_b.11_52 = vectp_b.11_51 + ivtmp_44; > > vectp_a.14_56 = vectp_a.14_55 + ivtmp_54; > > ivtmp_60 = ivtmp_59 - _61; > > if (ivtmp_60 != 0) > > goto <bb 4>; [83.33%] > > else > > goto <bb 5>; [16.67%] > > > > gcc/ChangeLog: > > > > * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer. > > * internal-fn.h (internal_fn_mask_index): Ditto. > > * optabs-query.cc (supports_vec_gather_load_p): Ditto. > > (supports_vec_scatter_store_p): Ditto. > > * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto. > > * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto. > > (check_load_store_for_partial_vectors): Ditto. > > (vect_get_strided_load_store_ops): Ditto. > > (vectorizable_store): Ditto. > > (vectorizable_load): Ditto. > > > > --- > > gcc/internal-fn.cc | 16 ++++- > > gcc/internal-fn.h | 2 +- > > gcc/optabs-query.cc | 2 + > > gcc/tree-vect-data-refs.cc | 18 ++++- > > gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------ > > 5 files changed, 150 insertions(+), 23 deletions(-) > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > > index 303df102d81..2c78c870de8 100644 > > --- a/gcc/internal-fn.cc > > +++ b/gcc/internal-fn.cc > > @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn) > > otherwise return -1. */ > > > > int > > -internal_fn_mask_index (internal_fn fn) > > +internal_fn_mask_index (internal_fn fn, bool vectoried_p) > > { > > switch (fn) > > { > > @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn) > > case IFN_LEN_MASK_STORE: > > return 4; > > > > + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > > + on arguments before and after vectorized. > > + > > + Before vectorized: > > + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > + > > + After vectorized: > > + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > > + */ > > case IFN_LEN_MASK_GATHER_LOAD: > > case IFN_LEN_MASK_SCATTER_STORE: > > - return 6; > > + if (vectoried_p) > > + return 6; > > + else > > + return 4; > > > > default: > > return (conditional_internal_fn_code (fn) != ERROR_MARK > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > > index 4234bbfed87..e9168c16297 100644 > > --- a/gcc/internal-fn.h > > +++ b/gcc/internal-fn.h > > @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *, > > extern bool internal_load_fn_p (internal_fn); > > extern bool internal_store_fn_p (internal_fn); > > extern bool internal_gather_scatter_fn_p (internal_fn); > > -extern int internal_fn_mask_index (internal_fn); > > +extern int internal_fn_mask_index (internal_fn, bool = true); > > extern int internal_fn_len_index (internal_fn); > > extern int internal_fn_stored_value_index (internal_fn); > > extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, > > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc > > index 2fdd0d34354..bf1f484e874 100644 > > --- a/gcc/optabs-query.cc > > +++ b/gcc/optabs-query.cc > > @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode) > > this_fn_optabs->supports_vec_gather_load[mode] > > = (supports_vec_convert_optab_p (gather_load_optab, mode) > > || supports_vec_convert_optab_p (mask_gather_load_optab, mode) > > + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode) > > ? 1 : -1); > > > > return this_fn_optabs->supports_vec_gather_load[mode] > 0; > > @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode) > > this_fn_optabs->supports_vec_scatter_store[mode] > > = (supports_vec_convert_optab_p (scatter_store_optab, mode) > > || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) > > + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode) > > ? 1 : -1); > > > > return this_fn_optabs->supports_vec_scatter_store[mode] > 0; > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > > index ebe93832b1e..ab2af103cb4 100644 > > --- a/gcc/tree-vect-data-refs.cc > > +++ b/gcc/tree-vect-data-refs.cc > > @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, > > return false; > > > > /* Work out which function we need. */ > > - internal_fn ifn, alt_ifn; > > + internal_fn ifn, alt_ifn, alt_ifn2; > > if (read_p) > > { > > ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; > > alt_ifn = IFN_MASK_GATHER_LOAD; > > + /* When target supports LEN_MASK_GATHER_LOAD, we always > > + use LEN_MASK_GATHER_LOAD regardless whether len and > > + mask are valid or not. */ > > + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD; > > } > > else > > { > > ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; > > alt_ifn = IFN_MASK_SCATTER_STORE; > > + /* When target supports LEN_MASK_SCATTER_STORE, we always > > + use LEN_MASK_SCATTER_STORE regardless whether len and > > + mask are valid or not. */ > > + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE; > > } > > > > for (;;) > > @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, > > *offset_vectype_out = offset_vectype; > > return true; > > } > > + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype, > > + memory_type, > > + offset_vectype, scale)) > > + { > > + *ifn_out = alt_ifn2; > > + *offset_vectype_out = offset_vectype; > > + return true; > > + } > > > > if (TYPE_PRECISION (offset_type) >= POINTER_SIZE > > && TYPE_PRECISION (offset_type) >= element_bits) > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > index a0c39268bf0..33ec33f8b8d 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info) > > if (call && gimple_call_internal_p (call)) > > { > > internal_fn ifn = gimple_call_internal_fn (call); > > - int mask_index = internal_fn_mask_index (ifn); > > + int mask_index = internal_fn_mask_index (ifn, false); > > if (mask_index >= 0 > > && use == gimple_call_arg (call, mask_index)) > > return true; > > @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, > > gs_info->offset_vectype, > > gs_info->scale)) > > { > > + ifn = (is_load > > + ? IFN_LEN_MASK_GATHER_LOAD > > + : IFN_LEN_MASK_SCATTER_STORE); > > + if (internal_gather_scatter_fn_supported_p (ifn, vectype, > > + gs_info->memory_type, > > + gs_info->offset_vectype, > > + gs_info->scale)) > > + { > > + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); > > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); > > + return; > > + } > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > "can't operate on partial vectors because" > > @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo, > > static void > > vect_get_strided_load_store_ops (stmt_vec_info stmt_info, > > loop_vec_info loop_vinfo, > > + gimple_stmt_iterator *gsi, > > gather_scatter_info *gs_info, > > - tree *dataref_bump, tree *vec_offset) > > + tree *dataref_bump, tree *vec_offset, > > + vec_loop_lens *loop_lens) > > { > > struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); > > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > > > > - tree bump = size_binop (MULT_EXPR, > > - fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > > - size_int (TYPE_VECTOR_SUBPARTS (vectype))); > > - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > > + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) > > + { > > + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]); > > + ivtmp_8 = _31 * 16 (step in bytes); > > + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... ); > > + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */ > > + tree loop_len > > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0); > > + tree tmp > > + = fold_build2 (MULT_EXPR, sizetype, > > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > > + loop_len); > > + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp"); > > + gassign *assign = gimple_build_assign (bump, tmp); > > + gsi_insert_before (gsi, assign, GSI_SAME_STMT); > > + *dataref_bump = bump; > > + } > > + else > > + { > > + tree bump > > + = size_binop (MULT_EXPR, > > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > > + size_int (TYPE_VECTOR_SUBPARTS (vectype))); > > + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > > + } > > > > /* The offset given in GS_INFO can have pointer type, so use the element > > type of the vector instead. */ > > @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo, > > return false; > > } > > > > - int mask_index = internal_fn_mask_index (ifn); > > + int mask_index = internal_fn_mask_index (ifn, false); > > if (mask_index >= 0 > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > &mask, NULL, &mask_dt, &mask_vectype)) > > @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo, > > else if (memory_access_type == VMAT_GATHER_SCATTER) > > { > > aggr_type = elem_type; > > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > > - &bump, &vec_offset); > > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > > + &bump, &vec_offset, loop_lens); > > } > > else > > { > > @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo, > > unsigned HOST_WIDE_INT align; > > > > tree final_mask = NULL_TREE; > > + tree final_len = NULL_TREE; > > + tree bias = NULL_TREE; > > if (loop_masks) > > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > > vec_num * ncopies, > > @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo, > > if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > > vec_offset = vec_offsets[vec_num * j + i]; > > tree scale = size_int (gs_info.scale); > > + > > + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE) > > + { > > + if (loop_lens) > > + { > > + final_len > > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > > + vec_num * ncopies, vectype, > > + vec_num * j + i, 1); > > + } > > + else > > + { > > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > > + final_len > > + = build_int_cst (iv_type, > > + TYPE_VECTOR_SUBPARTS (vectype)); > > + } > > + signed char biasval > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > > + bias = build_int_cst (intQI_type_node, biasval); > > + if (!final_mask) > > + { > > + mask_vectype = truth_type_for (vectype); > > + final_mask = build_minus_one_cst (mask_vectype); > > + } > > + } > > + > > gcall *call; > > - if (final_mask) > > + if (final_len && final_len) > > + call > > + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE, > > + 7, dataref_ptr, vec_offset, > > + scale, vec_oprnd, final_len, > > + bias, final_mask); > > + else if (final_mask) > > call = gimple_build_call_internal > > (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset, > > scale, vec_oprnd, final_mask); > > @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo, > > machine_mode vmode = TYPE_MODE (vectype); > > machine_mode new_vmode = vmode; > > internal_fn partial_ifn = IFN_LAST; > > - /* Produce 'len' and 'bias' argument. */ > > - tree final_len = NULL_TREE; > > - tree bias = NULL_TREE; > > if (loop_lens) > > { > > opt_machine_mode new_ovmode > > @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo, > > else if (memory_access_type == VMAT_GATHER_SCATTER) > > { > > aggr_type = elem_type; > > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > > - &bump, &vec_offset); > > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > > + &bump, &vec_offset, loop_lens); > > } > > else > > { > > @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo, > > for (i = 0; i < vec_num; i++) > > { > > tree final_mask = NULL_TREE; > > + tree final_len = NULL_TREE; > > + tree bias = NULL_TREE; > > if (loop_masks > > && memory_access_type != VMAT_INVARIANT) > > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > > @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo, > > vec_offset = vec_offsets[vec_num * j + i]; > > tree zero = build_zero_cst (vectype); > > tree scale = size_int (gs_info.scale); > > + > > + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD) > > + { > > + if (loop_lens) > > + { > > + final_len = vect_get_loop_len ( > > + loop_vinfo, gsi, loop_lens, vec_num * ncopies, > > + vectype, vec_num * j + i, 1); > > + } > > + else > > + { > > + tree iv_type > > + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > > + final_len = build_int_cst ( > > + iv_type, TYPE_VECTOR_SUBPARTS (vectype)); > > + } > > + signed char biasval > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > > + bias = build_int_cst (intQI_type_node, biasval); > > + if (!final_mask) > > + { > > + mask_vectype = truth_type_for (vectype); > > + final_mask = build_minus_one_cst (mask_vectype); > > + } > > + } > > + > > gcall *call; > > - if (final_mask) > > + if (final_len && final_mask) > > + call = gimple_build_call_internal ( > > + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr, > > + vec_offset, scale, zero, final_len, bias, > > + final_mask); > > + else if (final_mask) > > call = gimple_build_call_internal > > (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, > > vec_offset, scale, zero, final_mask); > > @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo, > > machine_mode vmode = TYPE_MODE (vectype); > > machine_mode new_vmode = vmode; > > internal_fn partial_ifn = IFN_LAST; > > - /* Produce 'len' and 'bias' argument. */ > > - tree final_len = NULL_TREE; > > - tree bias = NULL_TREE; > > if (loop_lens) > > { > > opt_machine_mode new_ovmode > > > >
Hi, Richi. For GATHER_LOAD which doesn't have len and mask. Should I keep it as gather_load, then I support both gather_load and len_mask_gather_load ? Or I should normalize it into len_mask_gather_load with length = vf and mask = {1,1,1,1,1,...}, then I only need to support len_mask_gather_load in RISC-V port? Thanks. juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-07-04 19:17 To: juzhe.zhong@rivai.ai CC: gcc-patches; richard.sandiford Subject: Re: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote: > Hi, Richi. > > >> Eh, it's obvious that we should have the "vectorized" form > >> also in the 'scalar' variant. If you think there's no reasonable > >> way to add a value for len or bias then instead re-order the > >> arguments so 'mask' comes first and the len/bias pair last. > > I found this patch is buggy when I am testing LEN_MAS_GATHER_LOAD. > And reoder 'mask' comes first can not help. > > Here, we have 'GATHER_LOAD' and 'MASK_GATHER_LOAD', > For GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0); > I change it into: > > LEN_MASK_GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0); > In this situation, internal_fn_mask_index > should return -1. > > Wheras, MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33); > I change it into LEN_MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33); > it should return the index = 4. > > I can't differentiate them by only using LEN_MASK_GATHER_LOAD. > Could I revise internal_fn_mask_index > as follows ? No, please adjust the gather pattern recognition to produce either appropriate LEN_ variant IFNs or simply keep only the unconditional and conditional mask variants from patterns but code generate the len_ variants. I don't really see what the problem is. Maybe you fail to specify the appropriate ifn when you inspect the scalar internal fn call? > int > internal_fn_mask_index (internal_fn fn, int nargs) > { > switch (fn) > { > case IFN_MASK_LOAD: > case IFN_MASK_LOAD_LANES: > case IFN_MASK_STORE: > case IFN_MASK_STORE_LANES: > return 2; > > case IFN_MASK_GATHER_LOAD: > case IFN_MASK_SCATTER_STORE: > case IFN_LEN_MASK_LOAD: > case IFN_LEN_MASK_STORE: > return 4; > > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > on arguments before and after vectorized. > > Before vectorized: > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > After vectorized: > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > */ > case IFN_LEN_MASK_GATHER_LOAD: > case IFN_LEN_MASK_SCATTER_STORE: > return nargs == 4 ? -1 : nargs == 5 ? 4 : 6; > > default: > return (conditional_internal_fn_code (fn) != ERROR_MARK > || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1); > } > } > > > Thanks. > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-07-04 19:05 > To: Ju-Zhe Zhong > CC: gcc-patches; richard.sandiford > Subject: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer > On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote: > > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > > > Hi, Richard and Richi. > > > > Address comments from Richard. > > > > Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE. > > > > Since: > > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > > on arguments before and after vectorized. > > > > Before vectorized: > > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > > > After vectorized: > > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > > */ > > > > I add "vectorized_p" default argument into internal_fn_mask_index. > > So that we could simplify the codes. > > Eh, it's obvious that we should have the "vectorized" form > also in the 'scalar' variant. If you think there's no reasonable > way to add a value for len or bias then instead re-order the > arguments so 'mask' comes first and the len/bias pair last. > > But IMHO "any" len/bias value should do here. > > The rest looks OK now. > > Thanks, > Richard. > > > The len_mask_gather_load/len_mask_scatter_store patterns have been added. > > Now, this patch applies them into vectorizer. > > > > Here is the example: > > > > void > > f (int *restrict a, > > int *restrict b, int n, > > int base, int step, > > int *restrict cond) > > { > > for (int i = 0; i < n; ++i) > > { > > if (cond[i]) > > a[i * 4] = b[i]; > > } > > } > > > > Gimple IR: > > > > <bb 3> [local count: 105119324]: > > _58 = (unsigned long) n_13(D); > > > > <bb 4> [local count: 630715945]: > > # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)> > > # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)> > > # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)> > > # ivtmp_59 = PHI <ivtmp_60(4), _58(3)> > > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]); > > ivtmp_44 = _61 * 4; > > vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... }); > > mask__24.10_49 = vect__4.9_47 != { 0, ... }; > > vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49); > > ivtmp_54 = _61 * 16; > > .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49); > > vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44; > > vectp_b.11_52 = vectp_b.11_51 + ivtmp_44; > > vectp_a.14_56 = vectp_a.14_55 + ivtmp_54; > > ivtmp_60 = ivtmp_59 - _61; > > if (ivtmp_60 != 0) > > goto <bb 4>; [83.33%] > > else > > goto <bb 5>; [16.67%] > > > > gcc/ChangeLog: > > > > * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer. > > * internal-fn.h (internal_fn_mask_index): Ditto. > > * optabs-query.cc (supports_vec_gather_load_p): Ditto. > > (supports_vec_scatter_store_p): Ditto. > > * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto. > > * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto. > > (check_load_store_for_partial_vectors): Ditto. > > (vect_get_strided_load_store_ops): Ditto. > > (vectorizable_store): Ditto. > > (vectorizable_load): Ditto. > > > > --- > > gcc/internal-fn.cc | 16 ++++- > > gcc/internal-fn.h | 2 +- > > gcc/optabs-query.cc | 2 + > > gcc/tree-vect-data-refs.cc | 18 ++++- > > gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------ > > 5 files changed, 150 insertions(+), 23 deletions(-) > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > > index 303df102d81..2c78c870de8 100644 > > --- a/gcc/internal-fn.cc > > +++ b/gcc/internal-fn.cc > > @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn) > > otherwise return -1. */ > > > > int > > -internal_fn_mask_index (internal_fn fn) > > +internal_fn_mask_index (internal_fn fn, bool vectoried_p) > > { > > switch (fn) > > { > > @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn) > > case IFN_LEN_MASK_STORE: > > return 4; > > > > + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > > + on arguments before and after vectorized. > > + > > + Before vectorized: > > + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > + > > + After vectorized: > > + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > > + */ > > case IFN_LEN_MASK_GATHER_LOAD: > > case IFN_LEN_MASK_SCATTER_STORE: > > - return 6; > > + if (vectoried_p) > > + return 6; > > + else > > + return 4; > > > > default: > > return (conditional_internal_fn_code (fn) != ERROR_MARK > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > > index 4234bbfed87..e9168c16297 100644 > > --- a/gcc/internal-fn.h > > +++ b/gcc/internal-fn.h > > @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *, > > extern bool internal_load_fn_p (internal_fn); > > extern bool internal_store_fn_p (internal_fn); > > extern bool internal_gather_scatter_fn_p (internal_fn); > > -extern int internal_fn_mask_index (internal_fn); > > +extern int internal_fn_mask_index (internal_fn, bool = true); > > extern int internal_fn_len_index (internal_fn); > > extern int internal_fn_stored_value_index (internal_fn); > > extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, > > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc > > index 2fdd0d34354..bf1f484e874 100644 > > --- a/gcc/optabs-query.cc > > +++ b/gcc/optabs-query.cc > > @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode) > > this_fn_optabs->supports_vec_gather_load[mode] > > = (supports_vec_convert_optab_p (gather_load_optab, mode) > > || supports_vec_convert_optab_p (mask_gather_load_optab, mode) > > + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode) > > ? 1 : -1); > > > > return this_fn_optabs->supports_vec_gather_load[mode] > 0; > > @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode) > > this_fn_optabs->supports_vec_scatter_store[mode] > > = (supports_vec_convert_optab_p (scatter_store_optab, mode) > > || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) > > + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode) > > ? 1 : -1); > > > > return this_fn_optabs->supports_vec_scatter_store[mode] > 0; > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > > index ebe93832b1e..ab2af103cb4 100644 > > --- a/gcc/tree-vect-data-refs.cc > > +++ b/gcc/tree-vect-data-refs.cc > > @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, > > return false; > > > > /* Work out which function we need. */ > > - internal_fn ifn, alt_ifn; > > + internal_fn ifn, alt_ifn, alt_ifn2; > > if (read_p) > > { > > ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; > > alt_ifn = IFN_MASK_GATHER_LOAD; > > + /* When target supports LEN_MASK_GATHER_LOAD, we always > > + use LEN_MASK_GATHER_LOAD regardless whether len and > > + mask are valid or not. */ > > + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD; > > } > > else > > { > > ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; > > alt_ifn = IFN_MASK_SCATTER_STORE; > > + /* When target supports LEN_MASK_SCATTER_STORE, we always > > + use LEN_MASK_SCATTER_STORE regardless whether len and > > + mask are valid or not. */ > > + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE; > > } > > > > for (;;) > > @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, > > *offset_vectype_out = offset_vectype; > > return true; > > } > > + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype, > > + memory_type, > > + offset_vectype, scale)) > > + { > > + *ifn_out = alt_ifn2; > > + *offset_vectype_out = offset_vectype; > > + return true; > > + } > > > > if (TYPE_PRECISION (offset_type) >= POINTER_SIZE > > && TYPE_PRECISION (offset_type) >= element_bits) > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > index a0c39268bf0..33ec33f8b8d 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info) > > if (call && gimple_call_internal_p (call)) > > { > > internal_fn ifn = gimple_call_internal_fn (call); > > - int mask_index = internal_fn_mask_index (ifn); > > + int mask_index = internal_fn_mask_index (ifn, false); > > if (mask_index >= 0 > > && use == gimple_call_arg (call, mask_index)) > > return true; > > @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, > > gs_info->offset_vectype, > > gs_info->scale)) > > { > > + ifn = (is_load > > + ? IFN_LEN_MASK_GATHER_LOAD > > + : IFN_LEN_MASK_SCATTER_STORE); > > + if (internal_gather_scatter_fn_supported_p (ifn, vectype, > > + gs_info->memory_type, > > + gs_info->offset_vectype, > > + gs_info->scale)) > > + { > > + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); > > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); > > + return; > > + } > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > "can't operate on partial vectors because" > > @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo, > > static void > > vect_get_strided_load_store_ops (stmt_vec_info stmt_info, > > loop_vec_info loop_vinfo, > > + gimple_stmt_iterator *gsi, > > gather_scatter_info *gs_info, > > - tree *dataref_bump, tree *vec_offset) > > + tree *dataref_bump, tree *vec_offset, > > + vec_loop_lens *loop_lens) > > { > > struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); > > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > > > > - tree bump = size_binop (MULT_EXPR, > > - fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > > - size_int (TYPE_VECTOR_SUBPARTS (vectype))); > > - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > > + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) > > + { > > + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]); > > + ivtmp_8 = _31 * 16 (step in bytes); > > + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... ); > > + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */ > > + tree loop_len > > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0); > > + tree tmp > > + = fold_build2 (MULT_EXPR, sizetype, > > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > > + loop_len); > > + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp"); > > + gassign *assign = gimple_build_assign (bump, tmp); > > + gsi_insert_before (gsi, assign, GSI_SAME_STMT); > > + *dataref_bump = bump; > > + } > > + else > > + { > > + tree bump > > + = size_binop (MULT_EXPR, > > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > > + size_int (TYPE_VECTOR_SUBPARTS (vectype))); > > + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > > + } > > > > /* The offset given in GS_INFO can have pointer type, so use the element > > type of the vector instead. */ > > @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo, > > return false; > > } > > > > - int mask_index = internal_fn_mask_index (ifn); > > + int mask_index = internal_fn_mask_index (ifn, false); > > if (mask_index >= 0 > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > &mask, NULL, &mask_dt, &mask_vectype)) > > @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo, > > else if (memory_access_type == VMAT_GATHER_SCATTER) > > { > > aggr_type = elem_type; > > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > > - &bump, &vec_offset); > > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > > + &bump, &vec_offset, loop_lens); > > } > > else > > { > > @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo, > > unsigned HOST_WIDE_INT align; > > > > tree final_mask = NULL_TREE; > > + tree final_len = NULL_TREE; > > + tree bias = NULL_TREE; > > if (loop_masks) > > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > > vec_num * ncopies, > > @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo, > > if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > > vec_offset = vec_offsets[vec_num * j + i]; > > tree scale = size_int (gs_info.scale); > > + > > + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE) > > + { > > + if (loop_lens) > > + { > > + final_len > > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > > + vec_num * ncopies, vectype, > > + vec_num * j + i, 1); > > + } > > + else > > + { > > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > > + final_len > > + = build_int_cst (iv_type, > > + TYPE_VECTOR_SUBPARTS (vectype)); > > + } > > + signed char biasval > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > > + bias = build_int_cst (intQI_type_node, biasval); > > + if (!final_mask) > > + { > > + mask_vectype = truth_type_for (vectype); > > + final_mask = build_minus_one_cst (mask_vectype); > > + } > > + } > > + > > gcall *call; > > - if (final_mask) > > + if (final_len && final_len) > > + call > > + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE, > > + 7, dataref_ptr, vec_offset, > > + scale, vec_oprnd, final_len, > > + bias, final_mask); > > + else if (final_mask) > > call = gimple_build_call_internal > > (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset, > > scale, vec_oprnd, final_mask); > > @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo, > > machine_mode vmode = TYPE_MODE (vectype); > > machine_mode new_vmode = vmode; > > internal_fn partial_ifn = IFN_LAST; > > - /* Produce 'len' and 'bias' argument. */ > > - tree final_len = NULL_TREE; > > - tree bias = NULL_TREE; > > if (loop_lens) > > { > > opt_machine_mode new_ovmode > > @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo, > > else if (memory_access_type == VMAT_GATHER_SCATTER) > > { > > aggr_type = elem_type; > > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > > - &bump, &vec_offset); > > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > > + &bump, &vec_offset, loop_lens); > > } > > else > > { > > @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo, > > for (i = 0; i < vec_num; i++) > > { > > tree final_mask = NULL_TREE; > > + tree final_len = NULL_TREE; > > + tree bias = NULL_TREE; > > if (loop_masks > > && memory_access_type != VMAT_INVARIANT) > > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > > @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo, > > vec_offset = vec_offsets[vec_num * j + i]; > > tree zero = build_zero_cst (vectype); > > tree scale = size_int (gs_info.scale); > > + > > + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD) > > + { > > + if (loop_lens) > > + { > > + final_len = vect_get_loop_len ( > > + loop_vinfo, gsi, loop_lens, vec_num * ncopies, > > + vectype, vec_num * j + i, 1); > > + } > > + else > > + { > > + tree iv_type > > + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > > + final_len = build_int_cst ( > > + iv_type, TYPE_VECTOR_SUBPARTS (vectype)); > > + } > > + signed char biasval > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > > + bias = build_int_cst (intQI_type_node, biasval); > > + if (!final_mask) > > + { > > + mask_vectype = truth_type_for (vectype); > > + final_mask = build_minus_one_cst (mask_vectype); > > + } > > + } > > + > > gcall *call; > > - if (final_mask) > > + if (final_len && final_mask) > > + call = gimple_build_call_internal ( > > + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr, > > + vec_offset, scale, zero, final_len, bias, > > + final_mask); > > + else if (final_mask) > > call = gimple_build_call_internal > > (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, > > vec_offset, scale, zero, final_mask); > > @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo, > > machine_mode vmode = TYPE_MODE (vectype); > > machine_mode new_vmode = vmode; > > internal_fn partial_ifn = IFN_LAST; > > - /* Produce 'len' and 'bias' argument. */ > > - tree final_len = NULL_TREE; > > - tree bias = NULL_TREE; > > if (loop_lens) > > { > > opt_machine_mode new_ovmode > > > >
On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote: > Hi, Richi. > > For GATHER_LOAD which doesn't have len and mask. > > Should I keep it as gather_load, then I support both gather_load and > len_mask_gather_load ? > > Or I should normalize it into len_mask_gather_load with length = vf and > mask = {1,1,1,1,1,...}, then I only need to support len_mask_gather_load > in RISC-V port? I think that pattern recog should keep it as GATHER_LOAD or MASK_GATHER_LOAD depending on what if-conversion did. The later handling should then use the appropriate vector IFN which could add a dummy -1 mask or a dummy len and handle partial vectors. Richard. > Thanks. > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-07-04 19:17 > To: juzhe.zhong@rivai.ai > CC: gcc-patches; richard.sandiford > Subject: Re: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer > On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote: > > > Hi, Richi. > > > > >> Eh, it's obvious that we should have the "vectorized" form > > >> also in the 'scalar' variant. If you think there's no reasonable > > >> way to add a value for len or bias then instead re-order the > > >> arguments so 'mask' comes first and the len/bias pair last. > > > > I found this patch is buggy when I am testing LEN_MAS_GATHER_LOAD. > > And reoder 'mask' comes first can not help. > > > > Here, we have 'GATHER_LOAD' and 'MASK_GATHER_LOAD', > > For GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0); > > I change it into: > > > > LEN_MASK_GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0); > > In this situation, internal_fn_mask_index > > should return -1. > > > > Wheras, MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33); > > I change it into LEN_MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33); > > it should return the index = 4. > > > > I can't differentiate them by only using LEN_MASK_GATHER_LOAD. > > Could I revise internal_fn_mask_index > > as follows ? > > No, please adjust the gather pattern recognition to produce either > appropriate LEN_ variant IFNs or simply keep only the unconditional > and conditional mask variants from patterns but code generate > the len_ variants. I don't really see what the problem is. > Maybe you fail to specify the appropriate ifn when you inspect > the scalar internal fn call? > > > int > > internal_fn_mask_index (internal_fn fn, int nargs) > > { > > switch (fn) > > { > > case IFN_MASK_LOAD: > > case IFN_MASK_LOAD_LANES: > > case IFN_MASK_STORE: > > case IFN_MASK_STORE_LANES: > > return 2; > > > > case IFN_MASK_GATHER_LOAD: > > case IFN_MASK_SCATTER_STORE: > > case IFN_LEN_MASK_LOAD: > > case IFN_LEN_MASK_STORE: > > return 4; > > > > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > > on arguments before and after vectorized. > > > > Before vectorized: > > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > > > After vectorized: > > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > > */ > > case IFN_LEN_MASK_GATHER_LOAD: > > case IFN_LEN_MASK_SCATTER_STORE: > > return nargs == 4 ? -1 : nargs == 5 ? 4 : 6; > > > > default: > > return (conditional_internal_fn_code (fn) != ERROR_MARK > > || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1); > > } > > } > > > > > > Thanks. > > > > > > juzhe.zhong@rivai.ai > > > > From: Richard Biener > > Date: 2023-07-04 19:05 > > To: Ju-Zhe Zhong > > CC: gcc-patches; richard.sandiford > > Subject: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer > > On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote: > > > > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > > > > > Hi, Richard and Richi. > > > > > > Address comments from Richard. > > > > > > Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE. > > > > > > Since: > > > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > > > on arguments before and after vectorized. > > > > > > Before vectorized: > > > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > > > > > After vectorized: > > > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > > > */ > > > > > > I add "vectorized_p" default argument into internal_fn_mask_index. > > > So that we could simplify the codes. > > > > Eh, it's obvious that we should have the "vectorized" form > > also in the 'scalar' variant. If you think there's no reasonable > > way to add a value for len or bias then instead re-order the > > arguments so 'mask' comes first and the len/bias pair last. > > > > But IMHO "any" len/bias value should do here. > > > > The rest looks OK now. > > > > Thanks, > > Richard. > > > > > The len_mask_gather_load/len_mask_scatter_store patterns have been added. > > > Now, this patch applies them into vectorizer. > > > > > > Here is the example: > > > > > > void > > > f (int *restrict a, > > > int *restrict b, int n, > > > int base, int step, > > > int *restrict cond) > > > { > > > for (int i = 0; i < n; ++i) > > > { > > > if (cond[i]) > > > a[i * 4] = b[i]; > > > } > > > } > > > > > > Gimple IR: > > > > > > <bb 3> [local count: 105119324]: > > > _58 = (unsigned long) n_13(D); > > > > > > <bb 4> [local count: 630715945]: > > > # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)> > > > # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)> > > > # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)> > > > # ivtmp_59 = PHI <ivtmp_60(4), _58(3)> > > > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]); > > > ivtmp_44 = _61 * 4; > > > vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... }); > > > mask__24.10_49 = vect__4.9_47 != { 0, ... }; > > > vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49); > > > ivtmp_54 = _61 * 16; > > > .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49); > > > vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44; > > > vectp_b.11_52 = vectp_b.11_51 + ivtmp_44; > > > vectp_a.14_56 = vectp_a.14_55 + ivtmp_54; > > > ivtmp_60 = ivtmp_59 - _61; > > > if (ivtmp_60 != 0) > > > goto <bb 4>; [83.33%] > > > else > > > goto <bb 5>; [16.67%] > > > > > > gcc/ChangeLog: > > > > > > * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer. > > > * internal-fn.h (internal_fn_mask_index): Ditto. > > > * optabs-query.cc (supports_vec_gather_load_p): Ditto. > > > (supports_vec_scatter_store_p): Ditto. > > > * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto. > > > * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto. > > > (check_load_store_for_partial_vectors): Ditto. > > > (vect_get_strided_load_store_ops): Ditto. > > > (vectorizable_store): Ditto. > > > (vectorizable_load): Ditto. > > > > > > --- > > > gcc/internal-fn.cc | 16 ++++- > > > gcc/internal-fn.h | 2 +- > > > gcc/optabs-query.cc | 2 + > > > gcc/tree-vect-data-refs.cc | 18 ++++- > > > gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------ > > > 5 files changed, 150 insertions(+), 23 deletions(-) > > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > > > index 303df102d81..2c78c870de8 100644 > > > --- a/gcc/internal-fn.cc > > > +++ b/gcc/internal-fn.cc > > > @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn) > > > otherwise return -1. */ > > > > > > int > > > -internal_fn_mask_index (internal_fn fn) > > > +internal_fn_mask_index (internal_fn fn, bool vectoried_p) > > > { > > > switch (fn) > > > { > > > @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn) > > > case IFN_LEN_MASK_STORE: > > > return 4; > > > > > > + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different > > > + on arguments before and after vectorized. > > > + > > > + Before vectorized: > > > + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); > > > + > > > + After vectorized: > > > + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); > > > + */ > > > case IFN_LEN_MASK_GATHER_LOAD: > > > case IFN_LEN_MASK_SCATTER_STORE: > > > - return 6; > > > + if (vectoried_p) > > > + return 6; > > > + else > > > + return 4; > > > > > > default: > > > return (conditional_internal_fn_code (fn) != ERROR_MARK > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > > > index 4234bbfed87..e9168c16297 100644 > > > --- a/gcc/internal-fn.h > > > +++ b/gcc/internal-fn.h > > > @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *, > > > extern bool internal_load_fn_p (internal_fn); > > > extern bool internal_store_fn_p (internal_fn); > > > extern bool internal_gather_scatter_fn_p (internal_fn); > > > -extern int internal_fn_mask_index (internal_fn); > > > +extern int internal_fn_mask_index (internal_fn, bool = true); > > > extern int internal_fn_len_index (internal_fn); > > > extern int internal_fn_stored_value_index (internal_fn); > > > extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, > > > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc > > > index 2fdd0d34354..bf1f484e874 100644 > > > --- a/gcc/optabs-query.cc > > > +++ b/gcc/optabs-query.cc > > > @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode) > > > this_fn_optabs->supports_vec_gather_load[mode] > > > = (supports_vec_convert_optab_p (gather_load_optab, mode) > > > || supports_vec_convert_optab_p (mask_gather_load_optab, mode) > > > + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode) > > > ? 1 : -1); > > > > > > return this_fn_optabs->supports_vec_gather_load[mode] > 0; > > > @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode) > > > this_fn_optabs->supports_vec_scatter_store[mode] > > > = (supports_vec_convert_optab_p (scatter_store_optab, mode) > > > || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) > > > + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode) > > > ? 1 : -1); > > > > > > return this_fn_optabs->supports_vec_scatter_store[mode] > 0; > > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > > > index ebe93832b1e..ab2af103cb4 100644 > > > --- a/gcc/tree-vect-data-refs.cc > > > +++ b/gcc/tree-vect-data-refs.cc > > > @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, > > > return false; > > > > > > /* Work out which function we need. */ > > > - internal_fn ifn, alt_ifn; > > > + internal_fn ifn, alt_ifn, alt_ifn2; > > > if (read_p) > > > { > > > ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; > > > alt_ifn = IFN_MASK_GATHER_LOAD; > > > + /* When target supports LEN_MASK_GATHER_LOAD, we always > > > + use LEN_MASK_GATHER_LOAD regardless whether len and > > > + mask are valid or not. */ > > > + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD; > > > } > > > else > > > { > > > ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; > > > alt_ifn = IFN_MASK_SCATTER_STORE; > > > + /* When target supports LEN_MASK_SCATTER_STORE, we always > > > + use LEN_MASK_SCATTER_STORE regardless whether len and > > > + mask are valid or not. */ > > > + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE; > > > } > > > > > > for (;;) > > > @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, > > > *offset_vectype_out = offset_vectype; > > > return true; > > > } > > > + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype, > > > + memory_type, > > > + offset_vectype, scale)) > > > + { > > > + *ifn_out = alt_ifn2; > > > + *offset_vectype_out = offset_vectype; > > > + return true; > > > + } > > > > > > if (TYPE_PRECISION (offset_type) >= POINTER_SIZE > > > && TYPE_PRECISION (offset_type) >= element_bits) > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > index a0c39268bf0..33ec33f8b8d 100644 > > > --- a/gcc/tree-vect-stmts.cc > > > +++ b/gcc/tree-vect-stmts.cc > > > @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info) > > > if (call && gimple_call_internal_p (call)) > > > { > > > internal_fn ifn = gimple_call_internal_fn (call); > > > - int mask_index = internal_fn_mask_index (ifn); > > > + int mask_index = internal_fn_mask_index (ifn, false); > > > if (mask_index >= 0 > > > && use == gimple_call_arg (call, mask_index)) > > > return true; > > > @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, > > > gs_info->offset_vectype, > > > gs_info->scale)) > > > { > > > + ifn = (is_load > > > + ? IFN_LEN_MASK_GATHER_LOAD > > > + : IFN_LEN_MASK_SCATTER_STORE); > > > + if (internal_gather_scatter_fn_supported_p (ifn, vectype, > > > + gs_info->memory_type, > > > + gs_info->offset_vectype, > > > + gs_info->scale)) > > > + { > > > + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); > > > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); > > > + return; > > > + } > > > if (dump_enabled_p ()) > > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > > "can't operate on partial vectors because" > > > @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo, > > > static void > > > vect_get_strided_load_store_ops (stmt_vec_info stmt_info, > > > loop_vec_info loop_vinfo, > > > + gimple_stmt_iterator *gsi, > > > gather_scatter_info *gs_info, > > > - tree *dataref_bump, tree *vec_offset) > > > + tree *dataref_bump, tree *vec_offset, > > > + vec_loop_lens *loop_lens) > > > { > > > struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); > > > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > > > > > > - tree bump = size_binop (MULT_EXPR, > > > - fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > > > - size_int (TYPE_VECTOR_SUBPARTS (vectype))); > > > - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > > > + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) > > > + { > > > + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]); > > > + ivtmp_8 = _31 * 16 (step in bytes); > > > + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... ); > > > + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */ > > > + tree loop_len > > > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0); > > > + tree tmp > > > + = fold_build2 (MULT_EXPR, sizetype, > > > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > > > + loop_len); > > > + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp"); > > > + gassign *assign = gimple_build_assign (bump, tmp); > > > + gsi_insert_before (gsi, assign, GSI_SAME_STMT); > > > + *dataref_bump = bump; > > > + } > > > + else > > > + { > > > + tree bump > > > + = size_binop (MULT_EXPR, > > > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > > > + size_int (TYPE_VECTOR_SUBPARTS (vectype))); > > > + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > > > + } > > > > > > /* The offset given in GS_INFO can have pointer type, so use the element > > > type of the vector instead. */ > > > @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo, > > > return false; > > > } > > > > > > - int mask_index = internal_fn_mask_index (ifn); > > > + int mask_index = internal_fn_mask_index (ifn, false); > > > if (mask_index >= 0 > > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > > &mask, NULL, &mask_dt, &mask_vectype)) > > > @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo, > > > else if (memory_access_type == VMAT_GATHER_SCATTER) > > > { > > > aggr_type = elem_type; > > > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > > > - &bump, &vec_offset); > > > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > > > + &bump, &vec_offset, loop_lens); > > > } > > > else > > > { > > > @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo, > > > unsigned HOST_WIDE_INT align; > > > > > > tree final_mask = NULL_TREE; > > > + tree final_len = NULL_TREE; > > > + tree bias = NULL_TREE; > > > if (loop_masks) > > > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > > > vec_num * ncopies, > > > @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo, > > > if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > > > vec_offset = vec_offsets[vec_num * j + i]; > > > tree scale = size_int (gs_info.scale); > > > + > > > + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE) > > > + { > > > + if (loop_lens) > > > + { > > > + final_len > > > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > > > + vec_num * ncopies, vectype, > > > + vec_num * j + i, 1); > > > + } > > > + else > > > + { > > > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > > > + final_len > > > + = build_int_cst (iv_type, > > > + TYPE_VECTOR_SUBPARTS (vectype)); > > > + } > > > + signed char biasval > > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > > > + bias = build_int_cst (intQI_type_node, biasval); > > > + if (!final_mask) > > > + { > > > + mask_vectype = truth_type_for (vectype); > > > + final_mask = build_minus_one_cst (mask_vectype); > > > + } > > > + } > > > + > > > gcall *call; > > > - if (final_mask) > > > + if (final_len && final_len) > > > + call > > > + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE, > > > + 7, dataref_ptr, vec_offset, > > > + scale, vec_oprnd, final_len, > > > + bias, final_mask); > > > + else if (final_mask) > > > call = gimple_build_call_internal > > > (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset, > > > scale, vec_oprnd, final_mask); > > > @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo, > > > machine_mode vmode = TYPE_MODE (vectype); > > > machine_mode new_vmode = vmode; > > > internal_fn partial_ifn = IFN_LAST; > > > - /* Produce 'len' and 'bias' argument. */ > > > - tree final_len = NULL_TREE; > > > - tree bias = NULL_TREE; > > > if (loop_lens) > > > { > > > opt_machine_mode new_ovmode > > > @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo, > > > else if (memory_access_type == VMAT_GATHER_SCATTER) > > > { > > > aggr_type = elem_type; > > > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > > > - &bump, &vec_offset); > > > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > > > + &bump, &vec_offset, loop_lens); > > > } > > > else > > > { > > > @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo, > > > for (i = 0; i < vec_num; i++) > > > { > > > tree final_mask = NULL_TREE; > > > + tree final_len = NULL_TREE; > > > + tree bias = NULL_TREE; > > > if (loop_masks > > > && memory_access_type != VMAT_INVARIANT) > > > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > > > @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo, > > > vec_offset = vec_offsets[vec_num * j + i]; > > > tree zero = build_zero_cst (vectype); > > > tree scale = size_int (gs_info.scale); > > > + > > > + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD) > > > + { > > > + if (loop_lens) > > > + { > > > + final_len = vect_get_loop_len ( > > > + loop_vinfo, gsi, loop_lens, vec_num * ncopies, > > > + vectype, vec_num * j + i, 1); > > > + } > > > + else > > > + { > > > + tree iv_type > > > + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > > > + final_len = build_int_cst ( > > > + iv_type, TYPE_VECTOR_SUBPARTS (vectype)); > > > + } > > > + signed char biasval > > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > > > + bias = build_int_cst (intQI_type_node, biasval); > > > + if (!final_mask) > > > + { > > > + mask_vectype = truth_type_for (vectype); > > > + final_mask = build_minus_one_cst (mask_vectype); > > > + } > > > + } > > > + > > > gcall *call; > > > - if (final_mask) > > > + if (final_len && final_mask) > > > + call = gimple_build_call_internal ( > > > + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr, > > > + vec_offset, scale, zero, final_len, bias, > > > + final_mask); > > > + else if (final_mask) > > > call = gimple_build_call_internal > > > (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, > > > vec_offset, scale, zero, final_mask); > > > @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo, > > > machine_mode vmode = TYPE_MODE (vectype); > > > machine_mode new_vmode = vmode; > > > internal_fn partial_ifn = IFN_LAST; > > > - /* Produce 'len' and 'bias' argument. */ > > > - tree final_len = NULL_TREE; > > > - tree bias = NULL_TREE; > > > if (loop_lens) > > > { > > > opt_machine_mode new_ovmode > > > > > > > > >
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 303df102d81..2c78c870de8 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn) otherwise return -1. */ int -internal_fn_mask_index (internal_fn fn) +internal_fn_mask_index (internal_fn fn, bool vectoried_p) { switch (fn) { @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn) case IFN_LEN_MASK_STORE: return 4; + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different + on arguments before and after vectorized. + + Before vectorized: + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); + + After vectorized: + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); + */ case IFN_LEN_MASK_GATHER_LOAD: case IFN_LEN_MASK_SCATTER_STORE: - return 6; + if (vectoried_p) + return 6; + else + return 4; default: return (conditional_internal_fn_code (fn) != ERROR_MARK diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h index 4234bbfed87..e9168c16297 100644 --- a/gcc/internal-fn.h +++ b/gcc/internal-fn.h @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *, extern bool internal_load_fn_p (internal_fn); extern bool internal_store_fn_p (internal_fn); extern bool internal_gather_scatter_fn_p (internal_fn); -extern int internal_fn_mask_index (internal_fn); +extern int internal_fn_mask_index (internal_fn, bool = true); extern int internal_fn_len_index (internal_fn); extern int internal_fn_stored_value_index (internal_fn); extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc index 2fdd0d34354..bf1f484e874 100644 --- a/gcc/optabs-query.cc +++ b/gcc/optabs-query.cc @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode) this_fn_optabs->supports_vec_gather_load[mode] = (supports_vec_convert_optab_p (gather_load_optab, mode) || supports_vec_convert_optab_p (mask_gather_load_optab, mode) + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode) ? 1 : -1); return this_fn_optabs->supports_vec_gather_load[mode] > 0; @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode) this_fn_optabs->supports_vec_scatter_store[mode] = (supports_vec_convert_optab_p (scatter_store_optab, mode) || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode) ? 1 : -1); return this_fn_optabs->supports_vec_scatter_store[mode] > 0; diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index ebe93832b1e..ab2af103cb4 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, return false; /* Work out which function we need. */ - internal_fn ifn, alt_ifn; + internal_fn ifn, alt_ifn, alt_ifn2; if (read_p) { ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; alt_ifn = IFN_MASK_GATHER_LOAD; + /* When target supports LEN_MASK_GATHER_LOAD, we always + use LEN_MASK_GATHER_LOAD regardless whether len and + mask are valid or not. */ + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD; } else { ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; alt_ifn = IFN_MASK_SCATTER_STORE; + /* When target supports LEN_MASK_SCATTER_STORE, we always + use LEN_MASK_SCATTER_STORE regardless whether len and + mask are valid or not. */ + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE; } for (;;) @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, *offset_vectype_out = offset_vectype; return true; } + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype, + memory_type, + offset_vectype, scale)) + { + *ifn_out = alt_ifn2; + *offset_vectype_out = offset_vectype; + return true; + } if (TYPE_PRECISION (offset_type) >= POINTER_SIZE && TYPE_PRECISION (offset_type) >= element_bits) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index a0c39268bf0..33ec33f8b8d 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info) if (call && gimple_call_internal_p (call)) { internal_fn ifn = gimple_call_internal_fn (call); - int mask_index = internal_fn_mask_index (ifn); + int mask_index = internal_fn_mask_index (ifn, false); if (mask_index >= 0 && use == gimple_call_arg (call, mask_index)) return true; @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, gs_info->offset_vectype, gs_info->scale)) { + ifn = (is_load + ? IFN_LEN_MASK_GATHER_LOAD + : IFN_LEN_MASK_SCATTER_STORE); + if (internal_gather_scatter_fn_supported_p (ifn, vectype, + gs_info->memory_type, + gs_info->offset_vectype, + gs_info->scale)) + { + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); + return; + } if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "can't operate on partial vectors because" @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo, static void vect_get_strided_load_store_ops (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, + gimple_stmt_iterator *gsi, gather_scatter_info *gs_info, - tree *dataref_bump, tree *vec_offset) + tree *dataref_bump, tree *vec_offset, + vec_loop_lens *loop_lens) { struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); tree vectype = STMT_VINFO_VECTYPE (stmt_info); - tree bump = size_binop (MULT_EXPR, - fold_convert (sizetype, unshare_expr (DR_STEP (dr))), - size_int (TYPE_VECTOR_SUBPARTS (vectype))); - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) + { + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]); + ivtmp_8 = _31 * 16 (step in bytes); + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... ); + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */ + tree loop_len + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0); + tree tmp + = fold_build2 (MULT_EXPR, sizetype, + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), + loop_len); + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp"); + gassign *assign = gimple_build_assign (bump, tmp); + gsi_insert_before (gsi, assign, GSI_SAME_STMT); + *dataref_bump = bump; + } + else + { + tree bump + = size_binop (MULT_EXPR, + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), + size_int (TYPE_VECTOR_SUBPARTS (vectype))); + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); + } /* The offset given in GS_INFO can have pointer type, so use the element type of the vector instead. */ @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo, return false; } - int mask_index = internal_fn_mask_index (ifn); + int mask_index = internal_fn_mask_index (ifn, false); if (mask_index >= 0 && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, &mask, NULL, &mask_dt, &mask_vectype)) @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo, else if (memory_access_type == VMAT_GATHER_SCATTER) { aggr_type = elem_type; - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, - &bump, &vec_offset); + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, + &bump, &vec_offset, loop_lens); } else { @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo, unsigned HOST_WIDE_INT align; tree final_mask = NULL_TREE; + tree final_len = NULL_TREE; + tree bias = NULL_TREE; if (loop_masks) final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, vec_num * ncopies, @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo, if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) vec_offset = vec_offsets[vec_num * j + i]; tree scale = size_int (gs_info.scale); + + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE) + { + if (loop_lens) + { + final_len + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + vec_num * ncopies, vectype, + vec_num * j + i, 1); + } + else + { + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + final_len + = build_int_cst (iv_type, + TYPE_VECTOR_SUBPARTS (vectype)); + } + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + if (!final_mask) + { + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); + } + } + gcall *call; - if (final_mask) + if (final_len && final_len) + call + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE, + 7, dataref_ptr, vec_offset, + scale, vec_oprnd, final_len, + bias, final_mask); + else if (final_mask) call = gimple_build_call_internal (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset, scale, vec_oprnd, final_mask); @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo, machine_mode vmode = TYPE_MODE (vectype); machine_mode new_vmode = vmode; internal_fn partial_ifn = IFN_LAST; - /* Produce 'len' and 'bias' argument. */ - tree final_len = NULL_TREE; - tree bias = NULL_TREE; if (loop_lens) { opt_machine_mode new_ovmode @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo, else if (memory_access_type == VMAT_GATHER_SCATTER) { aggr_type = elem_type; - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, - &bump, &vec_offset); + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, + &bump, &vec_offset, loop_lens); } else { @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo, for (i = 0; i < vec_num; i++) { tree final_mask = NULL_TREE; + tree final_len = NULL_TREE; + tree bias = NULL_TREE; if (loop_masks && memory_access_type != VMAT_INVARIANT) final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo, vec_offset = vec_offsets[vec_num * j + i]; tree zero = build_zero_cst (vectype); tree scale = size_int (gs_info.scale); + + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD) + { + if (loop_lens) + { + final_len = vect_get_loop_len ( + loop_vinfo, gsi, loop_lens, vec_num * ncopies, + vectype, vec_num * j + i, 1); + } + else + { + tree iv_type + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + final_len = build_int_cst ( + iv_type, TYPE_VECTOR_SUBPARTS (vectype)); + } + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + if (!final_mask) + { + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); + } + } + gcall *call; - if (final_mask) + if (final_len && final_mask) + call = gimple_build_call_internal ( + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr, + vec_offset, scale, zero, final_len, bias, + final_mask); + else if (final_mask) call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, vec_offset, scale, zero, final_mask); @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo, machine_mode vmode = TYPE_MODE (vectype); machine_mode new_vmode = vmode; internal_fn partial_ifn = IFN_LAST; - /* Produce 'len' and 'bias' argument. */ - tree final_len = NULL_TREE; - tree bias = NULL_TREE; if (loop_lens) { opt_machine_mode new_ovmode
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> Hi, Richard and Richi. Address comments from Richard. Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE. Since: /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different on arguments before and after vectorized. Before vectorized: LEN_MASK_GATHER_LOAD (ptr, align, offset, mask); After vectorized: LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask); */ I add "vectorized_p" default argument into internal_fn_mask_index. So that we could simplify the codes. The len_mask_gather_load/len_mask_scatter_store patterns have been added. Now, this patch applies them into vectorizer. Here is the example: void f (int *restrict a, int *restrict b, int n, int base, int step, int *restrict cond) { for (int i = 0; i < n; ++i) { if (cond[i]) a[i * 4] = b[i]; } } Gimple IR: <bb 3> [local count: 105119324]: _58 = (unsigned long) n_13(D); <bb 4> [local count: 630715945]: # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)> # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)> # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)> # ivtmp_59 = PHI <ivtmp_60(4), _58(3)> _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]); ivtmp_44 = _61 * 4; vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... }); mask__24.10_49 = vect__4.9_47 != { 0, ... }; vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49); ivtmp_54 = _61 * 16; .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49); vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44; vectp_b.11_52 = vectp_b.11_51 + ivtmp_44; vectp_a.14_56 = vectp_a.14_55 + ivtmp_54; ivtmp_60 = ivtmp_59 - _61; if (ivtmp_60 != 0) goto <bb 4>; [83.33%] else goto <bb 5>; [16.67%] gcc/ChangeLog: * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer. * internal-fn.h (internal_fn_mask_index): Ditto. * optabs-query.cc (supports_vec_gather_load_p): Ditto. (supports_vec_scatter_store_p): Ditto. * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto. * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto. (check_load_store_for_partial_vectors): Ditto. (vect_get_strided_load_store_ops): Ditto. (vectorizable_store): Ditto. (vectorizable_load): Ditto. --- gcc/internal-fn.cc | 16 ++++- gcc/internal-fn.h | 2 +- gcc/optabs-query.cc | 2 + gcc/tree-vect-data-refs.cc | 18 ++++- gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------ 5 files changed, 150 insertions(+), 23 deletions(-)