Message ID | 20200831090647.152432-1-luoxhu@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | rs6000: Expand vec_insert in expander instead of gimple [PR79251] | expand |
On Mon, Aug 31, 2020 at 11:09 AM Xiong Hu Luo via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value > to be insert, arg2 is the place to insert arg1 to arg0. This patch adds > __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to > not expand too early in gimple stage if arg2 is variable, to avoid generate > store hit load instructions. > > For Power9 V4SI: > addi 9,1,-16 > rldic 6,6,2,60 > stxv 34,-16(1) > stwx 5,9,6 > lxv 34,-16(1) > => > addis 9,2,.LC0@toc@ha > addi 9,9,.LC0@toc@l > mtvsrwz 33,5 > lxv 32,0(9) > sradi 9,6,2 > addze 9,9 > sldi 9,9,2 > subf 9,9,6 > subfic 9,9,3 > sldi 9,9,2 > subfic 9,9,20 > lvsl 13,0,9 > xxperm 33,33,45 > xxperm 32,32,45 > xxsel 34,34,33,32 > > Though instructions increase from 5 to 15, the performance is improved > 60% in typical cases. Not sure if it is already possible but maybe use internal functions for those purely internal builtins instead? That makes it possible to overload with a single IFN. Richard. > gcc/ChangeLog: > > * config/rs6000/altivec.md (altivec_lvsl_reg_<mode>2): Extend to > SDI mode. > * config/rs6000/rs6000-builtin.def (BU_VSX_X): Add support > macros for vec_insert built-in functions. > * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): > Generate built-in calls for vec_insert. > * config/rs6000/rs6000-call.c (altivec_expand_vec_insert_builtin): > New function. > (altivec_expand_builtin): Add case entry for > VSX_BUILTIN_VEC_INSERT_V16QI, VSX_BUILTIN_VEC_INSERT_V8HI, > VSX_BUILTIN_VEC_INSERT_V4SF, VSX_BUILTIN_VEC_INSERT_V4SI, > VSX_BUILTIN_VEC_INSERT_V2DF, VSX_BUILTIN_VEC_INSERT_V2DI. > (altivec_init_builtins): > * config/rs6000/rs6000-protos.h (rs6000_expand_vector_insert): > New declear. > * config/rs6000/rs6000.c (rs6000_expand_vector_insert): > New function. > * config/rs6000/rs6000.md (FQHS): New mode iterator. > (FD): New mode iterator. > p8_mtvsrwz_v16qi<mode>2: New define_insn. > p8_mtvsrd_v16qi<mode>2: New define_insn. > * config/rs6000/vsx.md: Call gen_altivec_lvsl_reg_di2. > > gcc/testsuite/ChangeLog: > > * gcc.target/powerpc/pr79251.c: New test. > --- > gcc/config/rs6000/altivec.md | 4 +- > gcc/config/rs6000/rs6000-builtin.def | 6 + > gcc/config/rs6000/rs6000-c.c | 61 +++++++++ > gcc/config/rs6000/rs6000-call.c | 74 +++++++++++ > gcc/config/rs6000/rs6000-protos.h | 1 + > gcc/config/rs6000/rs6000.c | 146 +++++++++++++++++++++ > gcc/config/rs6000/rs6000.md | 19 +++ > gcc/config/rs6000/vsx.md | 2 +- > gcc/testsuite/gcc.target/powerpc/pr79251.c | 23 ++++ > 9 files changed, 333 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.c > > diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md > index 0a2e634d6b0..66b636059a6 100644 > --- a/gcc/config/rs6000/altivec.md > +++ b/gcc/config/rs6000/altivec.md > @@ -2772,10 +2772,10 @@ > DONE; > }) > > -(define_insn "altivec_lvsl_reg" > +(define_insn "altivec_lvsl_reg_<mode>2" > [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") > (unspec:V16QI > - [(match_operand:DI 1 "gpc_reg_operand" "b")] > + [(match_operand:SDI 1 "gpc_reg_operand" "b")] > UNSPEC_LVSL_REG))] > "TARGET_ALTIVEC" > "lvsl %0,0,%1" > diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def > index f9f0fece549..d095b365c14 100644 > --- a/gcc/config/rs6000/rs6000-builtin.def > +++ b/gcc/config/rs6000/rs6000-builtin.def > @@ -2047,6 +2047,12 @@ BU_VSX_X (VEC_INIT_V2DI, "vec_init_v2di", CONST) > BU_VSX_X (VEC_SET_V1TI, "vec_set_v1ti", CONST) > BU_VSX_X (VEC_SET_V2DF, "vec_set_v2df", CONST) > BU_VSX_X (VEC_SET_V2DI, "vec_set_v2di", CONST) > +BU_VSX_X (VEC_INSERT_V16QI, "vec_insert_v16qi", CONST) > +BU_VSX_X (VEC_INSERT_V8HI, "vec_insert_v8hi", CONST) > +BU_VSX_X (VEC_INSERT_V4SI, "vec_insert_v4si", CONST) > +BU_VSX_X (VEC_INSERT_V4SF, "vec_insert_v4sf", CONST) > +BU_VSX_X (VEC_INSERT_V2DI, "vec_insert_v2di", CONST) > +BU_VSX_X (VEC_INSERT_V2DF, "vec_insert_v2df", CONST) > BU_VSX_X (VEC_EXT_V1TI, "vec_ext_v1ti", CONST) > BU_VSX_X (VEC_EXT_V2DF, "vec_ext_v2df", CONST) > BU_VSX_X (VEC_EXT_V2DI, "vec_ext_v2di", CONST) > diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c > index 2fad3d94706..03b00738a5e 100644 > --- a/gcc/config/rs6000/rs6000-c.c > +++ b/gcc/config/rs6000/rs6000-c.c > @@ -1563,6 +1563,67 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, > return build_call_expr (call, 3, arg1, arg0, arg2); > } > > + else if (VECTOR_MEM_VSX_P (mode)) > + { > + tree call = NULL_TREE; > + > + arg2 = fold_for_warn (arg2); > + > + /* If the second argument is variable, we can optimize it if we are > + generating 64-bit code on a machine with direct move. */ > + if (TREE_CODE (arg2) != INTEGER_CST && TARGET_DIRECT_MOVE_64BIT) > + { > + switch (mode) > + { > + default: > + break; > + > + case E_V2DImode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V2DI]; > + break; > + > + case E_V2DFmode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V2DF]; > + break; > + > + case E_V4SFmode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V4SF]; > + break; > + > + case E_V4SImode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V4SI]; > + break; > + > + case E_V8HImode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V8HI]; > + break; > + > + case E_V16QImode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V16QI]; > + break; > + } > + } > + > + if (call) > + { > + if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) > + arg2 = build_int_cst (TREE_TYPE (arg2), 0); > + else > + arg2 = build_binary_op ( > + loc, BIT_AND_EXPR, arg2, > + build_int_cst (TREE_TYPE (arg2), > + TYPE_VECTOR_SUBPARTS (arg1_type) - 1), > + 0); > + tree result > + = build_call_expr (call, 3, arg1, > + convert (TREE_TYPE (arg1_type), arg0), > + convert (integer_type_node, arg2)); > + /* Coerce the result to vector element type. May be no-op. */ > + result = fold_convert (TREE_TYPE (arg1), result); > + return result; > + } > + } > + > /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */ > arg1_inner_type = TREE_TYPE (arg1_type); > if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) > diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c > index e39cfcf672b..339e9ae87e3 100644 > --- a/gcc/config/rs6000/rs6000-call.c > +++ b/gcc/config/rs6000/rs6000-call.c > @@ -10660,6 +10660,40 @@ altivec_expand_vec_set_builtin (tree exp) > return op0; > } > > +/* Expand vec_insert builtin. */ > +static rtx > +altivec_expand_vec_insert_builtin (tree exp, rtx target) > +{ > + machine_mode tmode, mode1, mode2; > + tree arg0, arg1, arg2; > + rtx op0 = NULL_RTX, op1, op2; > + > + arg0 = CALL_EXPR_ARG (exp, 0); > + arg1 = CALL_EXPR_ARG (exp, 1); > + arg2 = CALL_EXPR_ARG (exp, 2); > + > + tmode = TYPE_MODE (TREE_TYPE (arg0)); > + mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); > + mode2 = TYPE_MODE ((TREE_TYPE (arg2))); > + gcc_assert (VECTOR_MODE_P (tmode)); > + > + op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); > + op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); > + op2 = expand_expr (arg2, NULL_RTX, mode2, EXPAND_NORMAL); > + > + if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) > + op1 = convert_modes (mode1, GET_MODE (op1), op1, true); > + > + op0 = force_reg (tmode, op0); > + op1 = force_reg (mode1, op1); > + op2 = force_reg (mode2, op2); > + > + target = gen_reg_rtx (V16QImode); > + rs6000_expand_vector_insert (target, op0, op1, op2); > + > + return target; > +} > + > /* Expand vec_ext builtin. */ > static rtx > altivec_expand_vec_ext_builtin (tree exp, rtx target) > @@ -10922,6 +10956,14 @@ altivec_expand_builtin (tree exp, rtx target, bool *expandedp) > case VSX_BUILTIN_VEC_SET_V1TI: > return altivec_expand_vec_set_builtin (exp); > > + case VSX_BUILTIN_VEC_INSERT_V16QI: > + case VSX_BUILTIN_VEC_INSERT_V8HI: > + case VSX_BUILTIN_VEC_INSERT_V4SF: > + case VSX_BUILTIN_VEC_INSERT_V4SI: > + case VSX_BUILTIN_VEC_INSERT_V2DF: > + case VSX_BUILTIN_VEC_INSERT_V2DI: > + return altivec_expand_vec_insert_builtin (exp, target); > + > case ALTIVEC_BUILTIN_VEC_EXT_V4SI: > case ALTIVEC_BUILTIN_VEC_EXT_V8HI: > case ALTIVEC_BUILTIN_VEC_EXT_V16QI: > @@ -13681,6 +13723,38 @@ altivec_init_builtins (void) > integer_type_node, NULL_TREE); > def_builtin ("__builtin_vec_set_v2di", ftype, VSX_BUILTIN_VEC_SET_V2DI); > > + /* Access to the vec_insert patterns. */ > + ftype = build_function_type_list (V16QI_type_node, V16QI_type_node, > + intQI_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v16qi", ftype, > + VSX_BUILTIN_VEC_INSERT_V16QI); > + > + ftype = build_function_type_list (V8HI_type_node, V8HI_type_node, > + intHI_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v8hi", ftype, VSX_BUILTIN_VEC_INSERT_V8HI); > + > + ftype = build_function_type_list (V4SI_type_node, V4SI_type_node, > + integer_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v4si", ftype, VSX_BUILTIN_VEC_INSERT_V4SI); > + > + ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, > + float_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v4sf", ftype, VSX_BUILTIN_VEC_INSERT_V4SF); > + > + ftype = build_function_type_list (V2DI_type_node, V2DI_type_node, > + intDI_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v2di", ftype, VSX_BUILTIN_VEC_INSERT_V2DI); > + > + ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, > + double_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v2df", ftype, VSX_BUILTIN_VEC_INSERT_V2DF); > + > /* Access to the vec_extract patterns. */ > ftype = build_function_type_list (intSI_type_node, V4SI_type_node, > integer_type_node, NULL_TREE); > diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h > index 28e859f4381..78b5b31d79f 100644 > --- a/gcc/config/rs6000/rs6000-protos.h > +++ b/gcc/config/rs6000/rs6000-protos.h > @@ -58,6 +58,7 @@ extern bool rs6000_split_128bit_ok_p (rtx []); > extern void rs6000_expand_float128_convert (rtx, rtx, bool); > extern void rs6000_expand_vector_init (rtx, rtx); > extern void rs6000_expand_vector_set (rtx, rtx, int); > +extern void rs6000_expand_vector_insert (rtx, rtx, rtx, rtx); > extern void rs6000_expand_vector_extract (rtx, rtx, rtx); > extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); > extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c > index fe93cf6ff2b..afa845f3dff 100644 > --- a/gcc/config/rs6000/rs6000.c > +++ b/gcc/config/rs6000/rs6000.c > @@ -6788,6 +6788,152 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) > emit_insn (gen_rtx_SET (target, x)); > } > > +/* Insert value from VEC into idx of TARGET. */ > + > +void > +rs6000_expand_vector_insert (rtx target, rtx vec, rtx val, rtx idx) > +{ > + machine_mode mode = GET_MODE (vec); > + > + if (VECTOR_MEM_VSX_P (mode) && CONST_INT_P (idx)) > + gcc_unreachable (); > + else if (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx) > + && TARGET_DIRECT_MOVE_64BIT) > + { > + gcc_assert (GET_MODE (idx) == E_SImode); > + machine_mode inner_mode = GET_MODE (val); > + HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); > + > + rtx tmp = gen_reg_rtx (GET_MODE (idx)); > + if (GET_MODE_SIZE (inner_mode) == 8) > + { > + if (!BYTES_BIG_ENDIAN) > + { > + /* idx = 1 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (1), idx)); > + /* idx = idx * 8. */ > + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (3))); > + /* idx = 16 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); > + } > + else > + { > + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (3))); > + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); > + } > + } > + else if (GET_MODE_SIZE (inner_mode) == 4) > + { > + if (!BYTES_BIG_ENDIAN) > + { > + /* idx = 3 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (3), idx)); > + /* idx = idx * 4. */ > + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (2))); > + /* idx = 20 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); > + } > + else > + { > + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (2))); > + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); > + } > + } > + else if (GET_MODE_SIZE (inner_mode) == 2) > + { > + if (!BYTES_BIG_ENDIAN) > + { > + /* idx = 7 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (7), idx)); > + /* idx = idx * 2. */ > + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); > + /* idx = 22 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (22), tmp)); > + } > + else > + { > + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); > + emit_insn (gen_subsi3 (tmp, GEN_INT (22), idx)); > + } > + } > + else if (GET_MODE_SIZE (inner_mode) == 1) > + if (!BYTES_BIG_ENDIAN) > + emit_insn (gen_addsi3 (tmp, idx, GEN_INT (8))); > + else > + emit_insn (gen_subsi3 (tmp, GEN_INT (23), idx)); > + else > + gcc_unreachable (); > + > + /* lxv vs32, mask. > + DImode: 0xffffffffffffffff0000000000000000 > + SImode: 0x00000000ffffffff0000000000000000 > + HImode: 0x000000000000ffff0000000000000000. > + QImode: 0x00000000000000ff0000000000000000. */ > + rtx mask = gen_reg_rtx (V16QImode); > + rtx mask_v2di = gen_reg_rtx (V2DImode); > + rtvec v = rtvec_alloc (2); > + if (!BYTES_BIG_ENDIAN) > + { > + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0); > + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask); > + } > + else > + { > + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask); > + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0); > + } > + emit_insn ( > + gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v))); > + rtx sub_mask = simplify_gen_subreg (V16QImode, mask_v2di, V2DImode, 0); > + emit_insn (gen_rtx_SET (mask, sub_mask)); > + > + /* mtvsrd[wz] f0,val. */ > + rtx val_v16qi = gen_reg_rtx (V16QImode); > + switch (inner_mode) > + { > + default: > + gcc_unreachable (); > + break; > + case E_QImode: > + emit_insn (gen_p8_mtvsrwz_v16qiqi2 (val_v16qi, val)); > + break; > + case E_HImode: > + emit_insn (gen_p8_mtvsrwz_v16qihi2 (val_v16qi, val)); > + break; > + case E_SImode: > + emit_insn (gen_p8_mtvsrwz_v16qisi2 (val_v16qi, val)); > + break; > + case E_SFmode: > + emit_insn (gen_p8_mtvsrwz_v16qisf2 (val_v16qi, val)); > + break; > + case E_DImode: > + emit_insn (gen_p8_mtvsrd_v16qidi2 (val_v16qi, val)); > + break; > + case E_DFmode: > + emit_insn (gen_p8_mtvsrd_v16qidf2 (val_v16qi, val)); > + break; > + } > + > + /* lvsl v1,0,idx. */ > + rtx pcv = gen_reg_rtx (V16QImode); > + emit_insn (gen_altivec_lvsl_reg_si2 (pcv, tmp)); > + > + /* xxperm vs0,vs0,vs33. */ > + /* xxperm vs32,vs32,vs33. */ > + rtx val_perm = gen_reg_rtx (V16QImode); > + rtx mask_perm = gen_reg_rtx (V16QImode); > + emit_insn ( > + gen_altivec_vperm_v8hiv16qi (val_perm, val_v16qi, val_v16qi, pcv)); > + emit_insn (gen_altivec_vperm_v8hiv16qi (mask_perm, mask, mask, pcv)); > + > + rtx sub_target = simplify_gen_subreg (V16QImode, vec, mode, 0); > + emit_insn (gen_rtx_SET (target, sub_target)); > + > + /* xxsel vs34,vs34,vs0,vs32. */ > + emit_insn (gen_vector_select_v16qi (target, target, val_perm, mask_perm)); > + } > +} > + > /* Extract field ELT from VEC into TARGET. */ > > void > diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md > index 43b620ae1c0..b02fda836d4 100644 > --- a/gcc/config/rs6000/rs6000.md > +++ b/gcc/config/rs6000/rs6000.md > @@ -8713,6 +8713,25 @@ > "mtvsrwz %x0,%1" > [(set_attr "type" "mftgpr")]) > > +(define_mode_iterator FQHS [SF QI HI SI]) > +(define_mode_iterator FD [DF DI]) > + > +(define_insn "p8_mtvsrwz_v16qi<mode>2" > + [(set (match_operand:V16QI 0 "register_operand" "=wa") > + (unspec:V16QI [(match_operand:FQHS 1 "register_operand" "r")] > + UNSPEC_P8V_MTVSRWZ))] > + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" > + "mtvsrwz %x0,%1" > + [(set_attr "type" "mftgpr")]) > + > +(define_insn "p8_mtvsrd_v16qi<mode>2" > + [(set (match_operand:V16QI 0 "register_operand" "=wa") > + (unspec:V16QI [(match_operand:FD 1 "register_operand" "r")] > + UNSPEC_P8V_MTVSRD))] > + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" > + "mtvsrd %x0,%1" > + [(set_attr "type" "mftgpr")]) > + > (define_insn_and_split "reload_fpr_from_gpr<mode>" > [(set (match_operand:FMOVE64X 0 "register_operand" "=d") > (unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")] > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index dd750210758..7e82690d12d 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -5349,7 +5349,7 @@ > rtx rtx_vtmp = gen_reg_rtx (V16QImode); > rtx tmp = gen_reg_rtx (DImode); > > - emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2])); > + emit_insn (gen_altivec_lvsl_reg_di2 (shift_mask, operands[2])); > emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56))); > emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp)); > emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp, > diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.c b/gcc/testsuite/gcc.target/powerpc/pr79251.c > new file mode 100644 > index 00000000000..877659a0146 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target powerpc_p9vector_ok } */ > +/* { dg-require-effective-target lp64 } */ > +/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */ > + > +#include <stddef.h> > +#include <altivec.h> > + > +#define TYPE int > + > +__attribute__ ((noinline)) > +vector TYPE test (vector TYPE v, TYPE i, size_t n) > +{ > + vector TYPE v1 = v; > + v1 = vec_insert (i, v, n); > + > + return v1; > +} > + > +/* { dg-final { scan-assembler-not {\mstxw\M} } } */ > +/* { dg-final { scan-assembler-times {\mlvsl\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mxxperm\M} 2 } } */ > +/* { dg-final { scan-assembler-times {\mxxsel\M} 1 } } */ > -- > 2.27.0.90.geebb51ba8c >
On Mon, 2020-08-31 at 04:06 -0500, Xiong Hu Luo via Gcc-patches wrote: > vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value > to be insert, arg2 is the place to insert arg1 to arg0. This patch adds > __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to > not expand too early in gimple stage if arg2 is variable, to avoid generate > store hit load instructions. > > For Power9 V4SI: > addi 9,1,-16 > rldic 6,6,2,60 > stxv 34,-16(1) > stwx 5,9,6 > lxv 34,-16(1) > => > addis 9,2,.LC0@toc@ha > addi 9,9,.LC0@toc@l > mtvsrwz 33,5 > lxv 32,0(9) > sradi 9,6,2 > addze 9,9 > sldi 9,9,2 > subf 9,9,6 > subfic 9,9,3 > sldi 9,9,2 > subfic 9,9,20 > lvsl 13,0,9 > xxperm 33,33,45 > xxperm 32,32,45 > xxsel 34,34,33,32 > > Though instructions increase from 5 to 15, the performance is improved > 60% in typical cases. Ok. :-) (bunch of nits below, no issues with the gist of the patch). > gcc/ChangeLog: > > * config/rs6000/altivec.md (altivec_lvsl_reg_<mode>2): Extend to > SDI mode. (altivec_lvsl_reg): Rename to (altivec_lvsl_reg_<mode>2) and extend to SDI mode. > * config/rs6000/rs6000-builtin.def (BU_VSX_X): Add support > macros for vec_insert built-in functions. Should that list the VEC_INSERT_V16QI, VEC_INSERT_V8HI, ... values instead of the BU_VSX_X ? (need second opinion.. ) > * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): > Generate built-in calls for vec_insert. > * config/rs6000/rs6000-call.c (altivec_expand_vec_insert_builtin): > New function. > (altivec_expand_builtin): Add case entry for > VSX_BUILTIN_VEC_INSERT_V16QI, VSX_BUILTIN_VEC_INSERT_V8HI, > VSX_BUILTIN_VEC_INSERT_V4SF, VSX_BUILTIN_VEC_INSERT_V4SI, > VSX_BUILTIN_VEC_INSERT_V2DF, VSX_BUILTIN_VEC_INSERT_V2DI. plural entries :-) > (altivec_init_builtins): Add defines for __builtin_vec_insert_v16qi, __builtin_vec_insert_v8hi, ... > * config/rs6000/rs6000-protos.h (rs6000_expand_vector_insert): > New declear. declare > * config/rs6000/rs6000.c (rs6000_expand_vector_insert): > New function. > * config/rs6000/rs6000.md (FQHS): New mode iterator. > (FD): New mode iterator. > p8_mtvsrwz_v16qi<mode>2: New define_insn. > p8_mtvsrd_v16qi<mode>2: New define_insn. > * config/rs6000/vsx.md: Call gen_altivec_lvsl_reg_di2. ok > > gcc/testsuite/ChangeLog: > > * gcc.target/powerpc/pr79251.c: New test. > --- > gcc/config/rs6000/altivec.md | 4 +- > gcc/config/rs6000/rs6000-builtin.def | 6 + > gcc/config/rs6000/rs6000-c.c | 61 +++++++++ > gcc/config/rs6000/rs6000-call.c | 74 +++++++++++ > gcc/config/rs6000/rs6000-protos.h | 1 + > gcc/config/rs6000/rs6000.c | 146 +++++++++++++++++++++ > gcc/config/rs6000/rs6000.md | 19 +++ > gcc/config/rs6000/vsx.md | 2 +- > gcc/testsuite/gcc.target/powerpc/pr79251.c | 23 ++++ > 9 files changed, 333 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.c > > diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md > index 0a2e634d6b0..66b636059a6 100644 > --- a/gcc/config/rs6000/altivec.md > +++ b/gcc/config/rs6000/altivec.md > @@ -2772,10 +2772,10 @@ > DONE; > }) > > -(define_insn "altivec_lvsl_reg" > +(define_insn "altivec_lvsl_reg_<mode>2" > [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") > (unspec:V16QI > - [(match_operand:DI 1 "gpc_reg_operand" "b")] > + [(match_operand:SDI 1 "gpc_reg_operand" "b")] > UNSPEC_LVSL_REG))] > "TARGET_ALTIVEC" > "lvsl %0,0,%1" > diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def > index f9f0fece549..d095b365c14 100644 > --- a/gcc/config/rs6000/rs6000-builtin.def > +++ b/gcc/config/rs6000/rs6000-builtin.def > @@ -2047,6 +2047,12 @@ BU_VSX_X (VEC_INIT_V2DI, "vec_init_v2di", CONST) > BU_VSX_X (VEC_SET_V1TI, "vec_set_v1ti", CONST) > BU_VSX_X (VEC_SET_V2DF, "vec_set_v2df", CONST) > BU_VSX_X (VEC_SET_V2DI, "vec_set_v2di", CONST) > +BU_VSX_X (VEC_INSERT_V16QI, "vec_insert_v16qi", CONST) > +BU_VSX_X (VEC_INSERT_V8HI, "vec_insert_v8hi", CONST) > +BU_VSX_X (VEC_INSERT_V4SI, "vec_insert_v4si", CONST) > +BU_VSX_X (VEC_INSERT_V4SF, "vec_insert_v4sf", CONST) > +BU_VSX_X (VEC_INSERT_V2DI, "vec_insert_v2di", CONST) > +BU_VSX_X (VEC_INSERT_V2DF, "vec_insert_v2df", CONST) > BU_VSX_X (VEC_EXT_V1TI, "vec_ext_v1ti", CONST) > BU_VSX_X (VEC_EXT_V2DF, "vec_ext_v2df", CONST) > BU_VSX_X (VEC_EXT_V2DI, "vec_ext_v2di", CONST) > diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c > index 2fad3d94706..03b00738a5e 100644 > --- a/gcc/config/rs6000/rs6000-c.c > +++ b/gcc/config/rs6000/rs6000-c.c > @@ -1563,6 +1563,67 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, > return build_call_expr (call, 3, arg1, arg0, arg2); > } > > + else if (VECTOR_MEM_VSX_P (mode)) > + { > + tree call = NULL_TREE; > + > + arg2 = fold_for_warn (arg2); > + > + /* If the second argument is variable, we can optimize it if we are > + generating 64-bit code on a machine with direct move. */ > + if (TREE_CODE (arg2) != INTEGER_CST && TARGET_DIRECT_MOVE_64BIT) > + { > + switch (mode) > + { > + default: > + break; > + > + case E_V2DImode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V2DI]; > + break; > + > + case E_V2DFmode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V2DF]; > + break; > + > + case E_V4SFmode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V4SF]; > + break; > + > + case E_V4SImode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V4SI]; > + break; > + > + case E_V8HImode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V8HI]; > + break; > + > + case E_V16QImode: > + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V16QI]; > + break; > + } > + } > + > + if (call) > + { > + if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) > + arg2 = build_int_cst (TREE_TYPE (arg2), 0); > + else > + arg2 = build_binary_op ( > + loc, BIT_AND_EXPR, arg2, > + build_int_cst (TREE_TYPE (arg2), > + TYPE_VECTOR_SUBPARTS (arg1_type) - 1), > + 0); Indentation nit there, the "loc, BIT_AND_EXPR, ..." line should go on the previous line. If that greatly messes up the indentation of the rest of the statement, use your judgement. > + tree result > + = build_call_expr (call, 3, arg1, > + convert (TREE_TYPE (arg1_type), arg0), > + convert (integer_type_node, arg2)); > + /* Coerce the result to vector element type. May be no-op. */ > + result = fold_convert (TREE_TYPE (arg1), result); > + return result; > + } > + } > + > /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */ > arg1_inner_type = TREE_TYPE (arg1_type); > if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) > diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c > index e39cfcf672b..339e9ae87e3 100644 > --- a/gcc/config/rs6000/rs6000-call.c > +++ b/gcc/config/rs6000/rs6000-call.c > @@ -10660,6 +10660,40 @@ altivec_expand_vec_set_builtin (tree exp) > return op0; > } > > +/* Expand vec_insert builtin. */ > +static rtx > +altivec_expand_vec_insert_builtin (tree exp, rtx target) > +{ > + machine_mode tmode, mode1, mode2; > + tree arg0, arg1, arg2; > + rtx op0 = NULL_RTX, op1, op2; > + > + arg0 = CALL_EXPR_ARG (exp, 0); > + arg1 = CALL_EXPR_ARG (exp, 1); > + arg2 = CALL_EXPR_ARG (exp, 2); > + > + tmode = TYPE_MODE (TREE_TYPE (arg0)); > + mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); > + mode2 = TYPE_MODE ((TREE_TYPE (arg2))); > + gcc_assert (VECTOR_MODE_P (tmode)); > + > + op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); > + op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); > + op2 = expand_expr (arg2, NULL_RTX, mode2, EXPAND_NORMAL); > + > + if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) > + op1 = convert_modes (mode1, GET_MODE (op1), op1, true); > + > + op0 = force_reg (tmode, op0); > + op1 = force_reg (mode1, op1); > + op2 = force_reg (mode2, op2); > + > + target = gen_reg_rtx (V16QImode); Should that be tmode, or is V16QImode always correct here? > + rs6000_expand_vector_insert (target, op0, op1, op2); > + > + return target; > +} > + > /* Expand vec_ext builtin. */ > static rtx > altivec_expand_vec_ext_builtin (tree exp, rtx target) > @@ -10922,6 +10956,14 @@ altivec_expand_builtin (tree exp, rtx target, bool *expandedp) > case VSX_BUILTIN_VEC_SET_V1TI: > return altivec_expand_vec_set_builtin (exp); > > + case VSX_BUILTIN_VEC_INSERT_V16QI: > + case VSX_BUILTIN_VEC_INSERT_V8HI: > + case VSX_BUILTIN_VEC_INSERT_V4SF: > + case VSX_BUILTIN_VEC_INSERT_V4SI: > + case VSX_BUILTIN_VEC_INSERT_V2DF: > + case VSX_BUILTIN_VEC_INSERT_V2DI: > + return altivec_expand_vec_insert_builtin (exp, target); > + > case ALTIVEC_BUILTIN_VEC_EXT_V4SI: > case ALTIVEC_BUILTIN_VEC_EXT_V8HI: > case ALTIVEC_BUILTIN_VEC_EXT_V16QI: > @@ -13681,6 +13723,38 @@ altivec_init_builtins (void) > integer_type_node, NULL_TREE); > def_builtin ("__builtin_vec_set_v2di", ftype, VSX_BUILTIN_VEC_SET_V2DI); > > + /* Access to the vec_insert patterns. */ > + ftype = build_function_type_list (V16QI_type_node, V16QI_type_node, > + intQI_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v16qi", ftype, > + VSX_BUILTIN_VEC_INSERT_V16QI); > + > + ftype = build_function_type_list (V8HI_type_node, V8HI_type_node, > + intHI_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v8hi", ftype, VSX_BUILTIN_VEC_INSERT_V8HI); > + > + ftype = build_function_type_list (V4SI_type_node, V4SI_type_node, > + integer_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v4si", ftype, VSX_BUILTIN_VEC_INSERT_V4SI); > + > + ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, > + float_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v4sf", ftype, VSX_BUILTIN_VEC_INSERT_V4SF); > + > + ftype = build_function_type_list (V2DI_type_node, V2DI_type_node, > + intDI_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v2di", ftype, VSX_BUILTIN_VEC_INSERT_V2DI); > + > + ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, > + double_type_node, > + integer_type_node, NULL_TREE); > + def_builtin ("__builtin_vec_insert_v2df", ftype, VSX_BUILTIN_VEC_INSERT_V2DF); > + > /* Access to the vec_extract patterns. */ > ftype = build_function_type_list (intSI_type_node, V4SI_type_node, > integer_type_node, NULL_TREE); > diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h > index 28e859f4381..78b5b31d79f 100644 > --- a/gcc/config/rs6000/rs6000-protos.h > +++ b/gcc/config/rs6000/rs6000-protos.h > @@ -58,6 +58,7 @@ extern bool rs6000_split_128bit_ok_p (rtx []); > extern void rs6000_expand_float128_convert (rtx, rtx, bool); > extern void rs6000_expand_vector_init (rtx, rtx); > extern void rs6000_expand_vector_set (rtx, rtx, int); > +extern void rs6000_expand_vector_insert (rtx, rtx, rtx, rtx); > extern void rs6000_expand_vector_extract (rtx, rtx, rtx); > extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); > extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c > index fe93cf6ff2b..afa845f3dff 100644 > --- a/gcc/config/rs6000/rs6000.c > +++ b/gcc/config/rs6000/rs6000.c > @@ -6788,6 +6788,152 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) > emit_insn (gen_rtx_SET (target, x)); > } > > +/* Insert value from VEC into idx of TARGET. */ > + > +void > +rs6000_expand_vector_insert (rtx target, rtx vec, rtx val, rtx idx) > +{ > + machine_mode mode = GET_MODE (vec); > + > + if (VECTOR_MEM_VSX_P (mode) && CONST_INT_P (idx)) > + gcc_unreachable (); only 2 spaces indent. (My mailer has suddenly gotten confused with tabs and spaces,.. here and below may need spaces replaced with tabs, or may just be a problem on my end.. ) > + else if (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx) > + && TARGET_DIRECT_MOVE_64BIT) > + { > + gcc_assert (GET_MODE (idx) == E_SImode); > + machine_mode inner_mode = GET_MODE (val); > + HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); > + > + rtx tmp = gen_reg_rtx (GET_MODE (idx)); > + if (GET_MODE_SIZE (inner_mode) == 8) > + { > + if (!BYTES_BIG_ENDIAN) > + { > + /* idx = 1 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (1), idx)); > + /* idx = idx * 8. */ > + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (3))); > + /* idx = 16 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); > + } > + else > + { > + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (3))); > + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); > + } > + } > + else if (GET_MODE_SIZE (inner_mode) == 4) > + { > + if (!BYTES_BIG_ENDIAN) > + { > + /* idx = 3 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (3), idx)); > + /* idx = idx * 4. */ > + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (2))); > + /* idx = 20 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); > + } > + else > + { > + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (2))); > + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); > + } > + } > + else if (GET_MODE_SIZE (inner_mode) == 2) > + { > + if (!BYTES_BIG_ENDIAN) > + { > + /* idx = 7 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (7), idx)); > + /* idx = idx * 2. */ > + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); > + /* idx = 22 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (22), tmp)); > + } > + else > + { > + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); > + emit_insn (gen_subsi3 (tmp, GEN_INT (22), idx)); > + } > + } > + else if (GET_MODE_SIZE (inner_mode) == 1) > + if (!BYTES_BIG_ENDIAN) > + emit_insn (gen_addsi3 (tmp, idx, GEN_INT (8))); > + else > + emit_insn (gen_subsi3 (tmp, GEN_INT (23), idx)); > + else > + gcc_unreachable (); > + > + /* lxv vs32, mask. > + DImode: 0xffffffffffffffff0000000000000000 > + SImode: 0x00000000ffffffff0000000000000000 > + HImode: 0x000000000000ffff0000000000000000. > + QImode: 0x00000000000000ff0000000000000000. */ good. :-) > + rtx mask = gen_reg_rtx (V16QImode); > + rtx mask_v2di = gen_reg_rtx (V2DImode); > + rtvec v = rtvec_alloc (2); > + if (!BYTES_BIG_ENDIAN) > + { > + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0); > + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask); > + } > + else > + { > + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask); > + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0); > + } > + emit_insn ( > + gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v))); > + rtx sub_mask = simplify_gen_subreg (V16QImode, mask_v2di, V2DImode, 0); > + emit_insn (gen_rtx_SET (mask, sub_mask)); > + > + /* mtvsrd[wz] f0,val. */ > + rtx val_v16qi = gen_reg_rtx (V16QImode); > + switch (inner_mode) > + { > + default: > + gcc_unreachable (); > + break; > + case E_QImode: > + emit_insn (gen_p8_mtvsrwz_v16qiqi2 (val_v16qi, val)); > + break; > + case E_HImode: > + emit_insn (gen_p8_mtvsrwz_v16qihi2 (val_v16qi, val)); > + break; > + case E_SImode: > + emit_insn (gen_p8_mtvsrwz_v16qisi2 (val_v16qi, val)); > + break; > + case E_SFmode: > + emit_insn (gen_p8_mtvsrwz_v16qisf2 (val_v16qi, val)); > + break; > + case E_DImode: > + emit_insn (gen_p8_mtvsrd_v16qidi2 (val_v16qi, val)); > + break; > + case E_DFmode: > + emit_insn (gen_p8_mtvsrd_v16qidf2 (val_v16qi, val)); > + break; > + } > + > + /* lvsl v1,0,idx. */ > + rtx pcv = gen_reg_rtx (V16QImode); > + emit_insn (gen_altivec_lvsl_reg_si2 (pcv, tmp)); > + > + /* xxperm vs0,vs0,vs33. */ > + /* xxperm vs32,vs32,vs33. */ > + rtx val_perm = gen_reg_rtx (V16QImode); > + rtx mask_perm = gen_reg_rtx (V16QImode); > + emit_insn ( > + gen_altivec_vperm_v8hiv16qi (val_perm, val_v16qi, val_v16qi, pcv)); > + emit_insn (gen_altivec_vperm_v8hiv16qi (mask_perm, mask, mask, pcv)); > + > + rtx sub_target = simplify_gen_subreg (V16QImode, vec, mode, 0); > + emit_insn (gen_rtx_SET (target, sub_target)); > + > + /* xxsel vs34,vs34,vs0,vs32. */ > + emit_insn (gen_vector_select_v16qi (target, target, val_perm, mask_perm)); > + } > +} > + > /* Extract field ELT from VEC into TARGET. */ > > void > diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md > index 43b620ae1c0..b02fda836d4 100644 > --- a/gcc/config/rs6000/rs6000.md > +++ b/gcc/config/rs6000/rs6000.md > @@ -8713,6 +8713,25 @@ > "mtvsrwz %x0,%1" > [(set_attr "type" "mftgpr")]) > > +(define_mode_iterator FQHS [SF QI HI SI]) > +(define_mode_iterator FD [DF DI]) > + > +(define_insn "p8_mtvsrwz_v16qi<mode>2" > + [(set (match_operand:V16QI 0 "register_operand" "=wa") > + (unspec:V16QI [(match_operand:FQHS 1 "register_operand" "r")] > + UNSPEC_P8V_MTVSRWZ))] > + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" > + "mtvsrwz %x0,%1" > + [(set_attr "type" "mftgpr")]) > + > +(define_insn "p8_mtvsrd_v16qi<mode>2" > + [(set (match_operand:V16QI 0 "register_operand" "=wa") > + (unspec:V16QI [(match_operand:FD 1 "register_operand" "r")] > + UNSPEC_P8V_MTVSRD))] > + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" > + "mtvsrd %x0,%1" > + [(set_attr "type" "mftgpr")]) > + > (define_insn_and_split "reload_fpr_from_gpr<mode>" > [(set (match_operand:FMOVE64X 0 "register_operand" "=d") > (unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")] > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index dd750210758..7e82690d12d 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -5349,7 +5349,7 @@ > rtx rtx_vtmp = gen_reg_rtx (V16QImode); > rtx tmp = gen_reg_rtx (DImode); > > - emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2])); > + emit_insn (gen_altivec_lvsl_reg_di2 (shift_mask, operands[2])); > emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56))); > emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp)); > emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp, > diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.c b/gcc/testsuite/gcc.target/powerpc/pr79251.c > new file mode 100644 > index 00000000000..877659a0146 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target powerpc_p9vector_ok } */ > +/* { dg-require-effective-target lp64 } */ > +/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */ > + > +#include <stddef.h> > +#include <altivec.h> > + > +#define TYPE int Is testing against int types sufficient coverage? (are there other existing tests?) thanks, -Will > + > +__attribute__ ((noinline)) > +vector TYPE test (vector TYPE v, TYPE i, size_t n) > +{ > + vector TYPE v1 = v; > + v1 = vec_insert (i, v, n); > + > + return v1; > +} > + > +/* { dg-final { scan-assembler-not {\mstxw\M} } } */ > +/* { dg-final { scan-assembler-times {\mlvsl\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mxxperm\M} 2 } } */ > +/* { dg-final { scan-assembler-times {\mxxsel\M} 1 } } */
Hi! On Mon, Aug 31, 2020 at 04:06:47AM -0500, Xiong Hu Luo wrote: > vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value > to be insert, arg2 is the place to insert arg1 to arg0. This patch adds > __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to > not expand too early in gimple stage if arg2 is variable, to avoid generate > store hit load instructions. > > For Power9 V4SI: > addi 9,1,-16 > rldic 6,6,2,60 > stxv 34,-16(1) > stwx 5,9,6 > lxv 34,-16(1) > => > addis 9,2,.LC0@toc@ha > addi 9,9,.LC0@toc@l > mtvsrwz 33,5 > lxv 32,0(9) > sradi 9,6,2 > addze 9,9 > sldi 9,9,2 > subf 9,9,6 > subfic 9,9,3 > sldi 9,9,2 > subfic 9,9,20 > lvsl 13,0,9 > xxperm 33,33,45 > xxperm 32,32,45 > xxsel 34,34,33,32 For v a V4SI, x a SI, j some int, what do we generate for v[j&3] = x; ? This should be exactly the same as we generate for vec_insert(x, v, j); (the builtin does a modulo 4 automatically). Segher
Hi, On 2020/9/1 01:04, Segher Boessenkool wrote: > Hi! > > On Mon, Aug 31, 2020 at 04:06:47AM -0500, Xiong Hu Luo wrote: >> vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value >> to be insert, arg2 is the place to insert arg1 to arg0. This patch adds >> __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to >> not expand too early in gimple stage if arg2 is variable, to avoid generate >> store hit load instructions. >> >> For Power9 V4SI: >> addi 9,1,-16 >> rldic 6,6,2,60 >> stxv 34,-16(1) >> stwx 5,9,6 >> lxv 34,-16(1) >> => >> addis 9,2,.LC0@toc@ha >> addi 9,9,.LC0@toc@l >> mtvsrwz 33,5 >> lxv 32,0(9) >> sradi 9,6,2 >> addze 9,9 >> sldi 9,9,2 >> subf 9,9,6 >> subfic 9,9,3 >> sldi 9,9,2 >> subfic 9,9,20 >> lvsl 13,0,9 >> xxperm 33,33,45 >> xxperm 32,32,45 >> xxsel 34,34,33,32 > > For v a V4SI, x a SI, j some int, what do we generate for > v[j&3] = x; > ? > This should be exactly the same as we generate for > vec_insert(x, v, j); > (the builtin does a modulo 4 automatically). No, even with my patch "stxv 34,-16(1);stwx 5,9,6;lxv 34,-16(1)" generated currently. Is it feasible and acceptable to expand some kind of pattern in expander directly without builtin transition? I borrowed some of implementation from vec_extract. For vec_extract, the issue also exists: source: gimple: expand: asm: 1) i = vec_extract (v, n); => i = __builtin_vec_ext_v4si (v, n); => {r120:SI=unspec[r118:V4SI,r119:DI] 134;...} => slwi 9,6,2 vextuwrx 3,9,2 2) i = vec_extract (v, 3); => i = __builtin_vec_ext_v4si (v, 3); => {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12 vextuwrx 3,9,2 3) i = v[n%4]; => _1 = n & 3; i = VIEW_CONVERT_EXPR<int[4]>(v)[_1]; => ... => stxv 34,-16(1);addi 9,1,-16; rldic 5,5,2,60; lwax 3,9,5 4) i = v[3]; => i = BIT_FIELD_REF <v, 32, 96>; => {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12; vextuwrx 3,9,2 Case 3) also couldn't handle the similar usage, and case 4) doesn't generate builtin as expected, it just expand to vec_select by coincidence. So does this mean both vec_insert and vec_extract and all other similar vector builtins should use IFN as suggested by Richard Biener, to match the pattern in gimple and expand both constant and variable index in expander? Will this also be beneficial for other targets except power? Or we should do that gradually after this patch approved as it seems another independent issue? Thanks:) Xionghu
Hi, On 2020/9/1 00:47, will schmidt wrote: >> + tmode = TYPE_MODE (TREE_TYPE (arg0)); >> + mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); >> + mode2 = TYPE_MODE ((TREE_TYPE (arg2))); >> + gcc_assert (VECTOR_MODE_P (tmode)); >> + >> + op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); >> + op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); >> + op2 = expand_expr (arg2, NULL_RTX, mode2, EXPAND_NORMAL); >> + >> + if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) >> + op1 = convert_modes (mode1, GET_MODE (op1), op1, true); >> + >> + op0 = force_reg (tmode, op0); >> + op1 = force_reg (mode1, op1); >> + op2 = force_reg (mode2, op2); >> + >> + target = gen_reg_rtx (V16QImode); > Should that be tmode, or is V16QImode always correct here? Thanks for the review. Yes, the target should be TImode here, but the followed call rs6000_expand_vector_insert needs a lot of emit_insns in it, using V16QI could reuse most of patterns in existed md files, after returning from this function, there will be a convert from V16QImode to TImode to make the type same: expr.c: convert_move (target, temp, TYPE_UNSIGNED (TREE_TYPE (exp))); and I've tested this with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on Power9-LE and Power8-BE, the result correctness is ensured. Other comments are modified. Will update it later if no disagreements about the implementation. Thanks, Xionghu > >> + rs6000_expand_vector_insert (target, op0, op1, op2); >> + >> + return target; >> +}
On Tue, Sep 1, 2020 at 10:11 AM luoxhu via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi, > > On 2020/9/1 01:04, Segher Boessenkool wrote: > > Hi! > > > > On Mon, Aug 31, 2020 at 04:06:47AM -0500, Xiong Hu Luo wrote: > >> vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value > >> to be insert, arg2 is the place to insert arg1 to arg0. This patch adds > >> __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to > >> not expand too early in gimple stage if arg2 is variable, to avoid generate > >> store hit load instructions. > >> > >> For Power9 V4SI: > >> addi 9,1,-16 > >> rldic 6,6,2,60 > >> stxv 34,-16(1) > >> stwx 5,9,6 > >> lxv 34,-16(1) > >> => > >> addis 9,2,.LC0@toc@ha > >> addi 9,9,.LC0@toc@l > >> mtvsrwz 33,5 > >> lxv 32,0(9) > >> sradi 9,6,2 > >> addze 9,9 > >> sldi 9,9,2 > >> subf 9,9,6 > >> subfic 9,9,3 > >> sldi 9,9,2 > >> subfic 9,9,20 > >> lvsl 13,0,9 > >> xxperm 33,33,45 > >> xxperm 32,32,45 > >> xxsel 34,34,33,32 > > > > For v a V4SI, x a SI, j some int, what do we generate for > > v[j&3] = x; > > ? > > This should be exactly the same as we generate for > > vec_insert(x, v, j); > > (the builtin does a modulo 4 automatically). > > No, even with my patch "stxv 34,-16(1);stwx 5,9,6;lxv 34,-16(1)" generated currently. > Is it feasible and acceptable to expand some kind of pattern in expander directly without > builtin transition? > > I borrowed some of implementation from vec_extract. For vec_extract, the issue also exists: > > source: gimple: expand: asm: > 1) i = vec_extract (v, n); => i = __builtin_vec_ext_v4si (v, n); => {r120:SI=unspec[r118:V4SI,r119:DI] 134;...} => slwi 9,6,2 vextuwrx 3,9,2 > 2) i = vec_extract (v, 3); => i = __builtin_vec_ext_v4si (v, 3); => {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12 vextuwrx 3,9,2 > 3) i = v[n%4]; => _1 = n & 3; i = VIEW_CONVERT_EXPR<int[4]>(v)[_1]; => ... => stxv 34,-16(1);addi 9,1,-16; rldic 5,5,2,60; lwax 3,9,5 > 4) i = v[3]; => i = BIT_FIELD_REF <v, 32, 96>; => {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12; vextuwrx 3,9,2 Why are 1) and 2) handled differently than 3)/4)? > Case 3) also couldn't handle the similar usage, and case 4) doesn't generate builtin as expected, > it just expand to vec_select by coincidence. So does this mean both vec_insert and vec_extract > and all other similar vector builtins should use IFN as suggested by Richard Biener, to match the > pattern in gimple and expand both constant and variable index in expander? Will this also be > beneficial for other targets except power? Or we should do that gradually after this patch > approved as it seems another independent issue? Thanks:) If the code generated for 3)/4) isn't optimal you have to figure why by tracing the RTL expansion code and looking for missing optabs. Consider the amount of backend code you need to write if ever using those in constexpr context ... Richard. > > > Xionghu
On Tue, Sep 01, 2020 at 04:09:53PM +0800, luoxhu wrote: > On 2020/9/1 01:04, Segher Boessenkool wrote: > > For v a V4SI, x a SI, j some int, what do we generate for > > v[j&3] = x; > > ? > > This should be exactly the same as we generate for > > vec_insert(x, v, j); > > (the builtin does a modulo 4 automatically). > > No, even with my patch "stxv 34,-16(1);stwx 5,9,6;lxv 34,-16(1)" generated currently. I think you should solve the problem in the generic case, then, since it is (presumably) much more frequent. > Is it feasible and acceptable to expand some kind of pattern in expander directly without > builtin transition? I don't understand what you mean? > I borrowed some of implementation from vec_extract. For vec_extract, the issue also exists: > > source: gimple: expand: asm: > 1) i = vec_extract (v, n); => i = __builtin_vec_ext_v4si (v, n); => {r120:SI=unspec[r118:V4SI,r119:DI] 134;...} => slwi 9,6,2 vextuwrx 3,9,2 > 2) i = vec_extract (v, 3); => i = __builtin_vec_ext_v4si (v, 3); => {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12 vextuwrx 3,9,2 > 3) i = v[n%4]; => _1 = n & 3; i = VIEW_CONVERT_EXPR<int[4]>(v)[_1]; => ... => stxv 34,-16(1);addi 9,1,-16; rldic 5,5,2,60; lwax 3,9,5 > 4) i = v[3]; => i = BIT_FIELD_REF <v, 32, 96>; => {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12; vextuwrx 3,9,2 > > Case 3) also couldn't handle the similar usage, and case 4) doesn't generate builtin as expected, > it just expand to vec_select by coincidence. So does this mean both vec_insert and vec_extract > and all other similar vector builtins should use IFN as suggested by Richard Biener, to match the > pattern in gimple and expand both constant and variable index in expander? Will this also be > beneficial for other targets except power? Or we should do that gradually after this patch > approved as it seems another independent issue? Thanks:) I don't think we should do that at all? IFNs just complicate everything, there is no added value here: this is just data movement! We need to avoid the data aliasing some generic way. Segher
Hi, On 2020/9/1 21:07, Richard Biener wrote: > On Tue, Sep 1, 2020 at 10:11 AM luoxhu via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> Hi, >> >> On 2020/9/1 01:04, Segher Boessenkool wrote: >>> Hi! >>> >>> On Mon, Aug 31, 2020 at 04:06:47AM -0500, Xiong Hu Luo wrote: >>>> vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value >>>> to be insert, arg2 is the place to insert arg1 to arg0. This patch adds >>>> __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to >>>> not expand too early in gimple stage if arg2 is variable, to avoid generate >>>> store hit load instructions. >>>> >>>> For Power9 V4SI: >>>> addi 9,1,-16 >>>> rldic 6,6,2,60 >>>> stxv 34,-16(1) >>>> stwx 5,9,6 >>>> lxv 34,-16(1) >>>> => >>>> addis 9,2,.LC0@toc@ha >>>> addi 9,9,.LC0@toc@l >>>> mtvsrwz 33,5 >>>> lxv 32,0(9) >>>> sradi 9,6,2 >>>> addze 9,9 >>>> sldi 9,9,2 >>>> subf 9,9,6 >>>> subfic 9,9,3 >>>> sldi 9,9,2 >>>> subfic 9,9,20 >>>> lvsl 13,0,9 >>>> xxperm 33,33,45 >>>> xxperm 32,32,45 >>>> xxsel 34,34,33,32 >>> >>> For v a V4SI, x a SI, j some int, what do we generate for >>> v[j&3] = x; >>> ? >>> This should be exactly the same as we generate for >>> vec_insert(x, v, j); >>> (the builtin does a modulo 4 automatically). >> >> No, even with my patch "stxv 34,-16(1);stwx 5,9,6;lxv 34,-16(1)" generated currently. >> Is it feasible and acceptable to expand some kind of pattern in expander directly without >> builtin transition? >> >> I borrowed some of implementation from vec_extract. For vec_extract, the issue also exists: >> >> source: gimple: expand: asm: >> 1) i = vec_extract (v, n); => i = __builtin_vec_ext_v4si (v, n); => {r120:SI=unspec[r118:V4SI,r119:DI] 134;...} => slwi 9,6,2 vextuwrx 3,9,2 >> 2) i = vec_extract (v, 3); => i = __builtin_vec_ext_v4si (v, 3); => {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12 vextuwrx 3,9,2 >> 3) i = v[n%4]; => _1 = n & 3; i = VIEW_CONVERT_EXPR<int[4]>(v)[_1]; => ... => stxv 34,-16(1);addi 9,1,-16; rldic 5,5,2,60; lwax 3,9,5 >> 4) i = v[3]; => i = BIT_FIELD_REF <v, 32, 96>; => {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12; vextuwrx 3,9,2 > > Why are 1) and 2) handled differently than 3)/4)? 1) and 2) are calling builtin function vec_extract, it is defined to __builtin_vec_extract and will be resolved to ALTIVEC_BUILTIN_VEC_EXTRACT by resolve_overloaded_builtin, to generate a call __builtin_vec_ext_v4si to be expanded only in RTL. 3) is access variable v as array type with opcode VIEW_CONVERT_EXPR, I guess we should also generate builtin call instead of calling convert_vector_to_array_for_subscript to generate VIEW_CONVERT_EXPR expression for such kind of usage. 4) is translated to BIT_FIELD_REF with constant bitstart and bitsize, variable v could also be accessed by register instead of stack, so optabs could match the rs6000_expand_vector_insert to generate expected instruction through extract_bit_field. > >> Case 3) also couldn't handle the similar usage, and case 4) doesn't generate builtin as expected, >> it just expand to vec_select by coincidence. So does this mean both vec_insert and vec_extract >> and all other similar vector builtins should use IFN as suggested by Richard Biener, to match the >> pattern in gimple and expand both constant and variable index in expander? Will this also be >> beneficial for other targets except power? Or we should do that gradually after this patch >> approved as it seems another independent issue? Thanks:) > > If the code generated for 3)/4) isn't optimal you have to figure why > by tracing the RTL > expansion code and looking for missing optabs. > > Consider the amount of backend code you need to write if ever using > those in constexpr > context ... It seems too complicated to expand the "i = VIEW_CONVERT_EXPR<int[4]>(v)[_1];" or "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i_6(D);" to rs6000_expand_vector_insert/rs6000_expand_vector_extract in RTL, as: 1) Vector v is stored to stack with array type; need extra load and store operation. 2) Requires amount of code to decompose VIEW_CONVERT_EXPR to extract the vector and index then call rs6000_expand_vector_insert/rs6000_expand_vector_extract. which means replace following instructions #9~#12 to new instruction sequences: 1: NOTE_INSN_DELETED 6: NOTE_INSN_BASIC_BLOCK 2 2: r119:V4SI=%2:V4SI 3: r120:DI=%5:DI 4: r121:DI=%6:DI 5: NOTE_INSN_FUNCTION_BEG 8: [r112:DI]=r119:V4SI 9: r122:DI=r121:DI&0x3 10: r123:DI=r122:DI<<0x2 11: r124:DI=r112:DI+r123:DI 12: [r124:DI]=r120:DI#0 13: r118:V4SI=[r112:DI] 17: %2:V4SI=r118:V4SI 18: use %2:V4SI => 1: NOTE_INSN_DELETED 6: NOTE_INSN_BASIC_BLOCK 2 2: r119:V4SI=%2:V4SI 3: r120:DI=%5:DI 4: r121:DI=%6:DI 5: NOTE_INSN_FUNCTION_BEG 8: [r112:DI]=r119:V4SI r130:V4SI=[r112:DI] rs6000_expand_vector_insert (r130, r121:DI&0x3, r120:DI#0) [r112:DI]=r130:V4SI 13: r118:V4SI=[r112:DI] 17: %2:V4SI=r118:V4SI 18: use %2:V4SI so maybe bypass convert_vector_to_array_for_subscript for special circumstance like "i = v[n%4]" or "v[n&3]=i" to generate vec_extract or vec_insert builtin call a relative simpler method? Xionghu
On Wed, Sep 2, 2020 at 11:26 AM luoxhu <luoxhu@linux.ibm.com> wrote: > > Hi, > > On 2020/9/1 21:07, Richard Biener wrote: > > On Tue, Sep 1, 2020 at 10:11 AM luoxhu via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > >> > >> Hi, > >> > >> On 2020/9/1 01:04, Segher Boessenkool wrote: > >>> Hi! > >>> > >>> On Mon, Aug 31, 2020 at 04:06:47AM -0500, Xiong Hu Luo wrote: > >>>> vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value > >>>> to be insert, arg2 is the place to insert arg1 to arg0. This patch adds > >>>> __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to > >>>> not expand too early in gimple stage if arg2 is variable, to avoid generate > >>>> store hit load instructions. > >>>> > >>>> For Power9 V4SI: > >>>> addi 9,1,-16 > >>>> rldic 6,6,2,60 > >>>> stxv 34,-16(1) > >>>> stwx 5,9,6 > >>>> lxv 34,-16(1) > >>>> => > >>>> addis 9,2,.LC0@toc@ha > >>>> addi 9,9,.LC0@toc@l > >>>> mtvsrwz 33,5 > >>>> lxv 32,0(9) > >>>> sradi 9,6,2 > >>>> addze 9,9 > >>>> sldi 9,9,2 > >>>> subf 9,9,6 > >>>> subfic 9,9,3 > >>>> sldi 9,9,2 > >>>> subfic 9,9,20 > >>>> lvsl 13,0,9 > >>>> xxperm 33,33,45 > >>>> xxperm 32,32,45 > >>>> xxsel 34,34,33,32 > >>> > >>> For v a V4SI, x a SI, j some int, what do we generate for > >>> v[j&3] = x; > >>> ? > >>> This should be exactly the same as we generate for > >>> vec_insert(x, v, j); > >>> (the builtin does a modulo 4 automatically). > >> > >> No, even with my patch "stxv 34,-16(1);stwx 5,9,6;lxv 34,-16(1)" generated currently. > >> Is it feasible and acceptable to expand some kind of pattern in expander directly without > >> builtin transition? > >> > >> I borrowed some of implementation from vec_extract. For vec_extract, the issue also exists: > >> > >> source: gimple: expand: asm: > >> 1) i = vec_extract (v, n); => i = __builtin_vec_ext_v4si (v, n); => {r120:SI=unspec[r118:V4SI,r119:DI] 134;...} => slwi 9,6,2 vextuwrx 3,9,2 > >> 2) i = vec_extract (v, 3); => i = __builtin_vec_ext_v4si (v, 3); => {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12 vextuwrx 3,9,2 > >> 3) i = v[n%4]; => _1 = n & 3; i = VIEW_CONVERT_EXPR<int[4]>(v)[_1]; => ... => stxv 34,-16(1);addi 9,1,-16; rldic 5,5,2,60; lwax 3,9,5 > >> 4) i = v[3]; => i = BIT_FIELD_REF <v, 32, 96>; => {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12; vextuwrx 3,9,2 > > > > Why are 1) and 2) handled differently than 3)/4)? > > 1) and 2) are calling builtin function vec_extract, it is defined to > __builtin_vec_extract and will be resolved to ALTIVEC_BUILTIN_VEC_EXTRACT > by resolve_overloaded_builtin, to generate a call __builtin_vec_ext_v4si > to be expanded only in RTL. > 3) is access variable v as array type with opcode VIEW_CONVERT_EXPR, I > guess we should also generate builtin call instead of calling > convert_vector_to_array_for_subscript to generate VIEW_CONVERT_EXPR > expression for such kind of usage. > 4) is translated to BIT_FIELD_REF with constant bitstart and bitsize, > variable v could also be accessed by register instead of stack, so optabs > could match the rs6000_expand_vector_insert to generate expected instruction > through extract_bit_field. > > > > >> Case 3) also couldn't handle the similar usage, and case 4) doesn't generate builtin as expected, > >> it just expand to vec_select by coincidence. So does this mean both vec_insert and vec_extract > >> and all other similar vector builtins should use IFN as suggested by Richard Biener, to match the > >> pattern in gimple and expand both constant and variable index in expander? Will this also be > >> beneficial for other targets except power? Or we should do that gradually after this patch > >> approved as it seems another independent issue? Thanks:) > > > > If the code generated for 3)/4) isn't optimal you have to figure why > > by tracing the RTL > > expansion code and looking for missing optabs. > > > > Consider the amount of backend code you need to write if ever using > > those in constexpr > > context ... > > It seems too complicated to expand the "i = VIEW_CONVERT_EXPR<int[4]>(v)[_1];" > or "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i_6(D);" to > rs6000_expand_vector_insert/rs6000_expand_vector_extract in RTL, as: > 1) Vector v is stored to stack with array type; need extra load and store operation. > 2) Requires amount of code to decompose VIEW_CONVERT_EXPR to extract the vector and > index then call rs6000_expand_vector_insert/rs6000_expand_vector_extract. > > which means replace following instructions #9~#12 to new instruction sequences: > 1: NOTE_INSN_DELETED > 6: NOTE_INSN_BASIC_BLOCK 2 > 2: r119:V4SI=%2:V4SI > 3: r120:DI=%5:DI > 4: r121:DI=%6:DI > 5: NOTE_INSN_FUNCTION_BEG > 8: [r112:DI]=r119:V4SI > > 9: r122:DI=r121:DI&0x3 > 10: r123:DI=r122:DI<<0x2 > 11: r124:DI=r112:DI+r123:DI > 12: [r124:DI]=r120:DI#0 > > 13: r118:V4SI=[r112:DI] > 17: %2:V4SI=r118:V4SI > 18: use %2:V4SI > > => > > 1: NOTE_INSN_DELETED > 6: NOTE_INSN_BASIC_BLOCK 2 > 2: r119:V4SI=%2:V4SI > 3: r120:DI=%5:DI > 4: r121:DI=%6:DI > 5: NOTE_INSN_FUNCTION_BEG > 8: [r112:DI]=r119:V4SI > > r130:V4SI=[r112:DI] > rs6000_expand_vector_insert (r130, r121:DI&0x3, r120:DI#0) > [r112:DI]=r130:V4SI > > 13: r118:V4SI=[r112:DI] > 17: %2:V4SI=r118:V4SI > 18: use %2:V4SI > > so maybe bypass convert_vector_to_array_for_subscript for special circumstance > like "i = v[n%4]" or "v[n&3]=i" to generate vec_extract or vec_insert builtin > call a relative simpler method? I think you have it backward. You need to work with what convert_vector_to_array_for_subscript gives and deal with it during RTL expansion / optimization to generate more optimal code for power. The goal is to have as little target specific builtins during the GIMPLE optimization phase (because we cannot work out its semantics in optimizers). Richard. > > Xionghu
On 2020/9/2 17:30, Richard Biener wrote: >> so maybe bypass convert_vector_to_array_for_subscript for special circumstance >> like "i = v[n%4]" or "v[n&3]=i" to generate vec_extract or vec_insert builtin >> call a relative simpler method? > I think you have it backward. You need to work with what > convert_vector_to_array_for_subscript > gives and deal with it during RTL expansion / optimization to generate > more optimal > code for power. The goal is to have as little target specific > builtins during the GIMPLE > optimization phase (because we cannot work out its semantics in optimizers). OK, got it, will add optabs vec_insert and expand "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i_6(D);" expressions to rs6000_expand_vector_insert instead of builtin call. vec_extract already has optabs and "i = v[n%4]" should be in another patch after this. Thanks, Xionghu
On Thu, Sep 3, 2020 at 11:20 AM luoxhu <luoxhu@linux.ibm.com> wrote: > > > > On 2020/9/2 17:30, Richard Biener wrote: > >> so maybe bypass convert_vector_to_array_for_subscript for special circumstance > >> like "i = v[n%4]" or "v[n&3]=i" to generate vec_extract or vec_insert builtin > >> call a relative simpler method? > > I think you have it backward. You need to work with what > > convert_vector_to_array_for_subscript > > gives and deal with it during RTL expansion / optimization to generate > > more optimal > > code for power. The goal is to have as little target specific > > builtins during the GIMPLE > > optimization phase (because we cannot work out its semantics in optimizers). > > OK, got it, will add optabs vec_insert and expand "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i_6(D);" > expressions to rs6000_expand_vector_insert instead of builtin call. > vec_extract already has optabs and "i = v[n%4]" should be in another patch > after this. There is already vec_set and vec_extract - the question is whether the expander tries those for variable index. Richard. > > Thanks, > Xionghu >
Hi, On 2020/9/3 18:29, Richard Biener wrote: > On Thu, Sep 3, 2020 at 11:20 AM luoxhu <luoxhu@linux.ibm.com> wrote: >> >> >> >> On 2020/9/2 17:30, Richard Biener wrote: >>>> so maybe bypass convert_vector_to_array_for_subscript for special circumstance >>>> like "i = v[n%4]" or "v[n&3]=i" to generate vec_extract or vec_insert builtin >>>> call a relative simpler method? >>> I think you have it backward. You need to work with what >>> convert_vector_to_array_for_subscript >>> gives and deal with it during RTL expansion / optimization to generate >>> more optimal >>> code for power. The goal is to have as little target specific >>> builtins during the GIMPLE >>> optimization phase (because we cannot work out its semantics in optimizers). >> >> OK, got it, will add optabs vec_insert and expand "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i_6(D);" >> expressions to rs6000_expand_vector_insert instead of builtin call. >> vec_extract already has optabs and "i = v[n%4]" should be in another patch >> after this. > > There is already vec_set and vec_extract - the question is whether the expander > tries those for variable index. > Yes, I checked and found that both vec_set and vec_extract doesn't support variable index for most targets, store_bit_field_1 and extract_bit_field_1 would only consider use optabs when index is integer value. Anyway, it shouldn't be hard to extend depending on target requirements. Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with different gimple code: { _1 = n & 3; VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i; } vs: { __vector signed int v1; __vector signed int D.3192; long unsigned int _1; long unsigned int _2; int * _3; <bb 2> [local count: 1073741824]: D.3192 = v_4(D); _1 = n_7(D) & 3; _2 = _1 * 4; _3 = &D.3192 + _2; *_3 = i_8(D); v1_10 = D.3192; return v1_10; } If not use builtin for "vec_insert(v, i, n)", the pointer is "int*" instead of vector type, will this be difficult for expander to capture so many statements then call the optabs? So shall we still keep the builtin style for "vec_insert(v, i, n)" and expand "v[n&3]=i" with optabs or expand both with optabs??? Drafted a fast patch to expand "v[n&3]=i" with optabs as below, sorry that not using existed vec_set yet as not quite sure, together with the first patch, both cases could be handled as expected: [PATCH] Expander: expand VIEW_CONVERT_EXPR to vec_insert with variable index v[n%4] = i has same semantic with vec_insert (i, v, n), but it will be optimized to "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i;" in gimple, this patch tries to recognize the pattern in expander and use optabs to expand it to fast instructions like vec_insert: lvsl+xxperm+xxsel. gcc/ChangeLog: * config/rs6000/vector.md: * expr.c (expand_assignment): * optabs.def (OPTAB_CD): --- gcc/config/rs6000/vector.md | 13 +++++++++++ gcc/expr.c | 46 +++++++++++++++++++++++++++++++++++++ gcc/optabs.def | 1 + 3 files changed, 60 insertions(+) diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index 796345c80d3..46d21271e17 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -1244,6 +1244,19 @@ (define_expand "vec_extract<mode><VEC_base_l>" DONE; }) +(define_expand "vec_insert<VEC_base_l><mode>" + [(match_operand:VEC_E 0 "vlogical_operand") + (match_operand:<VEC_base> 1 "register_operand") + (match_operand 2 "register_operand")] + "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" +{ + rtx target = gen_reg_rtx (V16QImode); + rs6000_expand_vector_insert (target, operands[0], operands[1], operands[2]); + rtx sub_target = simplify_gen_subreg (GET_MODE(operands[0]), target, V16QImode, 0); + emit_insn (gen_rtx_SET (operands[0], sub_target)); + DONE; +}) + ;; Convert double word types to single word types (define_expand "vec_pack_trunc_v2df" [(match_operand:V4SF 0 "vfloat_operand") diff --git a/gcc/expr.c b/gcc/expr.c index dd2200ddea8..ce2890c1a2d 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -5237,6 +5237,52 @@ expand_assignment (tree to, tree from, bool nontemporal) to_rtx = expand_expr (tem, NULL_RTX, VOIDmode, EXPAND_WRITE); + tree type = TREE_TYPE (to); + if (TREE_CODE (to) == ARRAY_REF && tree_fits_uhwi_p (TYPE_SIZE (type)) + && tree_fits_uhwi_p (TYPE_SIZE_UNIT (type)) + && tree_to_uhwi (TYPE_SIZE (type)) + * tree_to_uhwi (TYPE_SIZE_UNIT (type)) + == 128) + { + tree op0 = TREE_OPERAND (to, 0); + tree op1 = TREE_OPERAND (to, 1); + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR) + { + tree view_op0 = TREE_OPERAND (op0, 0); + mode = TYPE_MODE (TREE_TYPE (view_op0)); + if (TREE_CODE (TREE_TYPE (view_op0)) == VECTOR_TYPE) + { + rtx value + = expand_expr (from, NULL_RTX, VOIDmode, EXPAND_NORMAL); + rtx pos + = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); + rtx temp_target = gen_reg_rtx (mode); + emit_move_insn (temp_target, to_rtx); + + machine_mode outermode = mode; + scalar_mode innermode = GET_MODE_INNER (outermode); + class expand_operand ops[3]; + enum insn_code icode + = convert_optab_handler (vec_insert_optab, innermode, + outermode); + + if (icode != CODE_FOR_nothing) + { + pos = convert_to_mode (E_SImode, pos, 0); + + create_fixed_operand (&ops[0], temp_target); + create_input_operand (&ops[1], value, innermode); + create_input_operand (&ops[2], pos, GET_MODE (pos)); + if (maybe_expand_insn (icode, 3, ops)) + { + emit_move_insn (to_rtx, temp_target); + pop_temp_slots (); + return; + } + } + } + } + } /* If the field has a mode, we want to access it in the field's mode, not the computed mode. If a MEM has VOIDmode (external with incomplete type), diff --git a/gcc/optabs.def b/gcc/optabs.def index 78409aa1453..21b163a969e 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -96,6 +96,7 @@ OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") OPTAB_CD(scatter_store_optab, "scatter_store$a$b") OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b") OPTAB_CD(vec_extract_optab, "vec_extract$a$b") +OPTAB_CD(vec_insert_optab, "vec_insert$a$b") OPTAB_CD(vec_init_optab, "vec_init$a$b") OPTAB_CD (while_ult_optab, "while_ult$a$b")
On 2020/9/4 14:16, luoxhu via Gcc-patches wrote: > Hi, > > > Yes, I checked and found that both vec_set and vec_extract doesn't support > variable index for most targets, store_bit_field_1 and extract_bit_field_1 > would only consider use optabs when index is integer value. Anyway, it > shouldn't be hard to extend depending on target requirements. > > Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with > different gimple code: > > { > _1 = n & 3; > VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i; > } > > vs: > > { > __vector signed int v1; > __vector signed int D.3192; > long unsigned int _1; > long unsigned int _2; > int * _3; > > <bb 2> [local count: 1073741824]: > D.3192 = v_4(D); > _1 = n_7(D) & 3; > _2 = _1 * 4; > _3 = &D.3192 + _2; > *_3 = i_8(D); > v1_10 = D.3192; > return v1_10; > } Just realized use convert_vector_to_array_for_subscript would generate "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i;" before produce those instructions, your confirmation and comments will be highly appreciated... Thanks in advance. :) Xionghu > > If not use builtin for "vec_insert(v, i, n)", the pointer is "int*" instead > of vector type, will this be difficult for expander to capture so many > statements then call the optabs? So shall we still keep the builtin style > for "vec_insert(v, i, n)" and expand "v[n&3]=i" with optabs or expand both > with optabs??? > > Drafted a fast patch to expand "v[n&3]=i" with optabs as below, sorry that not > using existed vec_set yet as not quite sure, together with the first patch, both > cases could be handled as expected: > > > [PATCH] Expander: expand VIEW_CONVERT_EXPR to vec_insert with variable index > > v[n%4] = i has same semantic with vec_insert (i, v, n), but it will be > optimized to "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i;" in gimple, this > patch tries to recognize the pattern in expander and use optabs to > expand it to fast instructions like vec_insert: lvsl+xxperm+xxsel. > > gcc/ChangeLog: > > * config/rs6000/vector.md: > * expr.c (expand_assignment): > * optabs.def (OPTAB_CD): > --- > gcc/config/rs6000/vector.md | 13 +++++++++++ > gcc/expr.c | 46 +++++++++++++++++++++++++++++++++++++ > gcc/optabs.def | 1 + > 3 files changed, 60 insertions(+) > > diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md > index 796345c80d3..46d21271e17 100644 > --- a/gcc/config/rs6000/vector.md > +++ b/gcc/config/rs6000/vector.md > @@ -1244,6 +1244,19 @@ (define_expand "vec_extract<mode><VEC_base_l>" > DONE; > }) > > +(define_expand "vec_insert<VEC_base_l><mode>" > + [(match_operand:VEC_E 0 "vlogical_operand") > + (match_operand:<VEC_base> 1 "register_operand") > + (match_operand 2 "register_operand")] > + "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" > +{ > + rtx target = gen_reg_rtx (V16QImode); > + rs6000_expand_vector_insert (target, operands[0], operands[1], operands[2]); > + rtx sub_target = simplify_gen_subreg (GET_MODE(operands[0]), target, V16QImode, 0); > + emit_insn (gen_rtx_SET (operands[0], sub_target)); > + DONE; > +}) > + > ;; Convert double word types to single word types > (define_expand "vec_pack_trunc_v2df" > [(match_operand:V4SF 0 "vfloat_operand") > diff --git a/gcc/expr.c b/gcc/expr.c > index dd2200ddea8..ce2890c1a2d 100644 > --- a/gcc/expr.c > +++ b/gcc/expr.c > @@ -5237,6 +5237,52 @@ expand_assignment (tree to, tree from, bool nontemporal) > > to_rtx = expand_expr (tem, NULL_RTX, VOIDmode, EXPAND_WRITE); > > + tree type = TREE_TYPE (to); > + if (TREE_CODE (to) == ARRAY_REF && tree_fits_uhwi_p (TYPE_SIZE (type)) > + && tree_fits_uhwi_p (TYPE_SIZE_UNIT (type)) > + && tree_to_uhwi (TYPE_SIZE (type)) > + * tree_to_uhwi (TYPE_SIZE_UNIT (type)) > + == 128) > + { > + tree op0 = TREE_OPERAND (to, 0); > + tree op1 = TREE_OPERAND (to, 1); > + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR) > + { > + tree view_op0 = TREE_OPERAND (op0, 0); > + mode = TYPE_MODE (TREE_TYPE (view_op0)); > + if (TREE_CODE (TREE_TYPE (view_op0)) == VECTOR_TYPE) > + { > + rtx value > + = expand_expr (from, NULL_RTX, VOIDmode, EXPAND_NORMAL); > + rtx pos > + = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); > + rtx temp_target = gen_reg_rtx (mode); > + emit_move_insn (temp_target, to_rtx); > + > + machine_mode outermode = mode; > + scalar_mode innermode = GET_MODE_INNER (outermode); > + class expand_operand ops[3]; > + enum insn_code icode > + = convert_optab_handler (vec_insert_optab, innermode, > + outermode); > + > + if (icode != CODE_FOR_nothing) > + { > + pos = convert_to_mode (E_SImode, pos, 0); > + > + create_fixed_operand (&ops[0], temp_target); > + create_input_operand (&ops[1], value, innermode); > + create_input_operand (&ops[2], pos, GET_MODE (pos)); > + if (maybe_expand_insn (icode, 3, ops)) > + { > + emit_move_insn (to_rtx, temp_target); > + pop_temp_slots (); > + return; > + } > + } > + } > + } > + } > /* If the field has a mode, we want to access it in the > field's mode, not the computed mode. > If a MEM has VOIDmode (external with incomplete type), > diff --git a/gcc/optabs.def b/gcc/optabs.def > index 78409aa1453..21b163a969e 100644 > --- a/gcc/optabs.def > +++ b/gcc/optabs.def > @@ -96,6 +96,7 @@ OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") > OPTAB_CD(scatter_store_optab, "scatter_store$a$b") > OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b") > OPTAB_CD(vec_extract_optab, "vec_extract$a$b") > +OPTAB_CD(vec_insert_optab, "vec_insert$a$b") > OPTAB_CD(vec_init_optab, "vec_init$a$b") > > OPTAB_CD (while_ult_optab, "while_ult$a$b") >
On Fri, Sep 4, 2020 at 8:38 AM luoxhu <luoxhu@linux.ibm.com> wrote: > > > > On 2020/9/4 14:16, luoxhu via Gcc-patches wrote: > > Hi, > > > > > > Yes, I checked and found that both vec_set and vec_extract doesn't support > > variable index for most targets, store_bit_field_1 and extract_bit_field_1 > > would only consider use optabs when index is integer value. Anyway, it > > shouldn't be hard to extend depending on target requirements. > > > > Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with > > different gimple code: > > > > { > > _1 = n & 3; > > VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i; > > } > > > > vs: > > > > { > > __vector signed int v1; > > __vector signed int D.3192; > > long unsigned int _1; > > long unsigned int _2; > > int * _3; > > > > <bb 2> [local count: 1073741824]: > > D.3192 = v_4(D); > > _1 = n_7(D) & 3; > > _2 = _1 * 4; > > _3 = &D.3192 + _2; > > *_3 = i_8(D); > > v1_10 = D.3192; > > return v1_10; > > } > > Just realized use convert_vector_to_array_for_subscript would generate > "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i;" before produce those instructions, > your confirmation and comments will be highly appreciated... Thanks in > advance. :) I think what the GCC vector extensions produce is generally better so wherever "code generation" for vec_insert resides it should be adjusted to produce the same code. Same for vec_extract. > Xionghu > > > > > If not use builtin for "vec_insert(v, i, n)", the pointer is "int*" instead > > of vector type, will this be difficult for expander to capture so many > > statements then call the optabs? So shall we still keep the builtin style > > for "vec_insert(v, i, n)" and expand "v[n&3]=i" with optabs or expand both > > with optabs??? > > > > Drafted a fast patch to expand "v[n&3]=i" with optabs as below, sorry that not > > using existed vec_set yet as not quite sure, together with the first patch, both > > cases could be handled as expected: > > > > > > [PATCH] Expander: expand VIEW_CONVERT_EXPR to vec_insert with variable index > > > > v[n%4] = i has same semantic with vec_insert (i, v, n), but it will be > > optimized to "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i;" in gimple, this > > patch tries to recognize the pattern in expander and use optabs to > > expand it to fast instructions like vec_insert: lvsl+xxperm+xxsel. > > > > gcc/ChangeLog: > > > > * config/rs6000/vector.md: > > * expr.c (expand_assignment): > > * optabs.def (OPTAB_CD): > > --- > > gcc/config/rs6000/vector.md | 13 +++++++++++ > > gcc/expr.c | 46 +++++++++++++++++++++++++++++++++++++ > > gcc/optabs.def | 1 + > > 3 files changed, 60 insertions(+) > > > > diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md > > index 796345c80d3..46d21271e17 100644 > > --- a/gcc/config/rs6000/vector.md > > +++ b/gcc/config/rs6000/vector.md > > @@ -1244,6 +1244,19 @@ (define_expand "vec_extract<mode><VEC_base_l>" > > DONE; > > }) > > > > +(define_expand "vec_insert<VEC_base_l><mode>" > > + [(match_operand:VEC_E 0 "vlogical_operand") > > + (match_operand:<VEC_base> 1 "register_operand") > > + (match_operand 2 "register_operand")] > > + "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" > > +{ > > + rtx target = gen_reg_rtx (V16QImode); > > + rs6000_expand_vector_insert (target, operands[0], operands[1], operands[2]); > > + rtx sub_target = simplify_gen_subreg (GET_MODE(operands[0]), target, V16QImode, 0); > > + emit_insn (gen_rtx_SET (operands[0], sub_target)); > > + DONE; > > +}) > > + > > ;; Convert double word types to single word types > > (define_expand "vec_pack_trunc_v2df" > > [(match_operand:V4SF 0 "vfloat_operand") > > diff --git a/gcc/expr.c b/gcc/expr.c > > index dd2200ddea8..ce2890c1a2d 100644 > > --- a/gcc/expr.c > > +++ b/gcc/expr.c > > @@ -5237,6 +5237,52 @@ expand_assignment (tree to, tree from, bool nontemporal) > > > > to_rtx = expand_expr (tem, NULL_RTX, VOIDmode, EXPAND_WRITE); > > > > + tree type = TREE_TYPE (to); > > + if (TREE_CODE (to) == ARRAY_REF && tree_fits_uhwi_p (TYPE_SIZE (type)) > > + && tree_fits_uhwi_p (TYPE_SIZE_UNIT (type)) > > + && tree_to_uhwi (TYPE_SIZE (type)) > > + * tree_to_uhwi (TYPE_SIZE_UNIT (type)) > > + == 128) > > + { > > + tree op0 = TREE_OPERAND (to, 0); > > + tree op1 = TREE_OPERAND (to, 1); > > + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR) > > + { > > + tree view_op0 = TREE_OPERAND (op0, 0); > > + mode = TYPE_MODE (TREE_TYPE (view_op0)); > > + if (TREE_CODE (TREE_TYPE (view_op0)) == VECTOR_TYPE) > > + { > > + rtx value > > + = expand_expr (from, NULL_RTX, VOIDmode, EXPAND_NORMAL); > > + rtx pos > > + = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); > > + rtx temp_target = gen_reg_rtx (mode); > > + emit_move_insn (temp_target, to_rtx); > > + > > + machine_mode outermode = mode; > > + scalar_mode innermode = GET_MODE_INNER (outermode); > > + class expand_operand ops[3]; > > + enum insn_code icode > > + = convert_optab_handler (vec_insert_optab, innermode, > > + outermode); > > + > > + if (icode != CODE_FOR_nothing) > > + { > > + pos = convert_to_mode (E_SImode, pos, 0); > > + > > + create_fixed_operand (&ops[0], temp_target); > > + create_input_operand (&ops[1], value, innermode); > > + create_input_operand (&ops[2], pos, GET_MODE (pos)); > > + if (maybe_expand_insn (icode, 3, ops)) > > + { > > + emit_move_insn (to_rtx, temp_target); > > + pop_temp_slots (); > > + return; > > + } > > + } > > + } > > + } > > + } > > /* If the field has a mode, we want to access it in the > > field's mode, not the computed mode. > > If a MEM has VOIDmode (external with incomplete type), > > diff --git a/gcc/optabs.def b/gcc/optabs.def > > index 78409aa1453..21b163a969e 100644 > > --- a/gcc/optabs.def > > +++ b/gcc/optabs.def > > @@ -96,6 +96,7 @@ OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") > > OPTAB_CD(scatter_store_optab, "scatter_store$a$b") > > OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b") > > OPTAB_CD(vec_extract_optab, "vec_extract$a$b") > > +OPTAB_CD(vec_insert_optab, "vec_insert$a$b") > > OPTAB_CD(vec_init_optab, "vec_init$a$b") > > > > OPTAB_CD (while_ult_optab, "while_ult$a$b") > >
On Fri, Sep 4, 2020 at 9:19 AM Richard Biener <richard.guenther@gmail.com> wrote: > > On Fri, Sep 4, 2020 at 8:38 AM luoxhu <luoxhu@linux.ibm.com> wrote: > > > > > > > > On 2020/9/4 14:16, luoxhu via Gcc-patches wrote: > > > Hi, > > > > > > > > > Yes, I checked and found that both vec_set and vec_extract doesn't support > > > variable index for most targets, store_bit_field_1 and extract_bit_field_1 > > > would only consider use optabs when index is integer value. Anyway, it > > > shouldn't be hard to extend depending on target requirements. > > > > > > Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with > > > different gimple code: > > > > > > { > > > _1 = n & 3; > > > VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i; > > > } > > > > > > vs: > > > > > > { > > > __vector signed int v1; > > > __vector signed int D.3192; > > > long unsigned int _1; > > > long unsigned int _2; > > > int * _3; > > > > > > <bb 2> [local count: 1073741824]: > > > D.3192 = v_4(D); > > > _1 = n_7(D) & 3; > > > _2 = _1 * 4; > > > _3 = &D.3192 + _2; > > > *_3 = i_8(D); > > > v1_10 = D.3192; > > > return v1_10; > > > } > > > > Just realized use convert_vector_to_array_for_subscript would generate > > "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i;" before produce those instructions, > > your confirmation and comments will be highly appreciated... Thanks in > > advance. :) > > I think what the GCC vector extensions produce is generally better > so wherever "code generation" for vec_insert resides it should be > adjusted to produce the same code. Same for vec_extract. Guess altivec.h, dispatching to __builtin_vec_insert. Wonder why it wasn't #define vec_insert(a,b,c) (a)[c]=(b) anyway, you obviously have some lowering of the builtin somewhere in rs6000.c and thus can adjust that. > > Xionghu > > > > > > > > If not use builtin for "vec_insert(v, i, n)", the pointer is "int*" instead > > > of vector type, will this be difficult for expander to capture so many > > > statements then call the optabs? So shall we still keep the builtin style > > > for "vec_insert(v, i, n)" and expand "v[n&3]=i" with optabs or expand both > > > with optabs??? > > > > > > Drafted a fast patch to expand "v[n&3]=i" with optabs as below, sorry that not > > > using existed vec_set yet as not quite sure, together with the first patch, both > > > cases could be handled as expected: > > > > > > > > > [PATCH] Expander: expand VIEW_CONVERT_EXPR to vec_insert with variable index > > > > > > v[n%4] = i has same semantic with vec_insert (i, v, n), but it will be > > > optimized to "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i;" in gimple, this > > > patch tries to recognize the pattern in expander and use optabs to > > > expand it to fast instructions like vec_insert: lvsl+xxperm+xxsel. > > > > > > gcc/ChangeLog: > > > > > > * config/rs6000/vector.md: > > > * expr.c (expand_assignment): > > > * optabs.def (OPTAB_CD): > > > --- > > > gcc/config/rs6000/vector.md | 13 +++++++++++ > > > gcc/expr.c | 46 +++++++++++++++++++++++++++++++++++++ > > > gcc/optabs.def | 1 + > > > 3 files changed, 60 insertions(+) > > > > > > diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md > > > index 796345c80d3..46d21271e17 100644 > > > --- a/gcc/config/rs6000/vector.md > > > +++ b/gcc/config/rs6000/vector.md > > > @@ -1244,6 +1244,19 @@ (define_expand "vec_extract<mode><VEC_base_l>" > > > DONE; > > > }) > > > > > > +(define_expand "vec_insert<VEC_base_l><mode>" > > > + [(match_operand:VEC_E 0 "vlogical_operand") > > > + (match_operand:<VEC_base> 1 "register_operand") > > > + (match_operand 2 "register_operand")] > > > + "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" > > > +{ > > > + rtx target = gen_reg_rtx (V16QImode); > > > + rs6000_expand_vector_insert (target, operands[0], operands[1], operands[2]); > > > + rtx sub_target = simplify_gen_subreg (GET_MODE(operands[0]), target, V16QImode, 0); > > > + emit_insn (gen_rtx_SET (operands[0], sub_target)); > > > + DONE; > > > +}) > > > + > > > ;; Convert double word types to single word types > > > (define_expand "vec_pack_trunc_v2df" > > > [(match_operand:V4SF 0 "vfloat_operand") > > > diff --git a/gcc/expr.c b/gcc/expr.c > > > index dd2200ddea8..ce2890c1a2d 100644 > > > --- a/gcc/expr.c > > > +++ b/gcc/expr.c > > > @@ -5237,6 +5237,52 @@ expand_assignment (tree to, tree from, bool nontemporal) > > > > > > to_rtx = expand_expr (tem, NULL_RTX, VOIDmode, EXPAND_WRITE); > > > > > > + tree type = TREE_TYPE (to); > > > + if (TREE_CODE (to) == ARRAY_REF && tree_fits_uhwi_p (TYPE_SIZE (type)) > > > + && tree_fits_uhwi_p (TYPE_SIZE_UNIT (type)) > > > + && tree_to_uhwi (TYPE_SIZE (type)) > > > + * tree_to_uhwi (TYPE_SIZE_UNIT (type)) > > > + == 128) > > > + { > > > + tree op0 = TREE_OPERAND (to, 0); > > > + tree op1 = TREE_OPERAND (to, 1); > > > + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR) > > > + { > > > + tree view_op0 = TREE_OPERAND (op0, 0); > > > + mode = TYPE_MODE (TREE_TYPE (view_op0)); > > > + if (TREE_CODE (TREE_TYPE (view_op0)) == VECTOR_TYPE) > > > + { > > > + rtx value > > > + = expand_expr (from, NULL_RTX, VOIDmode, EXPAND_NORMAL); > > > + rtx pos > > > + = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); > > > + rtx temp_target = gen_reg_rtx (mode); > > > + emit_move_insn (temp_target, to_rtx); > > > + > > > + machine_mode outermode = mode; > > > + scalar_mode innermode = GET_MODE_INNER (outermode); > > > + class expand_operand ops[3]; > > > + enum insn_code icode > > > + = convert_optab_handler (vec_insert_optab, innermode, > > > + outermode); > > > + > > > + if (icode != CODE_FOR_nothing) > > > + { > > > + pos = convert_to_mode (E_SImode, pos, 0); > > > + > > > + create_fixed_operand (&ops[0], temp_target); > > > + create_input_operand (&ops[1], value, innermode); > > > + create_input_operand (&ops[2], pos, GET_MODE (pos)); > > > + if (maybe_expand_insn (icode, 3, ops)) > > > + { > > > + emit_move_insn (to_rtx, temp_target); > > > + pop_temp_slots (); > > > + return; > > > + } > > > + } > > > + } > > > + } > > > + } > > > /* If the field has a mode, we want to access it in the > > > field's mode, not the computed mode. > > > If a MEM has VOIDmode (external with incomplete type), > > > diff --git a/gcc/optabs.def b/gcc/optabs.def > > > index 78409aa1453..21b163a969e 100644 > > > --- a/gcc/optabs.def > > > +++ b/gcc/optabs.def > > > @@ -96,6 +96,7 @@ OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") > > > OPTAB_CD(scatter_store_optab, "scatter_store$a$b") > > > OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b") > > > OPTAB_CD(vec_extract_optab, "vec_extract$a$b") > > > +OPTAB_CD(vec_insert_optab, "vec_insert$a$b") > > > OPTAB_CD(vec_init_optab, "vec_init$a$b") > > > > > > OPTAB_CD (while_ult_optab, "while_ult$a$b") > > >
On 2020/9/4 15:23, Richard Biener wrote: > On Fri, Sep 4, 2020 at 9:19 AM Richard Biener > <richard.guenther@gmail.com> wrote: >> >> On Fri, Sep 4, 2020 at 8:38 AM luoxhu <luoxhu@linux.ibm.com> wrote: >>> >>> >>> >>> On 2020/9/4 14:16, luoxhu via Gcc-patches wrote: >>>> Hi, >>>> >>>> >>>> Yes, I checked and found that both vec_set and vec_extract doesn't support >>>> variable index for most targets, store_bit_field_1 and extract_bit_field_1 >>>> would only consider use optabs when index is integer value. Anyway, it >>>> shouldn't be hard to extend depending on target requirements. >>>> >>>> Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with >>>> different gimple code: >>>> >>>> { >>>> _1 = n & 3; >>>> VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i; >>>> } >>>> >>>> vs: >>>> >>>> { >>>> __vector signed int v1; >>>> __vector signed int D.3192; >>>> long unsigned int _1; >>>> long unsigned int _2; >>>> int * _3; >>>> >>>> <bb 2> [local count: 1073741824]: >>>> D.3192 = v_4(D); >>>> _1 = n_7(D) & 3; >>>> _2 = _1 * 4; >>>> _3 = &D.3192 + _2; >>>> *_3 = i_8(D); >>>> v1_10 = D.3192; >>>> return v1_10; >>>> } >>> >>> Just realized use convert_vector_to_array_for_subscript would generate >>> "VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i;" before produce those instructions, >>> your confirmation and comments will be highly appreciated... Thanks in >>> advance. :) >> >> I think what the GCC vector extensions produce is generally better >> so wherever "code generation" for vec_insert resides it should be >> adjusted to produce the same code. Same for vec_extract. > > Guess altivec.h, dispatching to __builtin_vec_insert. Wonder why it wasn't > > #define vec_insert(a,b,c) (a)[c]=(b) > > anyway, you obviously have some lowering of the builtin somewhere in rs6000.c > and thus can adjust that. > Yes, altivec.h use that style for all vector functions, not sure why. But this could be adjusted by below patch during front end parsing, which could also generate "VIEW_CONVERT_EXPR<int[4]>(D.3192)[_1] = i;" in gimple, then both v[n&3]=i and vec_insert(v, i, n) could use optabs in expander: diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index 03b00738a5e..00c65311f76 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */ @@ -1654,15 +1656,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, SET_EXPR_LOCATION (stmt, loc); stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); } - - innerptrtype = build_pointer_type (arg1_inner_type); - - stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0); - stmt = convert (innerptrtype, stmt); - stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1); - stmt = build_indirect_ref (loc, stmt, RO_NULL); - stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt, - convert (TREE_TYPE (stmt), arg0)); + stmt = build_array_ref (loc, stmt, arg2); + stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0); stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); return stmt; }
Hi! On Fri, Sep 04, 2020 at 05:18:49PM +0800, luoxhu wrote: > On 2020/9/4 15:23, Richard Biener wrote: > > On Fri, Sep 4, 2020 at 9:19 AM Richard Biener > > <richard.guenther@gmail.com> wrote: > >> On Fri, Sep 4, 2020 at 8:38 AM luoxhu <luoxhu@linux.ibm.com> wrote: > >>> On 2020/9/4 14:16, luoxhu via Gcc-patches wrote: > >>>> Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with > >>>> different gimple code: > >>>> > >>>> { > >>>> _1 = n & 3; > >>>> VIEW_CONVERT_EXPR<int[4]>(v1)[_1] = i; > >>>> } > >>>> > >>>> vs: > >>>> > >>>> { > >>>> __vector signed int v1; > >>>> __vector signed int D.3192; > >>>> long unsigned int _1; > >>>> long unsigned int _2; > >>>> int * _3; > >>>> > >>>> <bb 2> [local count: 1073741824]: > >>>> D.3192 = v_4(D); > >>>> _1 = n_7(D) & 3; > >>>> _2 = _1 * 4; > >>>> _3 = &D.3192 + _2; > >>>> *_3 = i_8(D); > >>>> v1_10 = D.3192; > >>>> return v1_10; > >>>> } Not the semantics of vec_insert aren't exactly that.. It doesn't modify the vector in place, it returns a copy with the modification. But yes, it could/should just use this same VIEW_CONVERT_EXPR(...)[...] thing for that. > >> I think what the GCC vector extensions produce is generally better > >> so wherever "code generation" for vec_insert resides it should be > >> adjusted to produce the same code. Same for vec_extract. Yup. > > Guess altivec.h, dispatching to __builtin_vec_insert. Wonder why it wasn't > > > > #define vec_insert(a,b,c) (a)[c]=(b) > > > > anyway, you obviously have some lowering of the builtin somewhere in rs6000.c > > and thus can adjust that. > > > > Yes, altivec.h use that style for all vector functions, not sure why. Probably simply because pretty much everything in there is just calling builtins, everything new follows suit. It is contagious ;-) > But this could be adjusted by below patch during front end parsing, > which could also generate "VIEW_CONVERT_EXPR<int[4]>(D.3192)[_1] = i;" > in gimple, then both v[n&3]=i and vec_insert(v, i, n) could use optabs > in expander: > > > diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c > index 03b00738a5e..00c65311f76 100644 > --- a/gcc/config/rs6000/rs6000-c.c > +++ b/gcc/config/rs6000/rs6000-c.c > /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */ > @@ -1654,15 +1656,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, > SET_EXPR_LOCATION (stmt, loc); > stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); > } > - > - innerptrtype = build_pointer_type (arg1_inner_type); > - > - stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0); > - stmt = convert (innerptrtype, stmt); > - stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1); > - stmt = build_indirect_ref (loc, stmt, RO_NULL); > - stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt, > - convert (TREE_TYPE (stmt), arg0)); > + stmt = build_array_ref (loc, stmt, arg2); > + stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0); > stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); > return stmt; > } You should make a copy of the vector, not modify the original one in place? (If I read that correctly, heh.) Looks good otherwise. Segher
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 0a2e634d6b0..66b636059a6 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -2772,10 +2772,10 @@ DONE; }) -(define_insn "altivec_lvsl_reg" +(define_insn "altivec_lvsl_reg_<mode>2" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") (unspec:V16QI - [(match_operand:DI 1 "gpc_reg_operand" "b")] + [(match_operand:SDI 1 "gpc_reg_operand" "b")] UNSPEC_LVSL_REG))] "TARGET_ALTIVEC" "lvsl %0,0,%1" diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index f9f0fece549..d095b365c14 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -2047,6 +2047,12 @@ BU_VSX_X (VEC_INIT_V2DI, "vec_init_v2di", CONST) BU_VSX_X (VEC_SET_V1TI, "vec_set_v1ti", CONST) BU_VSX_X (VEC_SET_V2DF, "vec_set_v2df", CONST) BU_VSX_X (VEC_SET_V2DI, "vec_set_v2di", CONST) +BU_VSX_X (VEC_INSERT_V16QI, "vec_insert_v16qi", CONST) +BU_VSX_X (VEC_INSERT_V8HI, "vec_insert_v8hi", CONST) +BU_VSX_X (VEC_INSERT_V4SI, "vec_insert_v4si", CONST) +BU_VSX_X (VEC_INSERT_V4SF, "vec_insert_v4sf", CONST) +BU_VSX_X (VEC_INSERT_V2DI, "vec_insert_v2di", CONST) +BU_VSX_X (VEC_INSERT_V2DF, "vec_insert_v2df", CONST) BU_VSX_X (VEC_EXT_V1TI, "vec_ext_v1ti", CONST) BU_VSX_X (VEC_EXT_V2DF, "vec_ext_v2df", CONST) BU_VSX_X (VEC_EXT_V2DI, "vec_ext_v2di", CONST) diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index 2fad3d94706..03b00738a5e 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -1563,6 +1563,67 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, return build_call_expr (call, 3, arg1, arg0, arg2); } + else if (VECTOR_MEM_VSX_P (mode)) + { + tree call = NULL_TREE; + + arg2 = fold_for_warn (arg2); + + /* If the second argument is variable, we can optimize it if we are + generating 64-bit code on a machine with direct move. */ + if (TREE_CODE (arg2) != INTEGER_CST && TARGET_DIRECT_MOVE_64BIT) + { + switch (mode) + { + default: + break; + + case E_V2DImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V2DI]; + break; + + case E_V2DFmode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V2DF]; + break; + + case E_V4SFmode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V4SF]; + break; + + case E_V4SImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V4SI]; + break; + + case E_V8HImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V8HI]; + break; + + case E_V16QImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V16QI]; + break; + } + } + + if (call) + { + if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) + arg2 = build_int_cst (TREE_TYPE (arg2), 0); + else + arg2 = build_binary_op ( + loc, BIT_AND_EXPR, arg2, + build_int_cst (TREE_TYPE (arg2), + TYPE_VECTOR_SUBPARTS (arg1_type) - 1), + 0); + tree result + = build_call_expr (call, 3, arg1, + convert (TREE_TYPE (arg1_type), arg0), + convert (integer_type_node, arg2)); + /* Coerce the result to vector element type. May be no-op. */ + result = fold_convert (TREE_TYPE (arg1), result); + return result; + } + } + /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */ arg1_inner_type = TREE_TYPE (arg1_type); if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index e39cfcf672b..339e9ae87e3 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -10660,6 +10660,40 @@ altivec_expand_vec_set_builtin (tree exp) return op0; } +/* Expand vec_insert builtin. */ +static rtx +altivec_expand_vec_insert_builtin (tree exp, rtx target) +{ + machine_mode tmode, mode1, mode2; + tree arg0, arg1, arg2; + rtx op0 = NULL_RTX, op1, op2; + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + + tmode = TYPE_MODE (TREE_TYPE (arg0)); + mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); + mode2 = TYPE_MODE ((TREE_TYPE (arg2))); + gcc_assert (VECTOR_MODE_P (tmode)); + + op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); + op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); + op2 = expand_expr (arg2, NULL_RTX, mode2, EXPAND_NORMAL); + + if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) + op1 = convert_modes (mode1, GET_MODE (op1), op1, true); + + op0 = force_reg (tmode, op0); + op1 = force_reg (mode1, op1); + op2 = force_reg (mode2, op2); + + target = gen_reg_rtx (V16QImode); + rs6000_expand_vector_insert (target, op0, op1, op2); + + return target; +} + /* Expand vec_ext builtin. */ static rtx altivec_expand_vec_ext_builtin (tree exp, rtx target) @@ -10922,6 +10956,14 @@ altivec_expand_builtin (tree exp, rtx target, bool *expandedp) case VSX_BUILTIN_VEC_SET_V1TI: return altivec_expand_vec_set_builtin (exp); + case VSX_BUILTIN_VEC_INSERT_V16QI: + case VSX_BUILTIN_VEC_INSERT_V8HI: + case VSX_BUILTIN_VEC_INSERT_V4SF: + case VSX_BUILTIN_VEC_INSERT_V4SI: + case VSX_BUILTIN_VEC_INSERT_V2DF: + case VSX_BUILTIN_VEC_INSERT_V2DI: + return altivec_expand_vec_insert_builtin (exp, target); + case ALTIVEC_BUILTIN_VEC_EXT_V4SI: case ALTIVEC_BUILTIN_VEC_EXT_V8HI: case ALTIVEC_BUILTIN_VEC_EXT_V16QI: @@ -13681,6 +13723,38 @@ altivec_init_builtins (void) integer_type_node, NULL_TREE); def_builtin ("__builtin_vec_set_v2di", ftype, VSX_BUILTIN_VEC_SET_V2DI); + /* Access to the vec_insert patterns. */ + ftype = build_function_type_list (V16QI_type_node, V16QI_type_node, + intQI_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v16qi", ftype, + VSX_BUILTIN_VEC_INSERT_V16QI); + + ftype = build_function_type_list (V8HI_type_node, V8HI_type_node, + intHI_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v8hi", ftype, VSX_BUILTIN_VEC_INSERT_V8HI); + + ftype = build_function_type_list (V4SI_type_node, V4SI_type_node, + integer_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v4si", ftype, VSX_BUILTIN_VEC_INSERT_V4SI); + + ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, + float_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v4sf", ftype, VSX_BUILTIN_VEC_INSERT_V4SF); + + ftype = build_function_type_list (V2DI_type_node, V2DI_type_node, + intDI_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v2di", ftype, VSX_BUILTIN_VEC_INSERT_V2DI); + + ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, + double_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v2df", ftype, VSX_BUILTIN_VEC_INSERT_V2DF); + /* Access to the vec_extract patterns. */ ftype = build_function_type_list (intSI_type_node, V4SI_type_node, integer_type_node, NULL_TREE); diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 28e859f4381..78b5b31d79f 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -58,6 +58,7 @@ extern bool rs6000_split_128bit_ok_p (rtx []); extern void rs6000_expand_float128_convert (rtx, rtx, bool); extern void rs6000_expand_vector_init (rtx, rtx); extern void rs6000_expand_vector_set (rtx, rtx, int); +extern void rs6000_expand_vector_insert (rtx, rtx, rtx, rtx); extern void rs6000_expand_vector_extract (rtx, rtx, rtx); extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index fe93cf6ff2b..afa845f3dff 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6788,6 +6788,152 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) emit_insn (gen_rtx_SET (target, x)); } +/* Insert value from VEC into idx of TARGET. */ + +void +rs6000_expand_vector_insert (rtx target, rtx vec, rtx val, rtx idx) +{ + machine_mode mode = GET_MODE (vec); + + if (VECTOR_MEM_VSX_P (mode) && CONST_INT_P (idx)) + gcc_unreachable (); + else if (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx) + && TARGET_DIRECT_MOVE_64BIT) + { + gcc_assert (GET_MODE (idx) == E_SImode); + machine_mode inner_mode = GET_MODE (val); + HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); + + rtx tmp = gen_reg_rtx (GET_MODE (idx)); + if (GET_MODE_SIZE (inner_mode) == 8) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 1 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (1), idx)); + /* idx = idx * 8. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (3))); + /* idx = 16 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (3))); + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 4) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 3 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (3), idx)); + /* idx = idx * 4. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (2))); + /* idx = 20 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (2))); + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 2) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 7 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (7), idx)); + /* idx = idx * 2. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); + /* idx = 22 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (22), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); + emit_insn (gen_subsi3 (tmp, GEN_INT (22), idx)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 1) + if (!BYTES_BIG_ENDIAN) + emit_insn (gen_addsi3 (tmp, idx, GEN_INT (8))); + else + emit_insn (gen_subsi3 (tmp, GEN_INT (23), idx)); + else + gcc_unreachable (); + + /* lxv vs32, mask. + DImode: 0xffffffffffffffff0000000000000000 + SImode: 0x00000000ffffffff0000000000000000 + HImode: 0x000000000000ffff0000000000000000. + QImode: 0x00000000000000ff0000000000000000. */ + rtx mask = gen_reg_rtx (V16QImode); + rtx mask_v2di = gen_reg_rtx (V2DImode); + rtvec v = rtvec_alloc (2); + if (!BYTES_BIG_ENDIAN) + { + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask); + } + else + { + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0); + } + emit_insn ( + gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v))); + rtx sub_mask = simplify_gen_subreg (V16QImode, mask_v2di, V2DImode, 0); + emit_insn (gen_rtx_SET (mask, sub_mask)); + + /* mtvsrd[wz] f0,val. */ + rtx val_v16qi = gen_reg_rtx (V16QImode); + switch (inner_mode) + { + default: + gcc_unreachable (); + break; + case E_QImode: + emit_insn (gen_p8_mtvsrwz_v16qiqi2 (val_v16qi, val)); + break; + case E_HImode: + emit_insn (gen_p8_mtvsrwz_v16qihi2 (val_v16qi, val)); + break; + case E_SImode: + emit_insn (gen_p8_mtvsrwz_v16qisi2 (val_v16qi, val)); + break; + case E_SFmode: + emit_insn (gen_p8_mtvsrwz_v16qisf2 (val_v16qi, val)); + break; + case E_DImode: + emit_insn (gen_p8_mtvsrd_v16qidi2 (val_v16qi, val)); + break; + case E_DFmode: + emit_insn (gen_p8_mtvsrd_v16qidf2 (val_v16qi, val)); + break; + } + + /* lvsl v1,0,idx. */ + rtx pcv = gen_reg_rtx (V16QImode); + emit_insn (gen_altivec_lvsl_reg_si2 (pcv, tmp)); + + /* xxperm vs0,vs0,vs33. */ + /* xxperm vs32,vs32,vs33. */ + rtx val_perm = gen_reg_rtx (V16QImode); + rtx mask_perm = gen_reg_rtx (V16QImode); + emit_insn ( + gen_altivec_vperm_v8hiv16qi (val_perm, val_v16qi, val_v16qi, pcv)); + emit_insn (gen_altivec_vperm_v8hiv16qi (mask_perm, mask, mask, pcv)); + + rtx sub_target = simplify_gen_subreg (V16QImode, vec, mode, 0); + emit_insn (gen_rtx_SET (target, sub_target)); + + /* xxsel vs34,vs34,vs0,vs32. */ + emit_insn (gen_vector_select_v16qi (target, target, val_perm, mask_perm)); + } +} + /* Extract field ELT from VEC into TARGET. */ void diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 43b620ae1c0..b02fda836d4 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -8713,6 +8713,25 @@ "mtvsrwz %x0,%1" [(set_attr "type" "mftgpr")]) +(define_mode_iterator FQHS [SF QI HI SI]) +(define_mode_iterator FD [DF DI]) + +(define_insn "p8_mtvsrwz_v16qi<mode>2" + [(set (match_operand:V16QI 0 "register_operand" "=wa") + (unspec:V16QI [(match_operand:FQHS 1 "register_operand" "r")] + UNSPEC_P8V_MTVSRWZ))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrwz %x0,%1" + [(set_attr "type" "mftgpr")]) + +(define_insn "p8_mtvsrd_v16qi<mode>2" + [(set (match_operand:V16QI 0 "register_operand" "=wa") + (unspec:V16QI [(match_operand:FD 1 "register_operand" "r")] + UNSPEC_P8V_MTVSRD))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrd %x0,%1" + [(set_attr "type" "mftgpr")]) + (define_insn_and_split "reload_fpr_from_gpr<mode>" [(set (match_operand:FMOVE64X 0 "register_operand" "=d") (unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")] diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index dd750210758..7e82690d12d 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -5349,7 +5349,7 @@ rtx rtx_vtmp = gen_reg_rtx (V16QImode); rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2])); + emit_insn (gen_altivec_lvsl_reg_di2 (shift_mask, operands[2])); emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56))); emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp)); emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp, diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.c b/gcc/testsuite/gcc.target/powerpc/pr79251.c new file mode 100644 index 00000000000..877659a0146 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */ + +#include <stddef.h> +#include <altivec.h> + +#define TYPE int + +__attribute__ ((noinline)) +vector TYPE test (vector TYPE v, TYPE i, size_t n) +{ + vector TYPE v1 = v; + v1 = vec_insert (i, v, n); + + return v1; +} + +/* { dg-final { scan-assembler-not {\mstxw\M} } } */ +/* { dg-final { scan-assembler-times {\mlvsl\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mxxperm\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxsel\M} 1 } } */