Message ID | 20230529083530.495654-1-pan2.li@intel.com |
---|---|
State | New |
Headers | show |
Series | [v7] RISC-V: Using merge approach to optimize repeating sequence in vec_init | expand |
LGTM, thanks On Mon, May 29, 2023 at 4:54 PM Pan Li via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > From: Pan Li <pan2.li@intel.com> > > This patch would like to optimize the VLS vector initialization like > repeating sequence. From the vslide1down to the vmerge with a simple > cost model, aka every instruction only has 1 cost. > > Given code with -march=rv64gcv_zvl256b --param riscv-autovec-preference=fixed-vlmax > typedef int64_t vnx32di __attribute__ ((vector_size (256))); > > __attribute__ ((noipa)) void > f_vnx32di (int64_t a, int64_t b, int64_t *out) > { > vnx32di v = { > a, b, a, b, a, b, a, b, > a, b, a, b, a, b, a, b, > a, b, a, b, a, b, a, b, > a, b, a, b, a, b, a, b, > }; > *(vnx32di *) out = v; > } > > Before this patch: > vslide1down.vx (x31 times) > > After this patch: > li a5,-1431654400 > addi a5,a5,-1365 > li a3,-1431654400 > addi a3,a3,-1366 > slli a5,a5,32 > add a5,a5,a3 > vsetvli a4,zero,e64,m8,ta,ma > vmv.v.x v8,a0 > vmv.s.x v0,a5 > vmerge.vxm v8,v8,a1,v0 > vs8r.v v8,0(a2) > > Since we dont't have SEW = 128 in vec_duplicate, we can't combine ab into > SEW = 128 element and then broadcast this big element. > > Signed-off-by: Pan Li <pan2.li@intel.com> > Co-Authored by: Juzhe-Zhong <juzhe.zhong@rivai.ai> > > gcc/ChangeLog: > > * config/riscv/riscv-protos.h (enum insn_type): New type. > * config/riscv/riscv-v.cc (RVV_INSN_OPERANDS_MAX): New macro. > (rvv_builder::can_duplicate_repeating_sequence_p): Align the referenced > class member. > (rvv_builder::get_merged_repeating_sequence): Ditto. > (rvv_builder::repeating_sequence_use_merge_profitable_p): New function > to evaluate the optimization cost. > (rvv_builder::get_merge_scalar_mask): New function to get the merge > mask. > (emit_scalar_move_insn): New function to emit vmv.s.x. > (emit_vlmax_integer_move_insn): New function to emit vlmax vmv.v.x. > (emit_nonvlmax_integer_move_insn): New function to emit nonvlmax > vmv.v.x. > (get_repeating_sequence_dup_machine_mode): New function to get the dup > machine mode. > (expand_vector_init_merge_repeating_sequence): New function to perform > the optimization. > (expand_vec_init): Add this vector init optimization. > * config/riscv/riscv.h (BITS_PER_WORD): New macro. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c: New test. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/config/riscv/riscv-protos.h | 1 + > gcc/config/riscv/riscv-v.cc | 225 +++++++++++++++++- > gcc/config/riscv/riscv.h | 1 + > .../vls-vlmax/init-repeat-sequence-1.c | 21 ++ > .../vls-vlmax/init-repeat-sequence-2.c | 24 ++ > .../vls-vlmax/init-repeat-sequence-3.c | 25 ++ > .../vls-vlmax/init-repeat-sequence-4.c | 15 ++ > .../vls-vlmax/init-repeat-sequence-5.c | 17 ++ > .../vls-vlmax/init-repeat-sequence-run-1.c | 47 ++++ > .../vls-vlmax/init-repeat-sequence-run-2.c | 46 ++++ > .../vls-vlmax/init-repeat-sequence-run-3.c | 41 ++++ > 11 files changed, 457 insertions(+), 6 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h > index 0462f96c8d5..277845673d4 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -142,6 +142,7 @@ enum insn_type > RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand. */ > RVV_UNOP_MU = RVV_UNOP + 2, /* Likewise. */ > RVV_TERNOP = 5, > + RVV_SCALAR_MOV_OP = 4, /* +1 for VUNDEF according to vector.md. */ > }; > enum vlmul_type > { > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc > index a5715bb466c..8c920532549 100644 > --- a/gcc/config/riscv/riscv-v.cc > +++ b/gcc/config/riscv/riscv-v.cc > @@ -21,6 +21,10 @@ > > #define IN_TARGET_CODE 1 > > +/* We have a maximum of 11 operands for RVV instruction patterns according to > + the vector.md. */ > +#define RVV_INSN_OPERANDS_MAX 11 > + > #include "config.h" > #include "system.h" > #include "coretypes.h" > @@ -1286,19 +1290,32 @@ public: > : rtx_vector_builder (mode, npatterns, nelts_per_pattern) > { > m_inner_mode = GET_MODE_INNER (mode); > - m_inner_size = GET_MODE_BITSIZE (m_inner_mode).to_constant (); > + m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode); > + m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode); > + > + gcc_assert ( > + int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode)); > } > > bool can_duplicate_repeating_sequence_p (); > rtx get_merged_repeating_sequence (); > > + bool repeating_sequence_use_merge_profitable_p (); > + rtx get_merge_scalar_mask (unsigned int) const; > + > machine_mode new_mode () const { return m_new_mode; } > + scalar_mode inner_mode () const { return m_inner_mode; } > + scalar_int_mode inner_int_mode () const { return m_inner_int_mode; } > + unsigned int inner_bits_size () const { return m_inner_bits_size; } > + unsigned int inner_bytes_size () const { return m_inner_bytes_size; } > > private: > - machine_mode m_inner_mode; > + scalar_mode m_inner_mode; > + scalar_int_mode m_inner_int_mode; > machine_mode m_new_mode; > scalar_int_mode m_new_inner_mode; > - unsigned int m_inner_size; > + unsigned int m_inner_bits_size; > + unsigned int m_inner_bytes_size; > }; > > /* Return true if the vector duplicated by a super element which is the fusion > @@ -1309,7 +1326,7 @@ bool > rvv_builder::can_duplicate_repeating_sequence_p () > { > poly_uint64 new_size = exact_div (full_nelts (), npatterns ()); > - unsigned int new_inner_size = m_inner_size * npatterns (); > + unsigned int new_inner_size = m_inner_bits_size * npatterns (); > if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode) > || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD > || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode)) > @@ -1317,6 +1334,61 @@ rvv_builder::can_duplicate_repeating_sequence_p () > return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ()); > } > > +/* Return true if it is a repeating sequence that using > + merge approach has better codegen than using default > + approach (slide1down). > + > + Sequence A: > + {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b} > + > + nelts = 16 > + npatterns = 2 > + > + for merging a we need mask 101010.... > + for merging b we need mask 010101.... > + > + Foreach element in the npattern, we need to build a mask in scalar register. > + Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar > + instruction and 1 scalar move to v0 register. Finally we need vector merge > + to merge them. > + > + lui a5, #imm > + add a5, #imm > + vmov.s.x v0, a5 > + vmerge.vxm v9, v9, a1, v0 > + > + So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8. > + If we use slide1down, the COST = nelts = 16 > 8 (COST of merge). > + So return true in this case as it is profitable. > + > + Sequence B: > + {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h} > + > + nelts = 16 > + npatterns = 8 > + > + COST of merge approach = (3 + 1) * npatterns = 24 > + COST of slide1down approach = nelts = 16 > + Return false in this case as it is NOT profitable in merge approach. > +*/ > +bool > +rvv_builder::repeating_sequence_use_merge_profitable_p () > +{ > + if (inner_bytes_size () > UNITS_PER_WORD) > + return false; > + > + unsigned int nelts = full_nelts ().to_constant (); > + > + if (!repeating_sequence_p (0, nelts, npatterns ())) > + return false; > + > + unsigned int merge_cost = 1; > + unsigned int build_merge_mask_cost = 3; > + unsigned int slide1down_cost = nelts; > + > + return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost; > +} > + > /* Merge the repeating sequence into a single element and return the RTX. */ > rtx > rvv_builder::get_merged_repeating_sequence () > @@ -1324,11 +1396,11 @@ rvv_builder::get_merged_repeating_sequence () > scalar_int_mode mode = Pmode; > rtx target = gen_reg_rtx (mode); > emit_move_insn (target, const0_rtx); > - rtx imm = gen_int_mode ((1ULL << m_inner_size) - 1, mode); > + rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode); > /* { a, b, a, b }: Generate duplicate element = b << bits | a. */ > for (unsigned int i = 0; i < npatterns (); i++) > { > - unsigned int loc = m_inner_size * i; > + unsigned int loc = m_inner_bits_size * i; > rtx shift = gen_int_mode (loc, mode); > rtx ele = gen_lowpart (mode, elt (i)); > rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false, > @@ -1344,6 +1416,29 @@ rvv_builder::get_merged_repeating_sequence () > return target; > } > > +/* Get the mask for merge approach. > + > + Consider such following case: > + {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b} > + To merge "a", the mask should be 1010.... > + To merge "b", the mask should be 0101.... > +*/ > +rtx > +rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const > +{ > + unsigned HOST_WIDE_INT mask = 0; > + unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern); > + > + gcc_assert (BITS_PER_WORD % npatterns () == 0); > + > + int limit = BITS_PER_WORD / npatterns (); > + > + for (int i = 0; i < limit; i++) > + mask |= base_mask << (i * npatterns ()); > + > + return gen_int_mode (mask, inner_int_mode ()); > +} > + > /* Subroutine of riscv_vector_expand_vector_init. > Works as follows: > (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. > @@ -1371,6 +1466,111 @@ expand_vector_init_insert_elems (rtx target, const rvv_builder &builder, > } > } > > +/* Emit vmv.s.x instruction. */ > + > +static void > +emit_scalar_move_insn (unsigned icode, rtx *ops) > +{ > + machine_mode data_mode = GET_MODE (ops[0]); > + machine_mode mask_mode = get_mask_mode (data_mode).require (); > + insn_expander<RVV_INSN_OPERANDS_MAX> e (riscv_vector::RVV_SCALAR_MOV_OP, > + /* HAS_DEST_P */ true, > + /* FULLY_UNMASKED_P */ false, > + /* USE_REAL_MERGE_P */ true, > + /* HAS_AVL_P */ true, > + /* VLMAX_P */ false, > + data_mode, mask_mode); > + e.set_policy (TAIL_ANY); > + e.set_policy (MASK_ANY); > + e.set_vl (CONST1_RTX (Pmode)); > + e.emit_insn ((enum insn_code) icode, ops); > +} > + > +/* Emit vmv.v.x instruction with vlmax. */ > + > +static void > +emit_vlmax_integer_move_insn (unsigned icode, rtx *ops, rtx vl) > +{ > + emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ops, vl); > +} > + > +/* Emit vmv.v.x instruction with nonvlmax. */ > + > +static void > +emit_nonvlmax_integer_move_insn (unsigned icode, rtx *ops, rtx avl) > +{ > + emit_nonvlmax_insn (icode, riscv_vector::RVV_UNOP, ops, avl); > +} > + > +/* Emit merge instruction. */ > + > +static machine_mode > +get_repeating_sequence_dup_machine_mode (const rvv_builder &builder) > +{ > + poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ()); > + > + if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR)) > + { > + dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR, > + builder.inner_bytes_size ()); > + } > + > + return get_vector_mode (builder.inner_int_mode (), dup_nunits).require (); > +} > + > +/* Use merge approach to initialize the vector with repeating sequence. > + v = {a, b, a, b, a, b, a, b}. > + > + v = broadcast (a). > + mask = 0b01010101.... > + v = merge (v, b, mask) > +*/ > +static void > +expand_vector_init_merge_repeating_sequence (rtx target, > + const rvv_builder &builder) > +{ > + machine_mode dup_mode = get_repeating_sequence_dup_machine_mode (builder); > + machine_mode dup_mask_mode = get_mask_mode (dup_mode).require (); > + machine_mode mask_mode = get_mask_mode (builder.mode ()).require (); > + uint64_t full_nelts = builder.full_nelts ().to_constant (); > + > + /* Step 1: Broadcast the first pattern. */ > + rtx ops[] = {target, force_reg (GET_MODE_INNER (dup_mode), builder.elt (0))}; > + emit_vlmax_integer_move_insn (code_for_pred_broadcast (builder.mode ()), > + ops, NULL_RTX); > + > + /* Step 2: Merge the rest iteration of pattern. */ > + for (unsigned int i = 1; i < builder.npatterns (); i++) > + { > + /* Step 2-1: Generate mask register v0 for each merge. */ > + rtx merge_mask = builder.get_merge_scalar_mask (i); > + rtx mask = gen_reg_rtx (mask_mode); > + rtx dup = gen_reg_rtx (dup_mode); > + > + if (full_nelts <= BITS_PER_WORD) /* vmv.s.x. */ > + { > + rtx ops[] = {dup, gen_scalar_move_mask (dup_mask_mode), > + RVV_VUNDEF (dup_mode), merge_mask}; > + emit_scalar_move_insn (code_for_pred_broadcast (GET_MODE (dup)), > + ops); > + } > + else /* vmv.v.x. */ > + { > + rtx ops[] = {dup, force_reg (GET_MODE_INNER (dup_mode), merge_mask)}; > + rtx vl = gen_int_mode (CEIL (full_nelts, BITS_PER_WORD), Pmode); > + emit_nonvlmax_integer_move_insn (code_for_pred_broadcast (dup_mode), > + ops, vl); > + } > + > + emit_move_insn (mask, gen_lowpart (mask_mode, dup)); > + > + /* Step 2-2: Merge pattern according to the mask. */ > + rtx ops[] = {target, target, builder.elt (i), mask}; > + emit_vlmax_merge_insn (code_for_pred_merge_scalar (GET_MODE (target)), > + riscv_vector::RVV_MERGE_OP, ops); > + } > +} > + > /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */ > > void > @@ -1394,6 +1594,19 @@ expand_vec_init (rtx target, rtx vals) > emit_move_insn (target, gen_lowpart (mode, dup)); > return; > } > + > + /* Case 2: Optimize repeating sequence cases that Case 1 can > + not handle and it is profitable. For example: > + ELEMENT BITSIZE = 64. > + v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}. > + We can't find a vector mode for "ab" which will be combined into > + 128-bit element to duplicate. */ > + if (v.repeating_sequence_use_merge_profitable_p ()) > + { > + expand_vector_init_merge_repeating_sequence (target, v); > + return; > + } > + > /* TODO: We will support more Initialization of vector in the future. */ > } > > diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h > index 807b0bccc18..4541255a8ae 100644 > --- a/gcc/config/riscv/riscv.h > +++ b/gcc/config/riscv/riscv.h > @@ -150,6 +150,7 @@ ASM_MISA_SPEC > > /* Width of a word, in units (bytes). */ > #define UNITS_PER_WORD (TARGET_64BIT ? 8 : 4) > +#define BITS_PER_WORD (BITS_PER_UNIT * UNITS_PER_WORD) > #ifndef IN_LIBGCC2 > #define MIN_UNITS_PER_WORD 4 > #endif > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c > new file mode 100644 > index 00000000000..59ad49cf795 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c > @@ -0,0 +1,21 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d" } */ > + > +#include <stdint-gcc.h> > + > +typedef int64_t vnx16di __attribute__ ((vector_size (1024))); > + > +__attribute__ ((noipa)) void > +f_vnx16di (int64_t a, int64_t b, int64_t *out) > +{ > + vnx16di v = { > + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, > + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, > + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, > + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, > + }; > + *(vnx16di *) out = v; > +} > + > +/* { dg-final { scan-assembler-times {vmv\.v\.x\s+v[0-9]+,\s*[a-x0-9]+} 2 } } */ > +/* { dg-final { scan-assembler-times {vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c > new file mode 100644 > index 00000000000..fe3741e3be7 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c > @@ -0,0 +1,24 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ > + > +#include <stdint-gcc.h> > + > +typedef double vnx8df __attribute__ ((vector_size (64))); > +typedef double vnx16df __attribute__ ((vector_size (128))); > + > +__attribute__ ((noipa)) void > +f_vnx8df (double a, double b, double *out) > +{ > + vnx8df v = {a, b, a, b, a, b, a, b}; > + *(vnx8df *) out = v; > +} > + > +__attribute__ ((noipa)) void > +f_vnx16df (double a, double b, double *out) > +{ > + vnx16df v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}; > + *(vnx16df *) out = v; > +} > + > +/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */ > +/* { dg-final { scan-assembler-times {vfmerge\.vfm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c > new file mode 100644 > index 00000000000..74776def963 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ > + > +#include <stdint-gcc.h> > + > +typedef int64_t vnx8di __attribute__ ((vector_size (64))); > +typedef int64_t vnx16di __attribute__ ((vector_size (128))); > + > +__attribute__ ((noipa)) void > +f_vnx8di (int64_t a, int64_t b, int64_t *out) > +{ > + vnx8di v = {a, b, a, b, a, b, a, b}; > + *(vnx8di *) out = v; > +} > + > +__attribute__ ((noipa)) void > +f_vnx16di (int64_t a, int64_t b, int64_t *out) > +{ > + vnx16di v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}; > + *(vnx16di *) out = v; > +} > + > + > +/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */ > +/* { dg-final { scan-assembler-times {vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c > new file mode 100644 > index 00000000000..2f61465e84f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ > + > +#include <stdint-gcc.h> > + > +typedef int64_t vnx8di __attribute__ ((vector_size (64))); > + > +__attribute__ ((noipa)) void > +f_vnx8di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out) > +{ > + vnx8di v = {a, b, c, d, a, b, c, d}; > + *(vnx8di *) out = v; > +} > + > +/* { dg-final { scan-assembler-times {vslide1down\.vx\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+} 7 } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c > new file mode 100644 > index 00000000000..7f4e6783f8e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ > + > +#include <stdint-gcc.h> > + > +typedef int64_t vnx16di __attribute__ ((vector_size (128))); > + > +__attribute__ ((noipa)) void > +f_vnx16di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out) > +{ > + vnx16di v = {a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d,}; > + *(vnx16di *) out = v; > +} > + > +/* { dg-final { scan-assembler-times {vmv\.v\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */ > +/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 0 } } */ > +/* { dg-final { scan-assembler-times {vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 0 } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c > new file mode 100644 > index 00000000000..1931d3f5fa0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c > @@ -0,0 +1,47 @@ > +/* { dg-do run { target { riscv_vector } } } */ > +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */ > + > +#include "init-repeat-sequence-2.c" > + > +int > +main () > +{ > + double a = -1789089.23423; > + double b = -8916156.45644; > + > + double v_vnx8df[sizeof (vnx8df) / sizeof (double)]; > + f_vnx8df (a, b, v_vnx8df); > + > + return 0; > + for (int i = 0; i < sizeof (vnx8df) / sizeof (double); i++) > + { > + if (i % 2 == 0) > + { > + if (v_vnx8df[i] != a) > + __builtin_abort (); > + } > + else > + { > + if (v_vnx8df[i] != b) > + __builtin_abort (); > + } > + } > + > + double v_vnx16df[sizeof (vnx16df) / sizeof (double)]; > + f_vnx16df (a, b, v_vnx16df); > + for (int i = 0; i < sizeof (vnx16df) / sizeof (double); i++) > + { > + if (i % 2 == 0) > + { > + if (v_vnx16df[i] != a) > + __builtin_abort (); > + } > + else > + { > + if (v_vnx16df[i] != b) > + __builtin_abort (); > + } > + } > + > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c > new file mode 100644 > index 00000000000..5564dd4a05a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c > @@ -0,0 +1,46 @@ > +/* { dg-do run { target { riscv_vector } } } */ > +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */ > + > +#include "init-repeat-sequence-3.c" > + > +int > +main () > +{ > + int64_t a = -178908923423; > + int64_t b = -891615645644; > + > + int64_t v_vnx8di[sizeof (vnx8di) / sizeof (int64_t)]; > + f_vnx8di (a, b, v_vnx8di); > + for (int i = 0; i < sizeof (vnx8di) / sizeof (int64_t); i++) > + { > + if (i % 2 == 0) > + { > + if (v_vnx8di[i] != a) > + __builtin_abort (); > + } > + else > + { > + if (v_vnx8di[i] != b) > + __builtin_abort (); > + } > + } > + > + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)]; > + f_vnx16di (a, b, v_vnx16di); > + > + for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++) > + { > + if (i % 2 == 0) > + { > + if (v_vnx16di[i] != a) > + __builtin_abort (); > + } > + else > + { > + if (v_vnx16di[i] != b) > + __builtin_abort (); > + } > + } > + > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c > new file mode 100644 > index 00000000000..fec5adc56de > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c > @@ -0,0 +1,41 @@ > +/* { dg-do run { target { riscv_vector } } } */ > +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */ > + > +#include "init-repeat-sequence-5.c" > + > +int > +main () > +{ > + int64_t a = -178908923423; > + int64_t b = -891615645644; > + int64_t c = 78908923423; > + int64_t d = 81615645644; > + > + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)]; > + f_vnx16di (a, b, c, d, v_vnx16di); > + for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++) > + { > + if (i % 4 == 0) > + { > + if (v_vnx16di[i] != a) > + __builtin_abort (); > + } > + else if (i % 4 == 1) > + { > + if (v_vnx16di[i] != b) > + __builtin_abort (); > + } > + else if (i % 4 == 2) > + { > + if (v_vnx16di[i] != c) > + __builtin_abort (); > + } > + else > + { > + if (v_vnx16di[i] != d) > + __builtin_abort (); > + } > + } > + > + return 0; > +} > -- > 2.34.1 >
Committed, thanks Kito. Pan -----Original Message----- From: Kito Cheng <kito.cheng@gmail.com> Sent: Monday, May 29, 2023 5:33 PM To: Li, Pan2 <pan2.li@intel.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@sifive.com; Wang, Yanzhang <yanzhang.wang@intel.com> Subject: Re: [PATCH v7] RISC-V: Using merge approach to optimize repeating sequence in vec_init LGTM, thanks On Mon, May 29, 2023 at 4:54 PM Pan Li via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > From: Pan Li <pan2.li@intel.com> > > This patch would like to optimize the VLS vector initialization like > repeating sequence. From the vslide1down to the vmerge with a simple > cost model, aka every instruction only has 1 cost. > > Given code with -march=rv64gcv_zvl256b --param > riscv-autovec-preference=fixed-vlmax > typedef int64_t vnx32di __attribute__ ((vector_size (256))); > > __attribute__ ((noipa)) void > f_vnx32di (int64_t a, int64_t b, int64_t *out) { > vnx32di v = { > a, b, a, b, a, b, a, b, > a, b, a, b, a, b, a, b, > a, b, a, b, a, b, a, b, > a, b, a, b, a, b, a, b, > }; > *(vnx32di *) out = v; > } > > Before this patch: > vslide1down.vx (x31 times) > > After this patch: > li a5,-1431654400 > addi a5,a5,-1365 > li a3,-1431654400 > addi a3,a3,-1366 > slli a5,a5,32 > add a5,a5,a3 > vsetvli a4,zero,e64,m8,ta,ma > vmv.v.x v8,a0 > vmv.s.x v0,a5 > vmerge.vxm v8,v8,a1,v0 > vs8r.v v8,0(a2) > > Since we dont't have SEW = 128 in vec_duplicate, we can't combine ab > into SEW = 128 element and then broadcast this big element. > > Signed-off-by: Pan Li <pan2.li@intel.com> Co-Authored by: Juzhe-Zhong > <juzhe.zhong@rivai.ai> > > gcc/ChangeLog: > > * config/riscv/riscv-protos.h (enum insn_type): New type. > * config/riscv/riscv-v.cc (RVV_INSN_OPERANDS_MAX): New macro. > (rvv_builder::can_duplicate_repeating_sequence_p): Align the referenced > class member. > (rvv_builder::get_merged_repeating_sequence): Ditto. > (rvv_builder::repeating_sequence_use_merge_profitable_p): New function > to evaluate the optimization cost. > (rvv_builder::get_merge_scalar_mask): New function to get the merge > mask. > (emit_scalar_move_insn): New function to emit vmv.s.x. > (emit_vlmax_integer_move_insn): New function to emit vlmax vmv.v.x. > (emit_nonvlmax_integer_move_insn): New function to emit nonvlmax > vmv.v.x. > (get_repeating_sequence_dup_machine_mode): New function to get the dup > machine mode. > (expand_vector_init_merge_repeating_sequence): New function to perform > the optimization. > (expand_vec_init): Add this vector init optimization. > * config/riscv/riscv.h (BITS_PER_WORD): New macro. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c: New test. > * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c: New test. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/config/riscv/riscv-protos.h | 1 + > gcc/config/riscv/riscv-v.cc | 225 +++++++++++++++++- > gcc/config/riscv/riscv.h | 1 + > .../vls-vlmax/init-repeat-sequence-1.c | 21 ++ > .../vls-vlmax/init-repeat-sequence-2.c | 24 ++ > .../vls-vlmax/init-repeat-sequence-3.c | 25 ++ > .../vls-vlmax/init-repeat-sequence-4.c | 15 ++ > .../vls-vlmax/init-repeat-sequence-5.c | 17 ++ > .../vls-vlmax/init-repeat-sequence-run-1.c | 47 ++++ > .../vls-vlmax/init-repeat-sequence-run-2.c | 46 ++++ > .../vls-vlmax/init-repeat-sequence-run-3.c | 41 ++++ > 11 files changed, 457 insertions(+), 6 deletions(-) create mode > 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque > nce-1.c create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque > nce-2.c create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque > nce-3.c create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque > nce-4.c create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque > nce-5.c create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque > nce-run-1.c create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque > nce-run-2.c create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque > nce-run-3.c > > diff --git a/gcc/config/riscv/riscv-protos.h > b/gcc/config/riscv/riscv-protos.h index 0462f96c8d5..277845673d4 > 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -142,6 +142,7 @@ enum insn_type > RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand. */ > RVV_UNOP_MU = RVV_UNOP + 2, /* Likewise. */ > RVV_TERNOP = 5, > + RVV_SCALAR_MOV_OP = 4, /* +1 for VUNDEF according to vector.md. */ > }; > enum vlmul_type > { > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc > index a5715bb466c..8c920532549 100644 > --- a/gcc/config/riscv/riscv-v.cc > +++ b/gcc/config/riscv/riscv-v.cc > @@ -21,6 +21,10 @@ > > #define IN_TARGET_CODE 1 > > +/* We have a maximum of 11 operands for RVV instruction patterns according to > + the vector.md. */ > +#define RVV_INSN_OPERANDS_MAX 11 > + > #include "config.h" > #include "system.h" > #include "coretypes.h" > @@ -1286,19 +1290,32 @@ public: > : rtx_vector_builder (mode, npatterns, nelts_per_pattern) > { > m_inner_mode = GET_MODE_INNER (mode); > - m_inner_size = GET_MODE_BITSIZE (m_inner_mode).to_constant (); > + m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode); > + m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode); > + > + gcc_assert ( > + int_mode_for_size (inner_bits_size (), 0).exists > + (&m_inner_int_mode)); > } > > bool can_duplicate_repeating_sequence_p (); > rtx get_merged_repeating_sequence (); > > + bool repeating_sequence_use_merge_profitable_p (); rtx > + get_merge_scalar_mask (unsigned int) const; > + > machine_mode new_mode () const { return m_new_mode; } > + scalar_mode inner_mode () const { return m_inner_mode; } > + scalar_int_mode inner_int_mode () const { return m_inner_int_mode; } > + unsigned int inner_bits_size () const { return m_inner_bits_size; } > + unsigned int inner_bytes_size () const { return m_inner_bytes_size; > + } > > private: > - machine_mode m_inner_mode; > + scalar_mode m_inner_mode; > + scalar_int_mode m_inner_int_mode; > machine_mode m_new_mode; > scalar_int_mode m_new_inner_mode; > - unsigned int m_inner_size; > + unsigned int m_inner_bits_size; > + unsigned int m_inner_bytes_size; > }; > > /* Return true if the vector duplicated by a super element which is > the fusion @@ -1309,7 +1326,7 @@ bool > rvv_builder::can_duplicate_repeating_sequence_p () { > poly_uint64 new_size = exact_div (full_nelts (), npatterns ()); > - unsigned int new_inner_size = m_inner_size * npatterns (); > + unsigned int new_inner_size = m_inner_bits_size * npatterns (); > if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode) > || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD > || !get_vector_mode (m_new_inner_mode, new_size).exists > (&m_new_mode)) @@ -1317,6 +1334,61 @@ rvv_builder::can_duplicate_repeating_sequence_p () > return repeating_sequence_p (0, full_nelts ().to_constant (), > npatterns ()); } > > +/* Return true if it is a repeating sequence that using > + merge approach has better codegen than using default > + approach (slide1down). > + > + Sequence A: > + {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b} > + > + nelts = 16 > + npatterns = 2 > + > + for merging a we need mask 101010.... > + for merging b we need mask 010101.... > + > + Foreach element in the npattern, we need to build a mask in scalar register. > + Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar > + instruction and 1 scalar move to v0 register. Finally we need vector merge > + to merge them. > + > + lui a5, #imm > + add a5, #imm > + vmov.s.x v0, a5 > + vmerge.vxm v9, v9, a1, v0 > + > + So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8. > + If we use slide1down, the COST = nelts = 16 > 8 (COST of merge). > + So return true in this case as it is profitable. > + > + Sequence B: > + {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h} > + > + nelts = 16 > + npatterns = 8 > + > + COST of merge approach = (3 + 1) * npatterns = 24 > + COST of slide1down approach = nelts = 16 > + Return false in this case as it is NOT profitable in merge approach. > +*/ > +bool > +rvv_builder::repeating_sequence_use_merge_profitable_p () { > + if (inner_bytes_size () > UNITS_PER_WORD) > + return false; > + > + unsigned int nelts = full_nelts ().to_constant (); > + > + if (!repeating_sequence_p (0, nelts, npatterns ())) > + return false; > + > + unsigned int merge_cost = 1; > + unsigned int build_merge_mask_cost = 3; unsigned int > + slide1down_cost = nelts; > + > + return (build_merge_mask_cost + merge_cost) * npatterns () < > +slide1down_cost; } > + > /* Merge the repeating sequence into a single element and return the > RTX. */ rtx rvv_builder::get_merged_repeating_sequence () @@ > -1324,11 +1396,11 @@ rvv_builder::get_merged_repeating_sequence () > scalar_int_mode mode = Pmode; > rtx target = gen_reg_rtx (mode); > emit_move_insn (target, const0_rtx); > - rtx imm = gen_int_mode ((1ULL << m_inner_size) - 1, mode); > + rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode); > /* { a, b, a, b }: Generate duplicate element = b << bits | a. */ > for (unsigned int i = 0; i < npatterns (); i++) > { > - unsigned int loc = m_inner_size * i; > + unsigned int loc = m_inner_bits_size * i; > rtx shift = gen_int_mode (loc, mode); > rtx ele = gen_lowpart (mode, elt (i)); > rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, > false, @@ -1344,6 +1416,29 @@ rvv_builder::get_merged_repeating_sequence () > return target; > } > > +/* Get the mask for merge approach. > + > + Consider such following case: > + {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b} > + To merge "a", the mask should be 1010.... > + To merge "b", the mask should be 0101.... > +*/ > +rtx > +rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) > +const { > + unsigned HOST_WIDE_INT mask = 0; > + unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern); > + > + gcc_assert (BITS_PER_WORD % npatterns () == 0); > + > + int limit = BITS_PER_WORD / npatterns (); > + > + for (int i = 0; i < limit; i++) > + mask |= base_mask << (i * npatterns ()); > + > + return gen_int_mode (mask, inner_int_mode ()); } > + > /* Subroutine of riscv_vector_expand_vector_init. > Works as follows: > (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. > @@ -1371,6 +1466,111 @@ expand_vector_init_insert_elems (rtx target, const rvv_builder &builder, > } > } > > +/* Emit vmv.s.x instruction. */ > + > +static void > +emit_scalar_move_insn (unsigned icode, rtx *ops) { > + machine_mode data_mode = GET_MODE (ops[0]); > + machine_mode mask_mode = get_mask_mode (data_mode).require (); > + insn_expander<RVV_INSN_OPERANDS_MAX> e (riscv_vector::RVV_SCALAR_MOV_OP, > + /* HAS_DEST_P */ true, > + /* FULLY_UNMASKED_P */ false, > + /* USE_REAL_MERGE_P */ true, > + /* HAS_AVL_P */ true, > + /* VLMAX_P */ false, > + data_mode, mask_mode); > + e.set_policy (TAIL_ANY); > + e.set_policy (MASK_ANY); > + e.set_vl (CONST1_RTX (Pmode)); > + e.emit_insn ((enum insn_code) icode, ops); } > + > +/* Emit vmv.v.x instruction with vlmax. */ > + > +static void > +emit_vlmax_integer_move_insn (unsigned icode, rtx *ops, rtx vl) { > + emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ops, vl); } > + > +/* Emit vmv.v.x instruction with nonvlmax. */ > + > +static void > +emit_nonvlmax_integer_move_insn (unsigned icode, rtx *ops, rtx avl) { > + emit_nonvlmax_insn (icode, riscv_vector::RVV_UNOP, ops, avl); } > + > +/* Emit merge instruction. */ > + > +static machine_mode > +get_repeating_sequence_dup_machine_mode (const rvv_builder &builder) > +{ > + poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ()); > + > + if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR)) > + { > + dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR, > + builder.inner_bytes_size ()); > + } > + > + return get_vector_mode (builder.inner_int_mode (), > +dup_nunits).require (); } > + > +/* Use merge approach to initialize the vector with repeating sequence. > + v = {a, b, a, b, a, b, a, b}. > + > + v = broadcast (a). > + mask = 0b01010101.... > + v = merge (v, b, mask) > +*/ > +static void > +expand_vector_init_merge_repeating_sequence (rtx target, > + const rvv_builder > +&builder) { > + machine_mode dup_mode = get_repeating_sequence_dup_machine_mode > +(builder); > + machine_mode dup_mask_mode = get_mask_mode (dup_mode).require (); > + machine_mode mask_mode = get_mask_mode (builder.mode ()).require > +(); > + uint64_t full_nelts = builder.full_nelts ().to_constant (); > + > + /* Step 1: Broadcast the first pattern. */ rtx ops[] = {target, > + force_reg (GET_MODE_INNER (dup_mode), builder.elt (0))}; > + emit_vlmax_integer_move_insn (code_for_pred_broadcast (builder.mode ()), > + ops, NULL_RTX); > + > + /* Step 2: Merge the rest iteration of pattern. */ for (unsigned > + int i = 1; i < builder.npatterns (); i++) > + { > + /* Step 2-1: Generate mask register v0 for each merge. */ > + rtx merge_mask = builder.get_merge_scalar_mask (i); > + rtx mask = gen_reg_rtx (mask_mode); > + rtx dup = gen_reg_rtx (dup_mode); > + > + if (full_nelts <= BITS_PER_WORD) /* vmv.s.x. */ > + { > + rtx ops[] = {dup, gen_scalar_move_mask (dup_mask_mode), > + RVV_VUNDEF (dup_mode), merge_mask}; > + emit_scalar_move_insn (code_for_pred_broadcast (GET_MODE (dup)), > + ops); > + } > + else /* vmv.v.x. */ > + { > + rtx ops[] = {dup, force_reg (GET_MODE_INNER (dup_mode), merge_mask)}; > + rtx vl = gen_int_mode (CEIL (full_nelts, BITS_PER_WORD), Pmode); > + emit_nonvlmax_integer_move_insn (code_for_pred_broadcast (dup_mode), > + ops, vl); > + } > + > + emit_move_insn (mask, gen_lowpart (mask_mode, dup)); > + > + /* Step 2-2: Merge pattern according to the mask. */ > + rtx ops[] = {target, target, builder.elt (i), mask}; > + emit_vlmax_merge_insn (code_for_pred_merge_scalar (GET_MODE (target)), > + riscv_vector::RVV_MERGE_OP, ops); > + } > +} > + > /* Initialize register TARGET from the elements in PARALLEL rtx VALS. > */ > > void > @@ -1394,6 +1594,19 @@ expand_vec_init (rtx target, rtx vals) > emit_move_insn (target, gen_lowpart (mode, dup)); > return; > } > + > + /* Case 2: Optimize repeating sequence cases that Case 1 can > + not handle and it is profitable. For example: > + ELEMENT BITSIZE = 64. > + v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}. > + We can't find a vector mode for "ab" which will be combined into > + 128-bit element to duplicate. */ > + if (v.repeating_sequence_use_merge_profitable_p ()) > + { > + expand_vector_init_merge_repeating_sequence (target, v); > + return; > + } > + > /* TODO: We will support more Initialization of vector in the future. */ > } > > diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index > 807b0bccc18..4541255a8ae 100644 > --- a/gcc/config/riscv/riscv.h > +++ b/gcc/config/riscv/riscv.h > @@ -150,6 +150,7 @@ ASM_MISA_SPEC > > /* Width of a word, in units (bytes). */ #define UNITS_PER_WORD > (TARGET_64BIT ? 8 : 4) > +#define BITS_PER_WORD (BITS_PER_UNIT * UNITS_PER_WORD) > #ifndef IN_LIBGCC2 > #define MIN_UNITS_PER_WORD 4 > #endif > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-1.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-1.c > new file mode 100644 > index 00000000000..59ad49cf795 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat > +++ -sequence-1.c > @@ -0,0 +1,21 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d" } */ > + > +#include <stdint-gcc.h> > + > +typedef int64_t vnx16di __attribute__ ((vector_size (1024))); > + > +__attribute__ ((noipa)) void > +f_vnx16di (int64_t a, int64_t b, int64_t *out) { > + vnx16di v = { > + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, > + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, > + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, > + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, > +a, b, a, b, a, b, a, b, a, b, > + }; > + *(vnx16di *) out = v; > +} > + > +/* { dg-final { scan-assembler-times > +{vmv\.v\.x\s+v[0-9]+,\s*[a-x0-9]+} 2 } } */ > +/* { dg-final { scan-assembler-times > +{vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */ > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-2.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-2.c > new file mode 100644 > index 00000000000..fe3741e3be7 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat > +++ -sequence-2.c > @@ -0,0 +1,24 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ > + > +#include <stdint-gcc.h> > + > +typedef double vnx8df __attribute__ ((vector_size (64))); typedef > +double vnx16df __attribute__ ((vector_size (128))); > + > +__attribute__ ((noipa)) void > +f_vnx8df (double a, double b, double *out) { > + vnx8df v = {a, b, a, b, a, b, a, b}; > + *(vnx8df *) out = v; > +} > + > +__attribute__ ((noipa)) void > +f_vnx16df (double a, double b, double *out) { > + vnx16df v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}; > + *(vnx16df *) out = v; > +} > + > +/* { dg-final { scan-assembler-times > +{vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */ > +/* { dg-final { scan-assembler-times > +{vfmerge\.vfm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */ > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-3.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-3.c > new file mode 100644 > index 00000000000..74776def963 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat > +++ -sequence-3.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ > + > +#include <stdint-gcc.h> > + > +typedef int64_t vnx8di __attribute__ ((vector_size (64))); typedef > +int64_t vnx16di __attribute__ ((vector_size (128))); > + > +__attribute__ ((noipa)) void > +f_vnx8di (int64_t a, int64_t b, int64_t *out) { > + vnx8di v = {a, b, a, b, a, b, a, b}; > + *(vnx8di *) out = v; > +} > + > +__attribute__ ((noipa)) void > +f_vnx16di (int64_t a, int64_t b, int64_t *out) { > + vnx16di v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}; > + *(vnx16di *) out = v; > +} > + > + > +/* { dg-final { scan-assembler-times > +{vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */ > +/* { dg-final { scan-assembler-times > +{vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */ > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-4.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-4.c > new file mode 100644 > index 00000000000..2f61465e84f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat > +++ -sequence-4.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ > + > +#include <stdint-gcc.h> > + > +typedef int64_t vnx8di __attribute__ ((vector_size (64))); > + > +__attribute__ ((noipa)) void > +f_vnx8di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out) { > + vnx8di v = {a, b, c, d, a, b, c, d}; > + *(vnx8di *) out = v; > +} > + > +/* { dg-final { scan-assembler-times > +{vslide1down\.vx\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+} 7 } } */ > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-5.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-5.c > new file mode 100644 > index 00000000000..7f4e6783f8e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat > +++ -sequence-5.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ > + > +#include <stdint-gcc.h> > + > +typedef int64_t vnx16di __attribute__ ((vector_size (128))); > + > +__attribute__ ((noipa)) void > +f_vnx16di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out) > +{ > + vnx16di v = {a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d,}; > + *(vnx16di *) out = v; > +} > + > +/* { dg-final { scan-assembler-times > +{vmv\.v\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */ > +/* { dg-final { scan-assembler-times > +{vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 0 } } */ > +/* { dg-final { scan-assembler-times > +{vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 0 } } */ > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-run-1.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-run-1.c > new file mode 100644 > index 00000000000..1931d3f5fa0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat > +++ -sequence-run-1.c > @@ -0,0 +1,47 @@ > +/* { dg-do run { target { riscv_vector } } } */ > +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } > +*/ > + > +#include "init-repeat-sequence-2.c" > + > +int > +main () > +{ > + double a = -1789089.23423; > + double b = -8916156.45644; > + > + double v_vnx8df[sizeof (vnx8df) / sizeof (double)]; f_vnx8df (a, > + b, v_vnx8df); > + > + return 0; > + for (int i = 0; i < sizeof (vnx8df) / sizeof (double); i++) > + { > + if (i % 2 == 0) > + { > + if (v_vnx8df[i] != a) > + __builtin_abort (); > + } > + else > + { > + if (v_vnx8df[i] != b) > + __builtin_abort (); > + } > + } > + > + double v_vnx16df[sizeof (vnx16df) / sizeof (double)]; f_vnx16df > + (a, b, v_vnx16df); for (int i = 0; i < sizeof (vnx16df) / sizeof > + (double); i++) > + { > + if (i % 2 == 0) > + { > + if (v_vnx16df[i] != a) > + __builtin_abort (); > + } > + else > + { > + if (v_vnx16df[i] != b) > + __builtin_abort (); > + } > + } > + > + return 0; > +} > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-run-2.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-run-2.c > new file mode 100644 > index 00000000000..5564dd4a05a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat > +++ -sequence-run-2.c > @@ -0,0 +1,46 @@ > +/* { dg-do run { target { riscv_vector } } } */ > +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } > +*/ > + > +#include "init-repeat-sequence-3.c" > + > +int > +main () > +{ > + int64_t a = -178908923423; > + int64_t b = -891615645644; > + > + int64_t v_vnx8di[sizeof (vnx8di) / sizeof (int64_t)]; f_vnx8di (a, > + b, v_vnx8di); for (int i = 0; i < sizeof (vnx8di) / sizeof > + (int64_t); i++) > + { > + if (i % 2 == 0) > + { > + if (v_vnx8di[i] != a) > + __builtin_abort (); > + } > + else > + { > + if (v_vnx8di[i] != b) > + __builtin_abort (); > + } > + } > + > + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)]; f_vnx16di > + (a, b, v_vnx16di); > + > + for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++) > + { > + if (i % 2 == 0) > + { > + if (v_vnx16di[i] != a) > + __builtin_abort (); > + } > + else > + { > + if (v_vnx16di[i] != b) > + __builtin_abort (); > + } > + } > + > + return 0; > +} > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-run-3.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq > uence-run-3.c > new file mode 100644 > index 00000000000..fec5adc56de > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat > +++ -sequence-run-3.c > @@ -0,0 +1,41 @@ > +/* { dg-do run { target { riscv_vector } } } */ > +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } > +*/ > + > +#include "init-repeat-sequence-5.c" > + > +int > +main () > +{ > + int64_t a = -178908923423; > + int64_t b = -891615645644; > + int64_t c = 78908923423; > + int64_t d = 81615645644; > + > + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)]; f_vnx16di > + (a, b, c, d, v_vnx16di); for (int i = 0; i < sizeof (vnx16di) / > + sizeof (int64_t); i++) > + { > + if (i % 4 == 0) > + { > + if (v_vnx16di[i] != a) > + __builtin_abort (); > + } > + else if (i % 4 == 1) > + { > + if (v_vnx16di[i] != b) > + __builtin_abort (); > + } > + else if (i % 4 == 2) > + { > + if (v_vnx16di[i] != c) > + __builtin_abort (); > + } > + else > + { > + if (v_vnx16di[i] != d) > + __builtin_abort (); > + } > + } > + > + return 0; > +} > -- > 2.34.1 >
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 0462f96c8d5..277845673d4 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -142,6 +142,7 @@ enum insn_type RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand. */ RVV_UNOP_MU = RVV_UNOP + 2, /* Likewise. */ RVV_TERNOP = 5, + RVV_SCALAR_MOV_OP = 4, /* +1 for VUNDEF according to vector.md. */ }; enum vlmul_type { diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index a5715bb466c..8c920532549 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -21,6 +21,10 @@ #define IN_TARGET_CODE 1 +/* We have a maximum of 11 operands for RVV instruction patterns according to + the vector.md. */ +#define RVV_INSN_OPERANDS_MAX 11 + #include "config.h" #include "system.h" #include "coretypes.h" @@ -1286,19 +1290,32 @@ public: : rtx_vector_builder (mode, npatterns, nelts_per_pattern) { m_inner_mode = GET_MODE_INNER (mode); - m_inner_size = GET_MODE_BITSIZE (m_inner_mode).to_constant (); + m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode); + m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode); + + gcc_assert ( + int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode)); } bool can_duplicate_repeating_sequence_p (); rtx get_merged_repeating_sequence (); + bool repeating_sequence_use_merge_profitable_p (); + rtx get_merge_scalar_mask (unsigned int) const; + machine_mode new_mode () const { return m_new_mode; } + scalar_mode inner_mode () const { return m_inner_mode; } + scalar_int_mode inner_int_mode () const { return m_inner_int_mode; } + unsigned int inner_bits_size () const { return m_inner_bits_size; } + unsigned int inner_bytes_size () const { return m_inner_bytes_size; } private: - machine_mode m_inner_mode; + scalar_mode m_inner_mode; + scalar_int_mode m_inner_int_mode; machine_mode m_new_mode; scalar_int_mode m_new_inner_mode; - unsigned int m_inner_size; + unsigned int m_inner_bits_size; + unsigned int m_inner_bytes_size; }; /* Return true if the vector duplicated by a super element which is the fusion @@ -1309,7 +1326,7 @@ bool rvv_builder::can_duplicate_repeating_sequence_p () { poly_uint64 new_size = exact_div (full_nelts (), npatterns ()); - unsigned int new_inner_size = m_inner_size * npatterns (); + unsigned int new_inner_size = m_inner_bits_size * npatterns (); if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode) || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode)) @@ -1317,6 +1334,61 @@ rvv_builder::can_duplicate_repeating_sequence_p () return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ()); } +/* Return true if it is a repeating sequence that using + merge approach has better codegen than using default + approach (slide1down). + + Sequence A: + {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b} + + nelts = 16 + npatterns = 2 + + for merging a we need mask 101010.... + for merging b we need mask 010101.... + + Foreach element in the npattern, we need to build a mask in scalar register. + Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar + instruction and 1 scalar move to v0 register. Finally we need vector merge + to merge them. + + lui a5, #imm + add a5, #imm + vmov.s.x v0, a5 + vmerge.vxm v9, v9, a1, v0 + + So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8. + If we use slide1down, the COST = nelts = 16 > 8 (COST of merge). + So return true in this case as it is profitable. + + Sequence B: + {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h} + + nelts = 16 + npatterns = 8 + + COST of merge approach = (3 + 1) * npatterns = 24 + COST of slide1down approach = nelts = 16 + Return false in this case as it is NOT profitable in merge approach. +*/ +bool +rvv_builder::repeating_sequence_use_merge_profitable_p () +{ + if (inner_bytes_size () > UNITS_PER_WORD) + return false; + + unsigned int nelts = full_nelts ().to_constant (); + + if (!repeating_sequence_p (0, nelts, npatterns ())) + return false; + + unsigned int merge_cost = 1; + unsigned int build_merge_mask_cost = 3; + unsigned int slide1down_cost = nelts; + + return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost; +} + /* Merge the repeating sequence into a single element and return the RTX. */ rtx rvv_builder::get_merged_repeating_sequence () @@ -1324,11 +1396,11 @@ rvv_builder::get_merged_repeating_sequence () scalar_int_mode mode = Pmode; rtx target = gen_reg_rtx (mode); emit_move_insn (target, const0_rtx); - rtx imm = gen_int_mode ((1ULL << m_inner_size) - 1, mode); + rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode); /* { a, b, a, b }: Generate duplicate element = b << bits | a. */ for (unsigned int i = 0; i < npatterns (); i++) { - unsigned int loc = m_inner_size * i; + unsigned int loc = m_inner_bits_size * i; rtx shift = gen_int_mode (loc, mode); rtx ele = gen_lowpart (mode, elt (i)); rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false, @@ -1344,6 +1416,29 @@ rvv_builder::get_merged_repeating_sequence () return target; } +/* Get the mask for merge approach. + + Consider such following case: + {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b} + To merge "a", the mask should be 1010.... + To merge "b", the mask should be 0101.... +*/ +rtx +rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const +{ + unsigned HOST_WIDE_INT mask = 0; + unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern); + + gcc_assert (BITS_PER_WORD % npatterns () == 0); + + int limit = BITS_PER_WORD / npatterns (); + + for (int i = 0; i < limit; i++) + mask |= base_mask << (i * npatterns ()); + + return gen_int_mode (mask, inner_int_mode ()); +} + /* Subroutine of riscv_vector_expand_vector_init. Works as follows: (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. @@ -1371,6 +1466,111 @@ expand_vector_init_insert_elems (rtx target, const rvv_builder &builder, } } +/* Emit vmv.s.x instruction. */ + +static void +emit_scalar_move_insn (unsigned icode, rtx *ops) +{ + machine_mode data_mode = GET_MODE (ops[0]); + machine_mode mask_mode = get_mask_mode (data_mode).require (); + insn_expander<RVV_INSN_OPERANDS_MAX> e (riscv_vector::RVV_SCALAR_MOV_OP, + /* HAS_DEST_P */ true, + /* FULLY_UNMASKED_P */ false, + /* USE_REAL_MERGE_P */ true, + /* HAS_AVL_P */ true, + /* VLMAX_P */ false, + data_mode, mask_mode); + e.set_policy (TAIL_ANY); + e.set_policy (MASK_ANY); + e.set_vl (CONST1_RTX (Pmode)); + e.emit_insn ((enum insn_code) icode, ops); +} + +/* Emit vmv.v.x instruction with vlmax. */ + +static void +emit_vlmax_integer_move_insn (unsigned icode, rtx *ops, rtx vl) +{ + emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ops, vl); +} + +/* Emit vmv.v.x instruction with nonvlmax. */ + +static void +emit_nonvlmax_integer_move_insn (unsigned icode, rtx *ops, rtx avl) +{ + emit_nonvlmax_insn (icode, riscv_vector::RVV_UNOP, ops, avl); +} + +/* Emit merge instruction. */ + +static machine_mode +get_repeating_sequence_dup_machine_mode (const rvv_builder &builder) +{ + poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ()); + + if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR)) + { + dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR, + builder.inner_bytes_size ()); + } + + return get_vector_mode (builder.inner_int_mode (), dup_nunits).require (); +} + +/* Use merge approach to initialize the vector with repeating sequence. + v = {a, b, a, b, a, b, a, b}. + + v = broadcast (a). + mask = 0b01010101.... + v = merge (v, b, mask) +*/ +static void +expand_vector_init_merge_repeating_sequence (rtx target, + const rvv_builder &builder) +{ + machine_mode dup_mode = get_repeating_sequence_dup_machine_mode (builder); + machine_mode dup_mask_mode = get_mask_mode (dup_mode).require (); + machine_mode mask_mode = get_mask_mode (builder.mode ()).require (); + uint64_t full_nelts = builder.full_nelts ().to_constant (); + + /* Step 1: Broadcast the first pattern. */ + rtx ops[] = {target, force_reg (GET_MODE_INNER (dup_mode), builder.elt (0))}; + emit_vlmax_integer_move_insn (code_for_pred_broadcast (builder.mode ()), + ops, NULL_RTX); + + /* Step 2: Merge the rest iteration of pattern. */ + for (unsigned int i = 1; i < builder.npatterns (); i++) + { + /* Step 2-1: Generate mask register v0 for each merge. */ + rtx merge_mask = builder.get_merge_scalar_mask (i); + rtx mask = gen_reg_rtx (mask_mode); + rtx dup = gen_reg_rtx (dup_mode); + + if (full_nelts <= BITS_PER_WORD) /* vmv.s.x. */ + { + rtx ops[] = {dup, gen_scalar_move_mask (dup_mask_mode), + RVV_VUNDEF (dup_mode), merge_mask}; + emit_scalar_move_insn (code_for_pred_broadcast (GET_MODE (dup)), + ops); + } + else /* vmv.v.x. */ + { + rtx ops[] = {dup, force_reg (GET_MODE_INNER (dup_mode), merge_mask)}; + rtx vl = gen_int_mode (CEIL (full_nelts, BITS_PER_WORD), Pmode); + emit_nonvlmax_integer_move_insn (code_for_pred_broadcast (dup_mode), + ops, vl); + } + + emit_move_insn (mask, gen_lowpart (mask_mode, dup)); + + /* Step 2-2: Merge pattern according to the mask. */ + rtx ops[] = {target, target, builder.elt (i), mask}; + emit_vlmax_merge_insn (code_for_pred_merge_scalar (GET_MODE (target)), + riscv_vector::RVV_MERGE_OP, ops); + } +} + /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */ void @@ -1394,6 +1594,19 @@ expand_vec_init (rtx target, rtx vals) emit_move_insn (target, gen_lowpart (mode, dup)); return; } + + /* Case 2: Optimize repeating sequence cases that Case 1 can + not handle and it is profitable. For example: + ELEMENT BITSIZE = 64. + v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}. + We can't find a vector mode for "ab" which will be combined into + 128-bit element to duplicate. */ + if (v.repeating_sequence_use_merge_profitable_p ()) + { + expand_vector_init_merge_repeating_sequence (target, v); + return; + } + /* TODO: We will support more Initialization of vector in the future. */ } diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index 807b0bccc18..4541255a8ae 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -150,6 +150,7 @@ ASM_MISA_SPEC /* Width of a word, in units (bytes). */ #define UNITS_PER_WORD (TARGET_64BIT ? 8 : 4) +#define BITS_PER_WORD (BITS_PER_UNIT * UNITS_PER_WORD) #ifndef IN_LIBGCC2 #define MIN_UNITS_PER_WORD 4 #endif diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c new file mode 100644 index 00000000000..59ad49cf795 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d" } */ + +#include <stdint-gcc.h> + +typedef int64_t vnx16di __attribute__ ((vector_size (1024))); + +__attribute__ ((noipa)) void +f_vnx16di (int64_t a, int64_t b, int64_t *out) +{ + vnx16di v = { + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + }; + *(vnx16di *) out = v; +} + +/* { dg-final { scan-assembler-times {vmv\.v\.x\s+v[0-9]+,\s*[a-x0-9]+} 2 } } */ +/* { dg-final { scan-assembler-times {vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c new file mode 100644 index 00000000000..fe3741e3be7 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ + +#include <stdint-gcc.h> + +typedef double vnx8df __attribute__ ((vector_size (64))); +typedef double vnx16df __attribute__ ((vector_size (128))); + +__attribute__ ((noipa)) void +f_vnx8df (double a, double b, double *out) +{ + vnx8df v = {a, b, a, b, a, b, a, b}; + *(vnx8df *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx16df (double a, double b, double *out) +{ + vnx16df v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}; + *(vnx16df *) out = v; +} + +/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vfmerge\.vfm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c new file mode 100644 index 00000000000..74776def963 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ + +#include <stdint-gcc.h> + +typedef int64_t vnx8di __attribute__ ((vector_size (64))); +typedef int64_t vnx16di __attribute__ ((vector_size (128))); + +__attribute__ ((noipa)) void +f_vnx8di (int64_t a, int64_t b, int64_t *out) +{ + vnx8di v = {a, b, a, b, a, b, a, b}; + *(vnx8di *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx16di (int64_t a, int64_t b, int64_t *out) +{ + vnx16di v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}; + *(vnx16di *) out = v; +} + + +/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c new file mode 100644 index 00000000000..2f61465e84f --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ + +#include <stdint-gcc.h> + +typedef int64_t vnx8di __attribute__ ((vector_size (64))); + +__attribute__ ((noipa)) void +f_vnx8di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out) +{ + vnx8di v = {a, b, c, d, a, b, c, d}; + *(vnx8di *) out = v; +} + +/* { dg-final { scan-assembler-times {vslide1down\.vx\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+} 7 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c new file mode 100644 index 00000000000..7f4e6783f8e --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */ + +#include <stdint-gcc.h> + +typedef int64_t vnx16di __attribute__ ((vector_size (128))); + +__attribute__ ((noipa)) void +f_vnx16di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out) +{ + vnx16di v = {a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d,}; + *(vnx16di *) out = v; +} + +/* { dg-final { scan-assembler-times {vmv\.v\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 0 } } */ +/* { dg-final { scan-assembler-times {vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 0 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c new file mode 100644 index 00000000000..1931d3f5fa0 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c @@ -0,0 +1,47 @@ +/* { dg-do run { target { riscv_vector } } } */ +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */ + +#include "init-repeat-sequence-2.c" + +int +main () +{ + double a = -1789089.23423; + double b = -8916156.45644; + + double v_vnx8df[sizeof (vnx8df) / sizeof (double)]; + f_vnx8df (a, b, v_vnx8df); + + return 0; + for (int i = 0; i < sizeof (vnx8df) / sizeof (double); i++) + { + if (i % 2 == 0) + { + if (v_vnx8df[i] != a) + __builtin_abort (); + } + else + { + if (v_vnx8df[i] != b) + __builtin_abort (); + } + } + + double v_vnx16df[sizeof (vnx16df) / sizeof (double)]; + f_vnx16df (a, b, v_vnx16df); + for (int i = 0; i < sizeof (vnx16df) / sizeof (double); i++) + { + if (i % 2 == 0) + { + if (v_vnx16df[i] != a) + __builtin_abort (); + } + else + { + if (v_vnx16df[i] != b) + __builtin_abort (); + } + } + + return 0; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c new file mode 100644 index 00000000000..5564dd4a05a --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c @@ -0,0 +1,46 @@ +/* { dg-do run { target { riscv_vector } } } */ +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */ + +#include "init-repeat-sequence-3.c" + +int +main () +{ + int64_t a = -178908923423; + int64_t b = -891615645644; + + int64_t v_vnx8di[sizeof (vnx8di) / sizeof (int64_t)]; + f_vnx8di (a, b, v_vnx8di); + for (int i = 0; i < sizeof (vnx8di) / sizeof (int64_t); i++) + { + if (i % 2 == 0) + { + if (v_vnx8di[i] != a) + __builtin_abort (); + } + else + { + if (v_vnx8di[i] != b) + __builtin_abort (); + } + } + + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)]; + f_vnx16di (a, b, v_vnx16di); + + for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++) + { + if (i % 2 == 0) + { + if (v_vnx16di[i] != a) + __builtin_abort (); + } + else + { + if (v_vnx16di[i] != b) + __builtin_abort (); + } + } + + return 0; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c new file mode 100644 index 00000000000..fec5adc56de --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c @@ -0,0 +1,41 @@ +/* { dg-do run { target { riscv_vector } } } */ +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */ + +#include "init-repeat-sequence-5.c" + +int +main () +{ + int64_t a = -178908923423; + int64_t b = -891615645644; + int64_t c = 78908923423; + int64_t d = 81615645644; + + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)]; + f_vnx16di (a, b, c, d, v_vnx16di); + for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++) + { + if (i % 4 == 0) + { + if (v_vnx16di[i] != a) + __builtin_abort (); + } + else if (i % 4 == 1) + { + if (v_vnx16di[i] != b) + __builtin_abort (); + } + else if (i % 4 == 2) + { + if (v_vnx16di[i] != c) + __builtin_abort (); + } + else + { + if (v_vnx16di[i] != d) + __builtin_abort (); + } + } + + return 0; +}