diff mbox series

RISC-V: Fix inferior codegen for vse intrinsics.

Message ID 20221229153402.40958-1-juzhe.zhong@rivai.ai
State New
Headers show
Series RISC-V: Fix inferior codegen for vse intrinsics. | expand

Commit Message

juzhe.zhong@rivai.ai Dec. 29, 2022, 3:34 p.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Currently we use pred_mov to to do the codegen for vse intrinsics. However, it
generates inferior codegen when I am testing AVL model of VSETVL PASS using vse intrinsics.

Consider this following code:
void f2 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
{
  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
  vbool64_t mask = *(vbool64_t*)mask_in;
  for (int i = 0; i < n; i++)
    {
      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);

      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);

      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13);
      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);

      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11);
      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);

      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11);
      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);

      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11);
      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);

      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
    }
}

Before this patch:
	csrr	t2,vlenb
	srli	t2,t2,1
	slli	s0,t2,2
	vsetvli	zero,19,e16,mf2,ta,ma
	sub	s0,s0,t2
	csrr	t2,vlenb
	vle16.v	v24,0(a3)
	mv	a4,a3
	vse16.v	v24,0(a1)
	srli	t2,t2,1
	add	a2,a3,t6
	add	s0,s0,sp
	vsetvli	zero,19,e32,mf2,ta,ma
	addi	a3,a3,4
	vle32.v	v24,0(a3)
	vsetvli	zero,t0,e32,mf2,ta,ma
	vse32.v	v24,0(s0)
	slli	s0,t2,2
	sub	s0,s0,t2
	add	s0,s0,sp
	vsetvli	t0,zero,e32,mf2,ta,ma
	vle32.v	v24,0(s0)
	mv	s0,t2
	slli	t2,t2,2
	mv	a5,a1
	vsetvli	zero,19,e32,mf2,ta,ma
	addi	a1,a1,4
	sub	t2,t2,s0
	vse32.v	v24,0(a1)
	add	t2,t2,sp
	vsetvli	t0,zero,e32,mf2,ta,ma
	addi	t1,a5,796
	vle32.v	v24,0(t2)
	addi	t5,a4,1196
	addi	a7,a5,1196
	addi	t4,a4,1996
	addi	a6,a5,1996
	vsetvli	zero,13,e32,mf2,ta,ma
	add	a4,a4,t3
	vse32.v	v24,0(t1)
	add	a5,a5,t3
	vsetvli	zero,11,e64,m1,tu,mu
	vle64.v	v24,0(t5),v0.t
	vse64.v	v24,0(a7)
	vle64.v	v24,0(t4),v0.t
	vse64.v	v24,0(a6)
	vle64.v	v24,0(a4),v0.t
	vse64.v	v24,0(a5),v0.t
	vsetvli	zero,11,e8,mf4,ta,ma
	vle8.v	v24,0(a2)
	vse8.v	v24,0(a2)
	bne	a0,a3,.L8
	csrr	t0,vlenb
	slli	t1,t0,1
	add	sp,sp,t1
	lw	s0,12(sp)
	addi	sp,sp,16
	jr	ra

We are generating redundant spilling codes.
Here we introduce a dedicated pred_store pattern for vse intrinsics like
maskstore in ARM SVE.

After this patch:
vsetvli	zero,19,e16,mf2,ta,ma
	mv	a5,a4
	vle16.v	v24,0(a0)
	mv	a3,a0
	vse16.v	19,0(a4)
	addi	t1,a4,796
	vsetvli	zero,19,e32,mf2,ta,ma
	addi	a0,a0,4
	addi	a4,a4,4
	vle32.v	v24,0(a0)
	addi	t0,a3,1196
	vse32.v	19,0(a4)
	addi	a7,a5,1196
	addi	t6,a3,1996
	addi	a6,a5,1996
	add	t5,a3,t4
	vsetvli	zero,13,e32,mf2,ta,ma
	add	a2,a5,t4
	vse32.v	13,0(t1)
	add	a3,a3,t3
	vsetvli	zero,11,e64,m1,tu,mu
	add	a5,a5,t3
	vle64.v	v24,0(t0),v0.t
	vse64.v	11,0(a7)
	vle64.v	v24,0(t6),v0.t
	vse64.v	11,0(a6)
	vle64.v	v24,0(t5),v0.t
	vse64.v	11,0(a2),v0.t
	vsetvli	zero,11,e8,mf4,ta,ma
	vle8.v	v24,0(a3)
	vse8.v	11,0(a5)
	bne	a1,a4,.L8
.L6:
	ret

gcc/ChangeLog:

        * config/riscv/riscv-vector-builtins-bases.cc (class loadstore): use pred_store for vse.
        * config/riscv/riscv-vector-builtins.cc (function_expander::add_mem_operand): Refine function.
        (function_expander::use_contiguous_load_insn): Adjust new implementation.
        (function_expander::use_contiguous_store_insn): Ditto.
        * config/riscv/riscv-vector-builtins.h: Refine function.
        * config/riscv/vector.md (@pred_store<mode>): New pattern.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/base/vse-constraint-1.c: New test.

---
 .../riscv/riscv-vector-builtins-bases.cc      |  2 +-
 gcc/config/riscv/riscv-vector-builtins.cc     | 22 +----
 gcc/config/riscv/riscv-vector-builtins.h      |  8 +-
 gcc/config/riscv/vector.md                    | 23 ++++-
 .../riscv/rvv/base/vse-constraint-1.c         | 97 +++++++++++++++++++
 5 files changed, 128 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c

Comments

Kito Cheng Jan. 26, 2023, 7:12 p.m. UTC | #1
committed, thanks.

On Thu, Dec 29, 2022 at 11:34 PM <juzhe.zhong@rivai.ai> wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Currently we use pred_mov to to do the codegen for vse intrinsics.
> However, it
> generates inferior codegen when I am testing AVL model of VSETVL PASS
> using vse intrinsics.
>
> Consider this following code:
> void f2 (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> {
>   vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
>   __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
>   vbool64_t mask = *(vbool64_t*)mask_in;
>   for (int i = 0; i < n; i++)
>     {
>       vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
>       __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
>
>       vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
>       __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
>
>       vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
>       __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
>
>       vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i +
> 300), 11);
>       __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
>
>       vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in
> + i + 500), 11);
>       __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
>
>       vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
>       __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
>
>       vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
>       __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
>     }
> }
>
> Before this patch:
>         csrr    t2,vlenb
>         srli    t2,t2,1
>         slli    s0,t2,2
>         vsetvli zero,19,e16,mf2,ta,ma
>         sub     s0,s0,t2
>         csrr    t2,vlenb
>         vle16.v v24,0(a3)
>         mv      a4,a3
>         vse16.v v24,0(a1)
>         srli    t2,t2,1
>         add     a2,a3,t6
>         add     s0,s0,sp
>         vsetvli zero,19,e32,mf2,ta,ma
>         addi    a3,a3,4
>         vle32.v v24,0(a3)
>         vsetvli zero,t0,e32,mf2,ta,ma
>         vse32.v v24,0(s0)
>         slli    s0,t2,2
>         sub     s0,s0,t2
>         add     s0,s0,sp
>         vsetvli t0,zero,e32,mf2,ta,ma
>         vle32.v v24,0(s0)
>         mv      s0,t2
>         slli    t2,t2,2
>         mv      a5,a1
>         vsetvli zero,19,e32,mf2,ta,ma
>         addi    a1,a1,4
>         sub     t2,t2,s0
>         vse32.v v24,0(a1)
>         add     t2,t2,sp
>         vsetvli t0,zero,e32,mf2,ta,ma
>         addi    t1,a5,796
>         vle32.v v24,0(t2)
>         addi    t5,a4,1196
>         addi    a7,a5,1196
>         addi    t4,a4,1996
>         addi    a6,a5,1996
>         vsetvli zero,13,e32,mf2,ta,ma
>         add     a4,a4,t3
>         vse32.v v24,0(t1)
>         add     a5,a5,t3
>         vsetvli zero,11,e64,m1,tu,mu
>         vle64.v v24,0(t5),v0.t
>         vse64.v v24,0(a7)
>         vle64.v v24,0(t4),v0.t
>         vse64.v v24,0(a6)
>         vle64.v v24,0(a4),v0.t
>         vse64.v v24,0(a5),v0.t
>         vsetvli zero,11,e8,mf4,ta,ma
>         vle8.v  v24,0(a2)
>         vse8.v  v24,0(a2)
>         bne     a0,a3,.L8
>         csrr    t0,vlenb
>         slli    t1,t0,1
>         add     sp,sp,t1
>         lw      s0,12(sp)
>         addi    sp,sp,16
>         jr      ra
>
> We are generating redundant spilling codes.
> Here we introduce a dedicated pred_store pattern for vse intrinsics like
> maskstore in ARM SVE.
>
> After this patch:
> vsetvli zero,19,e16,mf2,ta,ma
>         mv      a5,a4
>         vle16.v v24,0(a0)
>         mv      a3,a0
>         vse16.v 19,0(a4)
>         addi    t1,a4,796
>         vsetvli zero,19,e32,mf2,ta,ma
>         addi    a0,a0,4
>         addi    a4,a4,4
>         vle32.v v24,0(a0)
>         addi    t0,a3,1196
>         vse32.v 19,0(a4)
>         addi    a7,a5,1196
>         addi    t6,a3,1996
>         addi    a6,a5,1996
>         add     t5,a3,t4
>         vsetvli zero,13,e32,mf2,ta,ma
>         add     a2,a5,t4
>         vse32.v 13,0(t1)
>         add     a3,a3,t3
>         vsetvli zero,11,e64,m1,tu,mu
>         add     a5,a5,t3
>         vle64.v v24,0(t0),v0.t
>         vse64.v 11,0(a7)
>         vle64.v v24,0(t6),v0.t
>         vse64.v 11,0(a6)
>         vle64.v v24,0(t5),v0.t
>         vse64.v 11,0(a2),v0.t
>         vsetvli zero,11,e8,mf4,ta,ma
>         vle8.v  v24,0(a3)
>         vse8.v  11,0(a5)
>         bne     a1,a4,.L8
> .L6:
>         ret
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-vector-builtins-bases.cc (class loadstore):
> use pred_store for vse.
>         * config/riscv/riscv-vector-builtins.cc
> (function_expander::add_mem_operand): Refine function.
>         (function_expander::use_contiguous_load_insn): Adjust new
> implementation.
>         (function_expander::use_contiguous_store_insn): Ditto.
>         * config/riscv/riscv-vector-builtins.h: Refine function.
>         * config/riscv/vector.md (@pred_store<mode>): New pattern.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/base/vse-constraint-1.c: New test.
>
> ---
>  .../riscv/riscv-vector-builtins-bases.cc      |  2 +-
>  gcc/config/riscv/riscv-vector-builtins.cc     | 22 +----
>  gcc/config/riscv/riscv-vector-builtins.h      |  8 +-
>  gcc/config/riscv/vector.md                    | 23 ++++-
>  .../riscv/rvv/base/vse-constraint-1.c         | 97 +++++++++++++++++++
>  5 files changed, 128 insertions(+), 24 deletions(-)
>  create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
>
> diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc
> b/gcc/config/riscv/riscv-vector-builtins-bases.cc
> index 10373e5ccf2..af66b016b49 100644
> --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
> +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
> @@ -106,7 +106,7 @@ class loadstore : public function_base
>    rtx expand (function_expander &e) const override
>    {
>      if (STORE_P)
> -      return e.use_contiguous_store_insn (code_for_pred_mov
> (e.vector_mode ()));
> +      return e.use_contiguous_store_insn (code_for_pred_store
> (e.vector_mode ()));
>      else
>        return e.use_contiguous_load_insn (code_for_pred_mov (e.vector_mode
> ()));
>    }
> diff --git a/gcc/config/riscv/riscv-vector-builtins.cc
> b/gcc/config/riscv/riscv-vector-builtins.cc
> index e39bfea9636..47e01b647f8 100644
> --- a/gcc/config/riscv/riscv-vector-builtins.cc
> +++ b/gcc/config/riscv/riscv-vector-builtins.cc
> @@ -845,15 +845,15 @@ function_expander::add_vundef_operand (machine_mode
> mode)
>  }
>
>  /* Add a memory operand with mode MODE and address ADDR.  */
> -rtx
> -function_expander::add_mem_operand (machine_mode mode, rtx addr)
> +void
> +function_expander::add_mem_operand (machine_mode mode, unsigned argno)
>  {
>    gcc_assert (VECTOR_MODE_P (mode));
> +  rtx addr = expand_normal (CALL_EXPR_ARG (exp, argno));
>    rtx mem = gen_rtx_MEM (mode, memory_address (mode, addr));
>    /* The memory is only guaranteed to be element-aligned.  */
>    set_mem_align (mem, GET_MODE_ALIGNMENT (GET_MODE_INNER (mode)));
>    add_fixed_operand (mem);
> -  return mem;
>  }
>
>  /* Use contiguous load INSN.  */
> @@ -878,9 +878,7 @@ function_expander::use_contiguous_load_insn (insn_code
> icode)
>    else
>      add_vundef_operand (mode);
>
> -  tree addr_arg = CALL_EXPR_ARG (exp, arg_offset++);
> -  rtx addr = expand_normal (addr_arg);
> -  add_mem_operand (mode, addr);
> +  add_mem_operand (mode, arg_offset++);
>
>    for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
>      add_input_operand (argno);
> @@ -904,27 +902,17 @@ function_expander::use_contiguous_store_insn
> (insn_code icode)
>    /* Record the offset to get the argument.  */
>    int arg_offset = 0;
>
> -  int addr_loc = use_real_mask_p (pred) ? 1 : 0;
> -  tree addr_arg = CALL_EXPR_ARG (exp, addr_loc);
> -  rtx addr = expand_normal (addr_arg);
> -  rtx mem = add_mem_operand (mode, addr);
> +  add_mem_operand (mode, use_real_mask_p (pred) ? 1 : 0);
>
>    if (use_real_mask_p (pred))
>      add_input_operand (arg_offset++);
>    else
>      add_all_one_mask_operand (mask_mode);
>
> -  /* To model "+m" constraint, we include memory operand into input.  */
> -  add_input_operand (mode, mem);
> -
>    arg_offset++;
>    for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
>      add_input_operand (argno);
>
> -  add_input_operand (Pmode, get_tail_policy_for_pred (pred));
> -  add_input_operand (Pmode, get_mask_policy_for_pred (pred));
> -  add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
> -
>    return generate_insn (icode);
>  }
>
> diff --git a/gcc/config/riscv/riscv-vector-builtins.h
> b/gcc/config/riscv/riscv-vector-builtins.h
> index c13df99cb5b..58d8d78043c 100644
> --- a/gcc/config/riscv/riscv-vector-builtins.h
> +++ b/gcc/config/riscv/riscv-vector-builtins.h
> @@ -317,12 +317,12 @@ public:
>    rtx expand ();
>
>    void add_input_operand (machine_mode, rtx);
> -  void add_input_operand (unsigned argno);
> +  void add_input_operand (unsigned);
>    void add_output_operand (machine_mode, rtx);
> -  void add_all_one_mask_operand (machine_mode mode);
> -  void add_vundef_operand (machine_mode mode);
> +  void add_all_one_mask_operand (machine_mode);
> +  void add_vundef_operand (machine_mode);
>    void add_fixed_operand (rtx);
> -  rtx add_mem_operand (machine_mode, rtx);
> +  void add_mem_operand (machine_mode, unsigned);
>
>    machine_mode vector_mode (void) const;
>
> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index 89810b183fc..3d0174f98a2 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -209,7 +209,7 @@
>
>  ;; The index of operand[] to get the merge op.
>  (define_attr "merge_op_idx" ""
> -       (cond [(eq_attr "type"
> "vlde,vste,vimov,vfmov,vldm,vstm,vlds,vmalu")
> +       (cond [(eq_attr "type" "vlde,vimov,vfmov,vldm,vstm,vlds,vmalu")
>          (const_int 2)]
>         (const_int INVALID_ATTRIBUTE)))
>
> @@ -647,7 +647,7 @@
>              (reg:SI VL_REGNUM)
>              (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
>           (match_operand:V 3 "vector_move_operand"       "    m,     m,
> vr,    vr, viWc0")
> -         (match_operand:V 2 "vector_merge_operand"      "    0,    vu,
>  vu0,   vu0,   vu0")))]
> +         (match_operand:V 2 "vector_merge_operand"      "    0,    vu,
> vu,   vu0,   vu0")))]
>    "TARGET_VECTOR"
>    "@
>     vle<sew>.v\t%0,%3%p1
> @@ -663,6 +663,25 @@
>    [(set_attr "type" "vlde,vlde,vste,vimov,vimov")
>     (set_attr "mode" "<MODE>")])
>
> +;; Dedicated pattern for vse.v instruction since we can't reuse pred_mov
> pattern to include
> +;; memory operand as input which will produce inferior codegen.
> +(define_insn "@pred_store<mode>"
> +  [(set (match_operand:V 0 "memory_operand"                 "+m")
> +       (if_then_else:V
> +         (unspec:<VM>
> +           [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1")
> +            (match_operand 3 "vector_length_operand"    "   rK")
> +            (reg:SI VL_REGNUM)
> +            (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
> +         (match_operand:V 2 "register_operand"         "    vr")
> +         (match_dup 0)))]
> +  "TARGET_VECTOR"
> +  "vse<sew>.v\t%2,%0%p1"
> +  [(set_attr "type" "vste")
> +   (set_attr "mode" "<MODE>")
> +   (set (attr "avl_type") (symbol_ref "riscv_vector::NONVLMAX"))
> +   (set_attr "vl_op_idx" "3")])
> +
>  ;; vlm.v/vsm.v/vmclr.m/vmset.m.
>  ;; constraint alternative 0 match vlm.v.
>  ;; constraint alternative 1 match vsm.v.
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
> b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
> new file mode 100644
> index 00000000000..5b8b9b41c7b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
> @@ -0,0 +1,97 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +void f (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> +{
> +  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
> +  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
> +  vbool64_t mask = *(vbool64_t*)mask_in;
> +  for (int i = 0; i < n; i++)
> +    {
> +      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
> +      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
> +
> +      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
> +      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
> +
> +      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
> +      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v3, 13);
> +
> +      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i
> + 300), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
> +
> +      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double
> *)(in + i + 500), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
> +
> +      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
> +      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
> +
> +      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
> +      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> +    }
> +}
> +
> +void f2 (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> +{
> +  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
> +  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
> +  vbool64_t mask = *(vbool64_t*)mask_in;
> +  for (int i = 0; i < n; i++)
> +    {
> +      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
> +      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
> +
> +      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
> +      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
> +
> +      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
> +      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
> +
> +      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i
> + 300), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
> +
> +      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double
> *)(in + i + 500), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
> +
> +      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
> +      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
> +
> +      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
> +      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> +    }
> +}
> +
> +void f3 (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> +{
> +  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
> +  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
> +  vbool64_t mask = *(vbool64_t*)mask_in;
> +  for (int i = 0; i < n; i++)
> +    {
> +      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
> +      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
> +
> +      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
> +      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
> +
> +      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
> +      *(vint32mf2_t*)(out + i + 200) = v3;
> +
> +      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i
> + 300), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
> +
> +      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double
> *)(in + i + 500), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
> +
> +      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
> +      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
> +
> +      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
> +      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> +    }
> +}
> +
> +/* It should not have redundant vector register spills which produce csrr
> vlenb instructions allocate stack.  */
> +/* { dg-final { scan-assembler-not {csrr\s+[a-x0-9]+,\s*vlenb} } } */
> --
> 2.36.1
>
>
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 10373e5ccf2..af66b016b49 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -106,7 +106,7 @@  class loadstore : public function_base
   rtx expand (function_expander &e) const override
   {
     if (STORE_P)
-      return e.use_contiguous_store_insn (code_for_pred_mov (e.vector_mode ()));
+      return e.use_contiguous_store_insn (code_for_pred_store (e.vector_mode ()));
     else
       return e.use_contiguous_load_insn (code_for_pred_mov (e.vector_mode ()));
   }
diff --git a/gcc/config/riscv/riscv-vector-builtins.cc b/gcc/config/riscv/riscv-vector-builtins.cc
index e39bfea9636..47e01b647f8 100644
--- a/gcc/config/riscv/riscv-vector-builtins.cc
+++ b/gcc/config/riscv/riscv-vector-builtins.cc
@@ -845,15 +845,15 @@  function_expander::add_vundef_operand (machine_mode mode)
 }
 
 /* Add a memory operand with mode MODE and address ADDR.  */
-rtx
-function_expander::add_mem_operand (machine_mode mode, rtx addr)
+void
+function_expander::add_mem_operand (machine_mode mode, unsigned argno)
 {
   gcc_assert (VECTOR_MODE_P (mode));
+  rtx addr = expand_normal (CALL_EXPR_ARG (exp, argno));
   rtx mem = gen_rtx_MEM (mode, memory_address (mode, addr));
   /* The memory is only guaranteed to be element-aligned.  */
   set_mem_align (mem, GET_MODE_ALIGNMENT (GET_MODE_INNER (mode)));
   add_fixed_operand (mem);
-  return mem;
 }
 
 /* Use contiguous load INSN.  */
@@ -878,9 +878,7 @@  function_expander::use_contiguous_load_insn (insn_code icode)
   else
     add_vundef_operand (mode);
 
-  tree addr_arg = CALL_EXPR_ARG (exp, arg_offset++);
-  rtx addr = expand_normal (addr_arg);
-  add_mem_operand (mode, addr);
+  add_mem_operand (mode, arg_offset++);
 
   for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
     add_input_operand (argno);
@@ -904,27 +902,17 @@  function_expander::use_contiguous_store_insn (insn_code icode)
   /* Record the offset to get the argument.  */
   int arg_offset = 0;
 
-  int addr_loc = use_real_mask_p (pred) ? 1 : 0;
-  tree addr_arg = CALL_EXPR_ARG (exp, addr_loc);
-  rtx addr = expand_normal (addr_arg);
-  rtx mem = add_mem_operand (mode, addr);
+  add_mem_operand (mode, use_real_mask_p (pred) ? 1 : 0);
 
   if (use_real_mask_p (pred))
     add_input_operand (arg_offset++);
   else
     add_all_one_mask_operand (mask_mode);
 
-  /* To model "+m" constraint, we include memory operand into input.  */
-  add_input_operand (mode, mem);
-
   arg_offset++;
   for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
     add_input_operand (argno);
 
-  add_input_operand (Pmode, get_tail_policy_for_pred (pred));
-  add_input_operand (Pmode, get_mask_policy_for_pred (pred));
-  add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
-
   return generate_insn (icode);
 }
 
diff --git a/gcc/config/riscv/riscv-vector-builtins.h b/gcc/config/riscv/riscv-vector-builtins.h
index c13df99cb5b..58d8d78043c 100644
--- a/gcc/config/riscv/riscv-vector-builtins.h
+++ b/gcc/config/riscv/riscv-vector-builtins.h
@@ -317,12 +317,12 @@  public:
   rtx expand ();
 
   void add_input_operand (machine_mode, rtx);
-  void add_input_operand (unsigned argno);
+  void add_input_operand (unsigned);
   void add_output_operand (machine_mode, rtx);
-  void add_all_one_mask_operand (machine_mode mode);
-  void add_vundef_operand (machine_mode mode);
+  void add_all_one_mask_operand (machine_mode);
+  void add_vundef_operand (machine_mode);
   void add_fixed_operand (rtx);
-  rtx add_mem_operand (machine_mode, rtx);
+  void add_mem_operand (machine_mode, unsigned);
 
   machine_mode vector_mode (void) const;
 
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 89810b183fc..3d0174f98a2 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -209,7 +209,7 @@ 
 
 ;; The index of operand[] to get the merge op.
 (define_attr "merge_op_idx" ""
-	(cond [(eq_attr "type" "vlde,vste,vimov,vfmov,vldm,vstm,vlds,vmalu")
+	(cond [(eq_attr "type" "vlde,vimov,vfmov,vldm,vstm,vlds,vmalu")
 	 (const_int 2)]
 	(const_int INVALID_ATTRIBUTE)))
 
@@ -647,7 +647,7 @@ 
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
 	  (match_operand:V 3 "vector_move_operand"       "    m,     m,    vr,    vr, viWc0")
-	  (match_operand:V 2 "vector_merge_operand"      "    0,    vu,   vu0,   vu0,   vu0")))]
+	  (match_operand:V 2 "vector_merge_operand"      "    0,    vu,    vu,   vu0,   vu0")))]
   "TARGET_VECTOR"
   "@
    vle<sew>.v\t%0,%3%p1
@@ -663,6 +663,25 @@ 
   [(set_attr "type" "vlde,vlde,vste,vimov,vimov")
    (set_attr "mode" "<MODE>")])
 
+;; Dedicated pattern for vse.v instruction since we can't reuse pred_mov pattern to include
+;; memory operand as input which will produce inferior codegen.
+(define_insn "@pred_store<mode>"
+  [(set (match_operand:V 0 "memory_operand"                 "+m")
+	(if_then_else:V
+	  (unspec:<VM>
+	    [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1")
+	     (match_operand 3 "vector_length_operand"    "   rK")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (match_operand:V 2 "register_operand"         "    vr")
+	  (match_dup 0)))]
+  "TARGET_VECTOR"
+  "vse<sew>.v\t%2,%0%p1"
+  [(set_attr "type" "vste")
+   (set_attr "mode" "<MODE>")
+   (set (attr "avl_type") (symbol_ref "riscv_vector::NONVLMAX"))
+   (set_attr "vl_op_idx" "3")])
+
 ;; vlm.v/vsm.v/vmclr.m/vmset.m.
 ;; constraint alternative 0 match vlm.v.
 ;; constraint alternative 1 match vsm.v.
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
new file mode 100644
index 00000000000..5b8b9b41c7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
@@ -0,0 +1,97 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3" } */
+
+#include "riscv_vector.h"
+
+void f (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+  vbool64_t mask = *(vbool64_t*)mask_in;
+  for (int i = 0; i < n; i++)
+    {
+      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v3, 13);
+
+      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11);
+      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+    }
+}
+
+void f2 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+  vbool64_t mask = *(vbool64_t*)mask_in;
+  for (int i = 0; i < n; i++)
+    {
+      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
+
+      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11);
+      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+    }
+}
+
+void f3 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+  vbool64_t mask = *(vbool64_t*)mask_in;
+  for (int i = 0; i < n; i++)
+    {
+      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13);
+      *(vint32mf2_t*)(out + i + 200) = v3;
+
+      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11);
+      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+    }
+}
+
+/* It should not have redundant vector register spills which produce csrr vlenb instructions allocate stack.  */
+/* { dg-final { scan-assembler-not {csrr\s+[a-x0-9]+,\s*vlenb} } } */