diff mbox series

[V2] RISC-V: Disallow transformation into VLMAX AVL for cond_len_xxx when length is in range [0, 31]

Message ID 20231227023826.226460-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V2] RISC-V: Disallow transformation into VLMAX AVL for cond_len_xxx when length is in range [0, 31] | expand

Commit Message

juzhe.zhong@rivai.ai Dec. 27, 2023, 2:38 a.m. UTC
Notice we have this following situation:

        vsetivli        zero,4,e32,m1,ta,ma
        vlseg4e32.v     v4,(a5)
        vlseg4e32.v     v12,(a3)
        vsetvli a5,zero,e32,m1,tu,ma             ---> This is redundant since VLMAX AVL = 4 when it is fixed-vlmax
        vfadd.vf        v3,v13,fa0
        vfadd.vf        v1,v12,fa1
        vfmul.vv        v17,v3,v5
        vfmul.vv        v16,v1,v5

The rootcause is that we transform COND_LEN_xxx into VLMAX AVL when len == NUNITS blindly.
However, we don't need to transform all of them since when len is range of [0,31], we don't need to
consume scalar registers.

After this patch:

	vsetivli	zero,4,e32,m1,tu,ma
	addi	a4,a5,400
	vlseg4e32.v	v12,(a3)
	vfadd.vf	v3,v13,fa0
	vfadd.vf	v1,v12,fa1
	vlseg4e32.v	v4,(a4)
	vfadd.vf	v2,v14,fa1
	vfmul.vv	v17,v3,v5
	vfmul.vv	v16,v1,v5

Tested on both RV32 and RV64 no regression.

Ok for trunk ?

gcc/ChangeLog:

	* config/riscv/riscv-v.cc (is_vlmax_len_p): New function.
	(expand_load_store): Disallow transformation into VLMAX when len is in range of [0,31]
	(expand_cond_len_op): Ditto.
	(expand_gather_scatter): Ditto.
	(expand_lanes_load_store): Ditto.
	(expand_fold_extract_last): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/post-ra-avl.c: Adapt test.
	* gcc.target/riscv/rvv/base/vf_avl-2.c: New test.

---
 gcc/config/riscv/riscv-v.cc                   | 21 +++++++++++++------
 .../riscv/rvv/autovec/post-ra-avl.c           |  2 +-
 .../gcc.target/riscv/rvv/base/vf_avl-2.c      | 21 +++++++++++++++++++
 3 files changed, 37 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c

Comments

Jeff Law Dec. 28, 2023, 4:33 p.m. UTC | #1
On 12/26/23 19:38, Juzhe-Zhong wrote:
> Notice we have this following situation:
> 
>          vsetivli        zero,4,e32,m1,ta,ma
>          vlseg4e32.v     v4,(a5)
>          vlseg4e32.v     v12,(a3)
>          vsetvli a5,zero,e32,m1,tu,ma             ---> This is redundant since VLMAX AVL = 4 when it is fixed-vlmax
>          vfadd.vf        v3,v13,fa0
>          vfadd.vf        v1,v12,fa1
>          vfmul.vv        v17,v3,v5
>          vfmul.vv        v16,v1,v5
> 
> The rootcause is that we transform COND_LEN_xxx into VLMAX AVL when len == NUNITS blindly.
> However, we don't need to transform all of them since when len is range of [0,31], we don't need to
> consume scalar registers.
> 
> After this patch:
> 
> 	vsetivli	zero,4,e32,m1,tu,ma
> 	addi	a4,a5,400
> 	vlseg4e32.v	v12,(a3)
> 	vfadd.vf	v3,v13,fa0
> 	vfadd.vf	v1,v12,fa1
> 	vlseg4e32.v	v4,(a4)
> 	vfadd.vf	v2,v14,fa1
> 	vfmul.vv	v17,v3,v5
> 	vfmul.vv	v16,v1,v5
> 
> Tested on both RV32 and RV64 no regression.
So it looks like the two fragments above are from different sources, 
though I guess it's also possible one of the cut-n-pastes just got 
truncated.  Note the differing number of vfadd intructions.  That 
doesn't invalidate the patch, but does make it slightly harder to reason 
about what you're doing.


> 
> Ok for trunk ?
> 
> gcc/ChangeLog:
> 
> 	* config/riscv/riscv-v.cc (is_vlmax_len_p): New function.
> 	(expand_load_store): Disallow transformation into VLMAX when len is in range of [0,31]
> 	(expand_cond_len_op): Ditto.
> 	(expand_gather_scatter): Ditto.
> 	(expand_lanes_load_store): Ditto.
> 	(expand_fold_extract_last): Ditto.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/riscv/rvv/autovec/post-ra-avl.c: Adapt test.
> 	* gcc.target/riscv/rvv/base/vf_avl-2.c: New test.
> 
> ---
>   gcc/config/riscv/riscv-v.cc                   | 21 +++++++++++++------
>   .../riscv/rvv/autovec/post-ra-avl.c           |  2 +-
>   .../gcc.target/riscv/rvv/base/vf_avl-2.c      | 21 +++++++++++++++++++
>   3 files changed, 37 insertions(+), 7 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c
> 
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index 038ab084a37..0cc7af58da6 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -68,6 +68,16 @@ imm_avl_p (machine_mode mode)
>   	   : false;
>   }
>   
> +/* Return true if LEN is equal to NUNITS that outbounds range of [0, 31].  */
Perhaps "that is out of the range [0, 31]."?

OK with the comment nit fixed.
jeff
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 038ab084a37..0cc7af58da6 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -68,6 +68,16 @@  imm_avl_p (machine_mode mode)
 	   : false;
 }
 
+/* Return true if LEN is equal to NUNITS that outbounds range of [0, 31].  */
+static bool
+is_vlmax_len_p (machine_mode mode, rtx len)
+{
+  poly_int64 value;
+  return poly_int_rtx_p (len, &value)
+	 && known_eq (value, GET_MODE_NUNITS (mode))
+	 && !satisfies_constraint_K (len);
+}
+
 /* Helper functions for insn_flags && insn_types */
 
 /* Return true if caller need pass mask operand for insn pattern with
@@ -3776,7 +3786,7 @@  expand_load_store (rtx *ops, bool is_load)
   rtx len = ops[3];
   machine_mode mode = GET_MODE (ops[0]);
 
-  if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
     {
       /* If the length operand is equal to VF, it is VLMAX load/store.  */
       if (is_load)
@@ -3842,8 +3852,7 @@  expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len)
   machine_mode mask_mode = GET_MODE (mask);
   poly_int64 value;
   bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
-  bool is_vlmax_len
-    = poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode));
+  bool is_vlmax_len = is_vlmax_len_p (mode, len);
 
   unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
   if (is_dummy_mask)
@@ -4012,7 +4021,7 @@  expand_gather_scatter (rtx *ops, bool is_load)
   unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
   poly_int64 value;
-  bool is_vlmax = poly_int_rtx_p (len, &value) && known_eq (value, nunits);
+  bool is_vlmax = is_vlmax_len_p (vec_mode, len);
 
   /* Extend the offset element to address width.  */
   if (inner_offsize < BITS_PER_WORD)
@@ -4199,7 +4208,7 @@  expand_lanes_load_store (rtx *ops, bool is_load)
   rtx reg = is_load ? ops[0] : ops[1];
   machine_mode mode = GET_MODE (ops[0]);
 
-  if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
     {
       /* If the length operand is equal to VF, it is VLMAX load/store.  */
       if (is_load)
@@ -4252,7 +4261,7 @@  expand_fold_extract_last (rtx *ops)
   rtx slide_vect = gen_reg_rtx (mode);
   insn_code icode;
 
-  if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
     len = NULL_RTX;
 
   /* Calculate the number of 1-bit in mask. */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c
index f3d12bac7cd..bff6dcb1c38 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c
@@ -13,4 +13,4 @@  int foo() {
   return a;
 }
 
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli\s+[a-x0-9]+,\s*zero} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c
new file mode 100644
index 00000000000..5a94a51f308
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=fixed-vlmax" } */
+
+float f[12][100];
+
+void bad1(float v1, float v2)
+{
+  for (int r = 0; r < 100; r += 4)
+    {
+      int i = r + 1;
+      f[0][r] = f[1][r] * (f[2][r] + v2) - f[1][i] * (f[2][i] + v1);
+      f[0][i] = f[1][r] * (f[2][i] + v1) + f[1][i] * (f[2][r] + v2);
+      f[0][r+2] = f[1][r+2] * (f[2][r+2] + v2) - f[1][i+2] * (f[2][i+2] + v1);
+      f[0][i+2] = f[1][r+2] * (f[2][i+2] + v1) + f[1][i+2] * (f[2][r+2] + v2);
+    }
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 2 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */