diff mbox series

[V2] RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation

Message ID 20230612151107.13373-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V2] RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation | expand

Commit Message

juzhe.zhong@rivai.ai June 12, 2023, 3:11 p.m. UTC
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

According to RVV ISA:
https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc

We can enhance VLA SLP auto-vectorization with (16.5.1. Synthesizing vdecompress)
Decompress operation.

Case 1 (nunits = POLY_INT_CST [16, 16]):
_48 = VEC_PERM_EXPR <_37, _35, { 0, POLY_INT_CST [16, 16], 1, POLY_INT_CST [17, 16], 2, POLY_INT_CST [18, 16], ... }>;
We can optimize such VLA SLP permuation pattern into:
_48 = vdecompress (_37, _35, mask = { 0, 1, 0, 1, ... };

Case 2 (nunits = POLY_INT_CST [16, 16]):
_23 = VEC_PERM_EXPR <_46, _44, { POLY_INT_CST [1, 1], POLY_INT_CST [3, 3], POLY_INT_CST [2, 1], POLY_INT_CST [4, 3], POLY_INT_CST [3, 1], POLY_INT_CST [5, 3], ... }>;
We can optimize such VLA SLP permuation pattern into:
_48 = vdecompress (slidedown(_46, 1/2 nunits), slidedown(_44, 1/2 nunits), mask = { 0, 1, 0, 1, ... };

For example:
void __attribute__ ((noinline, noclone))
vec_slp (uint64_t *restrict a, uint64_t b, uint64_t c, int n)
{
  for (int i = 0; i < n; ++i)
    {
      a[i * 2] += b;
      a[i * 2 + 1] += c;
    }
}

ASM:
...
        vid.v   v0
        vand.vi v0,v0,1
        vmseq.vi        v0,v0,1  ===> mask = { 0, 1, 0, 1, ... }
vdecompress:
        viota.m v3,v0   
        vrgather.vv     v2,v1,v3,v0.t
Loop:
        vsetvli zero,a5,e64,m1,ta,ma
        vle64.v v1,0(a0)
        vsetvli a6,zero,e64,m1,ta,ma
        vadd.vv v1,v2,v1
        vsetvli zero,a5,e64,m1,ta,ma
        mv      a5,a3
        vse64.v v1,0(a0)
        add     a3,a3,a1
        add     a0,a0,a2
        bgtu    a5,a4,.L4


gcc/ChangeLog:

        * config/riscv/riscv-v.cc (emit_vlmax_decompress_insn): New function.
        (shuffle_decompress_patterns): New function.
        (expand_vec_perm_const_1): Add decompress optimization.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/partial/slp-8.c: New test.
        * gcc.target/riscv/rvv/autovec/partial/slp-9.c: New test.
        * gcc.target/riscv/rvv/autovec/partial/slp_run-8.c: New test.
        * gcc.target/riscv/rvv/autovec/partial/slp_run-9.c: New test.

---
 gcc/config/riscv/riscv-v.cc                   | 111 ++++++++++++++++++
 .../riscv/rvv/autovec/partial/slp-8.c         |  30 +++++
 .../riscv/rvv/autovec/partial/slp-9.c         |  31 +++++
 .../riscv/rvv/autovec/partial/slp_run-8.c     |  30 +++++
 .../riscv/rvv/autovec/partial/slp_run-9.c     |  30 +++++
 5 files changed, 232 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c

Comments

Jeff Law June 12, 2023, 7:42 p.m. UTC | #1
On 6/12/23 09:11, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
> According to RVV ISA:
> https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
> 
> We can enhance VLA SLP auto-vectorization with (16.5.1. Synthesizing vdecompress)
> Decompress operation.
> 
> Case 1 (nunits = POLY_INT_CST [16, 16]):
> _48 = VEC_PERM_EXPR <_37, _35, { 0, POLY_INT_CST [16, 16], 1, POLY_INT_CST [17, 16], 2, POLY_INT_CST [18, 16], ... }>;
> We can optimize such VLA SLP permuation pattern into:
> _48 = vdecompress (_37, _35, mask = { 0, 1, 0, 1, ... };
> 
> Case 2 (nunits = POLY_INT_CST [16, 16]):
> _23 = VEC_PERM_EXPR <_46, _44, { POLY_INT_CST [1, 1], POLY_INT_CST [3, 3], POLY_INT_CST [2, 1], POLY_INT_CST [4, 3], POLY_INT_CST [3, 1], POLY_INT_CST [5, 3], ... }>;
> We can optimize such VLA SLP permuation pattern into:
> _48 = vdecompress (slidedown(_46, 1/2 nunits), slidedown(_44, 1/2 nunits), mask = { 0, 1, 0, 1, ... };
> 
> For example:
> void __attribute__ ((noinline, noclone))
> vec_slp (uint64_t *restrict a, uint64_t b, uint64_t c, int n)
> {
>    for (int i = 0; i < n; ++i)
>      {
>        a[i * 2] += b;
>        a[i * 2 + 1] += c;
>      }
> }
> 
> ASM:
> ...
>          vid.v   v0
>          vand.vi v0,v0,1
>          vmseq.vi        v0,v0,1  ===> mask = { 0, 1, 0, 1, ... }
> vdecompress:
>          viota.m v3,v0
>          vrgather.vv     v2,v1,v3,v0.t
> Loop:
>          vsetvli zero,a5,e64,m1,ta,ma
>          vle64.v v1,0(a0)
>          vsetvli a6,zero,e64,m1,ta,ma
>          vadd.vv v1,v2,v1
>          vsetvli zero,a5,e64,m1,ta,ma
>          mv      a5,a3
>          vse64.v v1,0(a0)
>          add     a3,a3,a1
>          add     a0,a0,a2
>          bgtu    a5,a4,.L4
> 
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-v.cc (emit_vlmax_decompress_insn): New function.
>          (shuffle_decompress_patterns): New function.
>          (expand_vec_perm_const_1): Add decompress optimization.
> 
> gcc/testsuite/ChangeLog:
> 
>          * gcc.target/riscv/rvv/autovec/partial/slp-8.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp-9.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp_run-8.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp_run-9.c: New test.
I've been wanting to get inside expand_vec_perm_const to see what 
opportunities might exist to improve code in there.  We had good success 
mining this space at a prior employer.  While we had a lot of weird 
idioms and costs to consider it was well worth the time.

So quite happy to see you diving into this code.

OK for the trunk,
Jeff
Li, Pan2 via Gcc-patches June 13, 2023, 1:28 a.m. UTC | #2
Committed, thanks Jeff.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Jeff Law via Gcc-patches
Sent: Tuesday, June 13, 2023 3:43 AM
To: juzhe.zhong@rivai.ai; gcc-patches@gcc.gnu.org
Cc: kito.cheng@sifive.com; palmer@rivosinc.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH V2] RISC-V: Enhance RVV VLA SLP auto-vectorization with decompress operation



On 6/12/23 09:11, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
> According to RVV ISA:
> https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
> 
> We can enhance VLA SLP auto-vectorization with (16.5.1. Synthesizing 
> vdecompress) Decompress operation.
> 
> Case 1 (nunits = POLY_INT_CST [16, 16]):
> _48 = VEC_PERM_EXPR <_37, _35, { 0, POLY_INT_CST [16, 16], 1, 
> POLY_INT_CST [17, 16], 2, POLY_INT_CST [18, 16], ... }>; We can optimize such VLA SLP permuation pattern into:
> _48 = vdecompress (_37, _35, mask = { 0, 1, 0, 1, ... };
> 
> Case 2 (nunits = POLY_INT_CST [16, 16]):
> _23 = VEC_PERM_EXPR <_46, _44, { POLY_INT_CST [1, 1], POLY_INT_CST [3, 
> 3], POLY_INT_CST [2, 1], POLY_INT_CST [4, 3], POLY_INT_CST [3, 1], POLY_INT_CST [5, 3], ... }>; We can optimize such VLA SLP permuation pattern into:
> _48 = vdecompress (slidedown(_46, 1/2 nunits), slidedown(_44, 1/2 
> nunits), mask = { 0, 1, 0, 1, ... };
> 
> For example:
> void __attribute__ ((noinline, noclone)) vec_slp (uint64_t *restrict 
> a, uint64_t b, uint64_t c, int n) {
>    for (int i = 0; i < n; ++i)
>      {
>        a[i * 2] += b;
>        a[i * 2 + 1] += c;
>      }
> }
> 
> ASM:
> ...
>          vid.v   v0
>          vand.vi v0,v0,1
>          vmseq.vi        v0,v0,1  ===> mask = { 0, 1, 0, 1, ... }
> vdecompress:
>          viota.m v3,v0
>          vrgather.vv     v2,v1,v3,v0.t
> Loop:
>          vsetvli zero,a5,e64,m1,ta,ma
>          vle64.v v1,0(a0)
>          vsetvli a6,zero,e64,m1,ta,ma
>          vadd.vv v1,v2,v1
>          vsetvli zero,a5,e64,m1,ta,ma
>          mv      a5,a3
>          vse64.v v1,0(a0)
>          add     a3,a3,a1
>          add     a0,a0,a2
>          bgtu    a5,a4,.L4
> 
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-v.cc (emit_vlmax_decompress_insn): New function.
>          (shuffle_decompress_patterns): New function.
>          (expand_vec_perm_const_1): Add decompress optimization.
> 
> gcc/testsuite/ChangeLog:
> 
>          * gcc.target/riscv/rvv/autovec/partial/slp-8.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp-9.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp_run-8.c: New test.
>          * gcc.target/riscv/rvv/autovec/partial/slp_run-9.c: New test.
I've been wanting to get inside expand_vec_perm_const to see what opportunities might exist to improve code in there.  We had good success mining this space at a prior employer.  While we had a lot of weird idioms and costs to consider it was well worth the time.

So quite happy to see you diving into this code.

OK for the trunk,
Jeff
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index e1b85a5af91..fb970344521 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -836,6 +836,46 @@  emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
   emit_vlmax_masked_mu_insn (icode, RVV_BINOP_MU, ops);
 }
 
+/* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
+   https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
+
+  There is no inverse vdecompress provided, as this operation can be readily
+  synthesized using iota and a masked vrgather:
+
+      Desired functionality of 'vdecompress'
+	7 6 5 4 3 2 1 0     # vid
+
+	      e d c b a     # packed vector of 5 elements
+	1 0 0 1 1 1 0 1     # mask vector of 8 elements
+	p q r s t u v w     # destination register before vdecompress
+
+	e q r d c b v a     # result of vdecompress
+       # v0 holds mask
+       # v1 holds packed data
+       # v11 holds input expanded vector and result
+       viota.m v10, v0                 # Calc iota from mask in v0
+       vrgather.vv v11, v1, v10, v0.t  # Expand into destination
+     p q r s t u v w  # v11 destination register
+	   e d c b a  # v1 source vector
+     1 0 0 1 1 1 0 1  # v0 mask vector
+
+     4 4 4 3 2 1 1 0  # v10 result of viota.m
+     e q r d c b v a  # v11 destination after vrgather using viota.m under mask
+*/
+static void
+emit_vlmax_decompress_insn (rtx target, rtx op, rtx mask)
+{
+  machine_mode data_mode = GET_MODE (target);
+  machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
+  if (GET_MODE_INNER (data_mode) == QImode)
+    sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
+
+  rtx sel = gen_reg_rtx (sel_mode);
+  rtx iota_ops[] = {sel, mask};
+  emit_vlmax_insn (code_for_pred_iota (sel_mode), RVV_UNOP, iota_ops);
+  emit_vlmax_masked_gather_mu_insn (target, op, sel, mask);
+}
+
 /* Emit merge instruction.  */
 
 static machine_mode
@@ -2337,6 +2377,75 @@  struct expand_vec_perm_d
   bool testing_p;
 };
 
+/* Recognize decompress patterns:
+
+   1. VEC_PERM_EXPR op0 and op1
+      with isel = { 0, nunits, 1, nunits + 1, ... }.
+      Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
+
+   2. VEC_PERM_EXPR op0 and op1
+      with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
+      Slide down op0 and op1 with OFFSET = 1/2 nunits.
+      Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
+*/
+static bool
+shuffle_decompress_patterns (struct expand_vec_perm_d *d)
+{
+  poly_uint64 nelt = d->perm.length ();
+  machine_mode mask_mode = get_mask_mode (d->vmode).require ();
+
+  /* For constant size indices, we dont't need to handle it here.
+     Just leave it to vec_perm<mode>.  */
+  if (d->perm.length ().is_constant ())
+    return false;
+
+  poly_uint64 first = d->perm[0];
+  if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
+      || !d->perm.series_p (0, 2, first, 1)
+      || !d->perm.series_p (1, 2, first + nelt, 1))
+    return false;
+
+  /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
+     Otherwise, it could overflow the index range.  */
+  machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
+  if (GET_MODE_INNER (d->vmode) == QImode
+      && !get_vector_mode (HImode, nelt).exists (&sel_mode))
+    return false;
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  rtx op0, op1;
+  if (known_eq (first, 0U))
+    {
+      op0 = d->op0;
+      op1 = d->op1;
+    }
+  else
+    {
+      op0 = gen_reg_rtx (d->vmode);
+      op1 = gen_reg_rtx (d->vmode);
+      insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
+      rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
+      rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
+      emit_vlmax_insn (icode, RVV_BINOP, ops0);
+      emit_vlmax_insn (icode, RVV_BINOP, ops1);
+    }
+  /* Generate { 0, 1, .... } mask.  */
+  rtx vid = gen_reg_rtx (sel_mode);
+  rtx vid_repeat = gen_reg_rtx (sel_mode);
+  emit_insn (gen_vec_series (sel_mode, vid, const0_rtx, const1_rtx));
+  rtx and_ops[] = {vid_repeat, vid, const1_rtx};
+  emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), RVV_BINOP, and_ops);
+  rtx const_vec = gen_const_vector_dup (sel_mode, 1);
+  rtx mask = gen_reg_rtx (mask_mode);
+  expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
+  emit_move_insn (d->target, op0);
+  emit_vlmax_decompress_insn (d->target, op1, mask);
+  return true;
+}
+
 /* Recognize the pattern that can be shuffled by generic approach.  */
 
 static bool
@@ -2388,6 +2497,8 @@  expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
     {
       if (d->vmode == d->op_mode)
 	{
+	  if (shuffle_decompress_patterns (d))
+	    return true;
 	  if (shuffle_generic_patterns (d))
 	    return true;
 	  return false;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c
new file mode 100644
index 00000000000..2568d6947a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-8.c
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-optimized-details" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                                         \
+  TYPE __attribute__ ((noinline, noclone))                                     \
+  vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)                     \
+  {                                                                            \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+	a[i * 2] += b;                                                         \
+	a[i * 2 + 1] += c;                                                     \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (int8_t)                                                                   \
+  T (uint8_t)                                                                  \
+  T (int16_t)                                                                  \
+  T (uint16_t)                                                                 \
+  T (int32_t)                                                                  \
+  T (uint32_t)                                                                 \
+  T (int64_t)                                                                  \
+  T (uint64_t)
+
+TEST_ALL (VEC_PERM)
+
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 2 "optimized" } } */
+/* { dg-final { scan-assembler-times {viota.m} 2 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c
new file mode 100644
index 00000000000..d410e57adbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-9.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-optimized-details" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                                         \
+  TYPE __attribute__ ((noinline, noclone))                                     \
+  vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)                     \
+  {                                                                            \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+	a[i * 4] += b;                                                         \
+	a[i * 4 + 1] += c;                                                     \
+	a[i * 4 + 2] += b;                                                     \
+	a[i * 4 + 3] += c;                                                     \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (int8_t)                                                                   \
+  T (uint8_t)                                                                  \
+  T (int16_t)                                                                  \
+  T (uint16_t)                                                                 \
+  T (int32_t)                                                                  \
+  T (uint32_t)                                                                 \
+  T (int64_t)                                                                  \
+  T (uint64_t)
+
+TEST_ALL (VEC_PERM)
+
+/* { dg-final { scan-assembler-times {viota.m} 2 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c
new file mode 100644
index 00000000000..39ae513812b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-8.c
@@ -0,0 +1,30 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=scalable -fno-vect-cost-model" } */
+
+#include "slp-8.c"
+
+#define N (103 * 2)
+
+#define HARNESS(TYPE)						\
+  {								\
+    TYPE a[N], b[2] = { 3, 11 };				\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	a[i] = i * 2 + i % 5;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    vec_slp_##TYPE (a, b[0], b[1], N / 2);			\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	TYPE orig = i * 2 + i % 5;				\
+	TYPE expected = orig + b[i % 2];			\
+	if (a[i] != expected)					\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c
new file mode 100644
index 00000000000..791cfbc2b47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-9.c
@@ -0,0 +1,30 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=scalable -fno-vect-cost-model" } */
+
+#include "slp-9.c"
+
+#define N (103 * 4)
+
+#define HARNESS(TYPE)						\
+  {								\
+    TYPE a[N], b[2] = { 3, 11 };				\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	a[i] = i * 2 + i % 5;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    vec_slp_##TYPE (a, b[0], b[1], N / 4);			\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	TYPE orig = i * 2 + i % 5;				\
+	TYPE expected = orig + b[i % 2];			\
+	if (a[i] != expected)					\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}