diff mbox series

RISC-V: Add RVV FMA auto-vectorization support

Message ID 20230526091729.2751329-1-juzhe.zhong@rivai.ai
State New
Headers show
Series RISC-V: Add RVV FMA auto-vectorization support | expand

Commit Message

juzhe.zhong@rivai.ai May 26, 2023, 9:17 a.m. UTC
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

This patch support FMA auto-vectorization pattern.
1. Let's RA decide vmacc or vmadd.
2. Fix bug of vector.md which generate incorrect information to VSETVL
   PASS when testing ternop-3.c.

gcc/ChangeLog:

        * config/riscv/autovec.md (fma<mode>4): New pattern.
        (*fma<mode>): Ditto.
        * config/riscv/riscv-protos.h (enum insn_type): Add ternary enum.
        (emit_vlmax_ternop_insn): New function.
        * config/riscv/riscv-v.cc (emit_vlmax_ternop_insn): Ditto.
        * config/riscv/vector.md: Fix ternary patterns bug.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/rvv.exp: Add ternop tests.
        * gcc.target/riscv/rvv/autovec/ternop/ternop-1.c: New test.
        * gcc.target/riscv/rvv/autovec/ternop/ternop-2.c: New test.
        * gcc.target/riscv/rvv/autovec/ternop/ternop-3.c: New test.
        * gcc.target/riscv/rvv/autovec/ternop/ternop_run-1.c: New test.
        * gcc.target/riscv/rvv/autovec/ternop/ternop_run-2.c: New test.
        * gcc.target/riscv/rvv/autovec/ternop/ternop_run-3.c: New test.

---
 gcc/config/riscv/autovec.md                   |  65 +++++++++++
 gcc/config/riscv/riscv-protos.h               |   2 +
 gcc/config/riscv/riscv-v.cc                   |  22 ++++
 gcc/config/riscv/vector.md                    |   2 +-
 .../riscv/rvv/autovec/ternop/ternop-1.c       |  27 +++++
 .../riscv/rvv/autovec/ternop/ternop-2.c       |  33 ++++++
 .../riscv/rvv/autovec/ternop/ternop-3.c       |  33 ++++++
 .../riscv/rvv/autovec/ternop/ternop_run-1.c   |  84 ++++++++++++++
 .../riscv/rvv/autovec/ternop/ternop_run-2.c   | 104 ++++++++++++++++++
 .../riscv/rvv/autovec/ternop/ternop_run-3.c   | 104 ++++++++++++++++++
 gcc/testsuite/gcc.target/riscv/rvv/rvv.exp    |   2 +
 11 files changed, 477 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-3.c

Comments

Robin Dapp May 26, 2023, 10:16 a.m. UTC | #1
Hi Juzhe,

> +;; We can't expand FMA for the following reasons:

But we do :)  We just haven't selected the proper alternative yet.

> +;; 1. Before RA, we don't know which multiply-add instruction is the ideal one.
> +;;    The vmacc is the ideal instruction when operands[3] overlaps operands[0].
> +;;    The vmadd is the ideal instruction when operands[1|2] overlaps operands[0].
> +;; 2. According to vector.md, the multiply-add patterns has 'merge' operand which
> +;;    is the operands[5]. Since operands[5] should overlap operands[0], this operand
> +;;    should be allocated the same regno as operands[1|2|3].
> +;; 3. The 'merge' operand is always a real merge operand and we don't allow undefined
> +;;    operand.
> +;; 3. The operation of FMA pattern needs VLMAX vsetlvi which needs a VL operand.

Can you explain these two points (3 and 4, maybe 2) a bit in the comments?
I.e. what makes fma different from a normal insn?

> +(define_insn_and_split "*fma<mode>"
> +  [(set (match_operand:VI 0 "register_operand"     "=vr, vr, ?&vr")
> +	(plus:VI
> +	  (mult:VI
> +	    (match_operand:VI 1 "register_operand" " %0, vr,   vr")
> +	    (match_operand:VI 2 "register_operand" " vr, vr,   vr"))
> +	  (match_operand:VI 3 "register_operand"   " vr,  0,   vr")))
> +   (clobber (match_scratch:SI 4 "=r,r,r"))]
> +  "TARGET_VECTOR"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    PUT_MODE (operands[4], Pmode);
> +    riscv_vector::emit_vlmax_vsetvl (<MODE>mode, operands[4]);
> +    if (which_alternative == 3)

We only have three alternatives here.

> +      emit_insn (gen_rtx_SET (operands[0], operands[3]));
> +    rtx ops[] = {operands[0], operands[1], operands[2], operands[3], operands[0]};
> +    riscv_vector::emit_vlmax_ternop_insn (code_for_pred_mul_plus (<MODE>mode),
> +					  riscv_vector::RVV_TERNOP, ops, operands[4]);
> +    DONE;
> +  }
> +  [(set_attr "type" "vimuladd")
> +   (set_attr "mode" "<MODE>")])
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 36419c95bbd..86b2798fb5e 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -140,6 +140,7 @@ enum insn_type
>    RVV_MERGE_OP = 4,
>    RVV_CMP_OP = 4,
>    RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand.  */
> +  RVV_TERNOP = 5,
>  };

> +emit_vlmax_ternop_insn (unsigned icode, int op_num, rtx *ops, rtx vl)

We have a bit of naming overlap between "insn" an "op" already.  I would go
with just ternay_insn or tern_insn here.  That the insn_types have OP in
their name is unfortunate but let's keep that for now. 

> +  machine_mode data_mode = GET_MODE (ops[0]);
> +  machine_mode mask_mode = get_mask_mode (data_mode).require ();
> +  /* We have a maximum of 11 operands for RVV instruction patterns according to
> +   * vector.md.  */
> +  insn_expander<11> e (/*OP_NUM*/ op_num, /*HAS_DEST_P*/ true,
> +		       /*FULLY_UNMASKED_P*/ true,
> +		       /*USE_REAL_MERGE_P*/ true, /*HAS_AVL_P*/ true,
> +		       /*VLMAX_P*/ true,
> +		       /*DEST_MODE*/ data_mode, /*MASK_MODE*/ mask_mode);

Can we call data_mode dest_mode here?  data_mode imho only makes sense in
the context of conditionals where we have a comparison mode and a data mode.
I mean you could argue we always have a data mode and a mask mode so the
naming makes sense again but then we should get rid of dest_mode.

> +  /* According to LRA mov pattern in vector.md, we have a clobber operand
> +     to be used ad VL operand.  */
> +  e.set_vl (vl);

How does the LRA mov pattern (mov_lra?) come into play here?  I know the same
line is already in emit_vlmax_insn but it also is odd there.  What we actually
do is pass either NULL as length (before lra/reload) or a pre-allocated scratch
that we can use as vlmax length.

> +#include <stdint-gcc.h>
> +
> +#define TEST_TYPE(TYPE)                                                        \
> +  __attribute__ ((noipa)) void ternop_##TYPE (TYPE *__restrict dst,            \
> +					      TYPE *__restrict a,              \
> +					      TYPE *__restrict b, int n)       \

> +#define TEST_TYPE(TYPE)                                                        \
> +  __attribute__ ((noipa)) void ternop_##TYPE (TYPE *restrict dest1,            \
> +					      TYPE *restrict dest2,            \
> +					      TYPE *restrict dest3,            \
> +					      TYPE *restrict src1,             \
> +					      TYPE *restrict src2, int n)      

__restrict vs restrict.

> +int __attribute__ ((optimize (1))) main ()
> +int __attribute__ ((optimize (0))) main ()
Why the difference here?  Why do we need to restrict the optimization here
anyway?

Btw. any reason why you don't include fms, vnmsac in the patch?  Wouldn't the
patterns be really similar or do you have other plans for those?  Not needed
for this patch, just curious.

Regards
 Robin
juzhe.zhong@rivai.ai May 26, 2023, 11 a.m. UTC | #2
Hi, Robin.

>> Can you explain these two points (3 and 4, maybe 2) a bit in the comments?
>> I.e. what makes fma different from a normal insn?
You can take a lookt at vector.md. The ternary instruction pattern has 
operands[0] operands[1] operands[2] operands[3] operands[4] operands[5] :

operands[0] = operands[1] ? operands[2] * operands[3] + operands[4] : operands[5]
These operands are not necessary the same RTX but we should make them overlap.
Why have operands[5] ? Since we will have len_cond_fma.
So I want to lower simple fma pattern into patterns I define in vector.md.
operands[5] should be operands[1] if operands[1] overlap operand[0] --->vmacc
or operands[3] if operands[3] overlap operand[0] -->vmadd

>>We only have three alternatives here.
Address in V2.

>>We have a bit of naming overlap between "insn" an "op" already.  I would go
>>with just ternay_insn or tern_insn here.  That the insn_types have OP in
>>their name is unfortunate but let's keep that for now.
Ok


>>Can we call data_mode dest_mode here?  data_mode imho only makes sense in
>>the context of conditionals where we have a comparison mode and a data mode.
>>I mean you could argue we always have a data mode and a mask mode so the
>>naming makes sense again but then we should get rid of dest_mode.

ok

>> __restrict vs restrict.

ok

>>Why the difference here?  Why do we need to restrict the optimization here
>>anyway?
Ok


>>Btw. any reason why you don't include fms, vnmsac in the patch?  Wouldn't the
>>patterns be really similar or do you have other plans for those?  Not needed
>>for this patch, just curious.
I want to make patch small and simple enough to review. After this patch is merged,
I will post fms.

Thanks.


juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-05-26 18:16
To: juzhe.zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; palmer; palmer; jeffreyalaw; pan2.li
Subject: Re: [PATCH] RISC-V: Add RVV FMA auto-vectorization support
Hi Juzhe,
 
> +;; We can't expand FMA for the following reasons:
 
But we do :)  We just haven't selected the proper alternative yet.
 
> +;; 1. Before RA, we don't know which multiply-add instruction is the ideal one.
> +;;    The vmacc is the ideal instruction when operands[3] overlaps operands[0].
> +;;    The vmadd is the ideal instruction when operands[1|2] overlaps operands[0].
> +;; 2. According to vector.md, the multiply-add patterns has 'merge' operand which
> +;;    is the operands[5]. Since operands[5] should overlap operands[0], this operand
> +;;    should be allocated the same regno as operands[1|2|3].
> +;; 3. The 'merge' operand is always a real merge operand and we don't allow undefined
> +;;    operand.
> +;; 3. The operation of FMA pattern needs VLMAX vsetlvi which needs a VL operand.
 
Can you explain these two points (3 and 4, maybe 2) a bit in the comments?
I.e. what makes fma different from a normal insn?
 
> +(define_insn_and_split "*fma<mode>"
> +  [(set (match_operand:VI 0 "register_operand"     "=vr, vr, ?&vr")
> + (plus:VI
> +   (mult:VI
> +     (match_operand:VI 1 "register_operand" " %0, vr,   vr")
> +     (match_operand:VI 2 "register_operand" " vr, vr,   vr"))
> +   (match_operand:VI 3 "register_operand"   " vr,  0,   vr")))
> +   (clobber (match_scratch:SI 4 "=r,r,r"))]
> +  "TARGET_VECTOR"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    PUT_MODE (operands[4], Pmode);
> +    riscv_vector::emit_vlmax_vsetvl (<MODE>mode, operands[4]);
> +    if (which_alternative == 3)
 
We only have three alternatives here.
 
> +      emit_insn (gen_rtx_SET (operands[0], operands[3]));
> +    rtx ops[] = {operands[0], operands[1], operands[2], operands[3], operands[0]};
> +    riscv_vector::emit_vlmax_ternop_insn (code_for_pred_mul_plus (<MODE>mode),
> +   riscv_vector::RVV_TERNOP, ops, operands[4]);
> +    DONE;
> +  }
> +  [(set_attr "type" "vimuladd")
> +   (set_attr "mode" "<MODE>")])
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 36419c95bbd..86b2798fb5e 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -140,6 +140,7 @@ enum insn_type
>    RVV_MERGE_OP = 4,
>    RVV_CMP_OP = 4,
>    RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand.  */
> +  RVV_TERNOP = 5,
>  };
 
> +emit_vlmax_ternop_insn (unsigned icode, int op_num, rtx *ops, rtx vl)
 
We have a bit of naming overlap between "insn" an "op" already.  I would go
with just ternay_insn or tern_insn here.  That the insn_types have OP in
their name is unfortunate but let's keep that for now. 
 
> +  machine_mode data_mode = GET_MODE (ops[0]);
> +  machine_mode mask_mode = get_mask_mode (data_mode).require ();
> +  /* We have a maximum of 11 operands for RVV instruction patterns according to
> +   * vector.md.  */
> +  insn_expander<11> e (/*OP_NUM*/ op_num, /*HAS_DEST_P*/ true,
> +        /*FULLY_UNMASKED_P*/ true,
> +        /*USE_REAL_MERGE_P*/ true, /*HAS_AVL_P*/ true,
> +        /*VLMAX_P*/ true,
> +        /*DEST_MODE*/ data_mode, /*MASK_MODE*/ mask_mode);
 
Can we call data_mode dest_mode here?  data_mode imho only makes sense in
the context of conditionals where we have a comparison mode and a data mode.
I mean you could argue we always have a data mode and a mask mode so the
naming makes sense again but then we should get rid of dest_mode.
 
> +  /* According to LRA mov pattern in vector.md, we have a clobber operand
> +     to be used ad VL operand.  */
> +  e.set_vl (vl);
 
How does the LRA mov pattern (mov_lra?) come into play here?  I know the same
line is already in emit_vlmax_insn but it also is odd there.  What we actually
do is pass either NULL as length (before lra/reload) or a pre-allocated scratch
that we can use as vlmax length.
 
> +#include <stdint-gcc.h>
> +
> +#define TEST_TYPE(TYPE)                                                        \
> +  __attribute__ ((noipa)) void ternop_##TYPE (TYPE *__restrict dst,            \
> +       TYPE *__restrict a,              \
> +       TYPE *__restrict b, int n)       \
 
> +#define TEST_TYPE(TYPE)                                                        \
> +  __attribute__ ((noipa)) void ternop_##TYPE (TYPE *restrict dest1,            \
> +       TYPE *restrict dest2,            \
> +       TYPE *restrict dest3,            \
> +       TYPE *restrict src1,             \
> +       TYPE *restrict src2, int n)      
 
__restrict vs restrict.
 
> +int __attribute__ ((optimize (1))) main ()
> +int __attribute__ ((optimize (0))) main ()
Why the difference here?  Why do we need to restrict the optimization here
anyway?
 
Btw. any reason why you don't include fms, vnmsac in the patch?  Wouldn't the
patterns be really similar or do you have other plans for those?  Not needed
for this patch, just curious.
 
Regards
Robin
Robin Dapp May 26, 2023, 11:12 a.m. UTC | #3
Hi Juzhe,

>>> Can you explain these two points (3 and 4, maybe 2) a bit in the comments?
>>> I.e. what makes fma different from a normal insn?
> You can take a lookt at vector.md. The ternary instruction pattern has 
> operands[0] operands[1] operands[2] operands[3] operands[4] operands[5] :
> 
> operands[0] = operands[1] ? operands[2] * operands[3] + operands[4] : operands[5]
> These operands are not necessary the same RTX but we should make them overlap.
> Why have operands[5] ? Since we will have len_cond_fma.
> So I want to lower simple fma pattern into patterns I define in vector.md.
> operands[5] should be operands[1] if operands[1] overlap operand[0] --->vmacc
> or operands[3] if operands[3] overlap operand[0] -->vmadd

yes, the general principle is clear (it's same for other insns as well).
My point was to make sure we document that a bit more verbosely in the
comments so future readers will immediately know what they are dealing
with.

Thanks
 Robin
diff mbox series

Patch

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 7fe4d94de39..ba1240014dc 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -373,3 +373,68 @@ 
     DONE;
   }
 )
+
+;; =========================================================================
+;; == Ternary arithmetic
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] VMACC and VMADD
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - vmacc
+;; - vmadd
+;; -------------------------------------------------------------------------
+
+;; We can't expand FMA for the following reasons:
+;; 1. Before RA, we don't know which multiply-add instruction is the ideal one.
+;;    The vmacc is the ideal instruction when operands[3] overlaps operands[0].
+;;    The vmadd is the ideal instruction when operands[1|2] overlaps operands[0].
+;; 2. According to vector.md, the multiply-add patterns has 'merge' operand which
+;;    is the operands[5]. Since operands[5] should overlap operands[0], this operand
+;;    should be allocated the same regno as operands[1|2|3].
+;; 3. The 'merge' operand is always a real merge operand and we don't allow undefined
+;;    operand.
+;; 3. The operation of FMA pattern needs VLMAX vsetlvi which needs a VL operand.
+;;
+;; In this situation, we design the codegen of FMA as follows:
+;; 1. clobber a scratch in the expand pattern of FMA.
+;; 2. Let's RA decide which input operand (operands[1|2|3]) overlap operands[0].
+;; 3. Generate instructions (vmacc or vmadd) according to the register allocation
+;;    result after reload_completed.
+(define_expand "fma<mode>4"
+  [(parallel
+    [(set (match_operand:VI 0 "register_operand"     "=vr")
+	  (plus:VI
+	    (mult:VI
+	      (match_operand:VI 1 "register_operand" " vr")
+	      (match_operand:VI 2 "register_operand" " vr"))
+	    (match_operand:VI 3 "register_operand"   " vr")))
+     (clobber (match_scratch:SI 4))])]
+  "TARGET_VECTOR"
+  {})
+
+(define_insn_and_split "*fma<mode>"
+  [(set (match_operand:VI 0 "register_operand"     "=vr, vr, ?&vr")
+	(plus:VI
+	  (mult:VI
+	    (match_operand:VI 1 "register_operand" " %0, vr,   vr")
+	    (match_operand:VI 2 "register_operand" " vr, vr,   vr"))
+	  (match_operand:VI 3 "register_operand"   " vr,  0,   vr")))
+   (clobber (match_scratch:SI 4 "=r,r,r"))]
+  "TARGET_VECTOR"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    PUT_MODE (operands[4], Pmode);
+    riscv_vector::emit_vlmax_vsetvl (<MODE>mode, operands[4]);
+    if (which_alternative == 3)
+      emit_insn (gen_rtx_SET (operands[0], operands[3]));
+    rtx ops[] = {operands[0], operands[1], operands[2], operands[3], operands[0]};
+    riscv_vector::emit_vlmax_ternop_insn (code_for_pred_mul_plus (<MODE>mode),
+					  riscv_vector::RVV_TERNOP, ops, operands[4]);
+    DONE;
+  }
+  [(set_attr "type" "vimuladd")
+   (set_attr "mode" "<MODE>")])
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 36419c95bbd..86b2798fb5e 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -140,6 +140,7 @@  enum insn_type
   RVV_MERGE_OP = 4,
   RVV_CMP_OP = 4,
   RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand.  */
+  RVV_TERNOP = 5,
 };
 enum vlmul_type
 {
@@ -176,6 +177,7 @@  bool legitimize_move (rtx, rtx);
 void emit_vlmax_vsetvl (machine_mode, rtx);
 void emit_hard_vlmax_vsetvl (machine_mode, rtx);
 void emit_vlmax_insn (unsigned, int, rtx *, rtx = 0);
+void emit_vlmax_ternop_insn (unsigned, int, rtx *, rtx = 0);
 void emit_nonvlmax_insn (unsigned, int, rtx *, rtx);
 void emit_vlmax_merge_insn (unsigned, int, rtx *);
 void emit_vlmax_cmp_insn (unsigned, rtx *);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index f71ad9e46a1..840d4ef112a 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -362,6 +362,28 @@  emit_vlmax_insn (unsigned icode, int op_num, rtx *ops, rtx vl)
   e.emit_insn ((enum insn_code) icode, ops);
 }
 
+/* This function emits a {VLMAX, TAIL_ANY, MASK_ANY} vsetvli followed by the
+ * ternary operation which always has a real merge operand.  */
+void
+emit_vlmax_ternop_insn (unsigned icode, int op_num, rtx *ops, rtx vl)
+{
+  machine_mode data_mode = GET_MODE (ops[0]);
+  machine_mode mask_mode = get_mask_mode (data_mode).require ();
+  /* We have a maximum of 11 operands for RVV instruction patterns according to
+   * vector.md.  */
+  insn_expander<11> e (/*OP_NUM*/ op_num, /*HAS_DEST_P*/ true,
+		       /*FULLY_UNMASKED_P*/ true,
+		       /*USE_REAL_MERGE_P*/ true, /*HAS_AVL_P*/ true,
+		       /*VLMAX_P*/ true,
+		       /*DEST_MODE*/ data_mode, /*MASK_MODE*/ mask_mode);
+  e.set_policy (TAIL_ANY);
+  e.set_policy (MASK_ANY);
+  /* According to LRA mov pattern in vector.md, we have a clobber operand
+     to be used ad VL operand.  */
+  e.set_vl (vl);
+  e.emit_insn ((enum insn_code) icode, ops);
+}
+
 /* This function emits a {NONVLMAX, TAIL_ANY, MASK_ANY} vsetvli followed by the
  * actual operation.  */
 void
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 15f66efaa48..cd696da5d89 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -388,7 +388,7 @@ 
 	     (symbol_ref "INTVAL (operands[7])"))
 
 	 (eq_attr "type" "vldux,vldox,vialu,vshift,viminmax,vimul,vidiv,vsalu,\
-			  viwalu,viwmul,vnshift,vimuladd,vaalu,vsmul,vsshift,\
+			  viwalu,viwmul,vnshift,vaalu,vsmul,vsshift,\
 			  vnclip,vicmp,vfalu,vfmul,vfminmax,vfdiv,vfwalu,vfwmul,\
 			  vfsgnj,vfcmp,vfmuladd,vslideup,vslidedown,vislide1up,\
 			  vislide1down,vfslide1up,vfslide1down,vgather,viwmuladd,vfwmuladd,\
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-1.c
new file mode 100644
index 00000000000..e636b9a7a46
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-1.c
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable" } */
+
+#include <stdint-gcc.h>
+
+#define TEST_TYPE(TYPE)                                                        \
+  __attribute__ ((noipa)) void ternop_##TYPE (TYPE *__restrict dst,            \
+					      TYPE *__restrict a,              \
+					      TYPE *__restrict b, int n)       \
+  {                                                                            \
+    for (int i = 0; i < n; i++)                                                \
+      dst[i] += a[i] * b[i];                                                   \
+  }
+
+#define TEST_ALL()                                                             \
+  TEST_TYPE (int8_t)                                                           \
+  TEST_TYPE (uint8_t)                                                          \
+  TEST_TYPE (int16_t)                                                          \
+  TEST_TYPE (uint16_t)                                                         \
+  TEST_TYPE (int32_t)                                                          \
+  TEST_TYPE (uint32_t)                                                         \
+  TEST_TYPE (int64_t)                                                          \
+  TEST_TYPE (uint64_t)
+
+TEST_ALL ()
+
+/* { dg-final { scan-assembler-times {\tvmadd\.vv} 8 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-2.c
new file mode 100644
index 00000000000..2ad3fdd1899
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-2.c
@@ -0,0 +1,33 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable" } */
+
+#include <stdint-gcc.h>
+
+#define TEST_TYPE(TYPE)                                                        \
+  __attribute__ ((noipa)) void ternop_##TYPE (TYPE *restrict dest1,            \
+					      TYPE *restrict dest2,            \
+					      TYPE *restrict dest3,            \
+					      TYPE *restrict src1,             \
+					      TYPE *restrict src2, int n)      \
+  {                                                                            \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+	dest1[i] += src1[i] * src2[i];                                         \
+	dest2[i] += src1[i] * dest1[i];                                        \
+	dest3[i] += src2[i] * dest2[i];                                        \
+      }                                                                        \
+  }
+
+#define TEST_ALL()                                                             \
+  TEST_TYPE (int8_t)                                                           \
+  TEST_TYPE (uint8_t)                                                          \
+  TEST_TYPE (int16_t)                                                          \
+  TEST_TYPE (uint16_t)                                                         \
+  TEST_TYPE (int32_t)                                                          \
+  TEST_TYPE (uint32_t)                                                         \
+  TEST_TYPE (int64_t)                                                          \
+  TEST_TYPE (uint64_t)
+
+TEST_ALL ()
+
+/* { dg-final { scan-assembler-times {\tvmacc\.vv} 8 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-3.c
new file mode 100644
index 00000000000..393b26516a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop-3.c
@@ -0,0 +1,33 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable" } */
+
+#include <stdint-gcc.h>
+
+#define TEST_TYPE(TYPE)                                                        \
+  __attribute__ ((noipa)) void ternop_##TYPE (TYPE *restrict dest1,            \
+					      TYPE *restrict dest2,            \
+					      TYPE *restrict dest3,            \
+					      TYPE *restrict src1,             \
+					      TYPE *restrict src2, int n)      \
+  {                                                                            \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+	dest1[i] = src1[i] * src2[i] + dest2[i];                               \
+	dest2[i] += src1[i] * dest1[i];                                        \
+	dest3[i] += src2[i] * dest2[i];                                        \
+      }                                                                        \
+  }
+
+#define TEST_ALL()                                                             \
+  TEST_TYPE (int8_t)                                                           \
+  TEST_TYPE (uint8_t)                                                          \
+  TEST_TYPE (int16_t)                                                          \
+  TEST_TYPE (uint16_t)                                                         \
+  TEST_TYPE (int32_t)                                                          \
+  TEST_TYPE (uint32_t)                                                         \
+  TEST_TYPE (int64_t)                                                          \
+  TEST_TYPE (uint64_t)
+
+TEST_ALL ()
+
+/* { dg-final { scan-assembler-times {\tvmv} 8 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-1.c
new file mode 100644
index 00000000000..8ba2b970342
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-1.c
@@ -0,0 +1,84 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */
+
+#include "ternop-1.c"
+
+#define TEST_LOOP(TYPE, NUM)                                                   \
+  {                                                                            \
+    TYPE array1_##NUM[NUM] = {};                                               \
+    TYPE array2_##NUM[NUM] = {};                                               \
+    TYPE array3_##NUM[NUM] = {};                                               \
+    TYPE array4_##NUM[NUM] = {};                                               \
+    for (int i = 0; i < NUM; ++i)                                              \
+      {                                                                        \
+	array1_##NUM[i] = (i & 1) + 5;                                         \
+	array2_##NUM[i] = i - NUM / 3;                                         \
+	array3_##NUM[i] = NUM - NUM / 3 - i;                                   \
+	array4_##NUM[i] = NUM - NUM / 3 - i;                                   \
+	asm volatile("" ::: "memory");                                         \
+      }                                                                        \
+    ternop_##TYPE (array3_##NUM, array1_##NUM, array2_##NUM, NUM);             \
+    for (int i = 0; i < NUM; i++)                                              \
+      if (array3_##NUM[i]                                                      \
+	  != (TYPE) (array1_##NUM[i] * array2_##NUM[i] + array4_##NUM[i]))     \
+	__builtin_abort ();                                                    \
+  }
+
+int __attribute__ ((optimize (1))) main ()
+{
+  TEST_LOOP (int8_t, 7)
+  TEST_LOOP (uint8_t, 7)
+  TEST_LOOP (int16_t, 7)
+  TEST_LOOP (uint16_t, 7)
+  TEST_LOOP (int32_t, 7)
+  TEST_LOOP (uint32_t, 7)
+  TEST_LOOP (int64_t, 7)
+  TEST_LOOP (uint64_t, 7)
+
+  TEST_LOOP (int8_t, 16)
+  TEST_LOOP (uint8_t, 16)
+  TEST_LOOP (int16_t, 16)
+  TEST_LOOP (uint16_t, 16)
+  TEST_LOOP (int32_t, 16)
+  TEST_LOOP (uint32_t, 16)
+  TEST_LOOP (int64_t, 16)
+  TEST_LOOP (uint64_t, 16)
+
+  TEST_LOOP (int8_t, 77)
+  TEST_LOOP (uint8_t, 77)
+  TEST_LOOP (int16_t, 77)
+  TEST_LOOP (uint16_t, 77)
+  TEST_LOOP (int32_t, 77)
+  TEST_LOOP (uint32_t, 77)
+  TEST_LOOP (int64_t, 77)
+  TEST_LOOP (uint64_t, 77)
+  
+  TEST_LOOP (int8_t, 128)
+  TEST_LOOP (uint8_t, 128)
+  TEST_LOOP (int16_t, 128)
+  TEST_LOOP (uint16_t, 128)
+  TEST_LOOP (int32_t, 128)
+  TEST_LOOP (uint32_t, 128)
+  TEST_LOOP (int64_t, 128)
+  TEST_LOOP (uint64_t, 128)
+
+  TEST_LOOP (int8_t, 15641)
+  TEST_LOOP (uint8_t, 15641)
+  TEST_LOOP (int16_t, 15641)
+  TEST_LOOP (uint16_t, 15641)
+  TEST_LOOP (int32_t, 15641)
+  TEST_LOOP (uint32_t, 15641)
+  TEST_LOOP (int64_t, 15641)
+  TEST_LOOP (uint64_t, 15641)
+  
+  TEST_LOOP (int8_t, 795)
+  TEST_LOOP (uint8_t, 795)
+  TEST_LOOP (int16_t, 795)
+  TEST_LOOP (uint16_t, 795)
+  TEST_LOOP (int32_t, 795)
+  TEST_LOOP (uint32_t, 795)
+  TEST_LOOP (int64_t, 795)
+  TEST_LOOP (uint64_t, 795)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-2.c
new file mode 100644
index 00000000000..103b98acdf0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-2.c
@@ -0,0 +1,104 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */
+
+#include "ternop-2.c"
+
+#define TEST_LOOP(TYPE, NUM)                                                   \
+  {                                                                            \
+    TYPE array1_##NUM[NUM] = {};                                               \
+    TYPE array2_##NUM[NUM] = {};                                               \
+    TYPE array3_##NUM[NUM] = {};                                               \
+    TYPE array4_##NUM[NUM] = {};                                               \
+    TYPE array5_##NUM[NUM] = {};                                               \
+    TYPE array6_##NUM[NUM] = {};                                               \
+    TYPE array7_##NUM[NUM] = {};                                               \
+    TYPE array8_##NUM[NUM] = {};                                               \
+    for (int i = 0; i < NUM; ++i)                                              \
+      {                                                                        \
+	array1_##NUM[i] = (i & 1) + 5;                                         \
+	array2_##NUM[i] = i - NUM / 3;                                         \
+	array3_##NUM[i] = NUM - NUM / 3 - i;                                   \
+	array6_##NUM[i] = NUM - NUM / 3 - i;                                   \
+	array4_##NUM[i] = NUM - NUM / 2 + i;                                   \
+	array7_##NUM[i] = NUM - NUM / 2 + i;                                   \
+	array5_##NUM[i] = NUM + i * 7;                                         \
+	array8_##NUM[i] = NUM + i * 7;                                         \
+	asm volatile("" ::: "memory");                                         \
+      }                                                                        \
+    ternop_##TYPE (array3_##NUM, array4_##NUM, array5_##NUM, array1_##NUM,     \
+		   array2_##NUM, NUM);                                         \
+    for (int i = 0; i < NUM; i++)                                              \
+      {                                                                        \
+	array6_##NUM[i]                                                        \
+	  = (TYPE) (array1_##NUM[i] * array2_##NUM[i] + array6_##NUM[i]);      \
+	if (array3_##NUM[i] != array6_##NUM[i])                                \
+	  __builtin_abort ();                                                  \
+	array7_##NUM[i]                                                        \
+	  = (TYPE) (array1_##NUM[i] * array6_##NUM[i] + array7_##NUM[i]);      \
+	if (array4_##NUM[i] != array7_##NUM[i])                                \
+	  __builtin_abort ();                                                  \
+	array8_##NUM[i]                                                        \
+	  = (TYPE) (array2_##NUM[i] * array7_##NUM[i] + array8_##NUM[i]);      \
+	if (array5_##NUM[i] != array8_##NUM[i])                                \
+	  __builtin_abort ();                                                  \
+      }                                                                        \
+  }
+
+int __attribute__ ((optimize (0))) main ()
+{
+  TEST_LOOP (int8_t, 7)
+  TEST_LOOP (uint8_t, 7)
+  TEST_LOOP (int16_t, 7)
+  TEST_LOOP (uint16_t, 7)
+  TEST_LOOP (int32_t, 7)
+  TEST_LOOP (uint32_t, 7)
+  TEST_LOOP (int64_t, 7)
+  TEST_LOOP (uint64_t, 7)
+
+  TEST_LOOP (int8_t, 16)
+  TEST_LOOP (uint8_t, 16)
+  TEST_LOOP (int16_t, 16)
+  TEST_LOOP (uint16_t, 16)
+  TEST_LOOP (int32_t, 16)
+  TEST_LOOP (uint32_t, 16)
+  TEST_LOOP (int64_t, 16)
+  TEST_LOOP (uint64_t, 16)
+
+  TEST_LOOP (int8_t, 77)
+  TEST_LOOP (uint8_t, 77)
+  TEST_LOOP (int16_t, 77)
+  TEST_LOOP (uint16_t, 77)
+  TEST_LOOP (int32_t, 77)
+  TEST_LOOP (uint32_t, 77)
+  TEST_LOOP (int64_t, 77)
+  TEST_LOOP (uint64_t, 77)
+
+  TEST_LOOP (int8_t, 128)
+  TEST_LOOP (uint8_t, 128)
+  TEST_LOOP (int16_t, 128)
+  TEST_LOOP (uint16_t, 128)
+  TEST_LOOP (int32_t, 128)
+  TEST_LOOP (uint32_t, 128)
+  TEST_LOOP (int64_t, 128)
+  TEST_LOOP (uint64_t, 128)
+
+  TEST_LOOP (int8_t, 15641)
+  TEST_LOOP (uint8_t, 15641)
+  TEST_LOOP (int16_t, 15641)
+  TEST_LOOP (uint16_t, 15641)
+  TEST_LOOP (int32_t, 15641)
+  TEST_LOOP (uint32_t, 15641)
+  TEST_LOOP (int64_t, 15641)
+  TEST_LOOP (uint64_t, 15641)
+
+  TEST_LOOP (int8_t, 795)
+  TEST_LOOP (uint8_t, 795)
+  TEST_LOOP (int16_t, 795)
+  TEST_LOOP (uint16_t, 795)
+  TEST_LOOP (int32_t, 795)
+  TEST_LOOP (uint32_t, 795)
+  TEST_LOOP (int64_t, 795)
+  TEST_LOOP (uint64_t, 795)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-3.c
new file mode 100644
index 00000000000..eac5408ce6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_run-3.c
@@ -0,0 +1,104 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */
+
+#include "ternop-3.c"
+
+#define TEST_LOOP(TYPE, NUM)                                                   \
+  {                                                                            \
+    TYPE array1_##NUM[NUM] = {};                                               \
+    TYPE array2_##NUM[NUM] = {};                                               \
+    TYPE array3_##NUM[NUM] = {};                                               \
+    TYPE array4_##NUM[NUM] = {};                                               \
+    TYPE array5_##NUM[NUM] = {};                                               \
+    TYPE array6_##NUM[NUM] = {};                                               \
+    TYPE array7_##NUM[NUM] = {};                                               \
+    TYPE array8_##NUM[NUM] = {};                                               \
+    for (int i = 0; i < NUM; ++i)                                              \
+      {                                                                        \
+	array1_##NUM[i] = (i & 1) + 5;                                         \
+	array2_##NUM[i] = i - NUM / 3;                                         \
+	array3_##NUM[i] = NUM - NUM / 3 - i;                                   \
+	array6_##NUM[i] = NUM - NUM / 3 - i;                                   \
+	array4_##NUM[i] = NUM - NUM / 2 + i;                                   \
+	array7_##NUM[i] = NUM - NUM / 2 + i;                                   \
+	array5_##NUM[i] = NUM + i * 7;                                         \
+	array8_##NUM[i] = NUM + i * 7;                                         \
+	asm volatile("" ::: "memory");                                         \
+      }                                                                        \
+    ternop_##TYPE (array3_##NUM, array4_##NUM, array5_##NUM, array1_##NUM,     \
+		   array2_##NUM, NUM);                                         \
+    for (int i = 0; i < NUM; i++)                                              \
+      {                                                                        \
+	array6_##NUM[i]                                                        \
+	  = (TYPE) (array1_##NUM[i] * array2_##NUM[i] + array7_##NUM[i]);      \
+	if (array3_##NUM[i] != array6_##NUM[i])                                \
+	  __builtin_abort ();                                                  \
+	array7_##NUM[i]                                                        \
+	  = (TYPE) (array1_##NUM[i] * array6_##NUM[i] + array7_##NUM[i]);      \
+	if (array4_##NUM[i] != array7_##NUM[i])                                \
+	  __builtin_abort ();                                                  \
+	array8_##NUM[i]                                                        \
+	  = (TYPE) (array2_##NUM[i] * array7_##NUM[i] + array8_##NUM[i]);      \
+	if (array5_##NUM[i] != array8_##NUM[i])                                \
+	  __builtin_abort ();                                                  \
+      }                                                                        \
+  }
+
+int __attribute__ ((optimize (0))) main ()
+{
+  TEST_LOOP (int8_t, 7)
+  TEST_LOOP (uint8_t, 7)
+  TEST_LOOP (int16_t, 7)
+  TEST_LOOP (uint16_t, 7)
+  TEST_LOOP (int32_t, 7)
+  TEST_LOOP (uint32_t, 7)
+  TEST_LOOP (int64_t, 7)
+  TEST_LOOP (uint64_t, 7)
+
+  TEST_LOOP (int8_t, 16)
+  TEST_LOOP (uint8_t, 16)
+  TEST_LOOP (int16_t, 16)
+  TEST_LOOP (uint16_t, 16)
+  TEST_LOOP (int32_t, 16)
+  TEST_LOOP (uint32_t, 16)
+  TEST_LOOP (int64_t, 16)
+  TEST_LOOP (uint64_t, 16)
+
+  TEST_LOOP (int8_t, 77)
+  TEST_LOOP (uint8_t, 77)
+  TEST_LOOP (int16_t, 77)
+  TEST_LOOP (uint16_t, 77)
+  TEST_LOOP (int32_t, 77)
+  TEST_LOOP (uint32_t, 77)
+  TEST_LOOP (int64_t, 77)
+  TEST_LOOP (uint64_t, 77)
+
+  TEST_LOOP (int8_t, 128)
+  TEST_LOOP (uint8_t, 128)
+  TEST_LOOP (int16_t, 128)
+  TEST_LOOP (uint16_t, 128)
+  TEST_LOOP (int32_t, 128)
+  TEST_LOOP (uint32_t, 128)
+  TEST_LOOP (int64_t, 128)
+  TEST_LOOP (uint64_t, 128)
+
+  TEST_LOOP (int8_t, 15641)
+  TEST_LOOP (uint8_t, 15641)
+  TEST_LOOP (int16_t, 15641)
+  TEST_LOOP (uint16_t, 15641)
+  TEST_LOOP (int32_t, 15641)
+  TEST_LOOP (uint32_t, 15641)
+  TEST_LOOP (int64_t, 15641)
+  TEST_LOOP (uint64_t, 15641)
+
+  TEST_LOOP (int8_t, 795)
+  TEST_LOOP (uint8_t, 795)
+  TEST_LOOP (int16_t, 795)
+  TEST_LOOP (uint16_t, 795)
+  TEST_LOOP (int32_t, 795)
+  TEST_LOOP (uint32_t, 795)
+  TEST_LOOP (int64_t, 795)
+  TEST_LOOP (uint64_t, 795)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
index 9809a421fc8..7bd803303d0 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
+++ b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
@@ -65,6 +65,8 @@  foreach op $AUTOVEC_TEST_OPTS {
     "" "$op"
   dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/cmp/*.\[cS\]]] \
     "" "$op"
+  dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/ternop/*.\[cS\]]] \
+    "" "$op"
 }
 
 # VLS-VLMAX tests