diff mbox series

RISC-V: Support COND_LEN_* patterns

Message ID 20230712044424.75724-1-juzhe.zhong@rivai.ai
State New
Headers show
Series RISC-V: Support COND_LEN_* patterns | expand

Commit Message

juzhe.zhong@rivai.ai July 12, 2023, 4:44 a.m. UTC
This patch is depending on the following patch on Vectorizer:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624179.html

With this patch, we can handle operations may trap on elements outside the loop.

These 2 following cases will be addressed by this patch:

1. integer division:

  #define TEST_TYPE(TYPE) 				\
  __attribute__((noipa))				\
  void vrem_##TYPE (TYPE * __restrict dst, TYPE * __restrict a, TYPE * __restrict b, int n)	\
  {							\
    for (int i = 0; i < n; i++)				\
      dst[i] = a[i] % b[i];				\
  }
  #define TEST_ALL()	\
   TEST_TYPE(int8_t)	\
  TEST_ALL()

  Before this patch:

   vrem_int8_t:
        ble     a3,zero,.L14
        csrr    t4,vlenb
        addiw   a5,a3,-1
        addiw   a4,t4,-1
        sext.w  t5,a3
        bltu    a5,a4,.L10
        csrr    t3,vlenb
        subw    t3,t5,t3
        li      a5,0
        vsetvli t6,zero,e8,m1,ta,ma
.L4:
        add     a6,a2,a5
        add     a7,a0,a5
        add     t1,a1,a5
        mv      a4,a5
        add     a5,a5,t4
        vl1re8.v        v2,0(a6)
        vl1re8.v        v1,0(t1)
        sext.w  a6,a5
        vrem.vv v1,v1,v2
        vs1r.v  v1,0(a7)
        bleu    a6,t3,.L4
        csrr    a5,vlenb
        addw    a4,a4,a5
        sext.w  a5,a4
        beq     t5,a4,.L16
.L3:
        csrr    a6,vlenb
        subw    t5,t5,a4
        srli    a6,a6,1
        addiw   t1,t5,-1
        addiw   a7,a6,-1
        bltu    t1,a7,.L9
        slli    a4,a4,32
        srli    a4,a4,32
        add     t0,a1,a4
        add     t6,a2,a4
        add     a4,a0,a4
        vsetvli a7,zero,e8,mf2,ta,ma
        sext.w  t3,a6
        vle8.v  v1,0(t0)
        vle8.v  v2,0(t6)
        subw    t4,t5,a6
        vrem.vv v1,v1,v2
        vse8.v  v1,0(a4)
        mv      t1,t3
        bltu    t4,t3,.L7
        csrr    t1,vlenb
        add     a4,a4,a6
        add     t0,t0,a6
        add     t6,t6,a6
        sext.w  t1,t1
        vle8.v  v1,0(t0)
        vle8.v  v2,0(t6)
        vrem.vv v1,v1,v2
        vse8.v  v1,0(a4)
.L7:
        addw    a5,t1,a5
        beq     t5,t1,.L14
.L9:
        add     a4,a1,a5
        add     a6,a2,a5
        lb      a6,0(a6)
        lb      a4,0(a4)
        add     a7,a0,a5
        addi    a5,a5,1
        remw    a4,a4,a6
        sext.w  a6,a5
        sb      a4,0(a7)
        bgt     a3,a6,.L9
.L14:
        ret
.L10:
        li      a4,0
        li      a5,0
        j       .L3
.L16:
        ret

After this patch:

   vrem_int8_t:
	ble	a3,zero,.L5
.L3:
	vsetvli	a5,a3,e8,m1,tu,ma
	vle8.v	v1,0(a1)
	vle8.v	v2,0(a2)
	sub	a3,a3,a5
	vrem.vv	v1,v1,v2
	vse8.v	v1,0(a0)
	add	a1,a1,a5
	add	a2,a2,a5
	add	a0,a0,a5
	bne	a3,zero,.L3
.L5:
	ret

2. Floating-point operation **WITHOUT** -ffast-math:
     
    #define TEST_TYPE(TYPE) 				\
    __attribute__((noipa))				\
    void vadd_##TYPE (TYPE * __restrict dst, TYPE *__restrict a, TYPE *__restrict b, int n)	\
    {							\
      for (int i = 0; i < n; i++)				\
        dst[i] = a[i] + b[i];				\
    }

    #define TEST_ALL()	\
     TEST_TYPE(float)	\

    TEST_ALL()
   
Before this patch:
   
   vadd_float:
        ble     a3,zero,.L10
        csrr    a4,vlenb
        srli    t3,a4,2
        addiw   a5,a3,-1
        addiw   a6,t3,-1
        sext.w  t6,a3
        bltu    a5,a6,.L7
        subw    t5,t6,t3
        mv      t1,a1
        mv      a7,a2
        mv      a6,a0
        li      a5,0
        vsetvli t4,zero,e32,m1,ta,ma
.L4:
        vl1re32.v       v1,0(t1)
        vl1re32.v       v2,0(a7)
        addw    a5,a5,t3
        vfadd.vv        v1,v1,v2
        vs1r.v  v1,0(a6)
        add     t1,t1,a4
        add     a7,a7,a4
        add     a6,a6,a4
        bgeu    t5,a5,.L4
        beq     t6,a5,.L10
        sext.w  a5,a5
.L3:
        slli    a4,a5,2
.L6:
        add     a6,a1,a4
        add     a7,a2,a4
        flw     fa4,0(a6)
        flw     fa5,0(a7)
        add     a6,a0,a4
        addiw   a5,a5,1
        fadd.s  fa5,fa5,fa4
        addi    a4,a4,4
        fsw     fa5,0(a6)
        bgt     a3,a5,.L6
.L10:
        ret
.L7:
        li      a5,0
        j       .L3

After this patch:

   vadd_float:
	ble	a3,zero,.L5
.L3:
	vsetvli	a5,a3,e32,m1,tu,ma
	slli	a4,a5,2
	vle32.v	v1,0(a1)
	vle32.v	v2,0(a2)
	sub	a3,a3,a5
	vfadd.vv	v1,v1,v2
	vse32.v	v1,0(a0)
	add	a1,a1,a4
	add	a2,a2,a4
	add	a0,a0,a4
	bne	a3,zero,.L3
.L5:
	ret
  
gcc/ChangeLog:

        * config/riscv/autovec.md (cond_len_<optab><mode>): New pattern.
        * config/riscv/riscv-protos.h (enum insn_type): New enum.
        (expand_cond_len_binop): New function.
        * config/riscv/riscv-v.cc (emit_nonvlmax_tu_insn): Ditto.
        (emit_nonvlmax_fp_tu_insn): Ditto.
        (need_frm_p): Ditto.
        (expand_cond_len_binop): Ditto.
        * config/riscv/riscv.cc (riscv_preferred_else_value): Ditto.
        (TARGET_PREFERRED_ELSE_VALUE): New target hook.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c: Adapt testcase.
        * gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c: Ditto.
        * gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c: Ditto.
        * gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c: Ditto.
        * gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c: New test.

---
 gcc/config/riscv/autovec.md                   | 67 ++++++++++++++++
 gcc/config/riscv/riscv-protos.h               |  2 +
 gcc/config/riscv/riscv-v.cc                   | 79 +++++++++++++++++++
 gcc/config/riscv/riscv.cc                     | 17 ++++
 .../riscv/rvv/autovec/binop/vadd-run-nofm.c   |  4 +
 .../rvv/autovec/binop/vadd-rv32gcv-nofm.c     | 13 +++
 .../rvv/autovec/binop/vadd-rv64gcv-nofm.c     | 13 +++
 .../riscv/rvv/autovec/binop/vdiv-run-nofm.c   |  4 +
 .../rvv/autovec/binop/vdiv-rv32gcv-nofm.c     | 12 +++
 .../riscv/rvv/autovec/binop/vdiv-rv32gcv.c    | 11 ++-
 .../rvv/autovec/binop/vdiv-rv64gcv-nofm.c     | 12 +++
 .../riscv/rvv/autovec/binop/vdiv-rv64gcv.c    | 11 ++-
 .../riscv/rvv/autovec/binop/vmul-run-nofm.c   |  4 +
 .../rvv/autovec/binop/vmul-rv32gcv-nofm.c     |  8 ++
 .../rvv/autovec/binop/vmul-rv64gcv-nofm.c     |  8 ++
 .../riscv/rvv/autovec/binop/vrem-rv32gcv.c    | 10 +--
 .../riscv/rvv/autovec/binop/vrem-rv64gcv.c    | 10 +--
 .../riscv/rvv/autovec/binop/vsub-run-nofm.c   |  4 +
 .../rvv/autovec/binop/vsub-rv32gcv-nofm.c     | 14 ++++
 .../rvv/autovec/binop/vsub-rv64gcv-nofm.c     | 14 ++++
 20 files changed, 293 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c

Comments

juzhe.zhong@rivai.ai July 12, 2023, 2:12 p.m. UTC | #1
The middle-end vectorizer patch is approved and soon will be merged.

The middle-end dependency is resolved.

Ok for trunk?


juzhe.zhong@rivai.ai
 
From: Juzhe-Zhong
Date: 2023-07-12 12:44
To: gcc-patches
CC: kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc; Juzhe-Zhong
Subject: [PATCH] RISC-V: Support COND_LEN_* patterns
This patch is depending on the following patch on Vectorizer:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624179.html
 
With this patch, we can handle operations may trap on elements outside the loop.
 
These 2 following cases will be addressed by this patch:
 
1. integer division:
 
  #define TEST_TYPE(TYPE) \
  __attribute__((noipa)) \
  void vrem_##TYPE (TYPE * __restrict dst, TYPE * __restrict a, TYPE * __restrict b, int n) \
  { \
    for (int i = 0; i < n; i++) \
      dst[i] = a[i] % b[i]; \
  }
  #define TEST_ALL() \
   TEST_TYPE(int8_t) \
  TEST_ALL()
 
  Before this patch:
 
   vrem_int8_t:
        ble     a3,zero,.L14
        csrr    t4,vlenb
        addiw   a5,a3,-1
        addiw   a4,t4,-1
        sext.w  t5,a3
        bltu    a5,a4,.L10
        csrr    t3,vlenb
        subw    t3,t5,t3
        li      a5,0
        vsetvli t6,zero,e8,m1,ta,ma
.L4:
        add     a6,a2,a5
        add     a7,a0,a5
        add     t1,a1,a5
        mv      a4,a5
        add     a5,a5,t4
        vl1re8.v        v2,0(a6)
        vl1re8.v        v1,0(t1)
        sext.w  a6,a5
        vrem.vv v1,v1,v2
        vs1r.v  v1,0(a7)
        bleu    a6,t3,.L4
        csrr    a5,vlenb
        addw    a4,a4,a5
        sext.w  a5,a4
        beq     t5,a4,.L16
.L3:
        csrr    a6,vlenb
        subw    t5,t5,a4
        srli    a6,a6,1
        addiw   t1,t5,-1
        addiw   a7,a6,-1
        bltu    t1,a7,.L9
        slli    a4,a4,32
        srli    a4,a4,32
        add     t0,a1,a4
        add     t6,a2,a4
        add     a4,a0,a4
        vsetvli a7,zero,e8,mf2,ta,ma
        sext.w  t3,a6
        vle8.v  v1,0(t0)
        vle8.v  v2,0(t6)
        subw    t4,t5,a6
        vrem.vv v1,v1,v2
        vse8.v  v1,0(a4)
        mv      t1,t3
        bltu    t4,t3,.L7
        csrr    t1,vlenb
        add     a4,a4,a6
        add     t0,t0,a6
        add     t6,t6,a6
        sext.w  t1,t1
        vle8.v  v1,0(t0)
        vle8.v  v2,0(t6)
        vrem.vv v1,v1,v2
        vse8.v  v1,0(a4)
.L7:
        addw    a5,t1,a5
        beq     t5,t1,.L14
.L9:
        add     a4,a1,a5
        add     a6,a2,a5
        lb      a6,0(a6)
        lb      a4,0(a4)
        add     a7,a0,a5
        addi    a5,a5,1
        remw    a4,a4,a6
        sext.w  a6,a5
        sb      a4,0(a7)
        bgt     a3,a6,.L9
.L14:
        ret
.L10:
        li      a4,0
        li      a5,0
        j       .L3
.L16:
        ret
 
After this patch:
 
   vrem_int8_t:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e8,m1,tu,ma
vle8.v v1,0(a1)
vle8.v v2,0(a2)
sub a3,a3,a5
vrem.vv v1,v1,v2
vse8.v v1,0(a0)
add a1,a1,a5
add a2,a2,a5
add a0,a0,a5
bne a3,zero,.L3
.L5:
ret
 
2. Floating-point operation **WITHOUT** -ffast-math:
     
    #define TEST_TYPE(TYPE) \
    __attribute__((noipa)) \
    void vadd_##TYPE (TYPE * __restrict dst, TYPE *__restrict a, TYPE *__restrict b, int n) \
    { \
      for (int i = 0; i < n; i++) \
        dst[i] = a[i] + b[i]; \
    }
 
    #define TEST_ALL() \
     TEST_TYPE(float) \
 
    TEST_ALL()
   
Before this patch:
   
   vadd_float:
        ble     a3,zero,.L10
        csrr    a4,vlenb
        srli    t3,a4,2
        addiw   a5,a3,-1
        addiw   a6,t3,-1
        sext.w  t6,a3
        bltu    a5,a6,.L7
        subw    t5,t6,t3
        mv      t1,a1
        mv      a7,a2
        mv      a6,a0
        li      a5,0
        vsetvli t4,zero,e32,m1,ta,ma
.L4:
        vl1re32.v       v1,0(t1)
        vl1re32.v       v2,0(a7)
        addw    a5,a5,t3
        vfadd.vv        v1,v1,v2
        vs1r.v  v1,0(a6)
        add     t1,t1,a4
        add     a7,a7,a4
        add     a6,a6,a4
        bgeu    t5,a5,.L4
        beq     t6,a5,.L10
        sext.w  a5,a5
.L3:
        slli    a4,a5,2
.L6:
        add     a6,a1,a4
        add     a7,a2,a4
        flw     fa4,0(a6)
        flw     fa5,0(a7)
        add     a6,a0,a4
        addiw   a5,a5,1
        fadd.s  fa5,fa5,fa4
        addi    a4,a4,4
        fsw     fa5,0(a6)
        bgt     a3,a5,.L6
.L10:
        ret
.L7:
        li      a5,0
        j       .L3
 
After this patch:
 
   vadd_float:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e32,m1,tu,ma
slli a4,a5,2
vle32.v v1,0(a1)
vle32.v v2,0(a2)
sub a3,a3,a5
vfadd.vv v1,v1,v2
vse32.v v1,0(a0)
add a1,a1,a4
add a2,a2,a4
add a0,a0,a4
bne a3,zero,.L3
.L5:
ret
  
gcc/ChangeLog:
 
        * config/riscv/autovec.md (cond_len_<optab><mode>): New pattern.
        * config/riscv/riscv-protos.h (enum insn_type): New enum.
        (expand_cond_len_binop): New function.
        * config/riscv/riscv-v.cc (emit_nonvlmax_tu_insn): Ditto.
        (emit_nonvlmax_fp_tu_insn): Ditto.
        (need_frm_p): Ditto.
        (expand_cond_len_binop): Ditto.
        * config/riscv/riscv.cc (riscv_preferred_else_value): Ditto.
        (TARGET_PREFERRED_ELSE_VALUE): New target hook.
 
gcc/testsuite/ChangeLog:
 
        * gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c: Adapt testcase.
        * gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c: Ditto.
        * gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c: Ditto.
        * gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c: Ditto.
        * gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c: New test.
        * gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c: New test.
 
---
gcc/config/riscv/autovec.md                   | 67 ++++++++++++++++
gcc/config/riscv/riscv-protos.h               |  2 +
gcc/config/riscv/riscv-v.cc                   | 79 +++++++++++++++++++
gcc/config/riscv/riscv.cc                     | 17 ++++
.../riscv/rvv/autovec/binop/vadd-run-nofm.c   |  4 +
.../rvv/autovec/binop/vadd-rv32gcv-nofm.c     | 13 +++
.../rvv/autovec/binop/vadd-rv64gcv-nofm.c     | 13 +++
.../riscv/rvv/autovec/binop/vdiv-run-nofm.c   |  4 +
.../rvv/autovec/binop/vdiv-rv32gcv-nofm.c     | 12 +++
.../riscv/rvv/autovec/binop/vdiv-rv32gcv.c    | 11 ++-
.../rvv/autovec/binop/vdiv-rv64gcv-nofm.c     | 12 +++
.../riscv/rvv/autovec/binop/vdiv-rv64gcv.c    | 11 ++-
.../riscv/rvv/autovec/binop/vmul-run-nofm.c   |  4 +
.../rvv/autovec/binop/vmul-rv32gcv-nofm.c     |  8 ++
.../rvv/autovec/binop/vmul-rv64gcv-nofm.c     |  8 ++
.../riscv/rvv/autovec/binop/vrem-rv32gcv.c    | 10 +--
.../riscv/rvv/autovec/binop/vrem-rv64gcv.c    | 10 +--
.../riscv/rvv/autovec/binop/vsub-run-nofm.c   |  4 +
.../rvv/autovec/binop/vsub-rv32gcv-nofm.c     | 14 ++++
.../rvv/autovec/binop/vsub-rv64gcv-nofm.c     | 14 ++++
20 files changed, 293 insertions(+), 24 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 9e61b2e41d8..3a80ecd5d33 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1178,3 +1178,70 @@
riscv_vector::RVV_BINOP, operands);
   DONE;
})
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Conditional binary operations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - vadd.vv/vsub.vv/...
+;; - vadd.vi/vsub.vi/...
+;; -------------------------------------------------------------------------
+
+(define_expand "cond_len_<optab><mode>"
+  [(match_operand:VI 0 "register_operand")
+   (match_operand:<VM> 1 "vector_mask_operand")
+   (any_int_binop_no_shift:VI
+     (match_operand:VI 2 "<binop_rhs1_predicate>")
+     (match_operand:VI 3 "<binop_rhs2_predicate>"))
+   (match_operand:VI 4 "register_operand")
+   (match_operand 5 "autovec_length_operand")
+   (match_operand 6 "const_0_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::expand_cond_len_binop (<CODE>, operands);
+  DONE;
+})
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Conditional binary operations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - vfadd.vv/vfsub.vv/...
+;; - vfadd.vf/vfsub.vf/...
+;; -------------------------------------------------------------------------
+
+(define_expand "cond_len_<optab><mode>"
+  [(match_operand:VF 0 "register_operand")
+   (match_operand:<VM> 1 "vector_mask_operand")
+   (any_float_binop:VF
+     (match_operand:VF 2 "register_operand")
+     (match_operand:VF 3 "register_operand"))
+   (match_operand:VF 4 "register_operand")
+   (match_operand 5 "autovec_length_operand")
+   (match_operand 6 "const_0_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::expand_cond_len_binop (<CODE>, operands);
+  DONE;
+})
+
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - vfmin.vv/vfmax.vv
+;; - vfmin.vf/vfmax.vf
+;; -------------------------------------------------------------------------
+
+(define_expand "cond_len_<optab><mode>"
+  [(match_operand:VF 0 "register_operand")
+   (match_operand:<VM> 1 "vector_mask_operand")
+   (any_float_binop_nofrm:VF
+     (match_operand:VF 2 "register_operand")
+     (match_operand:VF 3 "register_operand"))
+   (match_operand:VF 4 "register_operand")
+   (match_operand 5 "autovec_length_operand")
+   (match_operand 6 "const_0_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::expand_cond_len_binop (<CODE>, operands);
+  DONE;
+})
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 5766e3597e8..df433a10629 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -139,6 +139,7 @@ enum insn_type
   RVV_UNOP = 2,
   RVV_BINOP = 3,
   RVV_BINOP_MU = RVV_BINOP + 2,
+  RVV_BINOP_TU = RVV_BINOP + 2,
   RVV_MERGE_OP = 4,
   RVV_CMP_OP = 4,
   RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand.  */
@@ -230,6 +231,7 @@ bool neg_simm5_p (rtx);
bool has_vi_variant_p (rtx_code, rtx);
void expand_vec_cmp (rtx, rtx_code, rtx, rtx);
bool expand_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
+void expand_cond_len_binop (rtx_code, rtx *);
#endif
bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
  bool, void (*)(rtx *, rtx));
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 8d5bed7ebe4..499f66d6940 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -919,6 +919,45 @@ emit_vlmax_masked_mu_insn (unsigned icode, int op_num, rtx *ops)
   e.emit_insn ((enum insn_code) icode, ops);
}
+/* This function emits a TU instruction.  */
+static void
+emit_nonvlmax_tu_insn (unsigned icode, int op_num, rtx *ops, rtx avl)
+{
+  machine_mode dest_mode = GET_MODE (ops[0]);
+  machine_mode mask_mode = get_mask_mode (dest_mode).require ();
+  insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num,
+   /*HAS_DEST_P*/ true,
+   /*FULLY_UNMASKED_P*/ false,
+   /*USE_REAL_MERGE_P*/ true,
+   /*HAS_AVL_P*/ true,
+   /*VLMAX_P*/ false, dest_mode,
+   mask_mode);
+  e.set_policy (TAIL_UNDISTURBED);
+  e.set_policy (MASK_ANY);
+  e.set_vl (avl);
+  e.emit_insn ((enum insn_code) icode, ops);
+}
+
+/* This function emits a TU instruction.  */
+static void
+emit_nonvlmax_fp_tu_insn (unsigned icode, int op_num, rtx *ops, rtx avl)
+{
+  machine_mode dest_mode = GET_MODE (ops[0]);
+  machine_mode mask_mode = get_mask_mode (dest_mode).require ();
+  insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num,
+   /*HAS_DEST_P*/ true,
+   /*FULLY_UNMASKED_P*/ false,
+   /*USE_REAL_MERGE_P*/ true,
+   /*HAS_AVL_P*/ true,
+   /*VLMAX_P*/ false, dest_mode,
+   mask_mode);
+  e.set_policy (TAIL_UNDISTURBED);
+  e.set_policy (MASK_ANY);
+  e.set_rounding_mode (FRM_DYN);
+  e.set_vl (avl);
+  e.emit_insn ((enum insn_code) icode, ops);
+}
+
/* Emit vmv.s.x instruction.  */
void
@@ -2812,4 +2851,44 @@ expand_load_store (rtx *ops, bool is_load)
     }
}
+/* Return true if the operation is the floating-point operation need FRM.  */
+static bool
+need_frm_p (rtx_code code, machine_mode mode)
+{
+  if (!FLOAT_MODE_P (mode))
+    return false;
+  return code != SMIN && code != SMAX;
+}
+
+/* Expand COND_LEN_*.  */
+void
+expand_cond_len_binop (rtx_code code, rtx *ops)
+{
+  rtx dest = ops[0];
+  rtx mask = ops[1];
+  rtx src1 = ops[2];
+  rtx src2 = ops[3];
+  rtx merge = ops[4];
+  rtx len = ops[5];
+  machine_mode mode = GET_MODE (dest);
+  machine_mode mask_mode = GET_MODE (mask);
+
+  poly_uint64 value;
+  bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
+
+  if (is_dummy_mask)
+    {
+      /* Use TU, MASK ANY policy.  */
+      rtx ops[] = {dest, mask, merge, src1, src2};
+      insn_code icode = code_for_pred (code, mode);
+      if (need_frm_p (code, mode))
+ emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_MU, ops, len);
+      else
+ emit_nonvlmax_tu_insn (icode, RVV_BINOP_MU, ops, len);
+    }
+  else
+    /* FIXME: Enable this case when we support it in the middle-end.  */
+    gcc_unreachable ();
+}
+
} // namespace riscv_vector
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 38d8eb2fcf5..3febb3f1288 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7855,6 +7855,20 @@ riscv_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   return false;
}
+/* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
+   prefer to use the first arithmetic operand as the else value if
+   the else value doesn't matter, since that exactly matches the SVE
+   destructive merging form.  For ternary operations we could either
+   pick the first operand and use FMAD-like instructions or the last
+   operand and use FMLA-like instructions; the latter seems more
+   natural.  */
+
+static tree
+riscv_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
+{
+  return nops == 3 ? ops[2] : ops[0];
+}
+
/* Initialize the GCC target structure.  */
#undef TARGET_ASM_ALIGNED_HI_OP
#define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -8156,6 +8170,9 @@ riscv_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
#undef TARGET_VECTORIZE_VEC_PERM_CONST
#define TARGET_VECTORIZE_VEC_PERM_CONST riscv_vectorize_vec_perm_const
+#undef TARGET_PREFERRED_ELSE_VALUE
+#define TARGET_PREFERRED_ELSE_VALUE riscv_preferred_else_value
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-riscv.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c
new file mode 100644
index 00000000000..66ae1167128
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c
@@ -0,0 +1,4 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model --param=riscv-autovec-preference=scalable" } */
+
+#include "vadd-run.c"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c
new file mode 100644
index 00000000000..069bc690697
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vadd-template.h"
+
+/* { dg-final { scan-assembler-times {\tvadd\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvadd\.vi} 8 } } */
+/* { dg-final { scan-assembler-times {\tvfadd\.vv} 7 } } */
+/* There are 2 MINUS operations.  */
+/* { dg-final { scan-assembler-times {\tvfsub\.vv} 2 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_ADD" 7 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_SUB" 2 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c
new file mode 100644
index 00000000000..07fa54878cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vadd-template.h"
+
+/* { dg-final { scan-assembler-times {\tvadd\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvadd\.vi} 8 } } */
+/* { dg-final { scan-assembler-times {\tvfadd\.vv} 7 } } */
+/* There are 2 MINUS operations.  */
+/* { dg-final { scan-assembler-times {\tvfsub\.vv} 2 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_ADD" 7 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_SUB" 2 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c
new file mode 100644
index 00000000000..ed340b840d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c
@@ -0,0 +1,4 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model --param=riscv-autovec-preference=scalable" } */
+
+#include "vdiv-run.c"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c
new file mode 100644
index 00000000000..5ce2e57e265
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vdiv-template.h"
+
+/* { dg-final { scan-assembler-times {\tvdiv\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvdivu\.vv} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tvfdiv\.vv} 6 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_DIV" 16 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_RDIV" 6 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c
index 604d9acb038..9b984dda452 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c
@@ -1,15 +1,14 @@
/* { dg-do compile } */
-/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=fixed-vlmax -ffast-math" } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=fixed-vlmax -ffast-math -fdump-tree-optimized-details" } */
#include "vdiv-template.h"
-/* Currently we use an epilogue loop which also contains vdivs.  Therefore we
-   expect 14 vdiv[u]s instead of 8.  */
-
-/* { dg-final { scan-assembler-times {\tvdiv\.vv} 14 } } */
-/* { dg-final { scan-assembler-times {\tvdivu\.vv} 14 } } */
+/* { dg-final { scan-assembler-times {\tvdiv\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvdivu\.vv} 8 } } */
/* Division by constant is done by calculating a reciprocal and
    then multiplying.  Hence we do not expect 6 vfdivs.  */
/* { dg-final { scan-assembler-times {\tvfdiv\.vv} 3 } } */
/* { dg-final { scan-assembler-times {\tvfmul\.vv} 3 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_DIV" 16 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c
new file mode 100644
index 00000000000..7b1aa28e45e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vdiv-template.h"
+
+/* { dg-final { scan-assembler-times {\tvdiv\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvdivu\.vv} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tvfdiv\.vv} 6 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_DIV" 16 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_RDIV" 6 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c
index 26884035d57..ca4d23bb1ed 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c
@@ -1,15 +1,14 @@
/* { dg-do compile } */
-/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=fixed-vlmax -ffast-math" } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=fixed-vlmax -ffast-math -fdump-tree-optimized-details" } */
#include "vdiv-template.h"
-/* Currently we use an epilogue loop which also contains vdivs.  Therefore we
-   expect 14 vdiv[u]s instead of 8.  */
-
-/* { dg-final { scan-assembler-times {\tvdiv\.vv} 14 } } */
-/* { dg-final { scan-assembler-times {\tvdivu\.vv} 14 } } */
+/* { dg-final { scan-assembler-times {\tvdiv\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvdivu\.vv} 8 } } */
/* Division by constant is done by calculating a reciprocal and
    then multiplying.  Hence we do not expect 6 vfdivs.  */
/* { dg-final { scan-assembler-times {\tvfdiv\.vv} 3 } } */
/* { dg-final { scan-assembler-times {\tvfmul\.vv} 3 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_DIV" 16 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c
new file mode 100644
index 00000000000..225030e2df0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c
@@ -0,0 +1,4 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model --param=riscv-autovec-preference=scalable" } */
+
+#include "vmul-run.c"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c
new file mode 100644
index 00000000000..3e0f06162fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vmul-template.h"
+
+/* { dg-final { scan-assembler-times {\tvmul\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvfmul\.vv} 6 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_MUL" 6 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c
new file mode 100644
index 00000000000..ca245e28662
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vmul-template.h"
+
+/* { dg-final { scan-assembler-times {\tvmul\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvfmul\.vv} 6 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_MUL" 6 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c
index 86607d03777..6d0493b3194 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c
@@ -1,9 +1,7 @@
-/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=fixed-vlmax" } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=fixed-vlmax -fdump-tree-optimized-details" } */
#include "vrem-template.h"
-/* Currently we use an epilogue loop which also contains vrems.  Therefore we
-   expect 14 vrem[u]s instead of 8.  */
-
-/* { dg-final { scan-assembler-times {\tvrem\.vv} 14 } } */
-/* { dg-final { scan-assembler-times {\tvremu\.vv} 14 } } */
+/* { dg-final { scan-assembler-times {\tvrem\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvremu\.vv} 8 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_MOD" 16 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c
index b7bc1ccb860..24b2bc81a90 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c
@@ -1,10 +1,8 @@
/* { dg-do compile } */
-/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=fixed-vlmax" } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=fixed-vlmax -fdump-tree-optimized-details" } */
#include "vrem-template.h"
-/* Currently we use an epilogue loop which also contains vrems.  Therefore we
-   expect 14 vrem[u]s instead of 8.  */
-
-/* { dg-final { scan-assembler-times {\tvrem\.vv} 14 } } */
-/* { dg-final { scan-assembler-times {\tvremu\.vv} 14 } } */
+/* { dg-final { scan-assembler-times {\tvrem\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvremu\.vv} 8 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_MOD" 16 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c
new file mode 100644
index 00000000000..36a17068e0e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c
@@ -0,0 +1,4 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model --param=riscv-autovec-preference=scalable" } */
+
+#include "vsub-run.c"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c
new file mode 100644
index 00000000000..c4ab934cdf5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vsub-template.h"
+
+/* { dg-final { scan-assembler-times {\tvsub\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvrsub\.vi} 16 } } */
+
+/* { dg-final { scan-assembler-times {\tvfsub\.vv} 12 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_SUB" 12 "optimized" } } */
+
+/* Do not expect vfrsub for now, because we do not properly
+   handle vop.vx and vfop.vf yet.  */
+/* { dg-final { scan-assembler-times {\tvfrsub\.vv} 0 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c
new file mode 100644
index 00000000000..9e71911a92a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vsub-template.h"
+
+/* { dg-final { scan-assembler-times {\tvsub\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvrsub\.vi} 16 } } */
+
+/* { dg-final { scan-assembler-times {\tvfsub\.vv} 12 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_SUB" 12 "optimized" } } */
+
+/* Do not expect vfrsub for now, because we do not properly
+   handle vop.vx and vfop.vf yet.  */
+/* { dg-final { scan-assembler-times {\tvfrsub\.vv} 0 } } */
Robin Dapp July 12, 2023, 2:24 p.m. UTC | #2
Hi Juzhe,

> +/* Return true if the operation is the floating-point operation need FRM.  */
> +static bool
> +need_frm_p (rtx_code code, machine_mode mode)
> +{
> +  if (!FLOAT_MODE_P (mode))
> +    return false;
> +  return code != SMIN && code != SMAX;
> +}

Return true if the operation requires a rounding mode operand.  Maybe also
call it needs_fp_rounding?

> +      if (need_frm_p (code, mode))
> +	emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_MU, ops, len);
> +      else
> +	emit_nonvlmax_tu_insn (icode, RVV_BINOP_MU, ops, len);
> +    }

This feels like we could decide it inside emit_nonvlmax_tu_insn.
Same for without _tu.  But let's keep it like this for now in
order not to stall progress.

> +/* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
> +   prefer to use the first arithmetic operand as the else value if
> +   the else value doesn't matter, since that exactly matches the SVE
> +   destructive merging form.  For ternary operations we could either
> +   pick the first operand and use FMAD-like instructions or the last
> +   operand and use FMLA-like instructions; the latter seems more
> +   natural.  */

What's FMLA?  That's SVE I suppose and ours is fmacc?

Apart from that fine from my side, thanks for supporting this.

Regards
 Robin
juzhe.zhong@rivai.ai July 12, 2023, 2:30 p.m. UTC | #3
>> Return true if the operation requires a rounding mode operand.  Maybe also
>>call it needs_fp_rounding?
ok

>>What's FMLA?  That's SVE I suppose and ours is fmacc?
Yes, the comments is misleading will fix it soon.


juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-07-12 22:24
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Support COND_LEN_* patterns
Hi Juzhe,
 
> +/* Return true if the operation is the floating-point operation need FRM.  */
> +static bool
> +need_frm_p (rtx_code code, machine_mode mode)
> +{
> +  if (!FLOAT_MODE_P (mode))
> +    return false;
> +  return code != SMIN && code != SMAX;
> +}
 
Return true if the operation requires a rounding mode operand.  Maybe also
call it needs_fp_rounding?
 
> +      if (need_frm_p (code, mode))
> + emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_MU, ops, len);
> +      else
> + emit_nonvlmax_tu_insn (icode, RVV_BINOP_MU, ops, len);
> +    }
 
This feels like we could decide it inside emit_nonvlmax_tu_insn.
Same for without _tu.  But let's keep it like this for now in
order not to stall progress.
 
> +/* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
> +   prefer to use the first arithmetic operand as the else value if
> +   the else value doesn't matter, since that exactly matches the SVE
> +   destructive merging form.  For ternary operations we could either
> +   pick the first operand and use FMAD-like instructions or the last
> +   operand and use FMLA-like instructions; the latter seems more
> +   natural.  */
 
What's FMLA?  That's SVE I suppose and ours is fmacc?
 
Apart from that fine from my side, thanks for supporting this.
 
Regards
Robin
diff mbox series

Patch

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 9e61b2e41d8..3a80ecd5d33 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1178,3 +1178,70 @@ 
 				 riscv_vector::RVV_BINOP, operands);
   DONE;
 })
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Conditional binary operations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - vadd.vv/vsub.vv/...
+;; - vadd.vi/vsub.vi/...
+;; -------------------------------------------------------------------------
+
+(define_expand "cond_len_<optab><mode>"
+  [(match_operand:VI 0 "register_operand")
+   (match_operand:<VM> 1 "vector_mask_operand")
+   (any_int_binop_no_shift:VI
+     (match_operand:VI 2 "<binop_rhs1_predicate>")
+     (match_operand:VI 3 "<binop_rhs2_predicate>"))
+   (match_operand:VI 4 "register_operand")
+   (match_operand 5 "autovec_length_operand")
+   (match_operand 6 "const_0_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::expand_cond_len_binop (<CODE>, operands);
+  DONE;
+})
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Conditional binary operations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - vfadd.vv/vfsub.vv/...
+;; - vfadd.vf/vfsub.vf/...
+;; -------------------------------------------------------------------------
+
+(define_expand "cond_len_<optab><mode>"
+  [(match_operand:VF 0 "register_operand")
+   (match_operand:<VM> 1 "vector_mask_operand")
+   (any_float_binop:VF
+     (match_operand:VF 2 "register_operand")
+     (match_operand:VF 3 "register_operand"))
+   (match_operand:VF 4 "register_operand")
+   (match_operand 5 "autovec_length_operand")
+   (match_operand 6 "const_0_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::expand_cond_len_binop (<CODE>, operands);
+  DONE;
+})
+
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - vfmin.vv/vfmax.vv
+;; - vfmin.vf/vfmax.vf
+;; -------------------------------------------------------------------------
+
+(define_expand "cond_len_<optab><mode>"
+  [(match_operand:VF 0 "register_operand")
+   (match_operand:<VM> 1 "vector_mask_operand")
+   (any_float_binop_nofrm:VF
+     (match_operand:VF 2 "register_operand")
+     (match_operand:VF 3 "register_operand"))
+   (match_operand:VF 4 "register_operand")
+   (match_operand 5 "autovec_length_operand")
+   (match_operand 6 "const_0_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::expand_cond_len_binop (<CODE>, operands);
+  DONE;
+})
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 5766e3597e8..df433a10629 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -139,6 +139,7 @@  enum insn_type
   RVV_UNOP = 2,
   RVV_BINOP = 3,
   RVV_BINOP_MU = RVV_BINOP + 2,
+  RVV_BINOP_TU = RVV_BINOP + 2,
   RVV_MERGE_OP = 4,
   RVV_CMP_OP = 4,
   RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand.  */
@@ -230,6 +231,7 @@  bool neg_simm5_p (rtx);
 bool has_vi_variant_p (rtx_code, rtx);
 void expand_vec_cmp (rtx, rtx_code, rtx, rtx);
 bool expand_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
+void expand_cond_len_binop (rtx_code, rtx *);
 #endif
 bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
 			  bool, void (*)(rtx *, rtx));
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 8d5bed7ebe4..499f66d6940 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -919,6 +919,45 @@  emit_vlmax_masked_mu_insn (unsigned icode, int op_num, rtx *ops)
   e.emit_insn ((enum insn_code) icode, ops);
 }
 
+/* This function emits a TU instruction.  */
+static void
+emit_nonvlmax_tu_insn (unsigned icode, int op_num, rtx *ops, rtx avl)
+{
+  machine_mode dest_mode = GET_MODE (ops[0]);
+  machine_mode mask_mode = get_mask_mode (dest_mode).require ();
+  insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num,
+					  /*HAS_DEST_P*/ true,
+					  /*FULLY_UNMASKED_P*/ false,
+					  /*USE_REAL_MERGE_P*/ true,
+					  /*HAS_AVL_P*/ true,
+					  /*VLMAX_P*/ false, dest_mode,
+					  mask_mode);
+  e.set_policy (TAIL_UNDISTURBED);
+  e.set_policy (MASK_ANY);
+  e.set_vl (avl);
+  e.emit_insn ((enum insn_code) icode, ops);
+}
+
+/* This function emits a TU instruction.  */
+static void
+emit_nonvlmax_fp_tu_insn (unsigned icode, int op_num, rtx *ops, rtx avl)
+{
+  machine_mode dest_mode = GET_MODE (ops[0]);
+  machine_mode mask_mode = get_mask_mode (dest_mode).require ();
+  insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num,
+					  /*HAS_DEST_P*/ true,
+					  /*FULLY_UNMASKED_P*/ false,
+					  /*USE_REAL_MERGE_P*/ true,
+					  /*HAS_AVL_P*/ true,
+					  /*VLMAX_P*/ false, dest_mode,
+					  mask_mode);
+  e.set_policy (TAIL_UNDISTURBED);
+  e.set_policy (MASK_ANY);
+  e.set_rounding_mode (FRM_DYN);
+  e.set_vl (avl);
+  e.emit_insn ((enum insn_code) icode, ops);
+}
+
 /* Emit vmv.s.x instruction.  */
 
 void
@@ -2812,4 +2851,44 @@  expand_load_store (rtx *ops, bool is_load)
     }
 }
 
+/* Return true if the operation is the floating-point operation need FRM.  */
+static bool
+need_frm_p (rtx_code code, machine_mode mode)
+{
+  if (!FLOAT_MODE_P (mode))
+    return false;
+  return code != SMIN && code != SMAX;
+}
+
+/* Expand COND_LEN_*.  */
+void
+expand_cond_len_binop (rtx_code code, rtx *ops)
+{
+  rtx dest = ops[0];
+  rtx mask = ops[1];
+  rtx src1 = ops[2];
+  rtx src2 = ops[3];
+  rtx merge = ops[4];
+  rtx len = ops[5];
+  machine_mode mode = GET_MODE (dest);
+  machine_mode mask_mode = GET_MODE (mask);
+
+  poly_uint64 value;
+  bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
+
+  if (is_dummy_mask)
+    {
+      /* Use TU, MASK ANY policy.  */
+      rtx ops[] = {dest, mask, merge, src1, src2};
+      insn_code icode = code_for_pred (code, mode);
+      if (need_frm_p (code, mode))
+	emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_MU, ops, len);
+      else
+	emit_nonvlmax_tu_insn (icode, RVV_BINOP_MU, ops, len);
+    }
+  else
+    /* FIXME: Enable this case when we support it in the middle-end.  */
+    gcc_unreachable ();
+}
+
 } // namespace riscv_vector
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 38d8eb2fcf5..3febb3f1288 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7855,6 +7855,20 @@  riscv_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   return false;
 }
 
+/* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
+   prefer to use the first arithmetic operand as the else value if
+   the else value doesn't matter, since that exactly matches the SVE
+   destructive merging form.  For ternary operations we could either
+   pick the first operand and use FMAD-like instructions or the last
+   operand and use FMLA-like instructions; the latter seems more
+   natural.  */
+
+static tree
+riscv_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
+{
+  return nops == 3 ? ops[2] : ops[0];
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -8156,6 +8170,9 @@  riscv_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 #undef TARGET_VECTORIZE_VEC_PERM_CONST
 #define TARGET_VECTORIZE_VEC_PERM_CONST riscv_vectorize_vec_perm_const
 
+#undef TARGET_PREFERRED_ELSE_VALUE
+#define TARGET_PREFERRED_ELSE_VALUE riscv_preferred_else_value
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-riscv.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c
new file mode 100644
index 00000000000..66ae1167128
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c
@@ -0,0 +1,4 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model --param=riscv-autovec-preference=scalable" } */
+
+#include "vadd-run.c"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c
new file mode 100644
index 00000000000..069bc690697
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vadd-template.h"
+
+/* { dg-final { scan-assembler-times {\tvadd\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvadd\.vi} 8 } } */
+/* { dg-final { scan-assembler-times {\tvfadd\.vv} 7 } } */
+/* There are 2 MINUS operations.  */
+/* { dg-final { scan-assembler-times {\tvfsub\.vv} 2 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_ADD" 7 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_SUB" 2 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c
new file mode 100644
index 00000000000..07fa54878cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vadd-template.h"
+
+/* { dg-final { scan-assembler-times {\tvadd\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvadd\.vi} 8 } } */
+/* { dg-final { scan-assembler-times {\tvfadd\.vv} 7 } } */
+/* There are 2 MINUS operations.  */
+/* { dg-final { scan-assembler-times {\tvfsub\.vv} 2 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_ADD" 7 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_SUB" 2 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c
new file mode 100644
index 00000000000..ed340b840d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c
@@ -0,0 +1,4 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model --param=riscv-autovec-preference=scalable" } */
+
+#include "vdiv-run.c"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c
new file mode 100644
index 00000000000..5ce2e57e265
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vdiv-template.h"
+
+/* { dg-final { scan-assembler-times {\tvdiv\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvdivu\.vv} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tvfdiv\.vv} 6 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_DIV" 16 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_RDIV" 6 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c
index 604d9acb038..9b984dda452 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c
@@ -1,15 +1,14 @@ 
 /* { dg-do compile } */
-/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=fixed-vlmax -ffast-math" } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=fixed-vlmax -ffast-math -fdump-tree-optimized-details" } */
 
 #include "vdiv-template.h"
 
-/* Currently we use an epilogue loop which also contains vdivs.  Therefore we
-   expect 14 vdiv[u]s instead of 8.  */
-
-/* { dg-final { scan-assembler-times {\tvdiv\.vv} 14 } } */
-/* { dg-final { scan-assembler-times {\tvdivu\.vv} 14 } } */
+/* { dg-final { scan-assembler-times {\tvdiv\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvdivu\.vv} 8 } } */
 
 /* Division by constant is done by calculating a reciprocal and
    then multiplying.  Hence we do not expect 6 vfdivs.  */
 /* { dg-final { scan-assembler-times {\tvfdiv\.vv} 3 } } */
 /* { dg-final { scan-assembler-times {\tvfmul\.vv} 3 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_DIV" 16 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c
new file mode 100644
index 00000000000..7b1aa28e45e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vdiv-template.h"
+
+/* { dg-final { scan-assembler-times {\tvdiv\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvdivu\.vv} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tvfdiv\.vv} 6 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_DIV" 16 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_RDIV" 6 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c
index 26884035d57..ca4d23bb1ed 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c
@@ -1,15 +1,14 @@ 
 /* { dg-do compile } */
-/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=fixed-vlmax -ffast-math" } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=fixed-vlmax -ffast-math -fdump-tree-optimized-details" } */
 
 #include "vdiv-template.h"
 
-/* Currently we use an epilogue loop which also contains vdivs.  Therefore we
-   expect 14 vdiv[u]s instead of 8.  */
-
-/* { dg-final { scan-assembler-times {\tvdiv\.vv} 14 } } */
-/* { dg-final { scan-assembler-times {\tvdivu\.vv} 14 } } */
+/* { dg-final { scan-assembler-times {\tvdiv\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvdivu\.vv} 8 } } */
 
 /* Division by constant is done by calculating a reciprocal and
    then multiplying.  Hence we do not expect 6 vfdivs.  */
 /* { dg-final { scan-assembler-times {\tvfdiv\.vv} 3 } } */
 /* { dg-final { scan-assembler-times {\tvfmul\.vv} 3 } } */
+
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_DIV" 16 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c
new file mode 100644
index 00000000000..225030e2df0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c
@@ -0,0 +1,4 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model --param=riscv-autovec-preference=scalable" } */
+
+#include "vmul-run.c"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c
new file mode 100644
index 00000000000..3e0f06162fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c
@@ -0,0 +1,8 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vmul-template.h"
+
+/* { dg-final { scan-assembler-times {\tvmul\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvfmul\.vv} 6 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_MUL" 6 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c
new file mode 100644
index 00000000000..ca245e28662
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c
@@ -0,0 +1,8 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vmul-template.h"
+
+/* { dg-final { scan-assembler-times {\tvmul\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvfmul\.vv} 6 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_MUL" 6 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c
index 86607d03777..6d0493b3194 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c
@@ -1,9 +1,7 @@ 
-/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=fixed-vlmax" } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=fixed-vlmax -fdump-tree-optimized-details" } */
 
 #include "vrem-template.h"
 
-/* Currently we use an epilogue loop which also contains vrems.  Therefore we
-   expect 14 vrem[u]s instead of 8.  */
-
-/* { dg-final { scan-assembler-times {\tvrem\.vv} 14 } } */
-/* { dg-final { scan-assembler-times {\tvremu\.vv} 14 } } */
+/* { dg-final { scan-assembler-times {\tvrem\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvremu\.vv} 8 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_MOD" 16 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c
index b7bc1ccb860..24b2bc81a90 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c
@@ -1,10 +1,8 @@ 
 /* { dg-do compile } */
-/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=fixed-vlmax" } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=fixed-vlmax -fdump-tree-optimized-details" } */
 
 #include "vrem-template.h"
 
-/* Currently we use an epilogue loop which also contains vrems.  Therefore we
-   expect 14 vrem[u]s instead of 8.  */
-
-/* { dg-final { scan-assembler-times {\tvrem\.vv} 14 } } */
-/* { dg-final { scan-assembler-times {\tvremu\.vv} 14 } } */
+/* { dg-final { scan-assembler-times {\tvrem\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {\tvremu\.vv} 8 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_MOD" 16 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c
new file mode 100644
index 00000000000..36a17068e0e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c
@@ -0,0 +1,4 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model --param=riscv-autovec-preference=scalable" } */
+
+#include "vsub-run.c"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c
new file mode 100644
index 00000000000..c4ab934cdf5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vsub-template.h"
+
+/* { dg-final { scan-assembler-times {\tvsub\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvrsub\.vi} 16 } } */
+
+/* { dg-final { scan-assembler-times {\tvfsub\.vv} 12 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_SUB" 12 "optimized" } } */
+
+/* Do not expect vfrsub for now, because we do not properly
+   handle vop.vx and vfop.vf yet.  */
+/* { dg-final { scan-assembler-times {\tvfrsub\.vv} 0 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c
new file mode 100644
index 00000000000..9e71911a92a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model -march=rv64gcv_zvfh -mabi=lp64d --param=riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include "vsub-template.h"
+
+/* { dg-final { scan-assembler-times {\tvsub\.vv} 16 } } */
+/* { dg-final { scan-assembler-times {\tvrsub\.vi} 16 } } */
+
+/* { dg-final { scan-assembler-times {\tvfsub\.vv} 12 } } */
+/* { dg-final { scan-tree-dump-times "\.COND_LEN_SUB" 12 "optimized" } } */
+
+/* Do not expect vfrsub for now, because we do not properly
+   handle vop.vx and vfop.vf yet.  */
+/* { dg-final { scan-assembler-times {\tvfrsub\.vv} 0 } } */