diff mbox series

[RFC,v4,17/70] target/riscv: rvv-1.0: configure instructions

Message ID 20200817084955.28793-18-frank.chang@sifive.com
State New
Headers show
Series support vector extension v1.0 | expand

Commit Message

Frank Chang Aug. 17, 2020, 8:49 a.m. UTC
From: Frank Chang <frank.chang@sifive.com>

Signed-off-by: Frank Chang <frank.chang@sifive.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/riscv/insn_trans/trans_rvv.inc.c | 12 ++++++++----
 target/riscv/vector_helper.c            | 14 +++++++++++++-
 2 files changed, 21 insertions(+), 5 deletions(-)

Comments

Frank Chang Sept. 25, 2020, 8:51 a.m. UTC | #1
On Mon, Aug 17, 2020 at 4:50 PM <frank.chang@sifive.com> wrote:

> From: Frank Chang <frank.chang@sifive.com>
>
> Signed-off-by: Frank Chang <frank.chang@sifive.com>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/riscv/insn_trans/trans_rvv.inc.c | 12 ++++++++----
>  target/riscv/vector_helper.c            | 14 +++++++++++++-
>  2 files changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/target/riscv/insn_trans/trans_rvv.inc.c
> b/target/riscv/insn_trans/trans_rvv.inc.c
> index 4b8ae5470c3..4efe323920b 100644
> --- a/target/riscv/insn_trans/trans_rvv.inc.c
> +++ b/target/riscv/insn_trans/trans_rvv.inc.c
> @@ -98,8 +98,10 @@ static bool trans_vsetvl(DisasContext *ctx, arg_vsetvl
> *a)
>      s2 = tcg_temp_new();
>      dst = tcg_temp_new();
>
> -    /* Using x0 as the rs1 register specifier, encodes an infinite AVL */
> -    if (a->rs1 == 0) {
> +    if (a->rd == 0 && a->rs1 == 0) {
> +        s1 = tcg_temp_new();
> +        tcg_gen_mov_tl(s1, cpu_vl);
> +    } else if (a->rs1 == 0) {
>          /* As the mask is at least one bit, RV_VLEN_MAX is >= VLMAX */
>          s1 = tcg_const_tl(RV_VLEN_MAX);
>      } else {
> @@ -131,8 +133,10 @@ static bool trans_vsetvli(DisasContext *ctx,
> arg_vsetvli *a)
>      s2 = tcg_const_tl(a->zimm);
>      dst = tcg_temp_new();
>
> -    /* Using x0 as the rs1 register specifier, encodes an infinite AVL */
> -    if (a->rs1 == 0) {
> +    if (a->rd == 0 && a->rs1 == 0) {
> +        s1 = tcg_temp_new();
> +        tcg_gen_mov_tl(s1, cpu_vl);
> +    } else if (a->rs1 == 0) {
>          /* As the mask is at least one bit, RV_VLEN_MAX is >= VLMAX */
>          s1 = tcg_const_tl(RV_VLEN_MAX);
>       } else {
>          s1 = tcg_temp_new();
>          gen_get_gpr(s1, a->rs1);
>       }
>       gen_helper_vsetvl(dst, cpu_env, s1, s2);
>       gen_set_gpr(a->rd, dst);
>       gen_goto_tb(ctx, 0, ctx->pc_succ_insn);
>

trans_vsetvli() uses gen_goto_tb() to save the computation in the link to
the next TB.
I know there was a discussion about this back in RVV v0.7.1:
https://patchew.org/QEMU/20200103033347.20909-1-zhiwei_liu@c-sky.com/20200103033347.20909-5-zhiwei_liu@c-sky.com/

However, we had encountered an issue that looked like it was caused by the
linked TB.
The code snippet which cause the issue is:

00000000000104a8 <loop>: 104a8: 0122ffd7 vsetvli t6,t0,e32,m4,tu,mu,d1
104ac: 02036407 vle32.v v8,(t1) 104b0: 028a0a57 vadd.vv v20,v8,v20 104b4:
41f282b3 sub t0,t0,t6 104b8: 002f9893 slli a7,t6,0x2 104bc: 9346 add
t1,t1,a7 104be: fe0295e3 bnez t0,104a8 <loop> 104c2: 012f7057 vsetvli
zero,t5,e32,m4,tu,mu,d1
.....

If $t0 is given with the value, e.g. 68.
<loop> is expected to process 32 elements in each iteration.
That's it, the env->vl after vsetvli at 0x104a8 in each iteration would be:
1st iteration: 32 (remaining elements to be processed: 68 - 32 = 36)
2nd iteration: 32 (remaining elements to be processed: 36 - 32 = 4)
3rd iteration: 4 (remaining elements to be processed: 4 - 4 = 0, will leave
<loop> after 0x104be)

vadd.vv at 0x104b0 is implemented with gvec for acceleration:

if (a->vm && s->vl_eq_vlmax) {
    gvec_fn(s->sew, vreg_ofs(s, a->rd),
            vreg_ofs(s, a->rs2), vreg_ofs(s, a->rs1),
            MAXSZ(s), MAXSZ(s));
} else {
    uint32_t data = 0;

    data = FIELD_DP32(data, VDATA, VM, a->vm);
    data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
    tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0),
                       vreg_ofs(s, a->rs1), vreg_ofs(s, a->rs2),
                       cpu_env, 0, s->vlen / 8, data, fn);
}

gvec function is used when a->vm and s->vl_eq_vlmax are both true.
However, s->vl_eq_vlmax, for the above case, is only true in 1st and 2nd
iterations.
In third iteration, env->vl is 4 which is not equal to vlmax = 32.
But as the TB where vadd.vv resides are already linked with vsetvli's TB,
it won't be retranslated and still use the same gvec function in the third
iteration.
The total elemented being proceeded would be: 32 + 32 + 32 = 96, instead of
68.

I'm wondering under such conditions, is it still correct to use
gen_goto_tb() here?
Or we should use lookup_and_goto_ptr() as in trans_vsetvl() to not link the
TBs.

P.S. We also notice that this issue won't happen when debugging with GDB
because
use_goto_tb() in gen_goto_tb() will return false when GDB is connected and
lookup_and_goto_ptr() will be called instead.

Frank Chang


>       ctx->base.is_jmp = DISAS_NORETURN;
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index 7b4b1151b97..430b25d16c2 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -31,12 +31,24 @@ target_ulong HELPER(vsetvl)(CPURISCVState *env,
> target_ulong s1,
>  {
>      int vlmax, vl;
>      RISCVCPU *cpu = env_archcpu(env);
> +    uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
>      uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
>      uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
>      bool vill = FIELD_EX64(s2, VTYPE, VILL);
>      target_ulong reserved = FIELD_EX64(s2, VTYPE, RESERVED);
>
> -    if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
> +    if (lmul & 4) {
> +        /* Fractional LMUL. */
> +        if (lmul == 4 ||
> +            cpu->cfg.elen >> (8 - lmul) < sew) {
> +            vill = true;
> +        }
> +    }
> +
> +    if ((sew > cpu->cfg.elen)
> +        || vill
> +        || (ediv != 0)
> +        || (reserved != 0)) {
>          /* only set vill bit. */
>          env->vtype = FIELD_DP64(0, VTYPE, VILL, 1);
>          env->vl = 0;
> --
> 2.17.1
>
>
Richard Henderson Sept. 25, 2020, 6:28 p.m. UTC | #2
On 9/25/20 1:51 AM, Frank Chang wrote:
> trans_vsetvli() uses gen_goto_tb() to save the computation in the link to the
> next TB.
> I know there was a discussion about this back in RVV v0.7.1:
> https://patchew.org/QEMU/20200103033347.20909-1-zhiwei_liu@c-sky.com/20200103033347.20909-5-zhiwei_liu@c-sky.com/
> 
> However, we had encountered an issue that looked like it was caused by the
> linked TB.
> The code snippet which cause the issue is:
> 
> 00000000000104a8 <loop>: 104a8: 0122ffd7 vsetvli t6,t0,e32,m4,tu,mu,d1 104ac:
> 02036407 vle32.v v8,(t1) 104b0: 028a0a57 vadd.vv v20,v8,v20 104b4: 41f282b3 sub
> t0,t0,t6 104b8: 002f9893 slli a7,t6,0x2 104bc: 9346 add t1,t1,a7 104be:
> fe0295e3 bnez t0,104a8 <loop> 104c2: 012f7057 vsetvli zero,t5,e32,m4,tu,mu,d1
> .....
> 
> If $t0 is given with the value, e.g. 68.
> <loop> is expected to process 32 elements in each iteration.
> That's it, the env->vl after vsetvli at 0x104a8 in each iteration would be:
> 1st iteration: 32 (remaining elements to be processed: 68 - 32 = 36)
> 2nd iteration: 32 (remaining elements to be processed: 36 - 32 = 4)
> 3rd iteration: 4 (remaining elements to be processed: 4 - 4 = 0, will leave
> <loop> after 0x104be)
> 
> vadd.vv at 0x104b0 is implemented with gvec for acceleration:
> 
> if (a->vm && s->vl_eq_vlmax) {
>     gvec_fn(s->sew, vreg_ofs(s, a->rd),
>             vreg_ofs(s, a->rs2), vreg_ofs(s, a->rs1),
>             MAXSZ(s), MAXSZ(s));
> } else {
>     uint32_t data = 0;
> 
>     data = FIELD_DP32(data, VDATA, VM, a->vm);
>     data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
>     tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0),
>                        vreg_ofs(s, a->rs1), vreg_ofs(s, a->rs2),
>                        cpu_env, 0, s->vlen / 8, data, fn);
> }
> 
> gvec function is used when a->vm and s->vl_eq_vlmax are both true.
> However, s->vl_eq_vlmax, for the above case, is only true in 1st and 2nd
> iterations.
> In third iteration, env->vl is 4 which is not equal to vlmax = 32.
> But as the TB where vadd.vv resides are already linked with vsetvli's TB,
> it won't be retranslated and still use the same gvec function in the third
> iteration.
> The total elemented being proceeded would be: 32 + 32 + 32 = 96, instead of 68.
> 
> I'm wondering under such conditions, is it still correct to use gen_goto_tb() here?
> Or we should use lookup_and_goto_ptr() as in trans_vsetvl() to not link the TBs.

You're correct -- because of vl_eq_vlmax we can't use goto_tb when using a
variable input.

It would be possible when using xN,x0 for VLMAX, or x0,x0 for reuse of the
current vl, but I doubt it's worth special-casing that.

I wonder if the goto_tb conversation happened before we introduced vl_eq_vlmax
and forgot to re-evaluate, or if I just missed that in the first place.
Anyway, thanks for finding this.


r~
Frank Chang Sept. 26, 2020, 5:05 a.m. UTC | #3
On Sat, Sep 26, 2020 at 2:28 AM Richard Henderson <
richard.henderson@linaro.org> wrote:

> On 9/25/20 1:51 AM, Frank Chang wrote:
> > trans_vsetvli() uses gen_goto_tb() to save the computation in the link
> to the
> > next TB.
> > I know there was a discussion about this back in RVV v0.7.1:
> >
> https://patchew.org/QEMU/20200103033347.20909-1-zhiwei_liu@c-sky.com/20200103033347.20909-5-zhiwei_liu@c-sky.com/
> >
> > However, we had encountered an issue that looked like it was caused by
> the
> > linked TB.
> > The code snippet which cause the issue is:
> >
> > 00000000000104a8 <loop>: 104a8: 0122ffd7 vsetvli t6,t0,e32,m4,tu,mu,d1
> 104ac:
> > 02036407 vle32.v v8,(t1) 104b0: 028a0a57 vadd.vv v20,v8,v20 104b4:
> 41f282b3 sub
> > t0,t0,t6 104b8: 002f9893 slli a7,t6,0x2 104bc: 9346 add t1,t1,a7 104be:
> > fe0295e3 bnez t0,104a8 <loop> 104c2: 012f7057 vsetvli
> zero,t5,e32,m4,tu,mu,d1
> > .....
> >
> > If $t0 is given with the value, e.g. 68.
> > <loop> is expected to process 32 elements in each iteration.
> > That's it, the env->vl after vsetvli at 0x104a8 in each iteration would
> be:
> > 1st iteration: 32 (remaining elements to be processed: 68 - 32 = 36)
> > 2nd iteration: 32 (remaining elements to be processed: 36 - 32 = 4)
> > 3rd iteration: 4 (remaining elements to be processed: 4 - 4 = 0, will
> leave
> > <loop> after 0x104be)
> >
> > vadd.vv at 0x104b0 is implemented with gvec for acceleration:
> >
> > if (a->vm && s->vl_eq_vlmax) {
> >     gvec_fn(s->sew, vreg_ofs(s, a->rd),
> >             vreg_ofs(s, a->rs2), vreg_ofs(s, a->rs1),
> >             MAXSZ(s), MAXSZ(s));
> > } else {
> >     uint32_t data = 0;
> >
> >     data = FIELD_DP32(data, VDATA, VM, a->vm);
> >     data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
> >     tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0),
> >                        vreg_ofs(s, a->rs1), vreg_ofs(s, a->rs2),
> >                        cpu_env, 0, s->vlen / 8, data, fn);
> > }
> >
> > gvec function is used when a->vm and s->vl_eq_vlmax are both true.
> > However, s->vl_eq_vlmax, for the above case, is only true in 1st and 2nd
> > iterations.
> > In third iteration, env->vl is 4 which is not equal to vlmax = 32.
> > But as the TB where vadd.vv resides are already linked with vsetvli's TB,
> > it won't be retranslated and still use the same gvec function in the
> third
> > iteration.
> > The total elemented being proceeded would be: 32 + 32 + 32 = 96, instead
> of 68.
> >
> > I'm wondering under such conditions, is it still correct to use
> gen_goto_tb() here?
> > Or we should use lookup_and_goto_ptr() as in trans_vsetvl() to not link
> the TBs.
>
> You're correct -- because of vl_eq_vlmax we can't use goto_tb when using a
> variable input.
>
> It would be possible when using xN,x0 for VLMAX, or x0,x0 for reuse of the
> current vl, but I doubt it's worth special-casing that.
>
> I wonder if the goto_tb conversation happened before we introduced
> vl_eq_vlmax
> and forgot to re-evaluate, or if I just missed that in the first place.
> Anyway, thanks for finding this.
>
>
> r~
>

Thanks Richard, I'll include the fix in my next version patchset.

Frank Chang
diff mbox series

Patch

diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c
index 4b8ae5470c3..4efe323920b 100644
--- a/target/riscv/insn_trans/trans_rvv.inc.c
+++ b/target/riscv/insn_trans/trans_rvv.inc.c
@@ -98,8 +98,10 @@  static bool trans_vsetvl(DisasContext *ctx, arg_vsetvl *a)
     s2 = tcg_temp_new();
     dst = tcg_temp_new();
 
-    /* Using x0 as the rs1 register specifier, encodes an infinite AVL */
-    if (a->rs1 == 0) {
+    if (a->rd == 0 && a->rs1 == 0) {
+        s1 = tcg_temp_new();
+        tcg_gen_mov_tl(s1, cpu_vl);
+    } else if (a->rs1 == 0) {
         /* As the mask is at least one bit, RV_VLEN_MAX is >= VLMAX */
         s1 = tcg_const_tl(RV_VLEN_MAX);
     } else {
@@ -131,8 +133,10 @@  static bool trans_vsetvli(DisasContext *ctx, arg_vsetvli *a)
     s2 = tcg_const_tl(a->zimm);
     dst = tcg_temp_new();
 
-    /* Using x0 as the rs1 register specifier, encodes an infinite AVL */
-    if (a->rs1 == 0) {
+    if (a->rd == 0 && a->rs1 == 0) {
+        s1 = tcg_temp_new();
+        tcg_gen_mov_tl(s1, cpu_vl);
+    } else if (a->rs1 == 0) {
         /* As the mask is at least one bit, RV_VLEN_MAX is >= VLMAX */
         s1 = tcg_const_tl(RV_VLEN_MAX);
     } else {
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 7b4b1151b97..430b25d16c2 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -31,12 +31,24 @@  target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
 {
     int vlmax, vl;
     RISCVCPU *cpu = env_archcpu(env);
+    uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
     bool vill = FIELD_EX64(s2, VTYPE, VILL);
     target_ulong reserved = FIELD_EX64(s2, VTYPE, RESERVED);
 
-    if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
+    if (lmul & 4) {
+        /* Fractional LMUL. */
+        if (lmul == 4 ||
+            cpu->cfg.elen >> (8 - lmul) < sew) {
+            vill = true;
+        }
+    }
+
+    if ((sew > cpu->cfg.elen)
+        || vill
+        || (ediv != 0)
+        || (reserved != 0)) {
         /* only set vill bit. */
         env->vtype = FIELD_DP64(0, VTYPE, VILL, 1);
         env->vl = 0;