| Message ID | 202605061005.646A537u037157@mse-fl1.zte.com.cn |
|---|---|
| State | New |
| Headers | show |
| Series | target/riscv: Fix tail handling for vmv.s.x and vfmv.s.f | expand |
On Wed, May 6, 2026 at 10:48 PM <chen.zhongyao@zte.com.cn> wrote: > > The risc-v vector spec defines vmv.s.x and vfmv.s.f as writing > element 0 of the destination register while the remaining destination > elements follow the current tail policy. When QEMU runs with > rvv_ta_all_1s enabled, those elements must therefore become all 1s. > > QEMU handled both instructions as translation-time special cases that > directly wrote vd[0] and skipped the usual tail processing. As a result, > vmv.s.x and vfmv.s.f left the remaining destination elements unchanged > instead of applying the configured tail policy. > > Fix this by routing both instructions through a helper that writes > vd[0] and then treats the rest of the destination register as tail, > reusing the existing agnostic-element fill logic. > > Signed-off-by: Zhongyao Chen <chen.zhongyao@zte.com.cn> Acked-by: Alistair Francis <alistair.francis@wdc.com> Alistair > --- > target/riscv/helper.h | 4 +++ > target/riscv/insn_trans/trans_rvv.c.inc | 44 ++++++++----------------- > target/riscv/vector_helper.c | 18 ++++++++++ > 3 files changed, 36 insertions(+), 30 deletions(-) > > diff --git a/target/riscv/helper.h b/target/riscv/helper.h > index b785456ee0..5a7f043edb 100644 > --- a/target/riscv/helper.h > +++ b/target/riscv/helper.h > @@ -660,6 +660,10 @@ DEF_HELPER_4(vmv_v_x_b, void, ptr, i64, env, i32) > DEF_HELPER_4(vmv_v_x_h, void, ptr, i64, env, i32) > DEF_HELPER_4(vmv_v_x_w, void, ptr, i64, env, i32) > DEF_HELPER_4(vmv_v_x_d, void, ptr, i64, env, i32) > +DEF_HELPER_4(vset_velem0_b, void, ptr, i64, env, i32) > +DEF_HELPER_4(vset_velem0_h, void, ptr, i64, env, i32) > +DEF_HELPER_4(vset_velem0_w, void, ptr, i64, env, i32) > +DEF_HELPER_4(vset_velem0_d, void, ptr, i64, env, i32) > > DEF_HELPER_6(vsaddu_vv_b, void, ptr, ptr, ptr, ptr, env, i32) > DEF_HELPER_6(vsaddu_vv_h, void, ptr, ptr, ptr, ptr, env, i32) > diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc > index 4df9a40b44..6a966c35c3 100644 > --- a/target/riscv/insn_trans/trans_rvv.c.inc > +++ b/target/riscv/insn_trans/trans_rvv.c.inc > @@ -3381,37 +3381,21 @@ static void vec_element_loadi(DisasContext *s, TCGv_i64 dest, > load_element(dest, tcg_env, endian_ofs(s, vreg, idx), s->sew, sign); > } > > -/* Integer Scalar Move Instruction */ > +typedef void gen_helper_vset_velem0(TCGv_ptr, TCGv_i64, TCGv_env, TCGv_i32); > > -static void store_element(TCGv_i64 val, TCGv_ptr base, > - int ofs, int sew) > +static void vec_element_storei_tail(DisasContext *s, int vreg, TCGv_i64 val) > { > - switch (sew) { > - case MO_8: > - tcg_gen_st8_i64(val, base, ofs); > - break; > - case MO_16: > - tcg_gen_st16_i64(val, base, ofs); > - break; > - case MO_32: > - tcg_gen_st32_i64(val, base, ofs); > - break; > - case MO_64: > - tcg_gen_st_i64(val, base, ofs); > - break; > - default: > - g_assert_not_reached(); > - } > -} > + static gen_helper_vset_velem0 * const fns[4] = { > + gen_helper_vset_velem0_b, gen_helper_vset_velem0_h, > + gen_helper_vset_velem0_w, gen_helper_vset_velem0_d, > + }; > + TCGv_ptr dest = tcg_temp_new_ptr(); > + uint32_t data = FIELD_DP32(0, VDATA, VTA, s->vta); > + TCGv_i32 desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb, > + s->cfg_ptr->vlenb, data)); > > -/* > - * Store vreg[idx] = val. > - * The index must be in range of VLMAX. > - */ > -static void vec_element_storei(DisasContext *s, int vreg, > - int idx, TCGv_i64 val) > -{ > - store_element(val, tcg_env, endian_ofs(s, vreg, idx), s->sew); > + tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vreg)); > + fns[s->sew](dest, val, tcg_env, desc); > } > > /* vmv.x.s rd, vs2 # x[rd] = vs2[0] */ > @@ -3458,7 +3442,7 @@ static bool trans_vmv_s_x(DisasContext *s, arg_vmv_s_x *a) > */ > s1 = get_gpr(s, a->rs1, EXT_NONE); > tcg_gen_ext_tl_i64(t1, s1); > - vec_element_storei(s, a->rd, 0, t1); > + vec_element_storei_tail(s, a->rd, t1); > gen_set_label(over); > tcg_gen_movi_tl(cpu_vstart, 0); > finalize_rvv_inst(s); > @@ -3514,7 +3498,7 @@ static bool trans_vfmv_s_f(DisasContext *s, arg_vfmv_s_f *a) > t1 = tcg_temp_new_i64(); > do_nanbox(s, t1, cpu_fpr[a->rs1]); > > - vec_element_storei(s, a->rd, 0, t1); > + vec_element_storei_tail(s, a->rd, t1); > > gen_set_label(over); > tcg_gen_movi_tl(cpu_vstart, 0); > diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c > index 83dd26314d..6c7af25d82 100644 > --- a/target/riscv/vector_helper.c > +++ b/target/riscv/vector_helper.c > @@ -2112,6 +2112,24 @@ GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2) > GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4) > GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8) > > +#define GEN_VEXT_SET_VELEM0(NAME, ETYPE, H) \ > +void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env, \ > + uint32_t desc) \ > +{ \ > + uint32_t esz = sizeof(ETYPE); \ > + uint32_t vlenb = riscv_cpu_cfg(env)->vlenb; \ > + uint32_t vta = vext_vta(desc); \ > + \ > + *((ETYPE *)vd + H(0)) = (ETYPE)s1; \ > + /* Treat every element past vd[0] as tail for scalar-to-vector moves. */ \ > + vext_set_elems_1s(vd, vta, esz, vlenb); \ > +} > + > +GEN_VEXT_SET_VELEM0(vset_velem0_b, int8_t, H1) > +GEN_VEXT_SET_VELEM0(vset_velem0_h, int16_t, H2) > +GEN_VEXT_SET_VELEM0(vset_velem0_w, int32_t, H4) > +GEN_VEXT_SET_VELEM0(vset_velem0_d, int64_t, H8) > + > #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H) \ > void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ > CPURISCVState *env, uint32_t desc) \ > -- > 2.43.0 >
On Wed, May 6, 2026 at 10:48 PM <chen.zhongyao@zte.com.cn> wrote: > > The risc-v vector spec defines vmv.s.x and vfmv.s.f as writing > element 0 of the destination register while the remaining destination > elements follow the current tail policy. When QEMU runs with > rvv_ta_all_1s enabled, those elements must therefore become all 1s. > > QEMU handled both instructions as translation-time special cases that > directly wrote vd[0] and skipped the usual tail processing. As a result, > vmv.s.x and vfmv.s.f left the remaining destination elements unchanged > instead of applying the configured tail policy. > > Fix this by routing both instructions through a helper that writes > vd[0] and then treats the rest of the destination register as tail, > reusing the existing agnostic-element fill logic. > > Signed-off-by: Zhongyao Chen <chen.zhongyao@zte.com.cn> Acked-by: Alistair Francis <alistair.francis@wdc.com> Alistair > --- > target/riscv/helper.h | 4 +++ > target/riscv/insn_trans/trans_rvv.c.inc | 44 ++++++++----------------- > target/riscv/vector_helper.c | 18 ++++++++++ > 3 files changed, 36 insertions(+), 30 deletions(-) > > diff --git a/target/riscv/helper.h b/target/riscv/helper.h > index b785456ee0..5a7f043edb 100644 > --- a/target/riscv/helper.h > +++ b/target/riscv/helper.h > @@ -660,6 +660,10 @@ DEF_HELPER_4(vmv_v_x_b, void, ptr, i64, env, i32) > DEF_HELPER_4(vmv_v_x_h, void, ptr, i64, env, i32) > DEF_HELPER_4(vmv_v_x_w, void, ptr, i64, env, i32) > DEF_HELPER_4(vmv_v_x_d, void, ptr, i64, env, i32) > +DEF_HELPER_4(vset_velem0_b, void, ptr, i64, env, i32) > +DEF_HELPER_4(vset_velem0_h, void, ptr, i64, env, i32) > +DEF_HELPER_4(vset_velem0_w, void, ptr, i64, env, i32) > +DEF_HELPER_4(vset_velem0_d, void, ptr, i64, env, i32) > > DEF_HELPER_6(vsaddu_vv_b, void, ptr, ptr, ptr, ptr, env, i32) > DEF_HELPER_6(vsaddu_vv_h, void, ptr, ptr, ptr, ptr, env, i32) > diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc > index 4df9a40b44..6a966c35c3 100644 > --- a/target/riscv/insn_trans/trans_rvv.c.inc > +++ b/target/riscv/insn_trans/trans_rvv.c.inc > @@ -3381,37 +3381,21 @@ static void vec_element_loadi(DisasContext *s, TCGv_i64 dest, > load_element(dest, tcg_env, endian_ofs(s, vreg, idx), s->sew, sign); > } > > -/* Integer Scalar Move Instruction */ > +typedef void gen_helper_vset_velem0(TCGv_ptr, TCGv_i64, TCGv_env, TCGv_i32); > > -static void store_element(TCGv_i64 val, TCGv_ptr base, > - int ofs, int sew) > +static void vec_element_storei_tail(DisasContext *s, int vreg, TCGv_i64 val) > { > - switch (sew) { > - case MO_8: > - tcg_gen_st8_i64(val, base, ofs); > - break; > - case MO_16: > - tcg_gen_st16_i64(val, base, ofs); > - break; > - case MO_32: > - tcg_gen_st32_i64(val, base, ofs); > - break; > - case MO_64: > - tcg_gen_st_i64(val, base, ofs); > - break; > - default: > - g_assert_not_reached(); > - } > -} > + static gen_helper_vset_velem0 * const fns[4] = { > + gen_helper_vset_velem0_b, gen_helper_vset_velem0_h, > + gen_helper_vset_velem0_w, gen_helper_vset_velem0_d, > + }; > + TCGv_ptr dest = tcg_temp_new_ptr(); > + uint32_t data = FIELD_DP32(0, VDATA, VTA, s->vta); > + TCGv_i32 desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb, > + s->cfg_ptr->vlenb, data)); > > -/* > - * Store vreg[idx] = val. > - * The index must be in range of VLMAX. > - */ > -static void vec_element_storei(DisasContext *s, int vreg, > - int idx, TCGv_i64 val) > -{ > - store_element(val, tcg_env, endian_ofs(s, vreg, idx), s->sew); > + tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vreg)); > + fns[s->sew](dest, val, tcg_env, desc); > } > > /* vmv.x.s rd, vs2 # x[rd] = vs2[0] */ > @@ -3458,7 +3442,7 @@ static bool trans_vmv_s_x(DisasContext *s, arg_vmv_s_x *a) > */ > s1 = get_gpr(s, a->rs1, EXT_NONE); > tcg_gen_ext_tl_i64(t1, s1); > - vec_element_storei(s, a->rd, 0, t1); > + vec_element_storei_tail(s, a->rd, t1); > gen_set_label(over); > tcg_gen_movi_tl(cpu_vstart, 0); > finalize_rvv_inst(s); > @@ -3514,7 +3498,7 @@ static bool trans_vfmv_s_f(DisasContext *s, arg_vfmv_s_f *a) > t1 = tcg_temp_new_i64(); > do_nanbox(s, t1, cpu_fpr[a->rs1]); > > - vec_element_storei(s, a->rd, 0, t1); > + vec_element_storei_tail(s, a->rd, t1); > > gen_set_label(over); > tcg_gen_movi_tl(cpu_vstart, 0); > diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c > index 83dd26314d..6c7af25d82 100644 > --- a/target/riscv/vector_helper.c > +++ b/target/riscv/vector_helper.c > @@ -2112,6 +2112,24 @@ GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2) > GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4) > GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8) > > +#define GEN_VEXT_SET_VELEM0(NAME, ETYPE, H) \ > +void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env, \ > + uint32_t desc) \ > +{ \ > + uint32_t esz = sizeof(ETYPE); \ > + uint32_t vlenb = riscv_cpu_cfg(env)->vlenb; \ > + uint32_t vta = vext_vta(desc); \ > + \ > + *((ETYPE *)vd + H(0)) = (ETYPE)s1; \ > + /* Treat every element past vd[0] as tail for scalar-to-vector moves. */ \ > + vext_set_elems_1s(vd, vta, esz, vlenb); \ > +} > + > +GEN_VEXT_SET_VELEM0(vset_velem0_b, int8_t, H1) > +GEN_VEXT_SET_VELEM0(vset_velem0_h, int16_t, H2) > +GEN_VEXT_SET_VELEM0(vset_velem0_w, int32_t, H4) > +GEN_VEXT_SET_VELEM0(vset_velem0_d, int64_t, H8) > + > #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H) \ > void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ > CPURISCVState *env, uint32_t desc) \ > -- > 2.43.0 >
diff --git a/target/riscv/helper.h b/target/riscv/helper.h index b785456ee0..5a7f043edb 100644 --- a/target/riscv/helper.h +++ b/target/riscv/helper.h @@ -660,6 +660,10 @@ DEF_HELPER_4(vmv_v_x_b, void, ptr, i64, env, i32) DEF_HELPER_4(vmv_v_x_h, void, ptr, i64, env, i32) DEF_HELPER_4(vmv_v_x_w, void, ptr, i64, env, i32) DEF_HELPER_4(vmv_v_x_d, void, ptr, i64, env, i32) +DEF_HELPER_4(vset_velem0_b, void, ptr, i64, env, i32) +DEF_HELPER_4(vset_velem0_h, void, ptr, i64, env, i32) +DEF_HELPER_4(vset_velem0_w, void, ptr, i64, env, i32) +DEF_HELPER_4(vset_velem0_d, void, ptr, i64, env, i32) DEF_HELPER_6(vsaddu_vv_b, void, ptr, ptr, ptr, ptr, env, i32) DEF_HELPER_6(vsaddu_vv_h, void, ptr, ptr, ptr, ptr, env, i32) diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc index 4df9a40b44..6a966c35c3 100644 --- a/target/riscv/insn_trans/trans_rvv.c.inc +++ b/target/riscv/insn_trans/trans_rvv.c.inc @@ -3381,37 +3381,21 @@ static void vec_element_loadi(DisasContext *s, TCGv_i64 dest, load_element(dest, tcg_env, endian_ofs(s, vreg, idx), s->sew, sign); } -/* Integer Scalar Move Instruction */ +typedef void gen_helper_vset_velem0(TCGv_ptr, TCGv_i64, TCGv_env, TCGv_i32); -static void store_element(TCGv_i64 val, TCGv_ptr base, - int ofs, int sew) +static void vec_element_storei_tail(DisasContext *s, int vreg, TCGv_i64 val) { - switch (sew) { - case MO_8: - tcg_gen_st8_i64(val, base, ofs); - break; - case MO_16: - tcg_gen_st16_i64(val, base, ofs); - break; - case MO_32: - tcg_gen_st32_i64(val, base, ofs); - break; - case MO_64: - tcg_gen_st_i64(val, base, ofs); - break; - default: - g_assert_not_reached(); - } -} + static gen_helper_vset_velem0 * const fns[4] = { + gen_helper_vset_velem0_b, gen_helper_vset_velem0_h, + gen_helper_vset_velem0_w, gen_helper_vset_velem0_d, + }; + TCGv_ptr dest = tcg_temp_new_ptr(); + uint32_t data = FIELD_DP32(0, VDATA, VTA, s->vta); + TCGv_i32 desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb, + s->cfg_ptr->vlenb, data)); -/* - * Store vreg[idx] = val. - * The index must be in range of VLMAX. - */ -static void vec_element_storei(DisasContext *s, int vreg, - int idx, TCGv_i64 val) -{ - store_element(val, tcg_env, endian_ofs(s, vreg, idx), s->sew); + tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vreg)); + fns[s->sew](dest, val, tcg_env, desc); } /* vmv.x.s rd, vs2 # x[rd] = vs2[0] */ @@ -3458,7 +3442,7 @@ static bool trans_vmv_s_x(DisasContext *s, arg_vmv_s_x *a) */ s1 = get_gpr(s, a->rs1, EXT_NONE); tcg_gen_ext_tl_i64(t1, s1); - vec_element_storei(s, a->rd, 0, t1); + vec_element_storei_tail(s, a->rd, t1); gen_set_label(over); tcg_gen_movi_tl(cpu_vstart, 0); finalize_rvv_inst(s); @@ -3514,7 +3498,7 @@ static bool trans_vfmv_s_f(DisasContext *s, arg_vfmv_s_f *a) t1 = tcg_temp_new_i64(); do_nanbox(s, t1, cpu_fpr[a->rs1]); - vec_element_storei(s, a->rd, 0, t1); + vec_element_storei_tail(s, a->rd, t1); gen_set_label(over); tcg_gen_movi_tl(cpu_vstart, 0); diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 83dd26314d..6c7af25d82 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -2112,6 +2112,24 @@ GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2) GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4) GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8) +#define GEN_VEXT_SET_VELEM0(NAME, ETYPE, H) \ +void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env, \ + uint32_t desc) \ +{ \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t vlenb = riscv_cpu_cfg(env)->vlenb; \ + uint32_t vta = vext_vta(desc); \ + \ + *((ETYPE *)vd + H(0)) = (ETYPE)s1; \ + /* Treat every element past vd[0] as tail for scalar-to-vector moves. */ \ + vext_set_elems_1s(vd, vta, esz, vlenb); \ +} + +GEN_VEXT_SET_VELEM0(vset_velem0_b, int8_t, H1) +GEN_VEXT_SET_VELEM0(vset_velem0_h, int16_t, H2) +GEN_VEXT_SET_VELEM0(vset_velem0_w, int32_t, H4) +GEN_VEXT_SET_VELEM0(vset_velem0_d, int64_t, H8) + #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H) \ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ CPURISCVState *env, uint32_t desc) \
The risc-v vector spec defines vmv.s.x and vfmv.s.f as writing element 0 of the destination register while the remaining destination elements follow the current tail policy. When QEMU runs with rvv_ta_all_1s enabled, those elements must therefore become all 1s. QEMU handled both instructions as translation-time special cases that directly wrote vd[0] and skipped the usual tail processing. As a result, vmv.s.x and vfmv.s.f left the remaining destination elements unchanged instead of applying the configured tail policy. Fix this by routing both instructions through a helper that writes vd[0] and then treats the rest of the destination register as tail, reusing the existing agnostic-element fill logic. Signed-off-by: Zhongyao Chen <chen.zhongyao@zte.com.cn> --- target/riscv/helper.h | 4 +++ target/riscv/insn_trans/trans_rvv.c.inc | 44 ++++++++----------------- target/riscv/vector_helper.c | 18 ++++++++++ 3 files changed, 36 insertions(+), 30 deletions(-)