diff mbox series

[29/65] target/riscv: Add integer merge and move instructions for XTheadVector

Message ID 20240412073735.76413-30-eric.huang@linux.alibaba.com
State New
Headers show
Series target/riscv: Support XTheadVector extension | expand

Commit Message

Huang Tao April 12, 2024, 7:36 a.m. UTC
The instructions have the same function as RVV1.0. Overall there are only
general differences between XTheadVector and RVV1.0. Except of
th.vmv.v.x, the difference is that XTheadVector has no limit of SEW
of 8 to 64, Therefore, it is not suitable to use acceleration when
xlen < SEW.

Signed-off-by: Huang Tao <eric.huang@linux.alibaba.com>
---
 target/riscv/helper.h                         |  17 +++
 .../riscv/insn_trans/trans_xtheadvector.c.inc | 124 +++++++++++++++++-
 target/riscv/xtheadvector_helper.c            | 104 +++++++++++++++
 3 files changed, 239 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index 8b8dd62761..ba548ebdc9 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -1868,3 +1868,20 @@  DEF_HELPER_6(th_vwmaccsu_vx_w, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(th_vwmaccus_vx_b, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(th_vwmaccus_vx_h, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(th_vwmaccus_vx_w, void, ptr, ptr, tl, ptr, env, i32)
+
+DEF_HELPER_6(th_vmerge_vvm_b, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vvm_h, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vvm_w, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vvm_d, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vxm_b, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vxm_h, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vxm_w, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vxm_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_4(th_vmv_v_v_b, void, ptr, ptr, env, i32)
+DEF_HELPER_4(th_vmv_v_v_h, void, ptr, ptr, env, i32)
+DEF_HELPER_4(th_vmv_v_v_w, void, ptr, ptr, env, i32)
+DEF_HELPER_4(th_vmv_v_v_d, void, ptr, ptr, env, i32)
+DEF_HELPER_4(th_vmv_v_x_b, void, ptr, i64, env, i32)
+DEF_HELPER_4(th_vmv_v_x_h, void, ptr, i64, env, i32)
+DEF_HELPER_4(th_vmv_v_x_w, void, ptr, i64, env, i32)
+DEF_HELPER_4(th_vmv_v_x_d, void, ptr, i64, env, i32)
diff --git a/target/riscv/insn_trans/trans_xtheadvector.c.inc b/target/riscv/insn_trans/trans_xtheadvector.c.inc
index bfa3a26f78..6d0ce9f966 100644
--- a/target/riscv/insn_trans/trans_xtheadvector.c.inc
+++ b/target/riscv/insn_trans/trans_xtheadvector.c.inc
@@ -1576,18 +1576,130 @@  GEN_OPIVX_WIDEN_TRANS_TH(th_vwmacc_vx, opivx_widen_check_th)
 GEN_OPIVX_WIDEN_TRANS_TH(th_vwmaccsu_vx, opivx_widen_check_th)
 GEN_OPIVX_WIDEN_TRANS_TH(th_vwmaccus_vx, opivx_widen_check_th)
 
+/* Vector Integer Merge and Move Instructions */
+
+/*
+ * This function is almost the copy of trans_vmv_v_v, except:
+ * 1) XTheadVector simplifies the judgment logic of whether
+ *    to accelerate or not for its lack of fractional LMUL and
+ *    VTA.
+ */
+static bool trans_th_vmv_v_v(DisasContext *s, arg_th_vmv_v_v *a)
+{
+    if (require_xtheadvector(s) &&
+        vext_check_isa_ill(s) &&
+        th_check_reg(s, a->rd, false) &&
+        th_check_reg(s, a->rs1, false)) {
+
+        if (s->vl_eq_vlmax) {
+            tcg_gen_gvec_mov(s->sew, vreg_ofs(s, a->rd),
+                             vreg_ofs(s, a->rs1),
+                             MAXSZ(s), MAXSZ(s));
+        } else {
+            uint32_t data = FIELD_DP32(0, VDATA_TH, LMUL, s->lmul);
+            static gen_helper_gvec_2_ptr * const fns[4] = {
+                gen_helper_th_vmv_v_v_b, gen_helper_th_vmv_v_v_h,
+                gen_helper_th_vmv_v_v_w, gen_helper_th_vmv_v_v_d,
+            };
+
+            tcg_gen_gvec_2_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, a->rs1),
+                               tcg_env, s->cfg_ptr->vlenb,
+                               s->cfg_ptr->vlenb, data,
+                               fns[s->sew]);
+        }
+        finalize_rvv_inst(s);
+        return true;
+    }
+    return false;
+}
+
+
+#define gen_helper_vmv_vx_th gen_helper_vmv_vx
+/*
+ * This function is almost the copy of trans_vmv_v_x, except:
+ * 1) Simplier judgment logic of acceleration
+ * 2) XTheadVector has no limit of SEW of 8 to 64, Therefore, it is not
+ *    suitable to use acceleration when xlen < SEW.
+ */
+static bool trans_th_vmv_v_x(DisasContext *s, arg_th_vmv_v_x *a)
+{
+    if (require_xtheadvector(s) &&
+        vext_check_isa_ill(s) &&
+        th_check_reg(s, a->rd, false)) {
+
+        TCGv s1;
+        s1 = get_gpr(s, a->rs1, EXT_SIGN);
+
+        if (s->vl_eq_vlmax && (8 << s->sew) <= get_xlen(s)) {
+            tcg_gen_gvec_dup_tl(s->sew, vreg_ofs(s, a->rd),
+                                MAXSZ(s), MAXSZ(s), s1);
+        } else {
+            TCGv_i32 desc;
+            TCGv_i64 s1_i64 = tcg_temp_new_i64();
+            TCGv_ptr dest = tcg_temp_new_ptr();
+            uint32_t data = FIELD_DP32(0, VDATA_TH, LMUL, s->lmul);
+            static gen_helper_vmv_vx_th * const fns[4] = {
+                gen_helper_th_vmv_v_x_b, gen_helper_th_vmv_v_x_h,
+                gen_helper_th_vmv_v_x_w, gen_helper_th_vmv_v_x_d,
+            };
+
+            tcg_gen_ext_tl_i64(s1_i64, s1);
+            desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
+                                              s->cfg_ptr->vlenb, data));
+            tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, a->rd));
+            fns[s->sew](dest, s1_i64, tcg_env, desc);
+        }
+
+        finalize_rvv_inst(s);
+        return true;
+    }
+    return false;
+}
+
+/* The difference is same as trans_th_vmv_v_v */
+static bool trans_th_vmv_v_i(DisasContext *s, arg_th_vmv_v_i *a)
+{
+    if (require_xtheadvector(s) &&
+        vext_check_isa_ill(s) &&
+        th_check_reg(s, a->rd, false)) {
+
+        int64_t simm = sextract64(a->rs1, 0, 5);
+        if (s->vl_eq_vlmax) {
+            tcg_gen_gvec_dup_imm(s->sew, vreg_ofs(s, a->rd),
+                                 MAXSZ(s), MAXSZ(s), simm);
+        } else {
+            TCGv_i32 desc;
+            TCGv_i64 s1;
+            TCGv_ptr dest;
+            uint32_t data = FIELD_DP32(0, VDATA_TH, LMUL, s->lmul);
+            static gen_helper_vmv_vx_th * const fns[4] = {
+                gen_helper_th_vmv_v_x_b, gen_helper_th_vmv_v_x_h,
+                gen_helper_th_vmv_v_x_w, gen_helper_th_vmv_v_x_d,
+            };
+
+            s1 = tcg_constant_i64(simm);
+            dest = tcg_temp_new_ptr();
+            desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
+                                              s->cfg_ptr->vlenb, data));
+            tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, a->rd));
+            fns[s->sew](dest, s1, tcg_env, desc);
+        }
+        finalize_rvv_inst(s);
+        return true;
+    }
+    return false;
+}
+
+GEN_OPIVV_TRANS_TH(th_vmerge_vvm, opivv_vadc_check_th)
+GEN_OPIVX_TRANS_TH(th_vmerge_vxm, opivx_vadc_check_th)
+GEN_OPIVI_TRANS_TH(th_vmerge_vim, IMM_SX, th_vmerge_vxm, opivx_vadc_check_th)
+
 #define TH_TRANS_STUB(NAME)                                \
 static bool trans_##NAME(DisasContext *s, arg_##NAME *a)   \
 {                                                          \
     return require_xtheadvector(s);                        \
 }
 
-TH_TRANS_STUB(th_vmv_v_v)
-TH_TRANS_STUB(th_vmv_v_x)
-TH_TRANS_STUB(th_vmv_v_i)
-TH_TRANS_STUB(th_vmerge_vvm)
-TH_TRANS_STUB(th_vmerge_vxm)
-TH_TRANS_STUB(th_vmerge_vim)
 TH_TRANS_STUB(th_vsaddu_vv)
 TH_TRANS_STUB(th_vsaddu_vx)
 TH_TRANS_STUB(th_vsaddu_vi)
diff --git a/target/riscv/xtheadvector_helper.c b/target/riscv/xtheadvector_helper.c
index 19aad626c9..d8a0e3af90 100644
--- a/target/riscv/xtheadvector_helper.c
+++ b/target/riscv/xtheadvector_helper.c
@@ -1923,3 +1923,107 @@  GEN_TH_VX(th_vwmaccsu_vx_w, 4, 8, clearq_th)
 GEN_TH_VX(th_vwmaccus_vx_b, 1, 2, clearh_th)
 GEN_TH_VX(th_vwmaccus_vx_h, 2, 4, clearl_th)
 GEN_TH_VX(th_vwmaccus_vx_w, 4, 8, clearq_th)
+
+/* Vector Integer Merge and Move Instructions */
+
+/*
+ * The funtions below of VMV and vmerge are all the copy of RVV1.0 functions,
+ * except:
+ * 1) different desc encoding
+ * 2) different tail/masked element process policy
+ * 3) different mask layout
+ */
+#define GEN_TH_VMV_VV(NAME, ETYPE, H, CLEAR_FN)                      \
+void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
+                  uint32_t desc)                                     \
+{                                                                    \
+    uint32_t vl = env->vl;                                           \
+    uint32_t esz = sizeof(ETYPE);                                    \
+    uint32_t vlmax = th_maxsz(desc) / esz;                           \
+    uint32_t i;                                                      \
+                                                                     \
+    VSTART_CHECK_EARLY_EXIT(env);                                    \
+    for (i = env->vstart; i < vl; i++) {                             \
+        ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
+        *((ETYPE *)vd + H(i)) = s1;                                  \
+    }                                                                \
+    env->vstart = 0;                                                 \
+    CLEAR_FN(vd, vl, vl * esz, vlmax * esz);                         \
+}
+
+GEN_TH_VMV_VV(th_vmv_v_v_b, int8_t,  H1, clearb_th)
+GEN_TH_VMV_VV(th_vmv_v_v_h, int16_t, H2, clearh_th)
+GEN_TH_VMV_VV(th_vmv_v_v_w, int32_t, H4, clearl_th)
+GEN_TH_VMV_VV(th_vmv_v_v_d, int64_t, H8, clearq_th)
+
+#define GEN_TH_VMV_VX(NAME, ETYPE, H, CLEAR_FN)                      \
+void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
+                  uint32_t desc)                                     \
+{                                                                    \
+    uint32_t vl = env->vl;                                           \
+    uint32_t esz = sizeof(ETYPE);                                    \
+    uint32_t vlmax = th_maxsz(desc) / esz;                           \
+    uint32_t i;                                                      \
+                                                                     \
+    VSTART_CHECK_EARLY_EXIT(env);                                    \
+    for (i = env->vstart; i < vl; i++) {                             \
+        *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
+    }                                                                \
+    env->vstart = 0;                                                 \
+    CLEAR_FN(vd, vl, vl * esz, vlmax * esz);                         \
+}
+
+GEN_TH_VMV_VX(th_vmv_v_x_b, int8_t,  H1, clearb_th)
+GEN_TH_VMV_VX(th_vmv_v_x_h, int16_t, H2, clearh_th)
+GEN_TH_VMV_VX(th_vmv_v_x_w, int32_t, H4, clearl_th)
+GEN_TH_VMV_VX(th_vmv_v_x_d, int64_t, H8, clearq_th)
+
+#define GEN_TH_VMERGE_VV(NAME, ETYPE, H, CLEAR_FN)                   \
+void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
+                  CPURISCVState *env, uint32_t desc)                 \
+{                                                                    \
+    uint32_t mlen = th_mlen(desc);                                   \
+    uint32_t vl = env->vl;                                           \
+    uint32_t esz = sizeof(ETYPE);                                    \
+    uint32_t vlmax = th_maxsz(desc) / esz;                           \
+    uint32_t i;                                                      \
+                                                                     \
+    VSTART_CHECK_EARLY_EXIT(env);                                    \
+    for (i = env->vstart; i < vl; i++) {                             \
+        ETYPE *vt = (!th_elem_mask(v0, mlen, i) ? vs2 : vs1);        \
+        *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
+    }                                                                \
+    env->vstart = 0;                                                 \
+    CLEAR_FN(vd, vl, vl * esz, vlmax * esz);                         \
+}
+
+GEN_TH_VMERGE_VV(th_vmerge_vvm_b, int8_t,  H1, clearb_th)
+GEN_TH_VMERGE_VV(th_vmerge_vvm_h, int16_t, H2, clearh_th)
+GEN_TH_VMERGE_VV(th_vmerge_vvm_w, int32_t, H4, clearl_th)
+GEN_TH_VMERGE_VV(th_vmerge_vvm_d, int64_t, H8, clearq_th)
+
+#define GEN_TH_VMERGE_VX(NAME, ETYPE, H, CLEAR_FN)                   \
+void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
+                  void *vs2, CPURISCVState *env, uint32_t desc)      \
+{                                                                    \
+    uint32_t mlen = th_mlen(desc);                                   \
+    uint32_t vl = env->vl;                                           \
+    uint32_t esz = sizeof(ETYPE);                                    \
+    uint32_t vlmax = th_maxsz(desc) / esz;                           \
+    uint32_t i;                                                      \
+                                                                     \
+    VSTART_CHECK_EARLY_EXIT(env);                                    \
+    for (i = env->vstart; i < vl; i++) {                             \
+        ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
+        ETYPE d = (!th_elem_mask(v0, mlen, i) ? s2 :                 \
+                   (ETYPE)(target_long)s1);                          \
+        *((ETYPE *)vd + H(i)) = d;                                   \
+    }                                                                \
+    env->vstart = 0;                                                 \
+    CLEAR_FN(vd, vl, vl * esz, vlmax * esz);                         \
+}
+
+GEN_TH_VMERGE_VX(th_vmerge_vxm_b, int8_t,  H1, clearb_th)
+GEN_TH_VMERGE_VX(th_vmerge_vxm_h, int16_t, H2, clearh_th)
+GEN_TH_VMERGE_VX(th_vmerge_vxm_w, int32_t, H4, clearl_th)
+GEN_TH_VMERGE_VX(th_vmerge_vxm_d, int64_t, H8, clearq_th)