===========================================================
|| instruction || BEFORE || LOOP UNROLL || TCG ||
===========================================================
|| ilvr.b || 106.461 ms || 52.131 ms || 7.813 ms ||
|| ilvr.h || 82.962 ms || 36.222 ms || 3.622 ms ||
|| ilvr.w || 109.451 ms || 33.042 ms || 2.331 ms ||
|| ilvr.d || 32.270 ms || 27.328 ms || 2.025 ms ||
===========================================================
Suggested-by: Aleksandar Markovic <amarkovic@wavecomp.com>
Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
---
target/mips/helper.h | 1 -
target/mips/msa_helper.c | 8 ---
target/mips/translate.c | 184 ++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 183 insertions(+), 10 deletions(-)
@@ -862,7 +862,6 @@ DEF_HELPER_5(msa_sld_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_splat_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
@@ -1181,14 +1181,6 @@ MSA_FN_DF(pckev_df)
} while (0)
MSA_FN_DF(pckod_df)
#undef MSA_DO
-
-#define MSA_DO(DF) \
- do { \
- pwx->DF[2*i] = R##DF(pwt, i); \
- pwx->DF[2*i+1] = R##DF(pws, i); \
- } while (0)
-MSA_FN_DF(ilvr_df)
-#undef MSA_DO
#undef MSA_LOOP_COND
#define MSA_LOOP_COND(DF) \
@@ -28002,6 +28002,173 @@ static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx)
}
/*
+ * [MSA] ILVR.B wd, ws, wt
+ *
+ * Vector Interleave Right (byte data elements)
+ *
+ */
+static inline void gen_ilvr_b(CPUMIPSState *env, uint32_t wd,
+ uint32_t ws, uint32_t wt)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ uint64_t mask = 0x00000000000000ffULL;
+
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_mov_i64(t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 8);
+ tcg_gen_or_i64(t2, t2, t1);
+
+ mask = 0x000000000000ff00ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 8);
+ tcg_gen_or_i64(t2, t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 16);
+ tcg_gen_or_i64(t2, t2, t1);
+
+ mask = 0x0000000000ff0000ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 16);
+ tcg_gen_or_i64(t2, t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 24);
+ tcg_gen_or_i64(t2, t2, t1);
+
+ mask = 0x00000000ff000000ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 24);
+ tcg_gen_or_i64(t2, t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 32);
+ tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
+
+ mask = 0x000000ff00000000ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 32);
+ tcg_gen_mov_i64(t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 24);
+ tcg_gen_or_i64(t2, t2, t1);
+
+ mask = 0x0000ff0000000000ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 24);
+ tcg_gen_or_i64(t2, t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 16);
+ tcg_gen_or_i64(t2, t2, t1);
+
+ mask = 0x00ff000000000000ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 16);
+ tcg_gen_or_i64(t2, t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 8);
+ tcg_gen_or_i64(t2, t2, t1);
+
+ mask = 0xff00000000000000ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 8);
+ tcg_gen_or_i64(t2, t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+}
+
+/*
+ * [MSA] ILVR.H wd, ws, wt
+ *
+ * Vector Interleave Right (halfword data elements)
+ *
+ */
+static inline void gen_ilvr_h(CPUMIPSState *env, uint32_t wd,
+ uint32_t ws, uint32_t wt)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ uint64_t mask = 0x000000000000ffffULL;
+
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_mov_i64(t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 16);
+ tcg_gen_or_i64(t2, t2, t1);
+
+ mask = 0x00000000ffff0000ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 16);
+ tcg_gen_or_i64(t2, t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 32);
+ tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
+
+ mask = 0x0000ffff00000000ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 32);
+ tcg_gen_mov_i64(t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 16);
+ tcg_gen_or_i64(t2, t2, t1);
+
+ mask = 0xffff000000000000ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 16);
+ tcg_gen_or_i64(t2, t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+}
+
+/*
+ * [MSA] ILVR.W wd, ws, wt
+ *
+ * Vector Interleave Right (word data elements)
+ *
+ */
+static inline void gen_ilvr_w(CPUMIPSState *env, uint32_t wd,
+ uint32_t ws, uint32_t wt)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ uint64_t mask = 0x00000000ffffffffULL;
+
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_mov_i64(t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_shli_i64(t1, t1, 32);
+ tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
+
+ mask = 0xffffffff00000000ULL;
+ tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+ tcg_gen_shri_i64(t1, t1, 32);
+ tcg_gen_mov_i64(t2, t1);
+ tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+ tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+}
+
+/*
+ * [MSA] ILVR.D wd, ws, wt
+ *
+ * Vector Interleave Right (doubleword data elements)
+ *
+ */
+static inline void gen_ilvr_d(CPUMIPSState *env, uint32_t wd,
+ uint32_t ws, uint32_t wt)
+{
+ tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
+ tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
+}
+
+/*
* [MSA] ILVL.B wd, ws, wt
*
* Vector Interleave Left (byte data elements)
@@ -28468,7 +28635,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
gen_helper_msa_div_u_df(cpu_env, tdf, twd, tws, twt);
break;
case OPC_ILVR_df:
- gen_helper_msa_ilvr_df(cpu_env, tdf, twd, tws, twt);
+ switch (df) {
+ case DF_BYTE:
+ gen_ilvr_b(env, wd, ws, wt);
+ break;
+ case DF_HALF:
+ gen_ilvr_h(env, wd, ws, wt);
+ break;
+ case DF_WORD:
+ gen_ilvr_w(env, wd, ws, wt);
+ break;
+ case DF_DOUBLE:
+ gen_ilvr_d(env, wd, ws, wt);
+ break;
+ default:
+ assert(0);
+ }
break;
case OPC_BINSL_df:
gen_helper_msa_binsl_df(cpu_env, tdf, twd, tws, twt);
From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com> Optimize ILVR.<B|H|W|D> instructions, using directly tcg registers and logic performed on them, and instead of shifting the bit mask or assigning a new tcg constant to the bit mask, assign a new (shifted) uint64_t value to the bit mask. Performance measurement is done by executing the instructions 10 million times on a computer with Intel Core i7-3770 CPU @ 3.40GHz×8.