diff mbox series

[2/2] target/mips: Optimize ILVEV.<B|H|W|D> MSA instructions

Message ID 1552994915-7185-3-git-send-email-mateja.marjanovic@rt-rk.com
State New
Headers show
Series target/mips: Optimize MSA ILVEV and ILVOD instructions | expand

Commit Message

Mateja Marjanovic March 19, 2019, 11:28 a.m. UTC
From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>

Optimize set of MSA instructions ILVEV, using directly
tcg registers and performing logic on them instead of
using helpers.

 instr    ||   before    ||   after

Comments

Aleksandar Markovic March 19, 2019, 1:55 p.m. UTC | #1
> From: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> Subject: [PATCH 2/2] target/mips: Optimize ILVEV.<B|H|W|D> MSA instructions
> 
> ...
> 
> +static inline void gen_ilvev_d(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt)
> +{
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
> +}
> +

This code segment introduces a bug when wd == ws.

Please examine also the patch 1/2 for any similar behavior.

You should test these cases (wd == ws, wd == wt, etc.) too.

Aleksandar

> 
> ...
Richard Henderson March 19, 2019, 4:42 p.m. UTC | #2
On 3/19/19 6:55 AM, Aleksandar Markovic wrote:
>> From: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
>> Subject: [PATCH 2/2] target/mips: Optimize ILVEV.<B|H|W|D> MSA instructions
>>
>> ...
>>
>> +static inline void gen_ilvev_d(CPUMIPSState *env, uint32_t wd,
>> +                               uint32_t ws, uint32_t wt)
>> +{
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
>> +}
>> +
> 
> This code segment introduces a bug when wd == ws.

Note that the fix is to swap the two lines,
since x * 2 cannot equal y * 2 + 1.


r~
Richard Henderson March 19, 2019, 4:42 p.m. UTC | #3
On 3/19/19 4:28 AM, Mateja Marjanovic wrote:
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +    tcg_gen_shli_i64(t0, t0, 32);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);

Deposit, again.


r~
Aleksandar Markovic March 19, 2019, 5:05 p.m. UTC | #4
> From: Richard Henderson <richard.henderson@linaro.org>
> Subject: Re: [Qemu-devel] [PATCH 2/2] target/mips: Optimize ILVEV.<B|H|W|D> MSA instructions
> 
> On 3/19/19 4:28 AM, Mateja Marjanovic wrote:
> > +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> > +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> > +    tcg_gen_shli_i64(t0, t0, 32);
> > +    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
> 
> Deposit, again.

Mateja,

May I ask you to redo this segment of code as Richard
describe (the exact invocations of TCG functions are in
a Richard's comment to some of the previous versions of
this patch). This means redo ILVEV.W handling. Then you
can compare the performance of two versions, and attach
the results here. You can also (using -d out-asm or similar
QEMU options) find out what code is generated for both
alternatives, and attach the generated code here, maybe
some folks will find them interesting (I will).

Thanks,
Aleksandar
Richard Henderson March 19, 2019, 5:30 p.m. UTC | #5
On 3/19/19 10:05 AM, Aleksandar Markovic wrote:
> May I ask you to redo this segment of code as Richard
> describe (the exact invocations of TCG functions are in
> a Richard's comment to some of the previous versions of
> this patch). This means redo ILVEV.W handling. Then you
> can compare the performance of two versions, and attach
> the results here. You can also (using -d out-asm or similar
> QEMU options) find out what code is generated for both
> alternatives, and attach the generated code here, maybe
> some folks will find them interesting (I will).

To be fair, this will not affect x86 as a host at the moment.

But the underlying deposit operation is supported by AArch64, PowerPC, and S390
as hosts.  There is some support for deposit on x86 as a host within my
tcg_gen_extract2 patch set,

https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg02036.html

which will, in these cases, use rol+shld instead of shr+movq+and+or.


r~
diff mbox series

Patch

======================================
 ilvev.b  ||  126.92 ms  ||  26.41 ms
 ilvev.h  ||   93.67 ms  ||  25.79 ms
 ilvev.w  ||  117.86 ms  ||  24.42 ms
 ilvev.d  ||   45.49 ms  ||  20.28 ms

Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
---
 target/mips/helper.h     |   1 -
 target/mips/msa_helper.c |   9 ----
 target/mips/translate.c  | 111 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index d162836..2f23b0d 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -864,7 +864,6 @@  DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index 9e52a31..a500c59 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1197,15 +1197,6 @@  MSA_FN_DF(ilvl_df)
     } while (0)
 MSA_FN_DF(ilvr_df)
 #undef MSA_DO
-
-#define MSA_DO(DF)                      \
-    do {                                \
-        pwx->DF[2*i]   = pwt->DF[2*i];  \
-        pwx->DF[2*i+1] = pws->DF[2*i];  \
-    } while (0)
-MSA_FN_DF(ilvev_df)
-#undef MSA_DO
-
 #undef MSA_LOOP_COND
 
 #define MSA_LOOP_COND(DF) \
diff --git a/target/mips/translate.c b/target/mips/translate.c
index 10c5c55..d65db46 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28978,6 +28978,100 @@  static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
     tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
 }
 
+/*
+ * [MSA] ILVEV.B wd, ws, wt
+ *
+ *   Vector Interleave Even (byte data elements)
+ *
+ */
+static inline void gen_ilvev_b(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    const uint64_t mask = 0x00ff00ff00ff00ffULL;
+
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t0, t0, 8);
+    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
+
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_shli_i64(t0, t0, 8);
+    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+/*
+ * [MSA] ILVEV.H wd, ws, wt
+ *
+ *   Vector Interleave Even (halfword data elements)
+ *
+ */
+static inline void gen_ilvev_h(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    const uint64_t mask = 0x0000ffff0000ffffULL;
+
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t0, t0, 16);
+    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
+
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_shli_i64(t0, t0, 16);
+    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+/*
+ * [MSA] ILVEV.W wd, ws, wt
+ *
+ *   Vector Interleave Even (word data elements)
+ *
+ */
+static inline void gen_ilvev_w(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    const uint64_t mask = 0x00000000ffffffffULL;
+
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t0, t0, 32);
+    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
+
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_shli_i64(t0, t0, 32);
+    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+/*
+ * [MSA] ILVEV.D wd, ws, wt
+ *
+ *   Vector Interleave Even (Double data elements)
+ *
+ */
+static inline void gen_ilvev_d(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
+}
+
 static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
 {
 #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
@@ -29134,7 +29228,22 @@  static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
         gen_helper_msa_mod_s_df(cpu_env, tdf, twd, tws, twt);
         break;
     case OPC_ILVEV_df:
-        gen_helper_msa_ilvev_df(cpu_env, tdf, twd, tws, twt);
+        switch (df) {
+        case DF_BYTE:
+            gen_ilvev_b(env, wd, ws, wt);
+            break;
+        case DF_HALF:
+            gen_ilvev_h(env, wd, ws, wt);
+            break;
+        case DF_WORD:
+            gen_ilvev_w(env, wd, ws, wt);
+            break;
+        case DF_DOUBLE:
+            gen_ilvev_d(env, wd, ws, wt);
+            break;
+        default:
+            assert(0);
+        }
         break;
     case OPC_BINSR_df:
         gen_helper_msa_binsr_df(cpu_env, tdf, twd, tws, twt);