diff mbox series

[1/8] target/ppc: Optimize emulation of lvsl and lvsr instructions

Message ID 1559816130-17113-2-git-send-email-stefan.brankovic@rt-rk.com
State New
Headers show
Series Optimize emulation of ten Altivec instructions: lvsl, | expand

Commit Message

Stefan Brankovic June 6, 2019, 10:15 a.m. UTC
Adding simple macro that is calling tcg implementation of appropriate
instruction if altivec support is active.

Optimization of altivec instruction lvsl (Load Vector for Shift Left).
Place bytes sh:sh+15 of value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F
in destination register. Sh is calculated by adding 2 source registers and
getting bits 60-63 of result.

First we place bits [28-31] of EA to variable sh. After that
we create bytes sh:(sh+7) of X(from description) in for loop
(by incrementing sh in each iteration and placing it in
appropriate byte of variable result) and save them in higher
doubleword element of vD. We repeat this once again for lower
doubleword element of vD by creating bytes (sh+8):(sh+15) in
a for loop and saving result.

Optimization of altivec instruction lvsr (Load Vector for Shift Right).
Place bytes 16-sh:31-sh of value 0x00 || 0x01 || 0x02 || ... || 0x1E ||
0x1F in destination register. Sh is calculated by adding 2 source
registers and getting bits 60-63 of result.

First we place bits [28-31] of EA to variable sh. After that
we create bytes (16-sh):(23-sh) of X(from description) in for loop
(by incrementing sh in each iteration and placing it in
appropriate byte of variable result) and save them in higher
doubleword element of vD. We repeat this once again for lower
doubleword element of vD by creating bytes (24-sh):(32-sh) in
a for loop and saving result.

Signed-off-by: Stefan Brankovic <stefan.brankovic@rt-rk.com>
---
 target/ppc/translate/vmx-impl.inc.c | 143 ++++++++++++++++++++++++++++--------
 1 file changed, 111 insertions(+), 32 deletions(-)

Comments

Richard Henderson June 6, 2019, 4:46 p.m. UTC | #1
On 6/6/19 5:15 AM, Stefan Brankovic wrote:
> +    tcg_gen_addi_i64(result, sh, 7);
> +    for (i = 7; i >= 1; i--) {
> +        tcg_gen_shli_i64(tmp, sh, i * 8);
> +        tcg_gen_or_i64(result, result, tmp);
> +        tcg_gen_addi_i64(sh, sh, 1);
> +    }

Better to replicate sh into the 8 positions and then use one add.

    tcg_gen_muli_i64(sh, sh, 0x0101010101010101ull);
    tcg_gen_addi_i64(hi_result, sh, 0x0001020304050607ull);
    tcg_gen_addi_i64(lo_result, sh, 0x08090a0b0c0d0e0full);

and

    tcg_gen_subfi_i64(hi_result, 0x1011121314151617ull, sh);
    tcg_gen_subfi_i64(lo_result, 0x18191a1b1c1d1e1full, sh);

for lvsr.


r~
Stefan Brankovic June 17, 2019, 11:31 a.m. UTC | #2
On 6.6.19. 18:46, Richard Henderson wrote:
> On 6/6/19 5:15 AM, Stefan Brankovic wrote:
>> +    tcg_gen_addi_i64(result, sh, 7);
>> +    for (i = 7; i >= 1; i--) {
>> +        tcg_gen_shli_i64(tmp, sh, i * 8);
>> +        tcg_gen_or_i64(result, result, tmp);
>> +        tcg_gen_addi_i64(sh, sh, 1);
>> +    }
> Better to replicate sh into the 8 positions and then use one add.
>
>      tcg_gen_muli_i64(sh, sh, 0x0101010101010101ull);
>      tcg_gen_addi_i64(hi_result, sh, 0x0001020304050607ull);
>      tcg_gen_addi_i64(lo_result, sh, 0x08090a0b0c0d0e0full);
>
> and
>
>      tcg_gen_subfi_i64(hi_result, 0x1011121314151617ull, sh);
>      tcg_gen_subfi_i64(lo_result, 0x18191a1b1c1d1e1full, sh);
>
> for lvsr.
>
I think you are right, this is definitely better way of implementing it. 
I will adopt your approach in v2.

Kind Regards,

Stefan

> r~
diff mbox series

Patch

diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c
index bd3ff40..140bb05 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -142,38 +142,6 @@  GEN_VR_STVE(bx, 0x07, 0x04, 1);
 GEN_VR_STVE(hx, 0x07, 0x05, 2);
 GEN_VR_STVE(wx, 0x07, 0x06, 4);
 
-static void gen_lvsl(DisasContext *ctx)
-{
-    TCGv_ptr rd;
-    TCGv EA;
-    if (unlikely(!ctx->altivec_enabled)) {
-        gen_exception(ctx, POWERPC_EXCP_VPU);
-        return;
-    }
-    EA = tcg_temp_new();
-    gen_addr_reg_index(ctx, EA);
-    rd = gen_avr_ptr(rD(ctx->opcode));
-    gen_helper_lvsl(rd, EA);
-    tcg_temp_free(EA);
-    tcg_temp_free_ptr(rd);
-}
-
-static void gen_lvsr(DisasContext *ctx)
-{
-    TCGv_ptr rd;
-    TCGv EA;
-    if (unlikely(!ctx->altivec_enabled)) {
-        gen_exception(ctx, POWERPC_EXCP_VPU);
-        return;
-    }
-    EA = tcg_temp_new();
-    gen_addr_reg_index(ctx, EA);
-    rd = gen_avr_ptr(rD(ctx->opcode));
-    gen_helper_lvsr(rd, EA);
-    tcg_temp_free(EA);
-    tcg_temp_free_ptr(rd);
-}
-
 static void gen_mfvscr(DisasContext *ctx)
 {
     TCGv_i32 t;
@@ -316,6 +284,16 @@  static void glue(gen_, name)(DisasContext *ctx)                         \
     tcg_temp_free_ptr(rd);                                              \
 }
 
+#define GEN_VXFORM_TRANS(name, opc2, opc3)                              \
+static void glue(gen_, name)(DisasContext *ctx)                         \
+{                                                                       \
+    if (unlikely(!ctx->altivec_enabled)) {                              \
+        gen_exception(ctx, POWERPC_EXCP_VPU);                           \
+        return;                                                         \
+    }                                                                   \
+    trans_##name(ctx);                                                  \
+}
+
 #define GEN_VXFORM_ENV(name, opc2, opc3)                                \
 static void glue(gen_, name)(DisasContext *ctx)                         \
 {                                                                       \
@@ -515,6 +493,105 @@  static void gen_vmrgow(DisasContext *ctx)
     tcg_temp_free_i64(avr);
 }
 
+/*
+ * lvsl VRT,RA,RB - Load Vector for Shift Left
+ *
+ * Let the EA be the sum (rA|0)+(rB). Let sh=EA[28–31].
+ * Let X be the 32-byte value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F.
+ * Bytes sh:sh+15 of X are placed into vD.
+ */
+static void trans_lvsl(DisasContext *ctx)
+{
+    int VT = rD(ctx->opcode);
+    TCGv_i64 result = tcg_temp_new_i64();
+    TCGv_i64 tmp = tcg_temp_new_i64();
+    TCGv_i64 sh = tcg_temp_new_i64();
+    TCGv_i64 EA = tcg_temp_new();
+    int i;
+
+    /* Get sh(from description) by anding EA with 0xf. */
+    gen_addr_reg_index(ctx, EA);
+    tcg_gen_andi_i64(sh, EA, 0xfULL);
+    /*
+     * Create bytes sh:sh+7 of X(from description) and place them in
+     * higher doubleword of vD.
+     */
+    tcg_gen_addi_i64(result, sh, 7);
+    for (i = 7; i >= 1; i--) {
+        tcg_gen_shli_i64(tmp, sh, i * 8);
+        tcg_gen_or_i64(result, result, tmp);
+        tcg_gen_addi_i64(sh, sh, 1);
+    }
+    set_avr64(VT, result, true);
+    /*
+     * Create bytes sh+8:sh+15 of X(from description) and place them in
+     * lower doubleword of vD.
+     */
+    tcg_gen_addi_i64(result, sh, 8);
+    for (i = 7; i >= 1; i--) {
+        tcg_gen_addi_i64(sh, sh, 1);
+        tcg_gen_shli_i64(tmp, sh, i * 8);
+        tcg_gen_or_i64(result, result, tmp);
+    }
+    set_avr64(VT, result, false);
+
+    tcg_temp_free_i64(result);
+    tcg_temp_free_i64(tmp);
+    tcg_temp_free_i64(sh);
+    tcg_temp_free(EA);
+}
+
+/*
+ * lvsr VRT,RA,RB - Load Vector for Shift Right
+ *
+ * Let the EA be the sum (rA|0)+(rB). Let sh=EA[28–31].
+ * Let X be the 32-byte value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F.
+ * Bytes (16-sh):(31-sh) of X are placed into vD.
+ */
+static void trans_lvsr(DisasContext *ctx)
+{
+    int VT = rD(ctx->opcode);
+    TCGv_i64 result = tcg_temp_new_i64();
+    TCGv_i64 tmp = tcg_temp_new_i64();
+    TCGv_i64 sh = tcg_temp_new_i64();
+    TCGv_i64 EA = tcg_temp_new();
+    int i;
+
+    /* Get sh(from description) by anding EA with 0xf. */
+    gen_addr_reg_index(ctx, EA);
+    tcg_gen_andi_i64(sh, EA, 0xfULL);
+    /* Make (16-sh) and save it in sh. */
+    tcg_gen_subi_i64(sh, sh, 0x10ULL);
+    tcg_gen_neg_i64(sh, sh);
+    /*
+     * Create bytes (16-sh):(23-sh) of X(from description) and place them in
+     * higher doubleword of vD.
+     */
+    tcg_gen_addi_i64(result, sh, 7);
+    for (i = 7; i >= 1; i--) {
+        tcg_gen_shli_i64(tmp, sh, i * 8);
+        tcg_gen_or_i64(result, result, tmp);
+        tcg_gen_addi_i64(sh, sh, 1);
+    }
+    set_avr64(VT, result, true);
+    /*
+     * Create bytes (24-sh):(32-sh) of X(from description) and place them in
+     * lower doubleword of vD.
+     */
+    tcg_gen_addi_i64(result, sh, 8);
+    for (i = 7; i >= 1; i--) {
+        tcg_gen_addi_i64(sh, sh, 1);
+        tcg_gen_shli_i64(tmp, sh, i * 8);
+        tcg_gen_or_i64(result, result, tmp);
+    }
+    set_avr64(VT, result, false);
+
+    tcg_temp_free_i64(result);
+    tcg_temp_free_i64(tmp);
+    tcg_temp_free_i64(sh);
+    tcg_temp_free(EA);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -657,6 +734,8 @@  GEN_VXFORM_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
 GEN_VXFORM_HETRO(vextubrx, 6, 28)
 GEN_VXFORM_HETRO(vextuhrx, 6, 29)
 GEN_VXFORM_HETRO(vextuwrx, 6, 30)
+GEN_VXFORM_TRANS(lvsl, 6, 31)
+GEN_VXFORM_TRANS(lvsr, 6, 32)
 GEN_VXFORM_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207, \
                 vextuwrx, PPC_NONE, PPC2_ISA300)