diff mbox

[2/6] target-ppc: add vextu[bhw]lx instructions

Message ID 1475041518-9757-3-git-send-email-raji@linux.vnet.ibm.com
State New
Headers show

Commit Message

Rajalakshmi Srinivasaraghavan Sept. 28, 2016, 5:45 a.m. UTC
From: Avinesh Kumar <avinesku@linux.vnet.ibm.com>

vextublx:  Vector Extract Unsigned Byte Left
vextuhlx:  Vector Extract Unsigned Halfword Left
vextuwlx:  Vector Extract Unsigned Word Left

Signed-off-by: Avinesh Kumar <avinesku@linux.vnet.ibm.com>
[ Remove else part in helper ]
Signed-off-by: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
---
 target-ppc/helper.h                 |    3 ++
 target-ppc/int_helper.c             |   37 +++++++++++++++++++++++++++++++++++
 target-ppc/translate/vmx-impl.inc.c |   19 ++++++++++++++++++
 target-ppc/translate/vmx-ops.inc.c  |    4 ++-
 4 files changed, 62 insertions(+), 1 deletions(-)

Comments

Richard Henderson Sept. 28, 2016, 4:54 p.m. UTC | #1
On 09/27/2016 10:45 PM, Rajalakshmi Srinivasaraghavan wrote:
> +#if defined(HOST_WORDS_BIGENDIAN)
> +#define VEXTULX_DO(name, elem)                                  \
> +target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
> +{                                                               \
> +    target_ulong r = 0;                                         \
> +    int i;                                                      \
> +    int index = a & 0xf;                                        \
> +    for (i = 0; i < elem; i++) {                                \
> +        r = r << 8;                                             \
> +        if (index + i <= 15) {                                  \
> +            r = r | b->u8[index + i];                           \
> +        }                                                       \
> +    }                                                           \
> +    return r;                                                   \
> +}
> +#else
> +#define VEXTULX_DO(name, elem)                                  \
> +target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
> +{                                                               \
> +    target_ulong r = 0;                                         \
> +    int i;                                                      \
> +    int index = 15 - (a & 0xf);                                 \
> +    for (i = 0; i < elem; i++) {                                \
> +        r = r << 8;                                             \
> +        if (index - i >= 0) {                                   \
> +            r = r | b->u8[index - i];                           \
> +        }                                                       \
> +    }                                                           \
> +    return r;                                                   \
> +}
> +#endif
> +
> +VEXTULX_DO(vextublx, 1)
> +VEXTULX_DO(vextuhlx, 2)
> +VEXTULX_DO(vextuwlx, 4)
> +#undef VEXTULX_DO

Ew.

This should be one 128-bit shift and one and.

Since the shift amount is a multiple of 8, the 128-bit shift for vextub[lr]x
does not need to cross a double-word boundary, and so can be decomposed into
one 64-bit shift of (count & 64 ? hi : lo).

For vextu[hw]lr]x, you'd need to do the whole left-shift, right-shift, or thing.

But still, fantastically better than a loop.


r~
Rajalakshmi Srinivasaraghavan Oct. 5, 2016, 5:21 a.m. UTC | #2
On 09/28/2016 10:24 PM, Richard Henderson wrote:
> On 09/27/2016 10:45 PM, Rajalakshmi Srinivasaraghavan wrote:
>> +#if defined(HOST_WORDS_BIGENDIAN)
>> +#define VEXTULX_DO(name, elem)                                  \
>> +target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
>> +{                                                               \
>> +    target_ulong r = 0;                                         \
>> +    int i;                                                      \
>> +    int index = a & 0xf;                                        \
>> +    for (i = 0; i < elem; i++) {                                \
>> +        r = r << 8;                                             \
>> +        if (index + i <= 15) {                                  \
>> +            r = r | b->u8[index + i];                           \
>> +        }                                                       \
>> +    }                                                           \
>> +    return r;                                                   \
>> +}
>> +#else
>> +#define VEXTULX_DO(name, elem)                                  \
>> +target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
>> +{                                                               \
>> +    target_ulong r = 0;                                         \
>> +    int i;                                                      \
>> +    int index = 15 - (a & 0xf);                                 \
>> +    for (i = 0; i < elem; i++) {                                \
>> +        r = r << 8;                                             \
>> +        if (index - i >= 0) {                                   \
>> +            r = r | b->u8[index - i];                           \
>> +        }                                                       \
>> +    }                                                           \
>> +    return r;                                                   \
>> +}
>> +#endif
>> +
>> +VEXTULX_DO(vextublx, 1)
>> +VEXTULX_DO(vextuhlx, 2)
>> +VEXTULX_DO(vextuwlx, 4)
>> +#undef VEXTULX_DO
> Ew.
>
> This should be one 128-bit shift and one and.
>
> Since the shift amount is a multiple of 8, the 128-bit shift for vextub[lr]x
> does not need to cross a double-word boundary, and so can be decomposed into
> one 64-bit shift of (count & 64 ? hi : lo).
>
> For vextu[hw]lr]x, you'd need to do the whole left-shift, right-shift, or thing.
>
> But still, fantastically better than a loop.
Ack. Will send an updated patch.
>
>
> r~
>
>
diff mbox

Patch

diff --git a/target-ppc/helper.h b/target-ppc/helper.h
index a1c2962..3041199 100644
--- a/target-ppc/helper.h
+++ b/target-ppc/helper.h
@@ -344,6 +344,9 @@  DEF_HELPER_3(vpmsumb, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumh, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumw, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumd, void, avr, avr, avr)
+DEF_HELPER_2(vextublx, tl, tl, avr)
+DEF_HELPER_2(vextuhlx, tl, tl, avr)
+DEF_HELPER_2(vextuwlx, tl, tl, avr)
 
 DEF_HELPER_2(vsbox, void, avr, avr)
 DEF_HELPER_3(vcipher, void, avr, avr, avr)
diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c
index 51a9ac5..c24cc07 100644
--- a/target-ppc/int_helper.c
+++ b/target-ppc/int_helper.c
@@ -1705,6 +1705,43 @@  void helper_vlogefp(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *b)
     }
 }
 
+#if defined(HOST_WORDS_BIGENDIAN)
+#define VEXTULX_DO(name, elem)                                  \
+target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
+{                                                               \
+    target_ulong r = 0;                                         \
+    int i;                                                      \
+    int index = a & 0xf;                                        \
+    for (i = 0; i < elem; i++) {                                \
+        r = r << 8;                                             \
+        if (index + i <= 15) {                                  \
+            r = r | b->u8[index + i];                           \
+        }                                                       \
+    }                                                           \
+    return r;                                                   \
+}
+#else
+#define VEXTULX_DO(name, elem)                                  \
+target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
+{                                                               \
+    target_ulong r = 0;                                         \
+    int i;                                                      \
+    int index = 15 - (a & 0xf);                                 \
+    for (i = 0; i < elem; i++) {                                \
+        r = r << 8;                                             \
+        if (index - i >= 0) {                                   \
+            r = r | b->u8[index - i];                           \
+        }                                                       \
+    }                                                           \
+    return r;                                                   \
+}
+#endif
+
+VEXTULX_DO(vextublx, 1)
+VEXTULX_DO(vextuhlx, 2)
+VEXTULX_DO(vextuwlx, 4)
+#undef VEXTULX_DO
+
 /* The specification says that the results are undefined if all of the
  * shift counts are not identical.  We check to make sure that they are
  * to conform to what real hardware appears to do.  */
diff --git a/target-ppc/translate/vmx-impl.inc.c b/target-ppc/translate/vmx-impl.inc.c
index abfde27..815ba96 100644
--- a/target-ppc/translate/vmx-impl.inc.c
+++ b/target-ppc/translate/vmx-impl.inc.c
@@ -342,6 +342,19 @@  static void glue(gen_, name0##_##name1)(DisasContext *ctx)              \
     }                                                                   \
 }
 
+#define GEN_VXFORM_HETRO(name, opc2, opc3)                              \
+static void glue(gen_, name)(DisasContext *ctx)                         \
+{                                                                       \
+    TCGv_ptr rb;                                                        \
+    if (unlikely(!ctx->altivec_enabled)) {                              \
+        gen_exception(ctx, POWERPC_EXCP_VPU);                           \
+        return;                                                         \
+    }                                                                   \
+    rb = gen_avr_ptr(rB(ctx->opcode));                                  \
+    gen_helper_##name(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)], rb); \
+    tcg_temp_free_ptr(rb);                                              \
+}
+
 GEN_VXFORM(vaddubm, 0, 0);
 GEN_VXFORM_DUAL_EXT(vaddubm, PPC_NONE, PPC2_ALTIVEC_207, 0,       \
                     vmul10cuq, PPC_NONE, PPC2_ISA300, 0x0000F800)
@@ -516,6 +529,12 @@  GEN_VXFORM_ENV(vsubfp, 5, 1);
 GEN_VXFORM_ENV(vmaxfp, 5, 16);
 GEN_VXFORM_ENV(vminfp, 5, 17);
 
+GEN_VXFORM_HETRO(vextublx, 6, 24)
+GEN_VXFORM_HETRO(vextuhlx, 6, 25)
+GEN_VXFORM_HETRO(vextuwlx, 6, 26)
+GEN_VXFORM_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
+                vextuwlx, PPC_NONE, PPC2_ISA300)
+
 #define GEN_VXRFORM1(opname, name, str, opc2, opc3)                     \
 static void glue(gen_, name)(DisasContext *ctx)                         \
     {                                                                   \
diff --git a/target-ppc/translate/vmx-ops.inc.c b/target-ppc/translate/vmx-ops.inc.c
index 5d47b0f..3e0047d 100644
--- a/target-ppc/translate/vmx-ops.inc.c
+++ b/target-ppc/translate/vmx-ops.inc.c
@@ -91,8 +91,10 @@  GEN_VXFORM(vmrghw, 6, 2),
 GEN_VXFORM(vmrglb, 6, 4),
 GEN_VXFORM(vmrglh, 6, 5),
 GEN_VXFORM(vmrglw, 6, 6),
+GEN_VXFORM_300(vextublx, 6, 24),
+GEN_VXFORM_300(vextuhlx, 6, 25),
+GEN_VXFORM_DUAL(vmrgow, vextuwlx, 6, 26, PPC_ALTIVEC, PPC_NONE),
 GEN_VXFORM_207(vmrgew, 6, 30),
-GEN_VXFORM_207(vmrgow, 6, 26),
 GEN_VXFORM(vmuloub, 4, 0),
 GEN_VXFORM(vmulouh, 4, 1),
 GEN_VXFORM_DUAL(vmulouw, vmuluwm, 4, 2, PPC_ALTIVEC, PPC_NONE),