diff mbox

[3/4] target-ppc: add vrldnmi and vrlwmi instructions

Message ID 1477300500-25930-4-git-send-email-nikunj@linux.vnet.ibm.com
State New
Headers show

Commit Message

Nikunj A Dadhania Oct. 24, 2016, 9:14 a.m. UTC
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>

vrldmi: Vector Rotate Left Dword then Mask Insert
vrlwmi: Vector Rotate Left Word then Mask Insert

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
---
 disas/ppc.c                         |  2 +
 target-ppc/helper.h                 |  2 +
 target-ppc/int_helper.c             | 88 +++++++++++++++++++++++++++++++++++++
 target-ppc/translate/vmx-impl.inc.c |  6 +++
 target-ppc/translate/vmx-ops.inc.c  |  4 +-
 5 files changed, 100 insertions(+), 2 deletions(-)

Comments

Richard Henderson Oct. 24, 2016, 4:16 p.m. UTC | #1
On 10/24/2016 02:14 AM, Nikunj A Dadhania wrote:
> +#define EXTRACT_BITS(size)                                              \
> +static inline uint##size##_t extract_bits_u##size(uint##size##_t reg,   \
> +                                                  uint##size##_t start, \
> +                                                  uint##size##_t end)   \
> +{                                                                       \
> +    uint##size##_t nr_mask_bits = end - start + 1;                      \
> +    uint##size##_t val = 1;                                             \
> +    uint##size##_t mask = (val << nr_mask_bits) - 1;                    \
> +    uint##size##_t shifted_reg = reg  >> ((size - 1)  - end);           \
> +    return shifted_reg & mask;                                          \
> +}
> +
> +EXTRACT_BITS(64);
> +EXTRACT_BITS(32);

We already have extract32 and extract64, which you're (nearly) duplicating.

> +#define MASK(size, max_val)                                     \
> +static inline uint##size##_t mask_u##size(uint##size##_t start, \
> +                                uint##size##_t end)             \
> +{                                                               \
> +    uint##size##_t ret, max_bit = size - 1;                     \
> +                                                                \
> +    if (likely(start == 0)) {                                   \
> +        ret = max_val << (max_bit - end);                       \
> +    } else if (likely(end == max_bit)) {                        \
> +        ret = max_val >> start;                                 \
> +    } else {                                                    \
> +        ret = (((uint##size##_t)(-1ULL)) >> (start)) ^          \
> +            (((uint##size##_t)(-1ULL) >> (end)) >> 1);          \
> +        if (unlikely(start > end)) {                            \
> +            return ~ret;                                        \
> +        }                                                       \
> +    }                                                           \

Why the two likely cases?  Doesn't the third case cover them?

Also, (uint##size##_t)(-1ULL) should be just (uint##size##_t)-1.
Please remove all the other unnecessarry parenthesis too.

Hmph.  I see you've copied all this silliness from translate.c, so...
nevermind, I guess.  Let's leave this a near-exact copy.

> +#define LEFT_ROTATE(size)                                            \
> +static inline uint##size##_t left_rotate_u##size(uint##size##_t val, \
> +                                              uint##size##_t shift)  \
> +{                                                                    \
> +    if (!shift) {                                                    \
> +        return val;                                                  \
> +    }                                                                \
> +                                                                     \
> +    uint##size##_t left_val = extract_bits_u##size(val, 0, shift - 1); \
> +    uint##size##_t right_val = val & mask_u##size(shift, size - 1);    \
> +                                                                     \
> +    return right_val << shift | left_val;                            \
> +}
> +
> +LEFT_ROTATE(32);
> +LEFT_ROTATE(64);

We already have rol32 and rol64.

Which I see are broken for shift == 0.  Let's please fix that, as a separate
patch, like so:

  return (word << shift) | (word >> ((32 - shift) & 31));


r~
Nikunj A Dadhania Oct. 25, 2016, 4:08 a.m. UTC | #2
Richard Henderson <rth@twiddle.net> writes:

> On 10/24/2016 02:14 AM, Nikunj A Dadhania wrote:
>> +#define EXTRACT_BITS(size)                                              \
>> +static inline uint##size##_t extract_bits_u##size(uint##size##_t reg,   \
>> +                                                  uint##size##_t start, \
>> +                                                  uint##size##_t end)   \
>> +{                                                                       \
>> +    uint##size##_t nr_mask_bits = end - start + 1;                      \
>> +    uint##size##_t val = 1;                                             \
>> +    uint##size##_t mask = (val << nr_mask_bits) - 1;                    \
>> +    uint##size##_t shifted_reg = reg  >> ((size - 1)  - end);           \
>> +    return shifted_reg & mask;                                          \
>> +}
>> +
>> +EXTRACT_BITS(64);
>> +EXTRACT_BITS(32);
>
> We already have extract32 and extract64, which you're (nearly) duplicating.

The bit position number notation is different, because of this using the
above routine, MSB=0 and LSB=63.

While the below assumes: MSB=63 and LSB=0

static inline uint64_t extract64(uint64_t value, int start, int length)
{
    assert(start >= 0 && length > 0 && length <= 64 - start);
    return (value >> start) & (~0ULL >> (64 - length));
}

Let me know if I am missing something here.

>> +#define MASK(size, max_val)                                     \
>> +static inline uint##size##_t mask_u##size(uint##size##_t start, \
>> +                                uint##size##_t end)             \
>> +{                                                               \
>> +    uint##size##_t ret, max_bit = size - 1;                     \
>> +                                                                \
>> +    if (likely(start == 0)) {                                   \
>> +        ret = max_val << (max_bit - end);                       \
>> +    } else if (likely(end == max_bit)) {                        \
>> +        ret = max_val >> start;                                 \
>> +    } else {                                                    \
>> +        ret = (((uint##size##_t)(-1ULL)) >> (start)) ^          \
>> +            (((uint##size##_t)(-1ULL) >> (end)) >> 1);          \
>> +        if (unlikely(start > end)) {                            \
>> +            return ~ret;                                        \
>> +        }                                                       \
>> +    }                                                           \
>
> Why the two likely cases?  Doesn't the third case cover them?
>
> Also, (uint##size##_t)(-1ULL) should be just (uint##size##_t)-1.
> Please remove all the other unnecessarry parenthesis too.
>
> Hmph.  I see you've copied all this silliness from translate.c, so...
> nevermind, I guess.  Let's leave this a near-exact copy.

Ok.

>> +#define LEFT_ROTATE(size)                                            \
>> +static inline uint##size##_t left_rotate_u##size(uint##size##_t val, \
>> +                                              uint##size##_t shift)  \
>> +{                                                                    \
>> +    if (!shift) {                                                    \
>> +        return val;                                                  \
>> +    }                                                                \
>> +                                                                     \
>> +    uint##size##_t left_val = extract_bits_u##size(val, 0, shift - 1); \
>> +    uint##size##_t right_val = val & mask_u##size(shift, size - 1);    \
>> +                                                                     \
>> +    return right_val << shift | left_val;                            \
>> +}
>> +
>> +LEFT_ROTATE(32);
>> +LEFT_ROTATE(64);
>
> We already have rol32 and rol64.
>
> Which I see are broken for shift == 0.  Let's please fix that, as a separate
> patch, like so:
>
>   return (word << shift) | (word >> ((32 - shift) & 31));

Sure.

Regards
Nikunj
Richard Henderson Oct. 25, 2016, 4:21 a.m. UTC | #3
On 10/24/2016 09:08 PM, Nikunj A Dadhania wrote:
> Richard Henderson <rth@twiddle.net> writes:
>
>> On 10/24/2016 02:14 AM, Nikunj A Dadhania wrote:
>>> +#define EXTRACT_BITS(size)                                              \
>>> +static inline uint##size##_t extract_bits_u##size(uint##size##_t reg,   \
>>> +                                                  uint##size##_t start, \
>>> +                                                  uint##size##_t end)   \
>>> +{                                                                       \
>>> +    uint##size##_t nr_mask_bits = end - start + 1;                      \
>>> +    uint##size##_t val = 1;                                             \
>>> +    uint##size##_t mask = (val << nr_mask_bits) - 1;                    \
>>> +    uint##size##_t shifted_reg = reg  >> ((size - 1)  - end);           \
>>> +    return shifted_reg & mask;                                          \
>>> +}
>>> +
>>> +EXTRACT_BITS(64);
>>> +EXTRACT_BITS(32);
>>
>> We already have extract32 and extract64, which you're (nearly) duplicating.
>
> The bit position number notation is different, because of this using the
> above routine, MSB=0 and LSB=63.
>
> While the below assumes: MSB=63 and LSB=0
>
> static inline uint64_t extract64(uint64_t value, int start, int length)
> {
>     assert(start >= 0 && length > 0 && length <= 64 - start);
>     return (value >> start) & (~0ULL >> (64 - length));
> }
>
> Let me know if I am missing something here.

Since the arguments to extract_bits_uN are completely under your control, via 
the arguments to VRLMI, this is a non-argument.  Just change them to 
little-endian position + length.

(And, after you do that conversion for vrldmi and vilwmi, you'll see why 
big-endian bit numbering is the spawn of the devil.  ;-)


r~
Nikunj A Dadhania Oct. 25, 2016, 4:44 a.m. UTC | #4
Richard Henderson <rth@twiddle.net> writes:

> On 10/24/2016 09:08 PM, Nikunj A Dadhania wrote:
>> Richard Henderson <rth@twiddle.net> writes:
>>
>>> On 10/24/2016 02:14 AM, Nikunj A Dadhania wrote:
>>>> +#define EXTRACT_BITS(size)                                              \
>>>> +static inline uint##size##_t extract_bits_u##size(uint##size##_t reg,   \
>>>> +                                                  uint##size##_t start, \
>>>> +                                                  uint##size##_t end)   \
>>>> +{                                                                       \
>>>> +    uint##size##_t nr_mask_bits = end - start + 1;                      \
>>>> +    uint##size##_t val = 1;                                             \
>>>> +    uint##size##_t mask = (val << nr_mask_bits) - 1;                    \
>>>> +    uint##size##_t shifted_reg = reg  >> ((size - 1)  - end);           \
>>>> +    return shifted_reg & mask;                                          \
>>>> +}
>>>> +
>>>> +EXTRACT_BITS(64);
>>>> +EXTRACT_BITS(32);
>>>
>>> We already have extract32 and extract64, which you're (nearly) duplicating.
>>
>> The bit position number notation is different, because of this using the
>> above routine, MSB=0 and LSB=63.
>>
>> While the below assumes: MSB=63 and LSB=0
>>
>> static inline uint64_t extract64(uint64_t value, int start, int length)
>> {
>>     assert(start >= 0 && length > 0 && length <= 64 - start);
>>     return (value >> start) & (~0ULL >> (64 - length));
>> }
>>
>> Let me know if I am missing something here.
>
> Since the arguments to extract_bits_uN are completely under your control, via 
> the arguments to VRLMI, this is a non-argument.  Just change them to 
> little-endian position + length.

Sure, was already trying that, I have the changed version now:

#define VRLMI(name, size, element,                                  \
              begin_last,                                           \
              end_last,                                             \
              shift_last, num_bits, insert)                         \
void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)        \
{                                                                   \
    int i;                                                          \
    for (i = 0; i < ARRAY_SIZE(r->element); i++) {                  \
        uint##size##_t src1 = a->element[i];                        \
        uint##size##_t src2 = b->element[i];                        \
        uint##size##_t src3 = r->element[i];                        \
        uint##size##_t begin, end, shift, mask, rot_val;            \
                                                                    \
        begin = extract##size(src2, size - begin_last - 1, num_bits);   \
        end = extract##size(src2, size - end_last - 1, num_bits);       \
        shift = extract##size(src2, size - shift_last - 1, num_bits);   \
        rot_val = rol##size(src1, shift);                               \
        mask = mask_u##size(begin, end);                            \
        if (insert) {                                               \
            r->element[i] = (rot_val & mask) | (src3 & ~mask);      \
        } else {                                                    \
            r->element[i] = (rot_val & mask);                       \
        }                                                           \
    }                                                               \
}

VRLMI(vrldmi, 64, u64,
      47,  /* begin_last */
      55,  /* end_last */
      63,  /* shift_last */
      6,   /* num_bits */
      1);  /* mask and insert */

>
> (And, after you do that conversion for vrldmi and vilwmi, you'll see why 
> big-endian bit numbering is the spawn of the devil.  ;-)

That bit numbering gives nightmares ;-)

Regards
Nikunj
Nikunj A Dadhania Oct. 25, 2016, 6:02 a.m. UTC | #5
Richard Henderson <rth@twiddle.net> writes:


> We already have rol32 and rol64.
>
> Which I see are broken for shift == 0.

I tried with different shift (including 0) in a test program, and the
result is as expected:

0: ccddeeff

static inline unsigned int rol32(unsigned int word, unsigned int shift)
{
  return (word << shift) | (word >> (32 - shift));
}

void main(void)
{
  unsigned int value32 = 0xCCDDEEFF;

  for (int i = 0; i < 32; i++)
    printf("%d: %08x\n", i, rol32(value32, i));
}

> Let's please fix that, as a separate patch, like so:
>
>   return (word << shift) | (word >> ((32 - shift) & 31));

Doesn't seems to be necessary.

Regards
Nikunj
Richard Henderson Oct. 25, 2016, 4:33 p.m. UTC | #6
On 10/24/2016 11:02 PM, Nikunj A Dadhania wrote:
> Richard Henderson <rth@twiddle.net> writes:
> 
> 
>> We already have rol32 and rol64.
>>
>> Which I see are broken for shift == 0.
> 
> I tried with different shift (including 0) in a test program, and the
> result is as expected:
> 
> 0: ccddeeff
> 
> static inline unsigned int rol32(unsigned int word, unsigned int shift)
> {
>   return (word << shift) | (word >> (32 - shift));
> }

Technically, a shift by 32 is invalid.  Practically, there are two common
cases: shift >= 32 produces zero and shift is truncated to the word size, both
of which produce the correct results here.

That said, there's also the case of clang's sanitizers, which will in fact
signal this as a runtime error.


r~
Nikunj A Dadhania Oct. 26, 2016, 4:44 a.m. UTC | #7
Richard Henderson <rth@twiddle.net> writes:

> On 10/24/2016 11:02 PM, Nikunj A Dadhania wrote:
>> Richard Henderson <rth@twiddle.net> writes:
>> 
>> 
>>> We already have rol32 and rol64.
>>>
>>> Which I see are broken for shift == 0.
>> 
>> I tried with different shift (including 0) in a test program, and the
>> result is as expected:
>> 
>> 0: ccddeeff
>> 
>> static inline unsigned int rol32(unsigned int word, unsigned int shift)
>> {
>>   return (word << shift) | (word >> (32 - shift));
>> }
>
> Technically, a shift by 32 is invalid.  Practically, there are two common
> cases: shift >= 32 produces zero and shift is truncated to the word size, both
> of which produce the correct results here.
>
> That said, there's also the case of clang's sanitizers, which will in fact
> signal this as a runtime error.

In that case, will send patch updating them as part of my next revision

Regards
Nikunj
diff mbox

Patch

diff --git a/disas/ppc.c b/disas/ppc.c
index 052cebe..32f0d8d 100644
--- a/disas/ppc.c
+++ b/disas/ppc.c
@@ -2286,6 +2286,8 @@  const struct powerpc_opcode powerpc_opcodes[] = {
 { "vrlh",      VX(4,   68), VX_MASK,	PPCVEC,		{ VD, VA, VB } },
 { "vrlw",      VX(4,  132), VX_MASK,	PPCVEC,		{ VD, VA, VB } },
 { "vrsqrtefp", VX(4,  330), VX_MASK,	PPCVEC,		{ VD, VB } },
+{ "vrldmi",    VX(4,  197), VX_MASK,    PPCVEC,         { VD, VA, VB } },
+{ "vrlwmi",    VX(4,  133), VX_MASK,    PPCVEC,         { VD, VA, VB} },
 { "vsel",      VXA(4,  42), VXA_MASK,	PPCVEC,		{ VD, VA, VB, VC } },
 { "vsl",       VX(4,  452), VX_MASK,	PPCVEC,		{ VD, VA, VB } },
 { "vslb",      VX(4,  260), VX_MASK,	PPCVEC,		{ VD, VA, VB } },
diff --git a/target-ppc/helper.h b/target-ppc/helper.h
index 0337292..9fb8f0d 100644
--- a/target-ppc/helper.h
+++ b/target-ppc/helper.h
@@ -325,6 +325,8 @@  DEF_HELPER_4(vmaxfp, void, env, avr, avr, avr)
 DEF_HELPER_4(vminfp, void, env, avr, avr, avr)
 DEF_HELPER_3(vrefp, void, env, avr, avr)
 DEF_HELPER_3(vrsqrtefp, void, env, avr, avr)
+DEF_HELPER_3(vrlwmi, void, avr, avr, avr)
+DEF_HELPER_3(vrldmi, void, avr, avr, avr)
 DEF_HELPER_5(vmaddfp, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vnmsubfp, void, env, avr, avr, avr, avr)
 DEF_HELPER_3(vexptefp, void, env, avr, avr)
diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c
index dca4798..2273872 100644
--- a/target-ppc/int_helper.c
+++ b/target-ppc/int_helper.c
@@ -1717,6 +1717,94 @@  void helper_vrsqrtefp(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *b)
     }
 }
 
+#define EXTRACT_BITS(size)                                              \
+static inline uint##size##_t extract_bits_u##size(uint##size##_t reg,   \
+                                                  uint##size##_t start, \
+                                                  uint##size##_t end)   \
+{                                                                       \
+    uint##size##_t nr_mask_bits = end - start + 1;                      \
+    uint##size##_t val = 1;                                             \
+    uint##size##_t mask = (val << nr_mask_bits) - 1;                    \
+    uint##size##_t shifted_reg = reg  >> ((size - 1)  - end);           \
+    return shifted_reg & mask;                                          \
+}
+
+EXTRACT_BITS(64);
+EXTRACT_BITS(32);
+
+#define MASK(size, max_val)                                     \
+static inline uint##size##_t mask_u##size(uint##size##_t start, \
+                                uint##size##_t end)             \
+{                                                               \
+    uint##size##_t ret, max_bit = size - 1;                     \
+                                                                \
+    if (likely(start == 0)) {                                   \
+        ret = max_val << (max_bit - end);                       \
+    } else if (likely(end == max_bit)) {                        \
+        ret = max_val >> start;                                 \
+    } else {                                                    \
+        ret = (((uint##size##_t)(-1ULL)) >> (start)) ^          \
+            (((uint##size##_t)(-1ULL) >> (end)) >> 1);          \
+        if (unlikely(start > end)) {                            \
+            return ~ret;                                        \
+        }                                                       \
+    }                                                           \
+                                                                \
+    return ret;                                                 \
+}
+
+MASK(32, UINT32_MAX);
+MASK(64, UINT64_MAX);
+
+#define LEFT_ROTATE(size)                                            \
+static inline uint##size##_t left_rotate_u##size(uint##size##_t val, \
+                                              uint##size##_t shift)  \
+{                                                                    \
+    if (!shift) {                                                    \
+        return val;                                                  \
+    }                                                                \
+                                                                     \
+    uint##size##_t left_val = extract_bits_u##size(val, 0, shift - 1); \
+    uint##size##_t right_val = val & mask_u##size(shift, size - 1);    \
+                                                                     \
+    return right_val << shift | left_val;                            \
+}
+
+LEFT_ROTATE(32);
+LEFT_ROTATE(64);
+
+#define VRLMI(name, size, element,                                  \
+                     begin_first, begin_last,                       \
+                     end_first, end_last,                           \
+                     shift_first, shift_last)                       \
+void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)        \
+{                                                                   \
+    int i;                                                          \
+    for (i = 0; i < ARRAY_SIZE(r->element); i++) {                  \
+        uint##size##_t src1 = a->element[i];                        \
+        uint##size##_t src2 = b->element[i];                        \
+        uint##size##_t src3 = r->element[i];                        \
+        uint##size##_t begin, end, shift, mask, rot_val;            \
+                                                                    \
+        begin = extract_bits_u##size(src2, begin_first, begin_last);\
+        end = extract_bits_u##size(src2, end_first, end_last);      \
+        shift = extract_bits_u##size(src2, shift_first, shift_last);\
+        rot_val = left_rotate_u##size(src1, shift);                 \
+        mask = mask_u##size(begin, end);                            \
+        r->element[i] = (rot_val & mask) | (src3 & ~mask);          \
+    }                                                               \
+}
+
+VRLMI(vrldmi, 64, u64,
+             42, 47,  /* begin_first, begin_last */
+             50, 55,  /* end_first, end_last */
+             58, 63); /* shift_first, shift_last */
+
+VRLMI(vrlwmi, 32, u32,
+             11, 15,  /* begin_first, begin_last */
+             19, 23,  /* end_first, end_last */
+             27, 31); /* shift_first, shift_last */
+
 void helper_vsel(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b,
                  ppc_avr_t *c)
 {
diff --git a/target-ppc/translate/vmx-impl.inc.c b/target-ppc/translate/vmx-impl.inc.c
index fc612d9..fdfbd6a 100644
--- a/target-ppc/translate/vmx-impl.inc.c
+++ b/target-ppc/translate/vmx-impl.inc.c
@@ -488,7 +488,13 @@  GEN_VXFORM_DUAL(vsubeuqm, PPC_NONE, PPC2_ALTIVEC_207, \
 GEN_VXFORM(vrlb, 2, 0);
 GEN_VXFORM(vrlh, 2, 1);
 GEN_VXFORM(vrlw, 2, 2);
+GEN_VXFORM(vrlwmi, 2, 2);
+GEN_VXFORM_DUAL(vrlw, PPC_ALTIVEC, PPC_NONE, \
+                vrlwmi, PPC_NONE, PPC2_ISA300)
 GEN_VXFORM(vrld, 2, 3);
+GEN_VXFORM(vrldmi, 2, 3);
+GEN_VXFORM_DUAL(vrld, PPC_NONE, PPC2_ALTIVEC_207, \
+                vrldmi, PPC_NONE, PPC2_ISA300)
 GEN_VXFORM(vsl, 2, 7);
 GEN_VXFORM(vsr, 2, 11);
 GEN_VXFORM_ENV(vpkuhum, 7, 0);
diff --git a/target-ppc/translate/vmx-ops.inc.c b/target-ppc/translate/vmx-ops.inc.c
index cc7ed7e..76b3593 100644
--- a/target-ppc/translate/vmx-ops.inc.c
+++ b/target-ppc/translate/vmx-ops.inc.c
@@ -143,8 +143,8 @@  GEN_VXFORM_207(vsubcuq, 0, 21),
 GEN_VXFORM_DUAL(vsubeuqm, vsubecuq, 31, 0xFF, PPC_NONE, PPC2_ALTIVEC_207),
 GEN_VXFORM(vrlb, 2, 0),
 GEN_VXFORM(vrlh, 2, 1),
-GEN_VXFORM(vrlw, 2, 2),
-GEN_VXFORM_207(vrld, 2, 3),
+GEN_VXFORM_DUAL(vrlw, vrlwmi, 2, 2, PPC_ALTIVEC, PPC_NONE),
+GEN_VXFORM_DUAL(vrld, vrldmi, 2, 3, PPC_NONE, PPC2_ALTIVEC_207),
 GEN_VXFORM(vsl, 2, 7),
 GEN_VXFORM(vsr, 2, 11),
 GEN_VXFORM(vpkuhum, 7, 0),