diff mbox series

[v4,2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

Message ID 20190129191746.14868-3-mark.cave-ayland@ilande.co.uk
State New
Headers show
Series target/ppc: remove various endian hacks from int_helper.c | expand

Commit Message

Mark Cave-Ayland Jan. 29, 2019, 7:17 p.m. UTC
The current implementations make use of the endian-specific macros MRGLO/MRGHI
and also reference HI_IDX and LO_IDX directly to calculate array offsets.

Rework the implementation to use the Vsr* macros so that these per-endian
references can be removed.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
---
 target/ppc/int_helper.c | 38 +++++++++++---------------------------
 1 file changed, 11 insertions(+), 27 deletions(-)

Comments

Richard Henderson Jan. 29, 2019, 11:05 p.m. UTC | #1
On 1/29/19 11:17 AM, Mark Cave-Ayland wrote:
> +#define VMRG_DO(name, element, access, ofs)                             \
>      void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)       \
>      {                                                                   \
>          ppc_avr_t result;                                               \
>          int i;                                                          \
>                                                                          \
> +        for (i = 0; i < ARRAY_SIZE(r->element) / 2; i++) {              \
> +            result.access(i * 2 + 0) = a->access(i + ofs);              \
> +            result.access(i * 2 + 1) = b->access(i + ofs);              \
>          }                                                               \
>          *r = result;                                                    \
>      }
> +
> +#define VMRG(suffix, element, access, half)        \
> +    VMRG_DO(mrgl##suffix, element, access, half)   \
> +    VMRG_DO(mrgh##suffix, element, access, 0)
> +VMRG(b, u8, VsrB, 8)
> +VMRG(h, u16, VsrH, 4)
> +VMRG(w, u32, VsrW, 2)

So... the point of "half" was to not replicate knowledge out to VMRG.
Just use

    int i, half = ARRAY_SIZE(r->element) / 2;
    for (i = 0; i < half; i++) {

within VMRG_DO.


r~
Mark Cave-Ayland Jan. 30, 2019, 5:10 a.m. UTC | #2
On 29/01/2019 23:05, Richard Henderson wrote:

> On 1/29/19 11:17 AM, Mark Cave-Ayland wrote:
>> +#define VMRG_DO(name, element, access, ofs)                             \
>>      void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)       \
>>      {                                                                   \
>>          ppc_avr_t result;                                               \
>>          int i;                                                          \
>>                                                                          \
>> +        for (i = 0; i < ARRAY_SIZE(r->element) / 2; i++) {              \
>> +            result.access(i * 2 + 0) = a->access(i + ofs);              \
>> +            result.access(i * 2 + 1) = b->access(i + ofs);              \
>>          }                                                               \
>>          *r = result;                                                    \
>>      }
>> +
>> +#define VMRG(suffix, element, access, half)        \
>> +    VMRG_DO(mrgl##suffix, element, access, half)   \
>> +    VMRG_DO(mrgh##suffix, element, access, 0)
>> +VMRG(b, u8, VsrB, 8)
>> +VMRG(h, u16, VsrH, 4)
>> +VMRG(w, u32, VsrW, 2)
> 
> So... the point of "half" was to not replicate knowledge out to VMRG.
> Just use
> 
>     int i, half = ARRAY_SIZE(r->element) / 2;
>     for (i = 0; i < half; i++) {
> 
> within VMRG_DO.

Okay - I was a bit confused because in your example macro signature you added ofs
which made me think you wanted its value to be determined outside, but never mind.

What about the following instead? With high set as part of the macro then the initial
assignment to ofs should be inlined accordingly at compile time:


#define VMRG_DO(name, element, access, high)                                 \
    void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)            \
    {                                                                        \
        ppc_avr_t result;                                                    \
        int ofs, half = ARRAY_SIZE(r->element) / 2;                          \
        int i;                                                               \
                                                                             \
        if (high) {                                                          \
            ofs = 0;                                                         \
        } else {                                                             \
            ofs = half;                                                      \
        }                                                                    \
                                                                             \
        for (i = 0; i < half; i++) {                                         \
            result.access(i * 2 + 0) = a->access(i + ofs);                   \
            result.access(i * 2 + 1) = b->access(i + ofs);                   \
        }                                                                    \
        *r = result;                                                         \
    }

#define VMRG(suffix, element, access)          \
    VMRG_DO(mrgl##suffix, element, access, 0)  \
    VMRG_DO(mrgh##suffix, element, access, 1)
VMRG(b, u8, VsrB)
VMRG(h, u16, VsrH)
VMRG(w, u32, VsrW)
#undef VMRG_DO
#undef VMRG
#undef VMRG


ATB,

Mark.
BALATON Zoltan Jan. 30, 2019, 11:42 a.m. UTC | #3
On Wed, 30 Jan 2019, Mark Cave-Ayland wrote:
> On 29/01/2019 23:05, Richard Henderson wrote:
>
>> On 1/29/19 11:17 AM, Mark Cave-Ayland wrote:
>>> +#define VMRG_DO(name, element, access, ofs)                             \
>>>      void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)       \
>>>      {                                                                   \
>>>          ppc_avr_t result;                                               \
>>>          int i;                                                          \
>>>                                                                          \
>>> +        for (i = 0; i < ARRAY_SIZE(r->element) / 2; i++) {              \
>>> +            result.access(i * 2 + 0) = a->access(i + ofs);              \
>>> +            result.access(i * 2 + 1) = b->access(i + ofs);              \
>>>          }                                                               \
>>>          *r = result;                                                    \
>>>      }
>>> +
>>> +#define VMRG(suffix, element, access, half)        \
>>> +    VMRG_DO(mrgl##suffix, element, access, half)   \
>>> +    VMRG_DO(mrgh##suffix, element, access, 0)
>>> +VMRG(b, u8, VsrB, 8)
>>> +VMRG(h, u16, VsrH, 4)
>>> +VMRG(w, u32, VsrW, 2)
>>
>> So... the point of "half" was to not replicate knowledge out to VMRG.
>> Just use
>>
>>     int i, half = ARRAY_SIZE(r->element) / 2;
>>     for (i = 0; i < half; i++) {
>>
>> within VMRG_DO.
>
> Okay - I was a bit confused because in your example macro signature you added ofs
> which made me think you wanted its value to be determined outside, but never mind.
>
> What about the following instead? With high set as part of the macro then the initial
> assignment to ofs should be inlined accordingly at compile time:
>
>
> #define VMRG_DO(name, element, access, high)                                 \
>    void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)            \
>    {                                                                        \
>        ppc_avr_t result;                                                    \
>        int ofs, half = ARRAY_SIZE(r->element) / 2;                          \
>        int i;                                                               \
>                                                                             \
>        if (high) {                                                          \
>            ofs = 0;                                                         \
>        } else {                                                             \
>            ofs = half;                                                      \
>        }                                                                    \

This could be also written as:

int i, half = ARRAY_SIZE(r->element) / 2;                            \
int ofs = (high ? 0 : half);                                         \

to save a few lines but it depends on personal taste which one is more 
readable for someone.

Regards,
BALATON Zoltan

>                                                                             \
>        for (i = 0; i < half; i++) {                                         \
>            result.access(i * 2 + 0) = a->access(i + ofs);                   \
>            result.access(i * 2 + 1) = b->access(i + ofs);                   \
>        }                                                                    \
>        *r = result;                                                         \
>    }
>
> #define VMRG(suffix, element, access)          \
>    VMRG_DO(mrgl##suffix, element, access, 0)  \
>    VMRG_DO(mrgh##suffix, element, access, 1)
> VMRG(b, u8, VsrB)
> VMRG(h, u16, VsrH)
> VMRG(w, u32, VsrW)
> #undef VMRG_DO
> #undef VMRG
> #undef VMRG
>
>
> ATB,
>
> Mark.
>
>
Richard Henderson Jan. 30, 2019, 12:51 p.m. UTC | #4
On 1/29/19 9:10 PM, Mark Cave-Ayland wrote:
>> So... the point of "half" was to not replicate knowledge out to VMRG.
>> Just use
>>
>>     int i, half = ARRAY_SIZE(r->element) / 2;
>>     for (i = 0; i < half; i++) {
>>
>> within VMRG_DO.
> 
> Okay - I was a bit confused because in your example macro signature you
> added ofs which made me think you wanted its value to be determined outside,
> but nevermind.
> 
> What about the following instead? With high set as part of the macro then
> the initial assignment to ofs should be inlined accordingly at compile time:
> 
> #define VMRG_DO(name, element, access, high)                                 \
>     void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)            \
>     {                                                                        \
>         ppc_avr_t result;                                                    \
>         int ofs, half = ARRAY_SIZE(r->element) / 2;                          \
>         int i;                                                               \
>                                                                              \
>         if (high) {                                                          \
>             ofs = 0;                                                         \
>         } else {                                                             \
>             ofs = half;                                                      \
>         }                                                                    \

Why are you over-complicating things?  I thought I'd explicitly said this
twice, but perhaps not:

Pass the symbol "half" directly to VMRG_DO:

#define VMRG(suffix, element, access)          \
    VMRG_DO(mrgl##suffix, element, access, 0)  \
    VMRG_DO(mrgh##suffix, element, access, half)

You do not need another conditional within VMRG_DO.

Also, I'm pretty sure it's the "low" merge that wants the "ofs" to be non-zero,
since all of the Vsr* symbols implement big-endian indexing.


r~
Mark Cave-Ayland Jan. 30, 2019, 1:37 p.m. UTC | #5
On 30/01/2019 12:51, Richard Henderson wrote:

> 
> Why are you over-complicating things?  I thought I'd explicitly said this
> twice, but perhaps not:
> 
> Pass the symbol "half" directly to VMRG_DO:
> 
> #define VMRG(suffix, element, access)          \
>     VMRG_DO(mrgl##suffix, element, access, 0)  \
>     VMRG_DO(mrgh##suffix, element, access, half)
> 
> You do not need another conditional within VMRG_DO.

I'm sorry - I think I got confused by your original macro definitions which used ofs
as both the macro parameter and variable name, which is why I thought you wanted ofs
passed in by value. Based upon the above I now have this which appears to work:


#define VMRG_DO(name, element, access, sofs)                                 \
    void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)            \
    {                                                                        \
        ppc_avr_t result;                                                    \
        int ofs, half = ARRAY_SIZE(r->element) / 2;                          \
        int i;                                                               \
                                                                             \
        ofs = sofs;                                                          \
        for (i = 0; i < half; i++) {                                         \
            result.access(i * 2 + 0) = a->access(i + ofs);                   \
            result.access(i * 2 + 1) = b->access(i + ofs);                   \
        }                                                                    \
        *r = result;                                                         \
    }

#define VMRG(suffix, element, access)          \
    VMRG_DO(mrgl##suffix, element, access, half)   \
    VMRG_DO(mrgh##suffix, element, access, 0)
VMRG(b, u8, VsrB)
VMRG(h, u16, VsrH)
VMRG(w, u32, VsrW)
#undef VMRG_DO
#undef VMRG


Is this what you were intending? If this looks better then I'll resubmit a v5 later
this evening.

> Also, I'm pretty sure it's the "low" merge that wants the "ofs" to be non-zero,
> since all of the Vsr* symbols implement big-endian indexing.

I've just tried swapping them around as you suggested, but then all my OS X
background fills appear with corrupted colors. So to the best of my knowledge without
dumping the register contents directly, the above version with low = half and high =
0 still seems correct.


ATB,

Mark.
Richard Henderson Jan. 30, 2019, 3:31 p.m. UTC | #6
On 1/30/19 5:37 AM, Mark Cave-Ayland wrote:
> On 30/01/2019 12:51, Richard Henderson wrote:
> 
>>
>> Why are you over-complicating things?  I thought I'd explicitly said this
>> twice, but perhaps not:
>>
>> Pass the symbol "half" directly to VMRG_DO:
>>
>> #define VMRG(suffix, element, access)          \
>>     VMRG_DO(mrgl##suffix, element, access, 0)  \
>>     VMRG_DO(mrgh##suffix, element, access, half)
>>
>> You do not need another conditional within VMRG_DO.
> 
> I'm sorry - I think I got confused by your original macro definitions which used ofs
> as both the macro parameter and variable name, which is why I thought you wanted ofs
> passed in by value. Based upon the above I now have this which appears to work:
> 
> 
> #define VMRG_DO(name, element, access, sofs)                                 \
>     void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)            \
>     {                                                                        \
>         ppc_avr_t result;                                                    \
>         int ofs, half = ARRAY_SIZE(r->element) / 2;                          \
>         int i;                                                               \
>                                                                              \
>         ofs = sofs;                                                          \

What is this variable and assignment for?
What does it accomplish?

>         for (i = 0; i < half; i++) {                                         \
>             result.access(i * 2 + 0) = a->access(i + ofs);                   \
>             result.access(i * 2 + 1) = b->access(i + ofs);                   \
>         }                                                                    \
>         *r = result;                                                         \
>     }
> 
> #define VMRG(suffix, element, access)          \
>     VMRG_DO(mrgl##suffix, element, access, half)   \
>     VMRG_DO(mrgh##suffix, element, access, 0)
> VMRG(b, u8, VsrB)
> VMRG(h, u16, VsrH)
> VMRG(w, u32, VsrW)
> #undef VMRG_DO
> #undef VMRG
> 
> 
> Is this what you were intending? If this looks better then I'll resubmit a v5 later
> this evening.
> 
>> Also, I'm pretty sure it's the "low" merge that wants the "ofs" to be non-zero,
>> since all of the Vsr* symbols implement big-endian indexing.
> 
> I've just tried swapping them around as you suggested, but then all my OS X
> background fills appear with corrupted colors. So to the best of my knowledge without
> dumping the register contents directly, the above version with low = half and high =
> 0 still seems correct.

That is exactly what I said, still quoted above.
It is not what you had written in the previous version.


r~
BALATON Zoltan Jan. 30, 2019, 4:06 p.m. UTC | #7
On Wed, 30 Jan 2019, Mark Cave-Ayland wrote:
> On 30/01/2019 12:51, Richard Henderson wrote:
>
>>
>> Why are you over-complicating things?  I thought I'd explicitly said this
>> twice, but perhaps not:
>>
>> Pass the symbol "half" directly to VMRG_DO:
>>
>> #define VMRG(suffix, element, access)          \
>>     VMRG_DO(mrgl##suffix, element, access, 0)  \
>>     VMRG_DO(mrgh##suffix, element, access, half)
>>
>> You do not need another conditional within VMRG_DO.
>
> I'm sorry - I think I got confused by your original macro definitions which used ofs
> as both the macro parameter and variable name, which is why I thought you wanted ofs
> passed in by value. Based upon the above I now have this which appears to work:
>
>
> #define VMRG_DO(name, element, access, sofs)                                 \
>    void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)            \
>    {                                                                        \
>        ppc_avr_t result;                                                    \
>        int ofs, half = ARRAY_SIZE(r->element) / 2;                          \
>        int i;                                                               \
>                                                                             \
>        ofs = sofs;                                                          \

I think you don't need the ofs local variable, you can just use the macro 
parameter (in paranthesis but in this case that does not matter, just 
usual precaution in case expression is passed to macro instead of 
constant). This is a macro so it will be literal replace, that's the trick 
we were missing from Richard's suggestion I think so don't look at it as 
a function parameter.

Also I wonder if you really need the result local? Can't it just access 
the result via *r directly and save a copy at the end? (Although 
that probably would be optimised out by the compiler anyway.)

By the way, thanks a lot for doing this and sorry that my comment caused 
so much trouble but your benchmark has proven my suspicion that it would 
have been two times slower with your first version.

Regards,
BALATON Zoltan

>        for (i = 0; i < half; i++) {                                         \
>            result.access(i * 2 + 0) = a->access(i + ofs);                   \
>            result.access(i * 2 + 1) = b->access(i + ofs);                   \
>        }                                                                    \
>        *r = result;                                                         \
>    }
>
> #define VMRG(suffix, element, access)          \
>    VMRG_DO(mrgl##suffix, element, access, half)   \
>    VMRG_DO(mrgh##suffix, element, access, 0)
> VMRG(b, u8, VsrB)
> VMRG(h, u16, VsrH)
> VMRG(w, u32, VsrW)
> #undef VMRG_DO
> #undef VMRG
>
>
> Is this what you were intending? If this looks better then I'll resubmit a v5 later
> this evening.
>
>> Also, I'm pretty sure it's the "low" merge that wants the "ofs" to be non-zero,
>> since all of the Vsr* symbols implement big-endian indexing.
>
> I've just tried swapping them around as you suggested, but then all my OS X
> background fills appear with corrupted colors. So to the best of my knowledge without
> dumping the register contents directly, the above version with low = half and high =
> 0 still seems correct.
>
>
> ATB,
>
> Mark.
>
>
Richard Henderson Jan. 30, 2019, 6:22 p.m. UTC | #8
On 1/30/19 8:06 AM, BALATON Zoltan wrote:
> Also I wonder if you really need the result local? Can't it just access the
> result via *r directly and save a copy at the end? (Although that probably
> would be optimised out by the compiler anyway.)

I don't think so, because of potential overlap between *r and *a or *b.


r~
Mark Cave-Ayland Jan. 30, 2019, 6:26 p.m. UTC | #9
On 30/01/2019 15:31, Richard Henderson wrote:

>> I'm sorry - I think I got confused by your original macro definitions which used ofs
>> as both the macro parameter and variable name, which is why I thought you wanted ofs
>> passed in by value. Based upon the above I now have this which appears to work:
>>
>>
>> #define VMRG_DO(name, element, access, sofs)                                 \
>>     void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)            \
>>     {                                                                        \
>>         ppc_avr_t result;                                                    \
>>         int ofs, half = ARRAY_SIZE(r->element) / 2;                          \
>>         int i;                                                               \
>>                                                                              \
>>         ofs = sofs;                                                          \
> 
> What is this variable and assignment for?
> What does it accomplish?

Sigh. So I thought this was required because of the "int ofs" declaration above which
I was under the impression came from one of your previous emails. But in fact it was
me that introduced this earlier before I realised this morning that in fact the
preprocessor can do all the work.

>>         for (i = 0; i < half; i++) {                                         \
>>             result.access(i * 2 + 0) = a->access(i + ofs);                   \
>>             result.access(i * 2 + 1) = b->access(i + ofs);                   \
>>         }                                                                    \
>>         *r = result;                                                         \
>>     }
>>
>> #define VMRG(suffix, element, access)          \
>>     VMRG_DO(mrgl##suffix, element, access, half)   \
>>     VMRG_DO(mrgh##suffix, element, access, 0)
>> VMRG(b, u8, VsrB)
>> VMRG(h, u16, VsrH)
>> VMRG(w, u32, VsrW)
>> #undef VMRG_DO
>> #undef VMRG
>>
>>
>> Is this what you were intending? If this looks better then I'll resubmit a v5 later
>> this evening.
>>
>>> Also, I'm pretty sure it's the "low" merge that wants the "ofs" to be non-zero,
>>> since all of the Vsr* symbols implement big-endian indexing.
>>
>> I've just tried swapping them around as you suggested, but then all my OS X
>> background fills appear with corrupted colors. So to the best of my knowledge without
>> dumping the register contents directly, the above version with low = half and high =
>> 0 still seems correct.
> 
> That is exactly what I said, still quoted above.
> It is not what you had written in the previous version.

Yes indeed, you are right that was a simple cut and paste error from where I was
testing both variants and pasted the wrong one into the email. Below is the hopefully
final version which I've just tested here against OS X:


#define VMRG_DO(name, element, access, ofs)                                  \
    void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)            \
    {                                                                        \
        ppc_avr_t result;                                                    \
        int i, half = ARRAY_SIZE(r->element) / 2;                            \
                                                                             \
        for (i = 0; i < half; i++) {                                         \
            result.access(i * 2 + 0) = a->access(i + ofs);                   \
            result.access(i * 2 + 1) = b->access(i + ofs);                   \
        }                                                                    \
        *r = result;                                                         \
    }

#define VMRG(suffix, element, access)          \
    VMRG_DO(mrgl##suffix, element, access, half)   \
    VMRG_DO(mrgh##suffix, element, access, 0)
VMRG(b, u8, VsrB)
VMRG(h, u16, VsrH)
VMRG(w, u32, VsrW)
#undef VMRG_DO
#undef VMRG


ATB,

Mark.
diff mbox series

Patch

diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 598731d47a..59324b2d13 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -976,43 +976,27 @@  void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
     }
 }
 
-#define VMRG_DO(name, element, highp)                                   \
+#define VMRG_DO(name, element, access, ofs)                             \
     void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)       \
     {                                                                   \
         ppc_avr_t result;                                               \
         int i;                                                          \
-        size_t n_elems = ARRAY_SIZE(r->element);                        \
                                                                         \
-        for (i = 0; i < n_elems / 2; i++) {                             \
-            if (highp) {                                                \
-                result.element[i*2+HI_IDX] = a->element[i];             \
-                result.element[i*2+LO_IDX] = b->element[i];             \
-            } else {                                                    \
-                result.element[n_elems - i * 2 - (1 + HI_IDX)] =        \
-                    b->element[n_elems - i - 1];                        \
-                result.element[n_elems - i * 2 - (1 + LO_IDX)] =        \
-                    a->element[n_elems - i - 1];                        \
-            }                                                           \
+        for (i = 0; i < ARRAY_SIZE(r->element) / 2; i++) {              \
+            result.access(i * 2 + 0) = a->access(i + ofs);              \
+            result.access(i * 2 + 1) = b->access(i + ofs);              \
         }                                                               \
         *r = result;                                                    \
     }
-#if defined(HOST_WORDS_BIGENDIAN)
-#define MRGHI 0
-#define MRGLO 1
-#else
-#define MRGHI 1
-#define MRGLO 0
-#endif
-#define VMRG(suffix, element)                   \
-    VMRG_DO(mrgl##suffix, element, MRGHI)       \
-    VMRG_DO(mrgh##suffix, element, MRGLO)
-VMRG(b, u8)
-VMRG(h, u16)
-VMRG(w, u32)
+
+#define VMRG(suffix, element, access, half)        \
+    VMRG_DO(mrgl##suffix, element, access, half)   \
+    VMRG_DO(mrgh##suffix, element, access, 0)
+VMRG(b, u8, VsrB, 8)
+VMRG(h, u16, VsrH, 4)
+VMRG(w, u32, VsrW, 2)
 #undef VMRG_DO
 #undef VMRG
-#undef MRGHI
-#undef MRGLO
 
 void helper_vmsummbm(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
                      ppc_avr_t *b, ppc_avr_t *c)