diff mbox

[v3,17/27] tcg-ppc64: Implement bswap64

Message ID 1364876610-3933-18-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson April 2, 2013, 4:23 a.m. UTC
Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/ppc64/tcg-target.c | 35 +++++++++++++++++++++++++++++++++++
 tcg/ppc64/tcg-target.h |  2 +-
 2 files changed, 36 insertions(+), 1 deletion(-)

Comments

Alexander Graf April 2, 2013, 6:34 a.m. UTC | #1
On 02.04.2013, at 06:23, Richard Henderson wrote:

> Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
> Signed-off-by: Richard Henderson <rth@twiddle.net>

Is this faster than a load/store with std/ldbrx?


Alex

> ---
> tcg/ppc64/tcg-target.c | 35 +++++++++++++++++++++++++++++++++++
> tcg/ppc64/tcg-target.h |  2 +-
> 2 files changed, 36 insertions(+), 1 deletion(-)
> 
> diff --git a/tcg/ppc64/tcg-target.c b/tcg/ppc64/tcg-target.c
> index d8131ec..1806364 100644
> --- a/tcg/ppc64/tcg-target.c
> +++ b/tcg/ppc64/tcg-target.c
> @@ -1706,6 +1706,40 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
>         }
>         break;
> 
> +    case INDEX_op_bswap64_i64:
> +        a0 = args[0], a1 = args[1], a2 = 0;
> +        if (a0 == a1) {
> +            a0 = 0;
> +            a2 = a1;
> +        }
> +
> +        /* a1 = # abcd efgh */
> +        /* a0 = rl32(a1, 8) # 0000 fghe */
> +        tcg_out_rlw(s, RLWINM, a0, a1, 8, 0, 31);
> +        /* a0 = dep(a0, rl32(a1, 24), 0xff000000) # 0000 hghe */
> +        tcg_out_rlw(s, RLWIMI, a0, a1, 24, 0, 7);
> +        /* a0 = dep(a0, rl32(a1, 24), 0x0000ff00) # 0000 hgfe */
> +        tcg_out_rlw(s, RLWIMI, a0, a1, 24, 16, 23);
> +
> +        /* a0 = rl64(a0, 32) # hgfe 0000 */
> +        /* a2 = rl64(a1, 32) # efgh abcd */
> +        tcg_out_rld(s, RLDICL, a0, a0, 32, 0);
> +        tcg_out_rld(s, RLDICL, a2, a1, 32, 0);
> +
> +        /* a0 = dep(a0, rl32(a2, 8), 0xffffffff)  # hgfe bcda */
> +        tcg_out_rlw(s, RLWIMI, a0, a2, 8, 0, 31);
> +        /* a0 = dep(a0, rl32(a2, 24), 0xff000000) # hgfe dcda */
> +        tcg_out_rlw(s, RLWIMI, a0, a2, 24, 0, 7);
> +        /* a0 = dep(a0, rl32(a2, 24), 0x0000ff00) # hgfe dcba */
> +        tcg_out_rlw(s, RLWIMI, a0, a2, 24, 16, 23);
> +
> +        if (a0 == 0) {
> +            tcg_out_mov(s, TCG_TYPE_I64, args[0], a0);
> +            /* Revert the source rotate that we performed above.  */
> +            tcg_out_rld(s, RLDICL, a1, a1, 32, 0);
> +        }
> +        break;
> +
>     default:
>         tcg_dump_ops (s);
>         tcg_abort ();
> @@ -1816,6 +1850,7 @@ static const TCGTargetOpDef ppc_op_defs[] = {
>     { INDEX_op_bswap16_i64, { "r", "r" } },
>     { INDEX_op_bswap32_i32, { "r", "r" } },
>     { INDEX_op_bswap32_i64, { "r", "r" } },
> +    { INDEX_op_bswap64_i64, { "r", "r" } },
> 
>     { -1 },
> };
> diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
> index bd011b8..d8e1820 100644
> --- a/tcg/ppc64/tcg-target.h
> +++ b/tcg/ppc64/tcg-target.h
> @@ -102,7 +102,7 @@ typedef enum {
> #define TCG_TARGET_HAS_ext32u_i64       1
> #define TCG_TARGET_HAS_bswap16_i64      1
> #define TCG_TARGET_HAS_bswap32_i64      1
> -#define TCG_TARGET_HAS_bswap64_i64      0
> +#define TCG_TARGET_HAS_bswap64_i64      1
> #define TCG_TARGET_HAS_not_i64          1
> #define TCG_TARGET_HAS_neg_i64          1
> #define TCG_TARGET_HAS_andc_i64         0
> -- 
> 1.8.1.4
>
Richard Henderson April 2, 2013, 1:44 p.m. UTC | #2
On 2013-04-01 23:34, Alexander Graf wrote:
> Is this faster than a load/store with std/ldbrx?

Hmm.  Almost certainly not.  And since we've got stack space
allocated for function calls, we've got scratch space to do it in.

Probably similar for bswap32 too, eh?

I'll do a tiny bit o benchmarking for power7.


r~
Alexander Graf April 2, 2013, 2:41 p.m. UTC | #3
Am 02.04.2013 um 15:44 schrieb Richard Henderson <rth@twiddle.net>:

> On 2013-04-01 23:34, Alexander Graf wrote:
>> Is this faster than a load/store with std/ldbrx?
> 
> Hmm.  Almost certainly not.  And since we've got stack space
> allocated for function calls, we've got scratch space to do it in.
> 
> Probably similar for bswap32 too, eh?

Depends - memory load/store doesn't come for free and bswap32 is quite short.

> 
> I'll do a tiny bit o benchmarking for power7.

Cool, thanks a bunch :)

Alex

> 
> 
> r~
Richard Henderson April 2, 2013, 3:12 p.m. UTC | #4
On 2013-04-02 07:41, Alexander Graf wrote:
>> On 2013-04-01 23:34, Alexander Graf wrote:
>>> Is this faster than a load/store with std/ldbrx?
>>
>> Hmm.  Almost certainly not.  And since we've got stack space
>> allocated for function calls, we've got scratch space to do it in.
>>
>> Probably similar for bswap32 too, eh?
>
> Depends - memory load/store doesn't come for free and bswap32 is quite short.
>
>>
>> I'll do a tiny bit o benchmarking for power7.
>
> Cool, thanks a bunch :)

Heh.  "Almost certainly not" indeed.  Unless I've made some silly mistake,
going through memory stalls badly.  No store buffer forwarding on power7?

With the following test case, time reports:

f1		2.967s
f2		8.930s
f3		7.071s
f4		7.166s

And note that f4 is a normal store/load pair, trying to determine what the
store buffer forwarding delay might be.


r~
Alexander Graf April 2, 2013, 3:23 p.m. UTC | #5
On 04/02/2013 05:12 PM, Richard Henderson wrote:
> On 2013-04-02 07:41, Alexander Graf wrote:
>>> On 2013-04-01 23:34, Alexander Graf wrote:
>>>> Is this faster than a load/store with std/ldbrx?
>>>
>>> Hmm.  Almost certainly not.  And since we've got stack space
>>> allocated for function calls, we've got scratch space to do it in.
>>>
>>> Probably similar for bswap32 too, eh?
>>
>> Depends - memory load/store doesn't come for free and bswap32 is 
>> quite short.
>>
>>>
>>> I'll do a tiny bit o benchmarking for power7.
>>
>> Cool, thanks a bunch :)
>
> Heh.  "Almost certainly not" indeed.  Unless I've made some silly 
> mistake,
> going through memory stalls badly.  No store buffer forwarding on power7?
>
> With the following test case, time reports:
>
> f1        2.967s
> f2        8.930s
> f3        7.071s
> f4        7.166s
>
> And note that f4 is a normal store/load pair, trying to determine what 
> the
> store buffer forwarding delay might be.

Yeah, doesn't look like it makes any sense at all to do a load/store 
cycle then. What a shame :).

Keep in mind that this tests icache hot cycles. However, you might get 
bad icache penalties due to the long bswap64 sequence. So all the memory 
latency you see here might also affect the instruction stream when it 
gets executed. But then again we only care about performance of cache 
hot sequences in the first place....


Alex
diff mbox

Patch

diff --git a/tcg/ppc64/tcg-target.c b/tcg/ppc64/tcg-target.c
index d8131ec..1806364 100644
--- a/tcg/ppc64/tcg-target.c
+++ b/tcg/ppc64/tcg-target.c
@@ -1706,6 +1706,40 @@  static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
         }
         break;
 
+    case INDEX_op_bswap64_i64:
+        a0 = args[0], a1 = args[1], a2 = 0;
+        if (a0 == a1) {
+            a0 = 0;
+            a2 = a1;
+        }
+
+        /* a1 = # abcd efgh */
+        /* a0 = rl32(a1, 8) # 0000 fghe */
+        tcg_out_rlw(s, RLWINM, a0, a1, 8, 0, 31);
+        /* a0 = dep(a0, rl32(a1, 24), 0xff000000) # 0000 hghe */
+        tcg_out_rlw(s, RLWIMI, a0, a1, 24, 0, 7);
+        /* a0 = dep(a0, rl32(a1, 24), 0x0000ff00) # 0000 hgfe */
+        tcg_out_rlw(s, RLWIMI, a0, a1, 24, 16, 23);
+
+        /* a0 = rl64(a0, 32) # hgfe 0000 */
+        /* a2 = rl64(a1, 32) # efgh abcd */
+        tcg_out_rld(s, RLDICL, a0, a0, 32, 0);
+        tcg_out_rld(s, RLDICL, a2, a1, 32, 0);
+
+        /* a0 = dep(a0, rl32(a2, 8), 0xffffffff)  # hgfe bcda */
+        tcg_out_rlw(s, RLWIMI, a0, a2, 8, 0, 31);
+        /* a0 = dep(a0, rl32(a2, 24), 0xff000000) # hgfe dcda */
+        tcg_out_rlw(s, RLWIMI, a0, a2, 24, 0, 7);
+        /* a0 = dep(a0, rl32(a2, 24), 0x0000ff00) # hgfe dcba */
+        tcg_out_rlw(s, RLWIMI, a0, a2, 24, 16, 23);
+
+        if (a0 == 0) {
+            tcg_out_mov(s, TCG_TYPE_I64, args[0], a0);
+            /* Revert the source rotate that we performed above.  */
+            tcg_out_rld(s, RLDICL, a1, a1, 32, 0);
+        }
+        break;
+
     default:
         tcg_dump_ops (s);
         tcg_abort ();
@@ -1816,6 +1850,7 @@  static const TCGTargetOpDef ppc_op_defs[] = {
     { INDEX_op_bswap16_i64, { "r", "r" } },
     { INDEX_op_bswap32_i32, { "r", "r" } },
     { INDEX_op_bswap32_i64, { "r", "r" } },
+    { INDEX_op_bswap64_i64, { "r", "r" } },
 
     { -1 },
 };
diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
index bd011b8..d8e1820 100644
--- a/tcg/ppc64/tcg-target.h
+++ b/tcg/ppc64/tcg-target.h
@@ -102,7 +102,7 @@  typedef enum {
 #define TCG_TARGET_HAS_ext32u_i64       1
 #define TCG_TARGET_HAS_bswap16_i64      1
 #define TCG_TARGET_HAS_bswap32_i64      1
-#define TCG_TARGET_HAS_bswap64_i64      0
+#define TCG_TARGET_HAS_bswap64_i64      1
 #define TCG_TARGET_HAS_not_i64          1
 #define TCG_TARGET_HAS_neg_i64          1
 #define TCG_TARGET_HAS_andc_i64         0