Patchwork [11/18] tcg/arm: add bswap ops

login
register
mail settings
Submitter Aurelien Jarno
Date April 7, 2010, 5:51 p.m.
Message ID <1270662685-7379-12-git-send-email-aurelien@aurel32.net>
Download mbox | patch
Permalink /patch/49625/
State New
Headers show

Comments

Aurelien Jarno - April 7, 2010, 5:51 p.m.
Add an bswap16 and bswap32 ops, either using the rev and rev16
instructions on ARMv6+ or shifts and logical operations on previous
ARM versions. In both cases the result use less instructions than
the pure TCG version.

These ops are also needed by the qemu_ld/st functions.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 tcg/arm/tcg-target.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
 tcg/arm/tcg-target.h |    4 ++--
 2 files changed, 50 insertions(+), 2 deletions(-)
Paul Brook - April 8, 2010, 11:32 p.m.
> +static inline void tcg_out_bswap32(TCGContext *s, int cond, int rd, int rn)
> +#else
> +    /* This code only uses one temporary register. There is probably
> +       a faster way to do that with more temporary registers. */

You can do better even without a temporary:

  eor	r8, rn, rn, ror #16
  bic	r8, r8, #0x00ff0000
  mov	 rd, rn, ror #8
  eor	rd, rd, r8, lsr #8

Paul
Aurelien Jarno - April 9, 2010, 5:11 p.m.
On Fri, Apr 09, 2010 at 12:32:04AM +0100, Paul Brook wrote:
> > +static inline void tcg_out_bswap32(TCGContext *s, int cond, int rd, int rn)
> > +#else
> > +    /* This code only uses one temporary register. There is probably
> > +       a faster way to do that with more temporary registers. */
> 
> You can do better even without a temporary:
> 
>   eor	r8, rn, rn, ror #16
>   bic	r8, r8, #0x00ff0000
>   mov	 rd, rn, ror #8
>   eor	rd, rd, r8, lsr #8
> 

That's indeed much better, though it's still use one temporary (r8).
I'll use that in the next version. Thanks.

Patch

diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index d8ba5f1..aec1183 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -503,6 +503,44 @@  static inline void tcg_out_ext16u(TCGContext *s, int cond,
 #endif
 }
 
+static inline void tcg_out_bswap16(TCGContext *s, int cond, int rd, int rn)
+{
+#ifdef USE_ARMV6_INSTRUCTIONS
+    /* rev16 */
+    tcg_out32(s, 0x06bf0fb0 | (cond << 28) | (rd << 12) | rn);
+#else
+    tcg_out_dat_reg(s, cond, ARITH_MOV,
+                    TCG_REG_R8, 0, rn, SHIFT_IMM_LSL(24));
+    tcg_out_dat_reg(s, cond, ARITH_MOV,
+                    TCG_REG_R8, 0, TCG_REG_R8, SHIFT_IMM_LSR(16));
+    tcg_out_dat_reg(s, cond, ARITH_ORR,
+                    rd, TCG_REG_R8, rn, SHIFT_IMM_LSR(8));
+#endif
+}
+
+static inline void tcg_out_bswap32(TCGContext *s, int cond, int rd, int rn)
+{
+#ifdef USE_ARMV6_INSTRUCTIONS
+     /* rev */
+    tcg_out32(s, 0x06bf0f30 | (cond << 28) | (rd << 12) | rn);
+#else
+    /* This code only uses one temporary register. There is probably
+       a faster way to do that with more temporary registers. */
+    tcg_out_dat_reg(s, cond, ARITH_MOV,
+                    TCG_REG_R8, 0, rn, SHIFT_IMM_ROR(8));
+    tcg_out_dat_imm(s, cond, ARITH_BIC,
+                    rd, TCG_REG_R8, 0xff);
+    tcg_out_dat_imm(s, cond, ARITH_BIC,
+                    rd, rd, 0xff | 0x800);
+    tcg_out_dat_imm(s, cond, ARITH_BIC,
+                    TCG_REG_R8, TCG_REG_R8, 0xff | 0x400);
+    tcg_out_dat_imm(s, cond, ARITH_BIC,
+                    TCG_REG_R8, TCG_REG_R8, 0xff | 0xc00);
+    tcg_out_dat_reg(s, cond, ARITH_ORR,
+                    rd, rd, TCG_REG_R8, SHIFT_IMM_ROR(16));
+#endif
+}
+
 static inline void tcg_out_ld32_12(TCGContext *s, int cond,
                 int rd, int rn, tcg_target_long im)
 {
@@ -1520,6 +1558,13 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_qemu_st(s, COND_AL, args, 3);
         break;
 
+    case INDEX_op_bswap16_i32:
+        tcg_out_bswap16(s, COND_AL, args[0], args[1]);
+        break;
+    case INDEX_op_bswap32_i32:
+        tcg_out_bswap32(s, COND_AL, args[0], args[1]);
+        break;
+
     case INDEX_op_ext8s_i32:
         tcg_out_ext8s(s, COND_AL, args[0], args[1]);
         break;
@@ -1607,6 +1652,9 @@  static const TCGTargetOpDef arm_op_defs[] = {
     { INDEX_op_qemu_st64, { "x", "D", "x", "X" } },
 #endif
 
+    { INDEX_op_bswap16_i32, { "r", "r" } },
+    { INDEX_op_bswap32_i32, { "r", "r" } },
+
     { INDEX_op_ext8s_i32, { "r", "r" } },
     { INDEX_op_ext16s_i32, { "r", "r" } },
     { INDEX_op_ext16u_i32, { "r", "r" } },
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 1f6d665..d8d7d94 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -62,8 +62,8 @@  enum {
 #define TCG_TARGET_HAS_ext16s_i32
 #undef TCG_TARGET_HAS_ext8u_i32       /* and r0, r1, #0xff */
 #define TCG_TARGET_HAS_ext16u_i32
-// #define TCG_TARGET_HAS_bswap16_i32
-// #define TCG_TARGET_HAS_bswap32_i32
+#define TCG_TARGET_HAS_bswap16_i32
+#define TCG_TARGET_HAS_bswap32_i32
 #define TCG_TARGET_HAS_not_i32
 #define TCG_TARGET_HAS_neg_i32
 #define TCG_TARGET_HAS_rot_i32