Patchwork [05/21] tcg-i386: Tidy bswap operations.

login
register
mail settings
Submitter Richard Henderson
Date April 13, 2010, 11:33 p.m.
Message ID <e36e7f72dbd7724145773f759814a3c0a184c667.1271277329.git.rth@twiddle.net>
Download mbox | patch
Permalink /patch/50183/
State New
Headers show

Comments

Richard Henderson - April 13, 2010, 11:33 p.m.
Define OPC_BSWAP.  Factor opcode emission to separate functions.
Use bswap+shift to implement 16-bit swap instead of a rolw; this
gets the proper zero-extension required by INDEX_op_bswap16_i32.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   53 +++++++++++++++++++++++++------------------------
 1 files changed, 27 insertions(+), 26 deletions(-)
Aurelien Jarno - April 18, 2010, 10:13 p.m.
On Tue, Apr 13, 2010 at 04:33:59PM -0700, Richard Henderson wrote:
> Define OPC_BSWAP.  Factor opcode emission to separate functions.
> Use bswap+shift to implement 16-bit swap instead of a rolw; this
> gets the proper zero-extension required by INDEX_op_bswap16_i32.

This is not required by INDEX_op_bswap16_i32. What is need is that the
value in the input register has the 16 upper bits set to 0. Considering
that, the rolw instruction is faster than bswap + shift.

> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.c |   53 +++++++++++++++++++++++++------------------------
>  1 files changed, 27 insertions(+), 26 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 75b9915..0bafd00 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -163,6 +163,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  
>  #define P_EXT   0x100 /* 0x0f opcode prefix */
>  
> +#define OPC_BSWAP	(0xc8 | P_EXT)
>  #define OPC_MOVZBL	(0xb6 | P_EXT)
>  #define OPC_MOVZWL	(0xb7 | P_EXT)
>  #define OPC_MOVSBL	(0xbe | P_EXT)

> @@ -339,6 +340,22 @@ static inline void tcg_out_ext16s(TCGContext *s, int dest, int src)
>      tcg_out_modrm(s, OPC_MOVSWL, dest, src);
>  }
>  
> +static inline void tcg_out_bswap32(TCGContext *s, int reg)
> +{
> +    tcg_out_opc(s, OPC_BSWAP + reg);
> +}
> +
> +static inline void tcg_out_bswap16(TCGContext *s, int reg, int sign)
> +{
> +    /* This swap+shift combination guarantees that the high part contains
> +       the sign or zero extension required.  It also doesn't suffer the
> +       problem of partial register stalls that using rolw does.  */
> +    tcg_out_bswap32(s, reg);
> +    /* shr $16, dest */
> +    tcg_out_modrm(s, 0xc1, (sign ? SHIFT_SAR : SHIFT_SHR), reg);
> +    tcg_out8(s, 16);
> +}
> +
>  static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
>  {
>      if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
> @@ -745,31 +762,21 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>          /* movzwl */
>          tcg_out_modrm_offset(s, OPC_MOVZWL, data_reg, r0, GUEST_BASE);
>          if (bswap) {
> -            /* rolw $8, data_reg */
> -            tcg_out8(s, 0x66); 
> -            tcg_out_modrm(s, 0xc1, 0, data_reg);
> -            tcg_out8(s, 8);
> +            tcg_out_bswap16(s, data_reg, 0);
>          }
>          break;
>      case 1 | 4:
>          /* movswl */
>          tcg_out_modrm_offset(s, OPC_MOVSWL, data_reg, r0, GUEST_BASE);
>          if (bswap) {
> -            /* rolw $8, data_reg */
> -            tcg_out8(s, 0x66); 
> -            tcg_out_modrm(s, 0xc1, 0, data_reg);
> -            tcg_out8(s, 8);
> -
> -            /* movswl data_reg, data_reg */
> -            tcg_out_modrm(s, OPC_MOVSWL, data_reg, data_reg);
> +            tcg_out_bswap16(s, data_reg, 1);
>          }
>          break;
>      case 2:
>          /* movl (r0), data_reg */
>          tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE);
>          if (bswap) {
> -            /* bswap */
> -            tcg_out_opc(s, (0xc8 + data_reg) | P_EXT);
> +            tcg_out_bswap32(s, data_reg);
>          }
>          break;
>      case 3:
> @@ -786,11 +793,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>              tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE + 4);
>          } else {
>              tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE + 4);
> -            tcg_out_opc(s, (0xc8 + data_reg) | P_EXT);
> +            tcg_out_bswap32(s, data_reg);
>  
>              tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE);
> -            /* bswap */
> -            tcg_out_opc(s, (0xc8 + data_reg2) | P_EXT);
> +            tcg_out_bswap32(s, data_reg2);
>          }
>          break;
>      default:
> @@ -982,8 +988,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      case 2:
>          if (bswap) {
>              tcg_out_mov(s, r1, data_reg);
> -            /* bswap data_reg */
> -            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
> +            tcg_out_bswap32(s, r1);
>              data_reg = r1;
>          }
>          /* movl */
> @@ -992,12 +997,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      case 3:
>          if (bswap) {
>              tcg_out_mov(s, r1, data_reg2);
> -            /* bswap data_reg */
> -            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
> +            tcg_out_bswap32(s, r1);
>              tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE);
>              tcg_out_mov(s, r1, data_reg);
> -            /* bswap data_reg */
> -            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
> +            tcg_out_bswap32(s, r1);
>              tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE + 4);
>          } else {
>              tcg_out_modrm_offset(s, 0x89, data_reg, r0, GUEST_BASE);
> @@ -1195,12 +1198,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          break;
>  
>      case INDEX_op_bswap16_i32:
> -        tcg_out8(s, 0x66);
> -        tcg_out_modrm(s, 0xc1, SHIFT_ROL, args[0]);
> -        tcg_out8(s, 8);
> +        tcg_out_bswap16(s, args[0], 0);
>          break;
>      case INDEX_op_bswap32_i32:
> -        tcg_out_opc(s, (0xc8 + args[0]) | P_EXT);
> +        tcg_out_bswap32(s, args[0]);
>          break;
>  
>      case INDEX_op_neg_i32:
> -- 
> 1.6.2.5
> 
> 
> 
>
Richard Henderson - April 19, 2010, 1:56 p.m.
On 04/18/2010 05:13 PM, Aurelien Jarno wrote:
> On Tue, Apr 13, 2010 at 04:33:59PM -0700, Richard Henderson wrote:
>> Define OPC_BSWAP.  Factor opcode emission to separate functions.
>> Use bswap+shift to implement 16-bit swap instead of a rolw; this
>> gets the proper zero-extension required by INDEX_op_bswap16_i32.
> 
> This is not required by INDEX_op_bswap16_i32. What is need is that the
> value in the input register has the 16 upper bits set to 0.

Ah.

> Considering
> that, the rolw instruction is faster than bswap + shift.

Well, no, it isn't.

 static inline int test_rolw(unsigned short *s)
 {
   int i, start, end;
   asm volatile("rdtsc\n\t"
                "movl %%eax, %1\n\t"
                "movzwl %3,%2\n\t"
                "rolw $8, %w2\n\t"
                "addl $1,%2\n\t"
                "rdtsc"
                : "=&a"(end), "=r"(start), "=r"(i) : "m"(*s) : "edx");
   return end - start;
 }
 
 static inline int test_bswap(unsigned short *s)
 {
   int i, start, end;
   asm volatile("rdtsc\n\t"
                "movl %%eax, %1\n\t"
                "movzwl %3,%2\n\t"
                "bswap %2\n\t"
                "shl $16,%2\n\t"
                "addl $1,%2\n\t"
                "rdtsc"
                : "=&a"(end), "=r"(start), "=r"(i) : "m"(*s) : "edx");
   return end - start;
 }


model name	: Intel(R) Core(TM)2 Duo CPU     T7700  @ 2.40GHz
 rolw	   60   60   72   60   60   72   60   60   72   60
 bswap	   60   60   60   60   60   60   60   60   60   60

model name	: Dual-Core AMD Opteron(tm) Processor 1210
 rolw	    9   10    9    9    8    8    8    8    8    8
 bswap	    9    9    8    8    8    8    8    8    8    8

The rolw sequence isn't ever faster, and it's more unstable,
likely due to the partial register stall I mentioned.

I will grant that the rolw sequence is smaller, and I can 
adjust this patch to use that sequence if you wish.


r~

Patch

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 75b9915..0bafd00 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -163,6 +163,7 @@  static inline int tcg_target_const_match(tcg_target_long val,
 
 #define P_EXT   0x100 /* 0x0f opcode prefix */
 
+#define OPC_BSWAP	(0xc8 | P_EXT)
 #define OPC_MOVZBL	(0xb6 | P_EXT)
 #define OPC_MOVZWL	(0xb7 | P_EXT)
 #define OPC_MOVSBL	(0xbe | P_EXT)
@@ -339,6 +340,22 @@  static inline void tcg_out_ext16s(TCGContext *s, int dest, int src)
     tcg_out_modrm(s, OPC_MOVSWL, dest, src);
 }
 
+static inline void tcg_out_bswap32(TCGContext *s, int reg)
+{
+    tcg_out_opc(s, OPC_BSWAP + reg);
+}
+
+static inline void tcg_out_bswap16(TCGContext *s, int reg, int sign)
+{
+    /* This swap+shift combination guarantees that the high part contains
+       the sign or zero extension required.  It also doesn't suffer the
+       problem of partial register stalls that using rolw does.  */
+    tcg_out_bswap32(s, reg);
+    /* shr $16, dest */
+    tcg_out_modrm(s, 0xc1, (sign ? SHIFT_SAR : SHIFT_SHR), reg);
+    tcg_out8(s, 16);
+}
+
 static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
 {
     if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
@@ -745,31 +762,21 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         /* movzwl */
         tcg_out_modrm_offset(s, OPC_MOVZWL, data_reg, r0, GUEST_BASE);
         if (bswap) {
-            /* rolw $8, data_reg */
-            tcg_out8(s, 0x66); 
-            tcg_out_modrm(s, 0xc1, 0, data_reg);
-            tcg_out8(s, 8);
+            tcg_out_bswap16(s, data_reg, 0);
         }
         break;
     case 1 | 4:
         /* movswl */
         tcg_out_modrm_offset(s, OPC_MOVSWL, data_reg, r0, GUEST_BASE);
         if (bswap) {
-            /* rolw $8, data_reg */
-            tcg_out8(s, 0x66); 
-            tcg_out_modrm(s, 0xc1, 0, data_reg);
-            tcg_out8(s, 8);
-
-            /* movswl data_reg, data_reg */
-            tcg_out_modrm(s, OPC_MOVSWL, data_reg, data_reg);
+            tcg_out_bswap16(s, data_reg, 1);
         }
         break;
     case 2:
         /* movl (r0), data_reg */
         tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE);
         if (bswap) {
-            /* bswap */
-            tcg_out_opc(s, (0xc8 + data_reg) | P_EXT);
+            tcg_out_bswap32(s, data_reg);
         }
         break;
     case 3:
@@ -786,11 +793,10 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
             tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE + 4);
         } else {
             tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE + 4);
-            tcg_out_opc(s, (0xc8 + data_reg) | P_EXT);
+            tcg_out_bswap32(s, data_reg);
 
             tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE);
-            /* bswap */
-            tcg_out_opc(s, (0xc8 + data_reg2) | P_EXT);
+            tcg_out_bswap32(s, data_reg2);
         }
         break;
     default:
@@ -982,8 +988,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     case 2:
         if (bswap) {
             tcg_out_mov(s, r1, data_reg);
-            /* bswap data_reg */
-            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
+            tcg_out_bswap32(s, r1);
             data_reg = r1;
         }
         /* movl */
@@ -992,12 +997,10 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     case 3:
         if (bswap) {
             tcg_out_mov(s, r1, data_reg2);
-            /* bswap data_reg */
-            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
+            tcg_out_bswap32(s, r1);
             tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE);
             tcg_out_mov(s, r1, data_reg);
-            /* bswap data_reg */
-            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
+            tcg_out_bswap32(s, r1);
             tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE + 4);
         } else {
             tcg_out_modrm_offset(s, 0x89, data_reg, r0, GUEST_BASE);
@@ -1195,12 +1198,10 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_bswap16_i32:
-        tcg_out8(s, 0x66);
-        tcg_out_modrm(s, 0xc1, SHIFT_ROL, args[0]);
-        tcg_out8(s, 8);
+        tcg_out_bswap16(s, args[0], 0);
         break;
     case INDEX_op_bswap32_i32:
-        tcg_out_opc(s, (0xc8 + args[0]) | P_EXT);
+        tcg_out_bswap32(s, args[0]);
         break;
 
     case INDEX_op_neg_i32: