diff mbox

[03/22] tcg-i386: Tidy ext8u and ext16u operations.

Message ID d3bd14117b5ecc639dcad07f997c86581cc03bf8.1272479073.git.rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson April 13, 2010, 10:59 p.m. UTC
Define OPC_MOVZBL and OPC_MOVZWL.  Factor opcode emission to
separate functions.  Don't restrict the input register to the
low 4 "q" registers; emit an AND instead if needed.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   68 ++++++++++++++++++++++++++++++------------------
 1 files changed, 42 insertions(+), 26 deletions(-)

Comments

Aurelien Jarno May 19, 2010, 6:47 a.m. UTC | #1
Hi,

On Tue, Apr 13, 2010 at 03:59:20PM -0700, Richard Henderson wrote:
> Define OPC_MOVZBL and OPC_MOVZWL.  Factor opcode emission to
> separate functions.  Don't restrict the input register to the
> low 4 "q" registers; emit an AND instead if needed.

I am fine about the cleaning part, but I don't know what to think about
the constraints change.

The reg allocator is able to issue move if needed, so the only
improvement this patch is for doing a ext8u on both "q" registers.

OTOH the reg allocator knows this situation and will try to avoid this
situation during the allocation. Cheating on the reg allocator might
have some wrong effects, especially after your patch "Allocate
call-saved registers first". I am thinking of the scenario where the
value is in memory (which is likely to be the case given the limited 
number of registers), it will be likely loaded in a "r" register (they
are now at the top priority), and then ext8u will be called, which will 
issue "mov" + "and" instructions instead of a "movzbl" instruction.

If there are still cases to optimize I think it should be done in the
reg allocator instead, so it could benefit all ops and all targets. I
have started to play on that with Laurent Desnogues, I have a few more
ideas how to improve it, but unfortunately I don't have time to code
them.

All of that is purely theoretical. Do you know how does it behave in 
practice?

> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.c |   68 ++++++++++++++++++++++++++++++------------------
>  1 files changed, 42 insertions(+), 26 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 359f81b..2cc1191 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -161,6 +161,11 @@ static inline int tcg_target_const_match(tcg_target_long val,
>          return 0;
>  }
>  
> +#define P_EXT   0x100 /* 0x0f opcode prefix */
> +
> +#define OPC_MOVZBL	(0xb6 | P_EXT)
> +#define OPC_MOVZWL	(0xb7 | P_EXT)
> +
>  #define ARITH_ADD 0
>  #define ARITH_OR  1
>  #define ARITH_ADC 2
> @@ -194,8 +199,6 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  #define JCC_JLE 0xe
>  #define JCC_JG  0xf
>  
> -#define P_EXT   0x100 /* 0x0f opcode prefix */
> -
>  static const uint8_t tcg_cond_to_jcc[10] = {
>      [TCG_COND_EQ] = JCC_JE,
>      [TCG_COND_NE] = JCC_JNE,
> @@ -288,6 +291,27 @@ static inline void tcg_out_st(TCGContext *s, TCGType type, int arg,
>      tcg_out_modrm_offset(s, 0x89, arg, arg1, arg2);
>  }
>  
> +static void tcg_out_ext8u(TCGContext *s, int dest, int src)
> +{
> +    if (src >= 4) {
> +        tcg_out_mov(s, dest, src);
> +        if (dest >= 4) {
> +            tcg_out_modrm(s, 0x81, ARITH_AND, dest);
> +            tcg_out32(s, 0xff);
> +            return;
> +        }
> +        src = dest;
> +    }
> +    /* movzbl */
> +    tcg_out_modrm(s, OPC_MOVZBL, dest, src);
> +}
> +
> +static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
> +{
> +    /* movzwl */
> +    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
> +}
> +
>  static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
>  {
>      if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
> @@ -300,11 +324,9 @@ static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf
>          tcg_out_modrm(s, 0x83, c, r0);
>          tcg_out8(s, val);
>      } else if (c == ARITH_AND && val == 0xffu && r0 < 4) {
> -        /* movzbl */
> -        tcg_out_modrm(s, 0xb6 | P_EXT, r0, r0);
> +        tcg_out_ext8u(s, r0, r0);
>      } else if (c == ARITH_AND && val == 0xffffu) {
> -        /* movzwl */
> -        tcg_out_modrm(s, 0xb7 | P_EXT, r0, r0);
> +        tcg_out_ext16u(s, r0, r0);
>      } else {
>          tcg_out_modrm(s, 0x81, c, r0);
>          tcg_out32(s, val);
> @@ -645,12 +667,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>          tcg_out_modrm(s, 0xbf | P_EXT, data_reg, TCG_REG_EAX);
>          break;
>      case 0:
> -        /* movzbl */
> -        tcg_out_modrm(s, 0xb6 | P_EXT, data_reg, TCG_REG_EAX);
> +        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
>          break;
>      case 1:
> -        /* movzwl */
> -        tcg_out_modrm(s, 0xb7 | P_EXT, data_reg, TCG_REG_EAX);
> +        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
>          break;
>      case 2:
>      default:
> @@ -690,7 +710,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>      switch(opc) {
>      case 0:
>          /* movzbl */
> -        tcg_out_modrm_offset(s, 0xb6 | P_EXT, data_reg, r0, GUEST_BASE);
> +        tcg_out_modrm_offset(s, OPC_MOVZBL, data_reg, r0, GUEST_BASE);
>          break;
>      case 0 | 4:
>          /* movsbl */
> @@ -698,7 +718,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>          break;
>      case 1:
>          /* movzwl */
> -        tcg_out_modrm_offset(s, 0xb7 | P_EXT, data_reg, r0, GUEST_BASE);
> +        tcg_out_modrm_offset(s, OPC_MOVZWL, data_reg, r0, GUEST_BASE);
>          if (bswap) {
>              /* rolw $8, data_reg */
>              tcg_out8(s, 0x66); 
> @@ -850,12 +870,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      } else {
>          switch(opc) {
>          case 0:
> -            /* movzbl */
> -            tcg_out_modrm(s, 0xb6 | P_EXT, TCG_REG_EDX, data_reg);
> +            tcg_out_ext8u(s, TCG_REG_EDX, data_reg);
>              break;
>          case 1:
> -            /* movzwl */
> -            tcg_out_modrm(s, 0xb7 | P_EXT, TCG_REG_EDX, data_reg);
> +            tcg_out_ext16u(s, TCG_REG_EDX, data_reg);
>              break;
>          case 2:
>              tcg_out_mov(s, TCG_REG_EDX, data_reg);
> @@ -881,12 +899,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>          tcg_out_mov(s, TCG_REG_EDX, addr_reg2);
>          switch(opc) {
>          case 0:
> -            /* movzbl */
> -            tcg_out_modrm(s, 0xb6 | P_EXT, TCG_REG_ECX, data_reg);
> +            tcg_out_ext8u(s, TCG_REG_ECX, data_reg);
>              break;
>          case 1:
> -            /* movzwl */
> -            tcg_out_modrm(s, 0xb7 | P_EXT, TCG_REG_ECX, data_reg);
> +            tcg_out_ext16u(s, TCG_REG_ECX, data_reg);
>              break;
>          case 2:
>              tcg_out_mov(s, TCG_REG_ECX, data_reg);
> @@ -1022,7 +1038,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          break;
>      case INDEX_op_ld8u_i32:
>          /* movzbl */
> -        tcg_out_modrm_offset(s, 0xb6 | P_EXT, args[0], args[1], args[2]);
> +        tcg_out_modrm_offset(s, OPC_MOVZBL, args[0], args[1], args[2]);
>          break;
>      case INDEX_op_ld8s_i32:
>          /* movsbl */
> @@ -1030,7 +1046,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          break;
>      case INDEX_op_ld16u_i32:
>          /* movzwl */
> -        tcg_out_modrm_offset(s, 0xb7 | P_EXT, args[0], args[1], args[2]);
> +        tcg_out_modrm_offset(s, OPC_MOVZWL, args[0], args[1], args[2]);
>          break;
>      case INDEX_op_ld16s_i32:
>          /* movswl */
> @@ -1177,10 +1193,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          tcg_out_modrm(s, 0xbf | P_EXT, args[0], args[1]);
>          break;
>      case INDEX_op_ext8u_i32:
> -        tcg_out_modrm(s, 0xb6 | P_EXT, args[0], args[1]);
> +        tcg_out_ext8u(s, args[0], args[1]);
>          break;
>      case INDEX_op_ext16u_i32:
> -        tcg_out_modrm(s, 0xb7 | P_EXT, args[0], args[1]);
> +        tcg_out_ext16u(s, args[0], args[1]);
>          break;
>  
>      case INDEX_op_setcond_i32:
> @@ -1275,8 +1291,8 @@ static const TCGTargetOpDef x86_op_defs[] = {
>  
>      { INDEX_op_ext8s_i32, { "r", "q" } },
>      { INDEX_op_ext16s_i32, { "r", "r" } },
> -    { INDEX_op_ext8u_i32, { "r", "q"} },
> -    { INDEX_op_ext16u_i32, { "r", "r"} },
> +    { INDEX_op_ext8u_i32, { "r", "r" } },
> +    { INDEX_op_ext16u_i32, { "r", "r" } },
>  
>      { INDEX_op_setcond_i32, { "q", "r", "ri" } },
>      { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },
> -- 
> 1.6.6.1
> 
> 
> 
>
Richard Henderson May 19, 2010, 6:31 p.m. UTC | #2
On 05/18/2010 11:47 PM, Aurelien Jarno wrote:
> The reg allocator is able to issue move if needed, so the only
> improvement this patch is for doing a ext8u on both "q" registers.
> 
> OTOH the reg allocator knows this situation and will try to avoid this
> situation during the allocation. Cheating on the reg allocator might
> have some wrong effects, especially after your patch "Allocate
> call-saved registers first". I am thinking of the scenario where the
> value is in memory (which is likely to be the case given the limited 
> number of registers), it will be likely loaded in a "r" register (they
> are now at the top priority), and then ext8u will be called, which will 
> issue "mov" + "and" instructions instead of a "movzbl" instruction.

The case I was concerned with is the fact that if we have a value
allocated to, say, %esi, and we need to to an ext8u, then the 
register allocator has been told that it must move the value to a
"q" register in order to perform the movzbl.  In this case, the
new code will simply emit the andl.

I.e. the real problem is that we've told the register allocator
one way that the extend can be implemented, but not every way.

> All of that is purely theoretical. Do you know how does it behave in 
> practice?

Picking the i386 target since it seems to use more extensions than
any other target, from linux-user-test -d op_opt,out_asm i386/ls:

There are 176 instances of ext8u.
Of those, 83 instances are in-place, i.e. "ext8u_i32 tmp0,tmp0"

I examined the first 2 dozen appearances in the output assembly:

There are several instances of the value being in an "r" register:

 shr_i32 tmp1,edx,tmp13
 ext8u_i32 tmp1,tmp1
 =>
0x601c5468:  shr    $0x8,%edi
0x601c546b:  and    $0xff,%edi

All of the instances that I looked at that were not in-place happened
to already be using a "q" register -- usually %ebx.  I assume that's
because we place %ebx as the first allocation register and that's just
how things happen to work out once we've flushed the registers before
the qemu_ld.

 qemu_ld8u tmp0,tmp2,$0xffffffff
 ext8u_i32 tmp13,tmp0
 =>
0x601c82f9:  movzbl (%esi),%ebx
0x601c82fc:  movzbl %bl,%ebx


r~
Aurelien Jarno May 20, 2010, 1:39 p.m. UTC | #3
On Wed, May 19, 2010 at 11:31:27AM -0700, Richard Henderson wrote:
> On 05/18/2010 11:47 PM, Aurelien Jarno wrote:
> > The reg allocator is able to issue move if needed, so the only
> > improvement this patch is for doing a ext8u on both "q" registers.
> > 
> > OTOH the reg allocator knows this situation and will try to avoid this
> > situation during the allocation. Cheating on the reg allocator might
> > have some wrong effects, especially after your patch "Allocate
> > call-saved registers first". I am thinking of the scenario where the
> > value is in memory (which is likely to be the case given the limited 
> > number of registers), it will be likely loaded in a "r" register (they
> > are now at the top priority), and then ext8u will be called, which will 
> > issue "mov" + "and" instructions instead of a "movzbl" instruction.
> 
> The case I was concerned with is the fact that if we have a value
> allocated to, say, %esi, and we need to to an ext8u, then the 
> register allocator has been told that it must move the value to a
> "q" register in order to perform the movzbl.  In this case, the
> new code will simply emit the andl.
> 
> I.e. the real problem is that we've told the register allocator
> one way that the extend can be implemented, but not every way.
> 
> > All of that is purely theoretical. Do you know how does it behave in 
> > practice?
> 
> Picking the i386 target since it seems to use more extensions than
> any other target, from linux-user-test -d op_opt,out_asm i386/ls:
> 
> There are 176 instances of ext8u.
> Of those, 83 instances are in-place, i.e. "ext8u_i32 tmp0,tmp0"
> 
> I examined the first 2 dozen appearances in the output assembly:
> 
> There are several instances of the value being in an "r" register:
> 
>  shr_i32 tmp1,edx,tmp13
>  ext8u_i32 tmp1,tmp1
>  =>
> 0x601c5468:  shr    $0x8,%edi
> 0x601c546b:  and    $0xff,%edi
> 
> All of the instances that I looked at that were not in-place happened
> to already be using a "q" register -- usually %ebx.  I assume that's
> because we place %ebx as the first allocation register and that's just
> how things happen to work out once we've flushed the registers before
> the qemu_ld.
> 
>  qemu_ld8u tmp0,tmp2,$0xffffffff
>  ext8u_i32 tmp13,tmp0
>  =>
> 0x601c82f9:  movzbl (%esi),%ebx
> 0x601c82fc:  movzbl %bl,%ebx
> 

Do you have tried to compare the generated code before and after your
patch? I expect a few cases where your patch has some drawbacks, so I
don't know if there is a net gain on the size of the translated code.
Aurelien Jarno May 20, 2010, 2:04 p.m. UTC | #4
On Thu, May 20, 2010 at 03:39:08PM +0200, Aurelien Jarno wrote:
> On Wed, May 19, 2010 at 11:31:27AM -0700, Richard Henderson wrote:
> > On 05/18/2010 11:47 PM, Aurelien Jarno wrote:
> > > The reg allocator is able to issue move if needed, so the only
> > > improvement this patch is for doing a ext8u on both "q" registers.
> > > 
> > > OTOH the reg allocator knows this situation and will try to avoid this
> > > situation during the allocation. Cheating on the reg allocator might
> > > have some wrong effects, especially after your patch "Allocate
> > > call-saved registers first". I am thinking of the scenario where the
> > > value is in memory (which is likely to be the case given the limited 
> > > number of registers), it will be likely loaded in a "r" register (they
> > > are now at the top priority), and then ext8u will be called, which will 
> > > issue "mov" + "and" instructions instead of a "movzbl" instruction.
> > 
> > The case I was concerned with is the fact that if we have a value
> > allocated to, say, %esi, and we need to to an ext8u, then the 
> > register allocator has been told that it must move the value to a
> > "q" register in order to perform the movzbl.  In this case, the
> > new code will simply emit the andl.
> > 
> > I.e. the real problem is that we've told the register allocator
> > one way that the extend can be implemented, but not every way.
> > 
> > > All of that is purely theoretical. Do you know how does it behave in 
> > > practice?
> > 
> > Picking the i386 target since it seems to use more extensions than
> > any other target, from linux-user-test -d op_opt,out_asm i386/ls:
> > 
> > There are 176 instances of ext8u.
> > Of those, 83 instances are in-place, i.e. "ext8u_i32 tmp0,tmp0"
> > 
> > I examined the first 2 dozen appearances in the output assembly:
> > 
> > There are several instances of the value being in an "r" register:
> > 
> >  shr_i32 tmp1,edx,tmp13
> >  ext8u_i32 tmp1,tmp1
> >  =>
> > 0x601c5468:  shr    $0x8,%edi
> > 0x601c546b:  and    $0xff,%edi
> > 
> > All of the instances that I looked at that were not in-place happened
> > to already be using a "q" register -- usually %ebx.  I assume that's
> > because we place %ebx as the first allocation register and that's just
> > how things happen to work out once we've flushed the registers before
> > the qemu_ld.
> > 
> >  qemu_ld8u tmp0,tmp2,$0xffffffff
> >  ext8u_i32 tmp13,tmp0
> >  =>
> > 0x601c82f9:  movzbl (%esi),%ebx
> > 0x601c82fc:  movzbl %bl,%ebx
> > 
> 
> Do you have tried to compare the generated code before and after your
> patch? I expect a few cases where your patch has some drawbacks, so I
> don't know if there is a net gain on the size of the translated code.
> 

I have done a quick test on /bin/ls. 
       |  instr |  size  |
       +--------+--------+
before | 101305 | 344770 |
after  | 101258 | 344829 |

In short a small gain in the number of instructions, and a small loss in
the size of the translated code.
Richard Henderson May 20, 2010, 2:40 p.m. UTC | #5
On 05/20/2010 07:04 AM, Aurelien Jarno wrote:
>> Do you have tried to compare the generated code before and after your
>> patch? I expect a few cases where your patch has some drawbacks, so I
>> don't know if there is a net gain on the size of the translated code.
>>
> 
> I have done a quick test on /bin/ls. 
>        |  instr |  size  |
>        +--------+--------+
> before | 101305 | 344770 |
> after  | 101258 | 344829 |
> 
> In short a small gain in the number of instructions, and a small loss in
> the size of the translated code.

That was pretty much the test I would have done.

So where are we?  Is the patch acceptable as-is, or should I be
re-writing it without the constraints change?


r~
Aurelien Jarno May 20, 2010, 6:50 p.m. UTC | #6
On Thu, May 20, 2010 at 07:40:59AM -0700, Richard Henderson wrote:
> On 05/20/2010 07:04 AM, Aurelien Jarno wrote:
> >> Do you have tried to compare the generated code before and after your
> >> patch? I expect a few cases where your patch has some drawbacks, so I
> >> don't know if there is a net gain on the size of the translated code.
> >>
> > 
> > I have done a quick test on /bin/ls. 
> >        |  instr |  size  |
> >        +--------+--------+
> > before | 101305 | 344770 |
> > after  | 101258 | 344829 |
> > 
> > In short a small gain in the number of instructions, and a small loss in
> > the size of the translated code.
> 
> That was pretty much the test I would have done.
> 
> So where are we?  Is the patch acceptable as-is, or should I be
> re-writing it without the constraints change?
> 

Given the tests do not show a real improvement and given that it
complexify the code generation, I don't think we should have such a
patch. Could you please rewrite it without the constraints change?
diff mbox

Patch

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 359f81b..2cc1191 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -161,6 +161,11 @@  static inline int tcg_target_const_match(tcg_target_long val,
         return 0;
 }
 
+#define P_EXT   0x100 /* 0x0f opcode prefix */
+
+#define OPC_MOVZBL	(0xb6 | P_EXT)
+#define OPC_MOVZWL	(0xb7 | P_EXT)
+
 #define ARITH_ADD 0
 #define ARITH_OR  1
 #define ARITH_ADC 2
@@ -194,8 +199,6 @@  static inline int tcg_target_const_match(tcg_target_long val,
 #define JCC_JLE 0xe
 #define JCC_JG  0xf
 
-#define P_EXT   0x100 /* 0x0f opcode prefix */
-
 static const uint8_t tcg_cond_to_jcc[10] = {
     [TCG_COND_EQ] = JCC_JE,
     [TCG_COND_NE] = JCC_JNE,
@@ -288,6 +291,27 @@  static inline void tcg_out_st(TCGContext *s, TCGType type, int arg,
     tcg_out_modrm_offset(s, 0x89, arg, arg1, arg2);
 }
 
+static void tcg_out_ext8u(TCGContext *s, int dest, int src)
+{
+    if (src >= 4) {
+        tcg_out_mov(s, dest, src);
+        if (dest >= 4) {
+            tcg_out_modrm(s, 0x81, ARITH_AND, dest);
+            tcg_out32(s, 0xff);
+            return;
+        }
+        src = dest;
+    }
+    /* movzbl */
+    tcg_out_modrm(s, OPC_MOVZBL, dest, src);
+}
+
+static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
+{
+    /* movzwl */
+    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
+}
+
 static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
 {
     if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
@@ -300,11 +324,9 @@  static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf
         tcg_out_modrm(s, 0x83, c, r0);
         tcg_out8(s, val);
     } else if (c == ARITH_AND && val == 0xffu && r0 < 4) {
-        /* movzbl */
-        tcg_out_modrm(s, 0xb6 | P_EXT, r0, r0);
+        tcg_out_ext8u(s, r0, r0);
     } else if (c == ARITH_AND && val == 0xffffu) {
-        /* movzwl */
-        tcg_out_modrm(s, 0xb7 | P_EXT, r0, r0);
+        tcg_out_ext16u(s, r0, r0);
     } else {
         tcg_out_modrm(s, 0x81, c, r0);
         tcg_out32(s, val);
@@ -645,12 +667,10 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         tcg_out_modrm(s, 0xbf | P_EXT, data_reg, TCG_REG_EAX);
         break;
     case 0:
-        /* movzbl */
-        tcg_out_modrm(s, 0xb6 | P_EXT, data_reg, TCG_REG_EAX);
+        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
         break;
     case 1:
-        /* movzwl */
-        tcg_out_modrm(s, 0xb7 | P_EXT, data_reg, TCG_REG_EAX);
+        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
         break;
     case 2:
     default:
@@ -690,7 +710,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     switch(opc) {
     case 0:
         /* movzbl */
-        tcg_out_modrm_offset(s, 0xb6 | P_EXT, data_reg, r0, GUEST_BASE);
+        tcg_out_modrm_offset(s, OPC_MOVZBL, data_reg, r0, GUEST_BASE);
         break;
     case 0 | 4:
         /* movsbl */
@@ -698,7 +718,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         break;
     case 1:
         /* movzwl */
-        tcg_out_modrm_offset(s, 0xb7 | P_EXT, data_reg, r0, GUEST_BASE);
+        tcg_out_modrm_offset(s, OPC_MOVZWL, data_reg, r0, GUEST_BASE);
         if (bswap) {
             /* rolw $8, data_reg */
             tcg_out8(s, 0x66); 
@@ -850,12 +870,10 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     } else {
         switch(opc) {
         case 0:
-            /* movzbl */
-            tcg_out_modrm(s, 0xb6 | P_EXT, TCG_REG_EDX, data_reg);
+            tcg_out_ext8u(s, TCG_REG_EDX, data_reg);
             break;
         case 1:
-            /* movzwl */
-            tcg_out_modrm(s, 0xb7 | P_EXT, TCG_REG_EDX, data_reg);
+            tcg_out_ext16u(s, TCG_REG_EDX, data_reg);
             break;
         case 2:
             tcg_out_mov(s, TCG_REG_EDX, data_reg);
@@ -881,12 +899,10 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
         tcg_out_mov(s, TCG_REG_EDX, addr_reg2);
         switch(opc) {
         case 0:
-            /* movzbl */
-            tcg_out_modrm(s, 0xb6 | P_EXT, TCG_REG_ECX, data_reg);
+            tcg_out_ext8u(s, TCG_REG_ECX, data_reg);
             break;
         case 1:
-            /* movzwl */
-            tcg_out_modrm(s, 0xb7 | P_EXT, TCG_REG_ECX, data_reg);
+            tcg_out_ext16u(s, TCG_REG_ECX, data_reg);
             break;
         case 2:
             tcg_out_mov(s, TCG_REG_ECX, data_reg);
@@ -1022,7 +1038,7 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_ld8u_i32:
         /* movzbl */
-        tcg_out_modrm_offset(s, 0xb6 | P_EXT, args[0], args[1], args[2]);
+        tcg_out_modrm_offset(s, OPC_MOVZBL, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld8s_i32:
         /* movsbl */
@@ -1030,7 +1046,7 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_ld16u_i32:
         /* movzwl */
-        tcg_out_modrm_offset(s, 0xb7 | P_EXT, args[0], args[1], args[2]);
+        tcg_out_modrm_offset(s, OPC_MOVZWL, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld16s_i32:
         /* movswl */
@@ -1177,10 +1193,10 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_modrm(s, 0xbf | P_EXT, args[0], args[1]);
         break;
     case INDEX_op_ext8u_i32:
-        tcg_out_modrm(s, 0xb6 | P_EXT, args[0], args[1]);
+        tcg_out_ext8u(s, args[0], args[1]);
         break;
     case INDEX_op_ext16u_i32:
-        tcg_out_modrm(s, 0xb7 | P_EXT, args[0], args[1]);
+        tcg_out_ext16u(s, args[0], args[1]);
         break;
 
     case INDEX_op_setcond_i32:
@@ -1275,8 +1291,8 @@  static const TCGTargetOpDef x86_op_defs[] = {
 
     { INDEX_op_ext8s_i32, { "r", "q" } },
     { INDEX_op_ext16s_i32, { "r", "r" } },
-    { INDEX_op_ext8u_i32, { "r", "q"} },
-    { INDEX_op_ext16u_i32, { "r", "r"} },
+    { INDEX_op_ext8u_i32, { "r", "r" } },
+    { INDEX_op_ext16u_i32, { "r", "r" } },
 
     { INDEX_op_setcond_i32, { "q", "r", "ri" } },
     { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },