diff mbox

[2/2] tcg-x86_64: Avoid unnecessary REX.B prefixes.

Message ID 20100106010537.9C9D2CBB@are.twiddle.net
State New
Headers show

Commit Message

Richard Henderson Jan. 6, 2010, 12:31 a.m. UTC
A while ago Laurent pointed out that the setcc opcode emitted by
the setcond patch had unnecessary REX prefixes.

The existing P_REXB internal opcode flag unconditionally emits
the REX prefix.  Technically it's not needed if the register in
question is %al, %bl, %cl, %dl.

Eliding the prefix requires splitting the P_REXB flag into two,
in order to indicate whether the byte register in question is
in the REG or the R/M field.  Within TCG, the byte register is
in the REG field only for stores.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/x86_64/tcg-target.c |   46 ++++++++++++++++++++++++++++++----------------
 1 files changed, 30 insertions(+), 16 deletions(-)

Comments

Richard Henderson Jan. 6, 2010, 4:16 a.m. UTC | #1
On 01/05/2010 04:31 PM, Richard Henderson wrote:
> A while ago Laurent pointed out that the setcc opcode emitted by
> the setcond patch had unnecessary REX prefixes.
>
> The existing P_REXB internal opcode flag unconditionally emits
> the REX prefix.  Technically it's not needed if the register in
> question is %al, %bl, %cl, %dl.

I should perhaps mention that this patch doesn't help out all
that much.  Only EBX is free for use by the code generator, so
more often than not we're using %bpl, which does require the
REX prefix.


r~
Aurelien Jarno Jan. 14, 2010, 4:10 p.m. UTC | #2
On Tue, Jan 05, 2010 at 04:31:11PM -0800, Richard Henderson wrote:
> A while ago Laurent pointed out that the setcc opcode emitted by
> the setcond patch had unnecessary REX prefixes.
> 
> The existing P_REXB internal opcode flag unconditionally emits
> the REX prefix.  Technically it's not needed if the register in
> question is %al, %bl, %cl, %dl.
> 
> Eliding the prefix requires splitting the P_REXB flag into two,
> in order to indicate whether the byte register in question is
> in the REG or the R/M field.  Within TCG, the byte register is
> in the REG field only for stores.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/x86_64/tcg-target.c |   46 ++++++++++++++++++++++++++++++----------------
>  1 files changed, 30 insertions(+), 16 deletions(-)
> 
> diff --git a/tcg/x86_64/tcg-target.c b/tcg/x86_64/tcg-target.c
> index f584c94..8b6b68c 100644
> --- a/tcg/x86_64/tcg-target.c
> +++ b/tcg/x86_64/tcg-target.c
> @@ -217,9 +217,10 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  #define JCC_JLE 0xe
>  #define JCC_JG  0xf
>  
> -#define P_EXT   0x100 /* 0x0f opcode prefix */
> -#define P_REXW  0x200 /* set rex.w = 1 */
> -#define P_REXB  0x400 /* force rex use for byte registers */
> +#define P_EXT		0x100		/* 0x0f opcode prefix */
> +#define P_REXW		0x200		/* set rex.w = 1 */
> +#define P_REXB_R	0x400		/* REG field as byte register */
> +#define P_REXB_RM	0x800		/* R/M field as byte register */
>                                    
>  static const uint8_t tcg_cond_to_jcc[10] = {
>      [TCG_COND_EQ] = JCC_JE,
> @@ -234,17 +235,30 @@ static const uint8_t tcg_cond_to_jcc[10] = {
>      [TCG_COND_GTU] = JCC_JA,
>  };
>  
> -static inline void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
> +static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
>  {
> -    int rex;
> -    rex = ((opc >> 6) & 0x8) | ((r >> 1) & 0x4) | 
> -        ((x >> 2) & 2) | ((rm >> 3) & 1);
> -    if (rex || (opc & P_REXB)) {
> +    int rex = 0;
> +
> +    rex |= (opc & P_REXW) >> 6;		/* REX.W */
> +    rex |= (r & 8) >> 1;		/* REX.R */
> +    rex |= (x & 8) >> 2;		/* REX.X */
> +    rex |= (rm & 8) >> 3;		/* REX.B */
> +
> +    /* P_REXB_{R,RM} indicates that the given register is the low byte.
> +       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
> +       as otherwise the encoding indicates %[abcd]h.  Note that the values
> +       that are ORed in merely indicate that the REX byte must be present;
> +       those bits get discarded in output.  */
> +    rex |= (r >= 4 ? opc & P_REXB_R : 0);
> +    rex |= (rm >= 4 ? opc & P_REXB_RM : 0);
> +
> +    if (rex) {
>          tcg_out8(s, rex | 0x40);
>      }

With the above change, rex can be > 0xff. Not sure it's not a good idea
to not have an explicit cast when calling tcg_out8(), even if it 
technically works.

> -    if (opc & P_EXT)
> +    if (opc & P_EXT) {
>          tcg_out8(s, 0x0f);
> -    tcg_out8(s, opc & 0xff);
> +    }
> +    tcg_out8(s, opc);

What's the reason for removing the '& 0xff' part? tcg_out8() takes an uint8_t.

>  }
>  
>  static inline void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
> @@ -408,7 +422,7 @@ static inline void tgen_arithi32(TCGContext *s, int c, int r0, int32_t val)
>          tcg_out8(s, val);
>      } else if (c == ARITH_AND && val == 0xffu) {
>          /* movzbl */
> -        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB, r0, r0);
> +        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB_RM, r0, r0);
>      } else if (c == ARITH_AND && val == 0xffffu) {
>          /* movzwl */
>          tcg_out_modrm(s, 0xb7 | P_EXT, r0, r0);
> @@ -776,7 +790,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      switch(opc) {
>      case 0:
>          /* movzbl */
> -        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB, TCG_REG_RSI, data_reg);
> +        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB_RM, TCG_REG_RSI, data_reg);
>          break;
>      case 1:
>          /* movzwl */
> @@ -829,7 +843,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      switch(opc) {
>      case 0:
>          /* movb */
> -        tcg_out_modrm_offset(s, 0x88 | P_REXB, data_reg, r0, offset);
> +        tcg_out_modrm_offset(s, 0x88 | P_REXB_R, data_reg, r0, offset);
>          break;
>      case 1:
>          if (bswap) {
> @@ -964,7 +978,7 @@ static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
>      case INDEX_op_st8_i32:
>      case INDEX_op_st8_i64:
>          /* movb */
> -        tcg_out_modrm_offset(s, 0x88 | P_REXB, args[0], args[1], args[2]);
> +        tcg_out_modrm_offset(s, 0x88 | P_REXB_R, args[0], args[1], args[2]);
>          break;
>      case INDEX_op_st16_i32:
>      case INDEX_op_st16_i64:
> @@ -1161,7 +1175,7 @@ static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
>          break;
>  
>      case INDEX_op_ext8s_i32:
> -        tcg_out_modrm(s, 0xbe | P_EXT | P_REXB, args[0], args[1]);
> +        tcg_out_modrm(s, 0xbe | P_EXT | P_REXB_RM, args[0], args[1]);
>          break;
>      case INDEX_op_ext16s_i32:
>          tcg_out_modrm(s, 0xbf | P_EXT, args[0], args[1]);
> @@ -1177,7 +1191,7 @@ static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
>          break;
>      case INDEX_op_ext8u_i32:
>      case INDEX_op_ext8u_i64:
> -        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB, args[0], args[1]);
> +        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB_RM, args[0], args[1]);
>          break;
>      case INDEX_op_ext16u_i32:
>      case INDEX_op_ext16u_i64:
> -- 
> 1.6.5.2
> 
> 
> 
>
Richard Henderson Jan. 14, 2010, 6:09 p.m. UTC | #3
On 01/14/2010 08:10 AM, Aurelien Jarno wrote:
> On Tue, Jan 05, 2010 at 04:31:11PM -0800, Richard Henderson wrote:
>> A while ago Laurent pointed out that the setcc opcode emitted by
>> the setcond patch had unnecessary REX prefixes.
>>
>> The existing P_REXB internal opcode flag unconditionally emits
>> the REX prefix.  Technically it's not needed if the register in
>> question is %al, %bl, %cl, %dl.
>>
>> Eliding the prefix requires splitting the P_REXB flag into two,
>> in order to indicate whether the byte register in question is
>> in the REG or the R/M field.  Within TCG, the byte register is
>> in the REG field only for stores.
>>
>> Signed-off-by: Richard Henderson<rth@twiddle.net>
>> ---
>>   tcg/x86_64/tcg-target.c |   46 ++++++++++++++++++++++++++++++----------------
>>   1 files changed, 30 insertions(+), 16 deletions(-)
>>
>> diff --git a/tcg/x86_64/tcg-target.c b/tcg/x86_64/tcg-target.c
>> index f584c94..8b6b68c 100644
>> --- a/tcg/x86_64/tcg-target.c
>> +++ b/tcg/x86_64/tcg-target.c
>> @@ -217,9 +217,10 @@ static inline int tcg_target_const_match(tcg_target_long val,
>>   #define JCC_JLE 0xe
>>   #define JCC_JG  0xf
>>
>> -#define P_EXT   0x100 /* 0x0f opcode prefix */
>> -#define P_REXW  0x200 /* set rex.w = 1 */
>> -#define P_REXB  0x400 /* force rex use for byte registers */
>> +#define P_EXT		0x100		/* 0x0f opcode prefix */
>> +#define P_REXW		0x200		/* set rex.w = 1 */
>> +#define P_REXB_R	0x400		/* REG field as byte register */
>> +#define P_REXB_RM	0x800		/* R/M field as byte register */
>>
>>   static const uint8_t tcg_cond_to_jcc[10] = {
>>       [TCG_COND_EQ] = JCC_JE,
>> @@ -234,17 +235,30 @@ static const uint8_t tcg_cond_to_jcc[10] = {
>>       [TCG_COND_GTU] = JCC_JA,
>>   };
>>
>> -static inline void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
>> +static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
>>   {
>> -    int rex;
>> -    rex = ((opc>>  6)&  0x8) | ((r>>  1)&  0x4) |
>> -        ((x>>  2)&  2) | ((rm>>  3)&  1);
>> -    if (rex || (opc&  P_REXB)) {
>> +    int rex = 0;
>> +
>> +    rex |= (opc&  P_REXW)>>  6;		/* REX.W */
>> +    rex |= (r&  8)>>  1;		/* REX.R */
>> +    rex |= (x&  8)>>  2;		/* REX.X */
>> +    rex |= (rm&  8)>>  3;		/* REX.B */
>> +
>> +    /* P_REXB_{R,RM} indicates that the given register is the low byte.
>> +       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
>> +       as otherwise the encoding indicates %[abcd]h.  Note that the values
>> +       that are ORed in merely indicate that the REX byte must be present;
>> +       those bits get discarded in output.  */
>> +    rex |= (r>= 4 ? opc&  P_REXB_R : 0);
>> +    rex |= (rm>= 4 ? opc&  P_REXB_RM : 0);
>> +
>> +    if (rex) {
>>           tcg_out8(s, rex | 0x40);
>>       }
>
> With the above change, rex can be > 0xff. Not sure it's not a good idea
> to not have an explicit cast when calling tcg_out8(), even if it
> technically works.

Same as below.

>
>> -    if (opc&  P_EXT)
>> +    if (opc&  P_EXT) {
>>           tcg_out8(s, 0x0f);
>> -    tcg_out8(s, opc&  0xff);
>> +    }
>> +    tcg_out8(s, opc);
>
> What's the reason for removing the '&  0xff' part? tcg_out8() takes an uint8_t.

Yes, and the uint8_t truncates the value just fine.  Is there any 
particular reason you want to clutter the code with a duplicate 
truncation?  It might have been reasonable if the function name didn't 
quite clearly indicate that a single byte was going to be output...


r~
Aurelien Jarno Jan. 14, 2010, 6:51 p.m. UTC | #4
On Thu, Jan 14, 2010 at 10:09:48AM -0800, Richard Henderson wrote:
> On 01/14/2010 08:10 AM, Aurelien Jarno wrote:
>> On Tue, Jan 05, 2010 at 04:31:11PM -0800, Richard Henderson wrote:
>>> A while ago Laurent pointed out that the setcc opcode emitted by
>>> the setcond patch had unnecessary REX prefixes.
>>>
>>> The existing P_REXB internal opcode flag unconditionally emits
>>> the REX prefix.  Technically it's not needed if the register in
>>> question is %al, %bl, %cl, %dl.
>>>
>>> Eliding the prefix requires splitting the P_REXB flag into two,
>>> in order to indicate whether the byte register in question is
>>> in the REG or the R/M field.  Within TCG, the byte register is
>>> in the REG field only for stores.
>>>
>>> Signed-off-by: Richard Henderson<rth@twiddle.net>
>>> ---
>>>   tcg/x86_64/tcg-target.c |   46 ++++++++++++++++++++++++++++++----------------
>>>   1 files changed, 30 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/tcg/x86_64/tcg-target.c b/tcg/x86_64/tcg-target.c
>>> index f584c94..8b6b68c 100644
>>> --- a/tcg/x86_64/tcg-target.c
>>> +++ b/tcg/x86_64/tcg-target.c
>>> @@ -217,9 +217,10 @@ static inline int tcg_target_const_match(tcg_target_long val,
>>>   #define JCC_JLE 0xe
>>>   #define JCC_JG  0xf
>>>
>>> -#define P_EXT   0x100 /* 0x0f opcode prefix */
>>> -#define P_REXW  0x200 /* set rex.w = 1 */
>>> -#define P_REXB  0x400 /* force rex use for byte registers */
>>> +#define P_EXT		0x100		/* 0x0f opcode prefix */
>>> +#define P_REXW		0x200		/* set rex.w = 1 */
>>> +#define P_REXB_R	0x400		/* REG field as byte register */
>>> +#define P_REXB_RM	0x800		/* R/M field as byte register */
>>>
>>>   static const uint8_t tcg_cond_to_jcc[10] = {
>>>       [TCG_COND_EQ] = JCC_JE,
>>> @@ -234,17 +235,30 @@ static const uint8_t tcg_cond_to_jcc[10] = {
>>>       [TCG_COND_GTU] = JCC_JA,
>>>   };
>>>
>>> -static inline void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
>>> +static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
>>>   {
>>> -    int rex;
>>> -    rex = ((opc>>  6)&  0x8) | ((r>>  1)&  0x4) |
>>> -        ((x>>  2)&  2) | ((rm>>  3)&  1);
>>> -    if (rex || (opc&  P_REXB)) {
>>> +    int rex = 0;
>>> +
>>> +    rex |= (opc&  P_REXW)>>  6;		/* REX.W */
>>> +    rex |= (r&  8)>>  1;		/* REX.R */
>>> +    rex |= (x&  8)>>  2;		/* REX.X */
>>> +    rex |= (rm&  8)>>  3;		/* REX.B */
>>> +
>>> +    /* P_REXB_{R,RM} indicates that the given register is the low byte.
>>> +       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
>>> +       as otherwise the encoding indicates %[abcd]h.  Note that the values
>>> +       that are ORed in merely indicate that the REX byte must be present;
>>> +       those bits get discarded in output.  */
>>> +    rex |= (r>= 4 ? opc&  P_REXB_R : 0);
>>> +    rex |= (rm>= 4 ? opc&  P_REXB_RM : 0);
>>> +
>>> +    if (rex) {
>>>           tcg_out8(s, rex | 0x40);
>>>       }
>>
>> With the above change, rex can be > 0xff. Not sure it's not a good idea
>> to not have an explicit cast when calling tcg_out8(), even if it
>> technically works.
>
> Same as below.
>
>>
>>> -    if (opc&  P_EXT)
>>> +    if (opc&  P_EXT) {
>>>           tcg_out8(s, 0x0f);
>>> -    tcg_out8(s, opc&  0xff);
>>> +    }
>>> +    tcg_out8(s, opc);
>>
>> What's the reason for removing the '&  0xff' part? tcg_out8() takes an uint8_t.
>
> Yes, and the uint8_t truncates the value just fine.  Is there any  
> particular reason you want to clutter the code with a duplicate  
> truncation?  It might have been reasonable if the function name didn't  
> quite clearly indicate that a single byte was going to be output...
>

I have to say I don't really like this way of programming. It seems I
agree with gcc people, as they have created the -Wconversion option (ok,
it emits tons of warning within QEMU).

Moreover here, the second truncation removal is totally unreleated to 
this patch.
Jamie Lokier Jan. 15, 2010, 1:37 a.m. UTC | #5
Richard Henderson wrote:
> On 01/14/2010 08:10 AM, Aurelien Jarno wrote:
> >With the above change, rex can be > 0xff. Not sure it's not a good idea
> >to not have an explicit cast when calling tcg_out8(), even if it
> >technically works.

> >What's the reason for removing the '&  0xff' part? tcg_out8() takes an 
> >uint8_t.
> 
> Yes, and the uint8_t truncates the value just fine.  Is there any 
> particular reason you want to clutter the code with a duplicate 
> truncation?  It might have been reasonable if the function name didn't 
> quite clearly indicate that a single byte was going to be output...

The & 0xff makes it clear that rex > 0xff is intentional; that you
have thought about it.

Otherwise it looks like rex > 0xff might be unintentional.  Anyone can
check the code isn't mistaken, but it's better if it doesn't *look*
like a mistake.  After all, there have been mistakes in this sort of
code elsewhere many times.

In this sense, I think it's not cluttering; it's removing excessive
subtlety.

I would hope that GCC optimises the & 0xff away.

-- Jamie
diff mbox

Patch

diff --git a/tcg/x86_64/tcg-target.c b/tcg/x86_64/tcg-target.c
index f584c94..8b6b68c 100644
--- a/tcg/x86_64/tcg-target.c
+++ b/tcg/x86_64/tcg-target.c
@@ -217,9 +217,10 @@  static inline int tcg_target_const_match(tcg_target_long val,
 #define JCC_JLE 0xe
 #define JCC_JG  0xf
 
-#define P_EXT   0x100 /* 0x0f opcode prefix */
-#define P_REXW  0x200 /* set rex.w = 1 */
-#define P_REXB  0x400 /* force rex use for byte registers */
+#define P_EXT		0x100		/* 0x0f opcode prefix */
+#define P_REXW		0x200		/* set rex.w = 1 */
+#define P_REXB_R	0x400		/* REG field as byte register */
+#define P_REXB_RM	0x800		/* R/M field as byte register */
                                   
 static const uint8_t tcg_cond_to_jcc[10] = {
     [TCG_COND_EQ] = JCC_JE,
@@ -234,17 +235,30 @@  static const uint8_t tcg_cond_to_jcc[10] = {
     [TCG_COND_GTU] = JCC_JA,
 };
 
-static inline void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
+static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 {
-    int rex;
-    rex = ((opc >> 6) & 0x8) | ((r >> 1) & 0x4) | 
-        ((x >> 2) & 2) | ((rm >> 3) & 1);
-    if (rex || (opc & P_REXB)) {
+    int rex = 0;
+
+    rex |= (opc & P_REXW) >> 6;		/* REX.W */
+    rex |= (r & 8) >> 1;		/* REX.R */
+    rex |= (x & 8) >> 2;		/* REX.X */
+    rex |= (rm & 8) >> 3;		/* REX.B */
+
+    /* P_REXB_{R,RM} indicates that the given register is the low byte.
+       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
+       as otherwise the encoding indicates %[abcd]h.  Note that the values
+       that are ORed in merely indicate that the REX byte must be present;
+       those bits get discarded in output.  */
+    rex |= (r >= 4 ? opc & P_REXB_R : 0);
+    rex |= (rm >= 4 ? opc & P_REXB_RM : 0);
+
+    if (rex) {
         tcg_out8(s, rex | 0x40);
     }
-    if (opc & P_EXT)
+    if (opc & P_EXT) {
         tcg_out8(s, 0x0f);
-    tcg_out8(s, opc & 0xff);
+    }
+    tcg_out8(s, opc);
 }
 
 static inline void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
@@ -408,7 +422,7 @@  static inline void tgen_arithi32(TCGContext *s, int c, int r0, int32_t val)
         tcg_out8(s, val);
     } else if (c == ARITH_AND && val == 0xffu) {
         /* movzbl */
-        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB, r0, r0);
+        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB_RM, r0, r0);
     } else if (c == ARITH_AND && val == 0xffffu) {
         /* movzwl */
         tcg_out_modrm(s, 0xb7 | P_EXT, r0, r0);
@@ -776,7 +790,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     switch(opc) {
     case 0:
         /* movzbl */
-        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB, TCG_REG_RSI, data_reg);
+        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB_RM, TCG_REG_RSI, data_reg);
         break;
     case 1:
         /* movzwl */
@@ -829,7 +843,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     switch(opc) {
     case 0:
         /* movb */
-        tcg_out_modrm_offset(s, 0x88 | P_REXB, data_reg, r0, offset);
+        tcg_out_modrm_offset(s, 0x88 | P_REXB_R, data_reg, r0, offset);
         break;
     case 1:
         if (bswap) {
@@ -964,7 +978,7 @@  static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
     case INDEX_op_st8_i32:
     case INDEX_op_st8_i64:
         /* movb */
-        tcg_out_modrm_offset(s, 0x88 | P_REXB, args[0], args[1], args[2]);
+        tcg_out_modrm_offset(s, 0x88 | P_REXB_R, args[0], args[1], args[2]);
         break;
     case INDEX_op_st16_i32:
     case INDEX_op_st16_i64:
@@ -1161,7 +1175,7 @@  static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
         break;
 
     case INDEX_op_ext8s_i32:
-        tcg_out_modrm(s, 0xbe | P_EXT | P_REXB, args[0], args[1]);
+        tcg_out_modrm(s, 0xbe | P_EXT | P_REXB_RM, args[0], args[1]);
         break;
     case INDEX_op_ext16s_i32:
         tcg_out_modrm(s, 0xbf | P_EXT, args[0], args[1]);
@@ -1177,7 +1191,7 @@  static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
         break;
     case INDEX_op_ext8u_i32:
     case INDEX_op_ext8u_i64:
-        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB, args[0], args[1]);
+        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB_RM, args[0], args[1]);
         break;
     case INDEX_op_ext16u_i32:
     case INDEX_op_ext16u_i64: