diff mbox

[v4,04/33] tcg-aarch64: Hoist common argument loads in tcg_out_op

Message ID 1379195690-6509-5-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Sept. 14, 2013, 9:54 p.m. UTC
This reduces the code size of the function significantly.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/aarch64/tcg-target.c | 95 +++++++++++++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 45 deletions(-)

Comments

Claudio Fontana Sept. 16, 2013, 7:42 a.m. UTC | #1
Hello Richard,

On 14.09.2013 23:54, Richard Henderson wrote:
> This reduces the code size of the function significantly.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/aarch64/tcg-target.c | 95 +++++++++++++++++++++++++-----------------------
>  1 file changed, 50 insertions(+), 45 deletions(-)
> 
> diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
> index 8f19b50..8f5814d 100644
> --- a/tcg/aarch64/tcg-target.c
> +++ b/tcg/aarch64/tcg-target.c
> @@ -1113,15 +1113,22 @@ static inline void tcg_out_load_pair(TCGContext *s, TCGReg addr,
>  }
>  
>  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
> -                       const TCGArg *args, const int *const_args)
> +                       const TCGArg args[TCG_MAX_OP_ARGS],
> +                       const int const_args[TCG_MAX_OP_ARGS])
>  {
>      /* 99% of the time, we can signal the use of extension registers
>         by looking to see if the opcode handles 64-bit data.  */
>      TCGType ext = (tcg_op_defs[opc].flags & TCG_OPF_64BIT) != 0;
>  
> +    /* Hoist the loads of the most common arguments.  */
> +    TCGArg a0 = args[0];
> +    TCGArg a1 = args[1];
> +    TCGArg a2 = args[2];
> +    int c2 = const_args[2];
> +

Either all or none (add c0, c1), I would expect the compiler not to generate code for the paths that don't use C[n].

Btw, if the compiler generates bloated code without this, we should notify the projects working on gcc for aarch64.

>      switch (opc) {
>      case INDEX_op_exit_tb:
> -        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, args[0]);
> +        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
>          tcg_out_goto(s, (tcg_target_long)tb_ret_addr);
>          break;
>  
> @@ -1130,23 +1137,23 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
>  #error "USE_DIRECT_JUMP required for aarch64"
>  #endif
>          assert(s->tb_jmp_offset != NULL); /* consistency for USE_DIRECT_JUMP */
> -        s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
> +        s->tb_jmp_offset[a0] = s->code_ptr - s->code_buf;
>          /* actual branch destination will be patched by
>             aarch64_tb_set_jmp_target later, beware retranslation. */
>          tcg_out_goto_noaddr(s);
> -        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
> +        s->tb_next_offset[a0] = s->code_ptr - s->code_buf;
>          break;
>  
>      case INDEX_op_call:
>          if (const_args[0]) {
> -            tcg_out_call(s, args[0]);
> +            tcg_out_call(s, a0);
>          } else {
> -            tcg_out_callr(s, args[0]);
> +            tcg_out_callr(s, a0);
>          }
>          break;
>  
>      case INDEX_op_br:
> -        tcg_out_goto_label(s, args[0]);
> +        tcg_out_goto_label(s, a0);
>          break;
>  
>      case INDEX_op_ld_i32:
> @@ -1169,97 +1176,95 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
>      case INDEX_op_st16_i64:
>      case INDEX_op_st32_i64:
>          tcg_out_ldst(s, aarch64_ldst_get_data(opc), aarch64_ldst_get_type(opc),
> -                     args[0], args[1], args[2]);
> +                     a0, a1, a2);
>          break;
>  
>      case INDEX_op_add_i64:
>      case INDEX_op_add_i32:
> -        tcg_out_arith(s, ARITH_ADD, ext, args[0], args[1], args[2], 0);
> +        tcg_out_arith(s, ARITH_ADD, ext, a0, a1, a2, 0);
>          break;
>  
>      case INDEX_op_sub_i64:
>      case INDEX_op_sub_i32:
> -        tcg_out_arith(s, ARITH_SUB, ext, args[0], args[1], args[2], 0);
> +        tcg_out_arith(s, ARITH_SUB, ext, a0, a1, a2, 0);
>          break;
>  
>      case INDEX_op_and_i64:
>      case INDEX_op_and_i32:
> -        tcg_out_arith(s, ARITH_AND, ext, args[0], args[1], args[2], 0);
> +        tcg_out_arith(s, ARITH_AND, ext, a0, a1, a2, 0);
>          break;
>  
>      case INDEX_op_or_i64:
>      case INDEX_op_or_i32:
> -        tcg_out_arith(s, ARITH_OR, ext, args[0], args[1], args[2], 0);
> +        tcg_out_arith(s, ARITH_OR, ext, a0, a1, a2, 0);
>          break;
>  
>      case INDEX_op_xor_i64:
>      case INDEX_op_xor_i32:
> -        tcg_out_arith(s, ARITH_XOR, ext, args[0], args[1], args[2], 0);
> +        tcg_out_arith(s, ARITH_XOR, ext, a0, a1, a2, 0);
>          break;
>  
>      case INDEX_op_mul_i64:
>      case INDEX_op_mul_i32:
> -        tcg_out_mul(s, ext, args[0], args[1], args[2]);
> +        tcg_out_mul(s, ext, a0, a1, a2);
>          break;
>  
>      case INDEX_op_shl_i64:
>      case INDEX_op_shl_i32:
> -        if (const_args[2]) {    /* LSL / UBFM Wd, Wn, (32 - m) */
> -            tcg_out_shl(s, ext, args[0], args[1], args[2]);
> +        if (c2) {    /* LSL / UBFM Wd, Wn, (32 - m) */
> +            tcg_out_shl(s, ext, a0, a1, a2);
>          } else {                /* LSL / LSLV */
> -            tcg_out_shiftrot_reg(s, SRR_SHL, ext, args[0], args[1], args[2]);
> +            tcg_out_shiftrot_reg(s, SRR_SHL, ext, a0, a1, a2);
>          }
>          break;
>  
>      case INDEX_op_shr_i64:
>      case INDEX_op_shr_i32:
> -        if (const_args[2]) {    /* LSR / UBFM Wd, Wn, m, 31 */
> -            tcg_out_shr(s, ext, args[0], args[1], args[2]);
> +        if (c2) {    /* LSR / UBFM Wd, Wn, m, 31 */
> +            tcg_out_shr(s, ext, a0, a1, a2);
>          } else {                /* LSR / LSRV */
> -            tcg_out_shiftrot_reg(s, SRR_SHR, ext, args[0], args[1], args[2]);
> +            tcg_out_shiftrot_reg(s, SRR_SHR, ext, a0, a1, a2);
>          }
>          break;
>  
>      case INDEX_op_sar_i64:
>      case INDEX_op_sar_i32:
> -        if (const_args[2]) {    /* ASR / SBFM Wd, Wn, m, 31 */
> -            tcg_out_sar(s, ext, args[0], args[1], args[2]);
> +        if (c2) {    /* ASR / SBFM Wd, Wn, m, 31 */
> +            tcg_out_sar(s, ext, a0, a1, a2);
>          } else {                /* ASR / ASRV */
> -            tcg_out_shiftrot_reg(s, SRR_SAR, ext, args[0], args[1], args[2]);
> +            tcg_out_shiftrot_reg(s, SRR_SAR, ext, a0, a1, a2);
>          }
>          break;
>  
>      case INDEX_op_rotr_i64:
>      case INDEX_op_rotr_i32:
> -        if (const_args[2]) {    /* ROR / EXTR Wd, Wm, Wm, m */
> -            tcg_out_rotr(s, ext, args[0], args[1], args[2]);
> +        if (c2) {    /* ROR / EXTR Wd, Wm, Wm, m */
> +            tcg_out_rotr(s, ext, a0, a1, a2);
>          } else {                /* ROR / RORV */
> -            tcg_out_shiftrot_reg(s, SRR_ROR, ext, args[0], args[1], args[2]);
> +            tcg_out_shiftrot_reg(s, SRR_ROR, ext, a0, a1, a2);
>          }
>          break;
>  
>      case INDEX_op_rotl_i64:
>      case INDEX_op_rotl_i32:     /* same as rotate right by (32 - m) */
> -        if (const_args[2]) {    /* ROR / EXTR Wd, Wm, Wm, 32 - m */
> -            tcg_out_rotl(s, ext, args[0], args[1], args[2]);
> +        if (c2) {    /* ROR / EXTR Wd, Wm, Wm, 32 - m */
> +            tcg_out_rotl(s, ext, a0, a1, a2);
>          } else {
> -            tcg_out_arith(s, ARITH_SUB, 0,
> -                          TCG_REG_TMP, TCG_REG_XZR, args[2], 0);
> -            tcg_out_shiftrot_reg(s, SRR_ROR, ext,
> -                                 args[0], args[1], TCG_REG_TMP);
> +            tcg_out_arith(s, ARITH_SUB, 0, TCG_REG_TMP, TCG_REG_XZR, a2, 0);
> +            tcg_out_shiftrot_reg(s, SRR_ROR, ext, a0, a1, TCG_REG_TMP);
>          }
>          break;
>  
>      case INDEX_op_brcond_i64:
> -    case INDEX_op_brcond_i32: /* CMP 0, 1, cond(2), label 3 */
> -        tcg_out_cmp(s, ext, args[0], args[1], 0);
> -        tcg_out_goto_label_cond(s, args[2], args[3]);
> +    case INDEX_op_brcond_i32:
> +        tcg_out_cmp(s, ext, a0, a1, 0);
> +        tcg_out_goto_label_cond(s, a2, args[3]);
>          break;
>  
>      case INDEX_op_setcond_i64:
>      case INDEX_op_setcond_i32:
> -        tcg_out_cmp(s, ext, args[1], args[2], 0);
> -        tcg_out_cset(s, 0, args[0], args[3]);
> +        tcg_out_cmp(s, ext, a1, a2, 0);
> +        tcg_out_cset(s, 0, a0, args[3]);
>          break;
>  
>      case INDEX_op_qemu_ld8u:
> @@ -1305,34 +1310,34 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          /* FALLTHRU */
>      case INDEX_op_bswap64_i64:
>      case INDEX_op_bswap32_i32:
> -        tcg_out_rev(s, ext, args[0], args[1]);
> +        tcg_out_rev(s, ext, a0, a1);
>          break;
>      case INDEX_op_bswap16_i64:
>      case INDEX_op_bswap16_i32:
> -        tcg_out_rev16(s, 0, args[0], args[1]);
> +        tcg_out_rev16(s, 0, a0, a1);
>          break;
>  
>      case INDEX_op_ext8s_i64:
>      case INDEX_op_ext8s_i32:
> -        tcg_out_sxt(s, ext, 0, args[0], args[1]);
> +        tcg_out_sxt(s, ext, 0, a0, a1);
>          break;
>      case INDEX_op_ext16s_i64:
>      case INDEX_op_ext16s_i32:
> -        tcg_out_sxt(s, ext, 1, args[0], args[1]);
> +        tcg_out_sxt(s, ext, 1, a0, a1);
>          break;
>      case INDEX_op_ext32s_i64:
> -        tcg_out_sxt(s, 1, 2, args[0], args[1]);
> +        tcg_out_sxt(s, 1, 2, a0, a1);
>          break;
>      case INDEX_op_ext8u_i64:
>      case INDEX_op_ext8u_i32:
> -        tcg_out_uxt(s, 0, args[0], args[1]);
> +        tcg_out_uxt(s, 0, a0, a1);
>          break;
>      case INDEX_op_ext16u_i64:
>      case INDEX_op_ext16u_i32:
> -        tcg_out_uxt(s, 1, args[0], args[1]);
> +        tcg_out_uxt(s, 1, a0, a1);
>          break;
>      case INDEX_op_ext32u_i64:
> -        tcg_out_movr(s, 0, args[0], args[1]);
> +        tcg_out_movr(s, 0, a0, a1);
>          break;
>  
>      case INDEX_op_mov_i64:
> 

Claudio
Richard Henderson Sept. 16, 2013, 4:20 p.m. UTC | #2
On 09/16/2013 12:42 AM, Claudio Fontana wrote:
>> +    /* Hoist the loads of the most common arguments.  */
>> > +    TCGArg a0 = args[0];
>> > +    TCGArg a1 = args[1];
>> > +    TCGArg a2 = args[2];
>> > +    int c2 = const_args[2];
>> > +
> Either all or none (add c0, c1), I would expect the compiler not to
> generate code for the paths that don't use C[n].

I chose the most common.  Those used in 90% of all of the cases.

> 
> Btw, if the compiler generates bloated code without this, we should notify
> the projects working on gcc for aarch64.

It's not the compiler's fault.  After parameter decomposition, the arrays
become pointers, and the compiler can't tell that it's always safe to perform
the loads.  So in general it can't hoist the loads higher than the first
explicit reference that proves the pointers must be non-null.

Now that I think about it, we might actually do better, generically, to package
all of these arguments up into a struct.  The compiler can more easily reason
about the collective safety of structure access...


r~
Claudio Fontana Sept. 17, 2013, 8:01 a.m. UTC | #3
On 16.09.2013 18:20, Richard Henderson wrote:
> On 09/16/2013 12:42 AM, Claudio Fontana wrote:
>>> +    /* Hoist the loads of the most common arguments.  */
>>>> +    TCGArg a0 = args[0];
>>>> +    TCGArg a1 = args[1];
>>>> +    TCGArg a2 = args[2];
>>>> +    int c2 = const_args[2];
>>>> +
>> Either all or none (add c0, c1), I would expect the compiler not to
>> generate code for the paths that don't use C[n].
> 
> I chose the most common.  Those used in 90% of all of the cases.

What you did is clear, does not change the fact that the mixing of use of the variables and args[] is confusing.
So either we add int c0 and int c1, and use the hoisted variables exclusively later, or we don't do it.

>> Btw, if the compiler generates bloated code without this, we should notify
>> the projects working on gcc for aarch64.
> 
> It's not the compiler's fault.  After parameter decomposition, the arrays
> become pointers, and the compiler can't tell that it's always safe to perform
> the loads.  So in general it can't hoist the loads higher than the first
> explicit reference that proves the pointers must be non-null.
> 
> Now that I think about it, we might actually do better, generically, to package
> all of these arguments up into a struct.  The compiler can more easily reason
> about the collective safety of structure access...

I don't have anything against it in principle, but just adding c0 and c1, which iirc should cover all uses, would be fine by me.

Claudio
Richard Henderson Sept. 17, 2013, 2:27 p.m. UTC | #4
On 09/17/2013 01:01 AM, Claudio Fontana wrote:
> I don't have anything against it in principle, but just adding c0 and c1,
> which iirc should cover all uses, would be fine by me.

Not really.

There are 6 potential args[] values, 5 of which might be const_args[].


r~
Claudio Fontana Sept. 18, 2013, 8:10 a.m. UTC | #5
On 17.09.2013 16:27, Richard Henderson wrote:
> On 09/17/2013 01:01 AM, Claudio Fontana wrote:
>> I don't have anything against it in principle, but just adding c0 and c1,
>> which iirc should cover all uses, would be fine by me.
> 
> Not really.
> 
> There are 6 potential args[] values, 5 of which might be const_args[].
> 

Current uses. Like the args[] that are actually used right now.
Richard Henderson Sept. 18, 2013, 2 p.m. UTC | #6
On 09/18/2013 01:10 AM, Claudio Fontana wrote:
> On 17.09.2013 16:27, Richard Henderson wrote:
>> On 09/17/2013 01:01 AM, Claudio Fontana wrote:
>>> I don't have anything against it in principle, but just adding c0 and c1,
>>> which iirc should cover all uses, would be fine by me.
>>
>> Not really.
>>
>> There are 6 potential args[] values, 5 of which might be const_args[].
>>
> 
> Current uses. Like the args[] that are actually used right now.

That's what I'm talking about.  E.g. movcond: 6 args[], 1-4 are inputs
that might have const_args[].


r~
Claudio Fontana Sept. 18, 2013, 2:18 p.m. UTC | #7
On 17.09.2013 16:27, Richard Henderson wrote:
> On 09/17/2013 01:01 AM, Claudio Fontana wrote:
>> I don't have anything against it in principle, but just adding c0 and c1,
>> which iirc should cover all uses, would be fine by me.
> 
> Not really.
> 
> There are 6 potential args[] values, 5 of which might be const_args[].
> 
> 
> r~

Ok adding all of those would be a waste, did not keep the movcond changes into consideration.
Your initial proposal was indeed the best tradeoff then, lets just hoist the most commonly used args then.

C.
diff mbox

Patch

diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 8f19b50..8f5814d 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -1113,15 +1113,22 @@  static inline void tcg_out_load_pair(TCGContext *s, TCGReg addr,
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-                       const TCGArg *args, const int *const_args)
+                       const TCGArg args[TCG_MAX_OP_ARGS],
+                       const int const_args[TCG_MAX_OP_ARGS])
 {
     /* 99% of the time, we can signal the use of extension registers
        by looking to see if the opcode handles 64-bit data.  */
     TCGType ext = (tcg_op_defs[opc].flags & TCG_OPF_64BIT) != 0;
 
+    /* Hoist the loads of the most common arguments.  */
+    TCGArg a0 = args[0];
+    TCGArg a1 = args[1];
+    TCGArg a2 = args[2];
+    int c2 = const_args[2];
+
     switch (opc) {
     case INDEX_op_exit_tb:
-        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, args[0]);
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
         tcg_out_goto(s, (tcg_target_long)tb_ret_addr);
         break;
 
@@ -1130,23 +1137,23 @@  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 #error "USE_DIRECT_JUMP required for aarch64"
 #endif
         assert(s->tb_jmp_offset != NULL); /* consistency for USE_DIRECT_JUMP */
-        s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
+        s->tb_jmp_offset[a0] = s->code_ptr - s->code_buf;
         /* actual branch destination will be patched by
            aarch64_tb_set_jmp_target later, beware retranslation. */
         tcg_out_goto_noaddr(s);
-        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
+        s->tb_next_offset[a0] = s->code_ptr - s->code_buf;
         break;
 
     case INDEX_op_call:
         if (const_args[0]) {
-            tcg_out_call(s, args[0]);
+            tcg_out_call(s, a0);
         } else {
-            tcg_out_callr(s, args[0]);
+            tcg_out_callr(s, a0);
         }
         break;
 
     case INDEX_op_br:
-        tcg_out_goto_label(s, args[0]);
+        tcg_out_goto_label(s, a0);
         break;
 
     case INDEX_op_ld_i32:
@@ -1169,97 +1176,95 @@  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_st16_i64:
     case INDEX_op_st32_i64:
         tcg_out_ldst(s, aarch64_ldst_get_data(opc), aarch64_ldst_get_type(opc),
-                     args[0], args[1], args[2]);
+                     a0, a1, a2);
         break;
 
     case INDEX_op_add_i64:
     case INDEX_op_add_i32:
-        tcg_out_arith(s, ARITH_ADD, ext, args[0], args[1], args[2], 0);
+        tcg_out_arith(s, ARITH_ADD, ext, a0, a1, a2, 0);
         break;
 
     case INDEX_op_sub_i64:
     case INDEX_op_sub_i32:
-        tcg_out_arith(s, ARITH_SUB, ext, args[0], args[1], args[2], 0);
+        tcg_out_arith(s, ARITH_SUB, ext, a0, a1, a2, 0);
         break;
 
     case INDEX_op_and_i64:
     case INDEX_op_and_i32:
-        tcg_out_arith(s, ARITH_AND, ext, args[0], args[1], args[2], 0);
+        tcg_out_arith(s, ARITH_AND, ext, a0, a1, a2, 0);
         break;
 
     case INDEX_op_or_i64:
     case INDEX_op_or_i32:
-        tcg_out_arith(s, ARITH_OR, ext, args[0], args[1], args[2], 0);
+        tcg_out_arith(s, ARITH_OR, ext, a0, a1, a2, 0);
         break;
 
     case INDEX_op_xor_i64:
     case INDEX_op_xor_i32:
-        tcg_out_arith(s, ARITH_XOR, ext, args[0], args[1], args[2], 0);
+        tcg_out_arith(s, ARITH_XOR, ext, a0, a1, a2, 0);
         break;
 
     case INDEX_op_mul_i64:
     case INDEX_op_mul_i32:
-        tcg_out_mul(s, ext, args[0], args[1], args[2]);
+        tcg_out_mul(s, ext, a0, a1, a2);
         break;
 
     case INDEX_op_shl_i64:
     case INDEX_op_shl_i32:
-        if (const_args[2]) {    /* LSL / UBFM Wd, Wn, (32 - m) */
-            tcg_out_shl(s, ext, args[0], args[1], args[2]);
+        if (c2) {    /* LSL / UBFM Wd, Wn, (32 - m) */
+            tcg_out_shl(s, ext, a0, a1, a2);
         } else {                /* LSL / LSLV */
-            tcg_out_shiftrot_reg(s, SRR_SHL, ext, args[0], args[1], args[2]);
+            tcg_out_shiftrot_reg(s, SRR_SHL, ext, a0, a1, a2);
         }
         break;
 
     case INDEX_op_shr_i64:
     case INDEX_op_shr_i32:
-        if (const_args[2]) {    /* LSR / UBFM Wd, Wn, m, 31 */
-            tcg_out_shr(s, ext, args[0], args[1], args[2]);
+        if (c2) {    /* LSR / UBFM Wd, Wn, m, 31 */
+            tcg_out_shr(s, ext, a0, a1, a2);
         } else {                /* LSR / LSRV */
-            tcg_out_shiftrot_reg(s, SRR_SHR, ext, args[0], args[1], args[2]);
+            tcg_out_shiftrot_reg(s, SRR_SHR, ext, a0, a1, a2);
         }
         break;
 
     case INDEX_op_sar_i64:
     case INDEX_op_sar_i32:
-        if (const_args[2]) {    /* ASR / SBFM Wd, Wn, m, 31 */
-            tcg_out_sar(s, ext, args[0], args[1], args[2]);
+        if (c2) {    /* ASR / SBFM Wd, Wn, m, 31 */
+            tcg_out_sar(s, ext, a0, a1, a2);
         } else {                /* ASR / ASRV */
-            tcg_out_shiftrot_reg(s, SRR_SAR, ext, args[0], args[1], args[2]);
+            tcg_out_shiftrot_reg(s, SRR_SAR, ext, a0, a1, a2);
         }
         break;
 
     case INDEX_op_rotr_i64:
     case INDEX_op_rotr_i32:
-        if (const_args[2]) {    /* ROR / EXTR Wd, Wm, Wm, m */
-            tcg_out_rotr(s, ext, args[0], args[1], args[2]);
+        if (c2) {    /* ROR / EXTR Wd, Wm, Wm, m */
+            tcg_out_rotr(s, ext, a0, a1, a2);
         } else {                /* ROR / RORV */
-            tcg_out_shiftrot_reg(s, SRR_ROR, ext, args[0], args[1], args[2]);
+            tcg_out_shiftrot_reg(s, SRR_ROR, ext, a0, a1, a2);
         }
         break;
 
     case INDEX_op_rotl_i64:
     case INDEX_op_rotl_i32:     /* same as rotate right by (32 - m) */
-        if (const_args[2]) {    /* ROR / EXTR Wd, Wm, Wm, 32 - m */
-            tcg_out_rotl(s, ext, args[0], args[1], args[2]);
+        if (c2) {    /* ROR / EXTR Wd, Wm, Wm, 32 - m */
+            tcg_out_rotl(s, ext, a0, a1, a2);
         } else {
-            tcg_out_arith(s, ARITH_SUB, 0,
-                          TCG_REG_TMP, TCG_REG_XZR, args[2], 0);
-            tcg_out_shiftrot_reg(s, SRR_ROR, ext,
-                                 args[0], args[1], TCG_REG_TMP);
+            tcg_out_arith(s, ARITH_SUB, 0, TCG_REG_TMP, TCG_REG_XZR, a2, 0);
+            tcg_out_shiftrot_reg(s, SRR_ROR, ext, a0, a1, TCG_REG_TMP);
         }
         break;
 
     case INDEX_op_brcond_i64:
-    case INDEX_op_brcond_i32: /* CMP 0, 1, cond(2), label 3 */
-        tcg_out_cmp(s, ext, args[0], args[1], 0);
-        tcg_out_goto_label_cond(s, args[2], args[3]);
+    case INDEX_op_brcond_i32:
+        tcg_out_cmp(s, ext, a0, a1, 0);
+        tcg_out_goto_label_cond(s, a2, args[3]);
         break;
 
     case INDEX_op_setcond_i64:
     case INDEX_op_setcond_i32:
-        tcg_out_cmp(s, ext, args[1], args[2], 0);
-        tcg_out_cset(s, 0, args[0], args[3]);
+        tcg_out_cmp(s, ext, a1, a2, 0);
+        tcg_out_cset(s, 0, a0, args[3]);
         break;
 
     case INDEX_op_qemu_ld8u:
@@ -1305,34 +1310,34 @@  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         /* FALLTHRU */
     case INDEX_op_bswap64_i64:
     case INDEX_op_bswap32_i32:
-        tcg_out_rev(s, ext, args[0], args[1]);
+        tcg_out_rev(s, ext, a0, a1);
         break;
     case INDEX_op_bswap16_i64:
     case INDEX_op_bswap16_i32:
-        tcg_out_rev16(s, 0, args[0], args[1]);
+        tcg_out_rev16(s, 0, a0, a1);
         break;
 
     case INDEX_op_ext8s_i64:
     case INDEX_op_ext8s_i32:
-        tcg_out_sxt(s, ext, 0, args[0], args[1]);
+        tcg_out_sxt(s, ext, 0, a0, a1);
         break;
     case INDEX_op_ext16s_i64:
     case INDEX_op_ext16s_i32:
-        tcg_out_sxt(s, ext, 1, args[0], args[1]);
+        tcg_out_sxt(s, ext, 1, a0, a1);
         break;
     case INDEX_op_ext32s_i64:
-        tcg_out_sxt(s, 1, 2, args[0], args[1]);
+        tcg_out_sxt(s, 1, 2, a0, a1);
         break;
     case INDEX_op_ext8u_i64:
     case INDEX_op_ext8u_i32:
-        tcg_out_uxt(s, 0, args[0], args[1]);
+        tcg_out_uxt(s, 0, a0, a1);
         break;
     case INDEX_op_ext16u_i64:
     case INDEX_op_ext16u_i32:
-        tcg_out_uxt(s, 1, args[0], args[1]);
+        tcg_out_uxt(s, 1, a0, a1);
         break;
     case INDEX_op_ext32u_i64:
-        tcg_out_movr(s, 0, args[0], args[1]);
+        tcg_out_movr(s, 0, a0, a1);
         break;
 
     case INDEX_op_mov_i64: