Patchwork [v2,0/6] Implement constant folding and copy propagation in TCG

login
register
mail settings
Submitter Richard Henderson
Date June 10, 2011, 6:08 p.m.
Message ID <4DF25D9A.80606@twiddle.net>
Download mbox | patch
Permalink /patch/99942/
State New
Headers show

Comments

Richard Henderson - June 10, 2011, 6:08 p.m.
On 06/09/2011 03:45 AM, Kirill Batuzov wrote:
> Changes:
> v1 -> v2
>  - State and Vals arrays merged to an array of structures.
>  - Added reference counting of temp's copies. This helps to reset temp's state
>    faster in most cases.
>  - Do not make copy propagation through operations with TCG_OPF_CALL_CLOBBER or
>    TCG_OPF_SIDE_EFFECTS flag.
>  - Split some expression simplifications into independent switch.
>  - Let compiler handle signed shifts and sign/zero extends in it's
>    implementation defined way.
> 
> Kirill Batuzov (6):
>   Add TCG optimizations stub
>   Add copy and constant propagation.
>   Do constant folding for basic arithmetic operations.
>   Do constant folding for boolean operations.
>   Do constant folding for shift operations.
>   Do constant folding for unary operations.
> 
>  Makefile.target |    2 +-
>  tcg/optimize.c  |  633 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tcg/tcg.c       |    6 +
>  tcg/tcg.h       |    3 +
>  4 files changed, 643 insertions(+), 1 deletions(-)
>  create mode 100644 tcg/optimize.c
> 

FWIW, a patch that implements most of the suggestions that I made
to the indivual patches in this thread, including a major restructure
of the body of the optimization to avoid code duplication.

I havn't bothered to break it up for now, but at least you can see 
what I was talking about.


r~

---
From 2b49e328d3d2461f97f0b6802e0c8e88e0165b1c Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@anchor.twiddle.net>
Date: Fri, 10 Jun 2011 10:08:17 -0700
Subject: [PATCH] Re-org tcg optimizer.

1: Put equivalence class members in a double-linked list.
2: Use glue to handle 32+64-bit opcodes at the same time.
3: Tidy duplicate code sequences inside tcg_constant_folding.
---
 tcg/optimize.c |  822 +++++++++++++++++++++++++++++---------------------------
 1 files changed, 419 insertions(+), 403 deletions(-)

Patch

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 2cdcc29..9768043 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -32,63 +32,124 @@ 
 #include "tcg-op.h"
 
 typedef enum {
-    TCG_TEMP_UNDEF = 0,
+    TCG_TEMP_VAR = 0,
     TCG_TEMP_CONST,
     TCG_TEMP_COPY,
-    TCG_TEMP_ANY
 } tcg_temp_state;
 
 struct tcg_temp_info {
     tcg_temp_state state;
-    uint16_t num_copies;
+    uint16_t prev_copy;
+    uint16_t next_copy;
     tcg_target_ulong val;
 };
 
-/* Reset TEMP's state to TCG_TEMP_ANY.  If TEMP was a representative of some
-   class of equivalent temp's, a new representative should be chosen in this
-   class. */
-static void reset_temp(struct tcg_temp_info *temps, TCGArg temp, int nb_temps,
-                       int nb_globals)
+/* Array TEMPS has an element for each temp.
+   If this temp holds a constant then its value is kept in .VAL.
+   If this temp is a copy of other ones then this equivalence class'
+   representative is kept in .VAL.  */
+struct tcg_temp_info temps[TCG_MAX_TEMPS];
+
+/* Avoid some of the rampant if-deffery related to 32 vs 64-bit operations.  */
+#if TCG_TARGET_REG_BITS == 64
+#define CASE_OP_32_64(x)                        \
+        glue(glue(case INDEX_op_, x), _i32):    \
+        glue(glue(case INDEX_op_, x), _i64)
+#else
+#define CASE_OP_32_64(x)                        \
+        glue(glue(case INDEX_op_, x), _i32)
+#endif
+
+
+/* Reset all temporaries to TCG_TEMP_VAR.  */
+static void reset_all_temps(int nb_temps)
 {
     int i;
-    TCGArg new_base;
-    if (temps[temp].state == TCG_TEMP_COPY) {
-        temps[temps[temp].val].num_copies--;
+    for (i = 0; i < nb_temps; ++i) {
+        temps[i].state = TCG_TEMP_VAR;
+        temps[i].prev_copy = i;
+        temps[i].next_copy = i;
     }
-    if (temps[temp].num_copies > 0) {
-        new_base = (TCGArg)-1;
-        for (i = nb_globals; i < nb_temps; i++) {
-            if (temps[i].state == TCG_TEMP_COPY && temps[i].val == temp) {
-                if (new_base == ((TCGArg)-1)) {
-                    new_base = (TCGArg)i;
-                    temps[i].state = TCG_TEMP_ANY;
-                    temps[i].num_copies = 0;
-                } else {
-                    temps[i].val = new_base;
-                    temps[new_base].num_copies++;
-                }
+}
+
+/* Reset T's state to TCG_TEMP_VAR.  If T was a representative of some
+   class of equivalent temp's, a new representative should be chosen
+   in this class. */
+static void reset_temp(int t)
+{
+    int prev = temps[t].prev_copy;
+    int next = temps[t].next_copy;
+
+    switch (temps[t].state) {
+    case TCG_TEMP_VAR:
+        /* If next (and prev) equals temp, that means there are no copies.
+           Otherwise, promote NEXT to be the new representative of the
+           equivalence class.  */
+        if (next != t) {
+            int i;
+            temps[next].state = TCG_TEMP_VAR;
+            for (i = temps[next].next_copy; i != t; i = temps[i].next_copy) {
+                temps[i].val = next;
             }
         }
-        for (i = 0; i < nb_globals; i++) {
-            if (temps[i].state == TCG_TEMP_COPY && temps[i].val == temp) {
-                if (new_base == ((TCGArg)-1)) {
-                    temps[i].state = TCG_TEMP_ANY;
-                    temps[i].num_copies = 0;
-                } else {
-                    temps[i].val = new_base;
-                    temps[new_base].num_copies++;
-                }
-            }
+        /* FALLTHRU */
+
+    case TCG_TEMP_COPY:
+        /* Unlink T from the equivalence class.  */
+        temps[prev].next_copy = next;
+        temps[next].prev_copy = prev;
+        break;
+
+    case TCG_TEMP_CONST:
+        break;
+
+    default:
+        tcg_abort();
+    }
+
+    temps[t].state = TCG_TEMP_VAR;
+    temps[t].prev_copy = t;
+    temps[t].next_copy = t;
+}
+
+static void reset_temps_bb(TCGContext *s)
+{
+    int i, nb_temps = s->nb_temps;
+
+    for (i = s->nb_globals; i < nb_temps; ++i) {
+        if (!s->temps[i].temp_local) {
+            reset_temp(i);
         }
     }
-    temps[temp].state = TCG_TEMP_ANY;
-    temps[temp].num_copies = 0;
 }
 
-static int op_bits(int op)
+static void make_copy(int dest, int src, int nb_globals)
+{
+    int prev, next, head = src;
+
+    if (temps[head].state == TCG_TEMP_COPY) {
+        head = temps[head].val;
+    }
+
+    temps[dest].state = TCG_TEMP_COPY;
+    temps[dest].val = head;
+
+    /* Insert at the tail of the list.  This will tend to choose
+       the next-oldest copy when HEAD gets flushed.  */
+    prev = temps[head].prev_copy;
+    next = head;
+
+    temps[prev].next_copy = dest;
+    temps[dest].prev_copy = prev;
+    temps[dest].next_copy = next;
+    temps[next].prev_copy = dest;
+}
+
+static int op_bits(TCGOpcode op)
 {
     switch (op) {
     case INDEX_op_mov_i32:
+    case INDEX_op_movi_i32:
     case INDEX_op_add_i32:
     case INDEX_op_sub_i32:
     case INDEX_op_mul_i32:
@@ -101,6 +162,7 @@  static int op_bits(int op)
     case INDEX_op_rotl_i32:
     case INDEX_op_rotr_i32:
     case INDEX_op_not_i32:
+    case INDEX_op_neg_i32:
     case INDEX_op_ext8s_i32:
     case INDEX_op_ext16s_i32:
     case INDEX_op_ext8u_i32:
@@ -108,6 +170,7 @@  static int op_bits(int op)
         return 32;
 #if TCG_TARGET_REG_BITS == 64
     case INDEX_op_mov_i64:
+    case INDEX_op_movi_i64:
     case INDEX_op_add_i64:
     case INDEX_op_sub_i64:
     case INDEX_op_mul_i64:
@@ -120,6 +183,7 @@  static int op_bits(int op)
     case INDEX_op_rotl_i64:
     case INDEX_op_rotr_i64:
     case INDEX_op_not_i64:
+    case INDEX_op_neg_i64:
     case INDEX_op_ext8s_i64:
     case INDEX_op_ext16s_i64:
     case INDEX_op_ext32s_i64:
@@ -134,163 +198,105 @@  static int op_bits(int op)
     }
 }
 
-static int op_to_movi(int op)
+static int op_to_movi(TCGOpcode op)
 {
-    if (op_bits(op) == 32) {
+    switch (op_bits(op)) {
+    case 32:
         return INDEX_op_movi_i32;
-    }
 #if TCG_TARGET_REG_BITS == 64
-    if (op_bits(op) == 64) {
+    case 64:
         return INDEX_op_movi_i64;
-    }
 #endif
+    }
     tcg_abort();
 }
 
-static int op_to_mov(int op)
+static int op_to_mov(TCGOpcode op)
 {
-    if (op_bits(op) == 32) {
+    switch (op_bits(op)) {
+    case 32:
         return INDEX_op_mov_i32;
-    }
 #if TCG_TARGET_REG_BITS == 64
-    if (op_bits(op) == 64) {
+    case 64:
         return INDEX_op_mov_i64;
-    }
 #endif
+    }
     tcg_abort();
 }
 
-static TCGArg do_constant_folding_2(int op, TCGArg x, TCGArg y)
+static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
 {
     switch (op) {
-    case INDEX_op_add_i32:
-#if TCG_TARGET_REG_BITS == 64
-    case INDEX_op_add_i64:
-#endif
+    CASE_OP_32_64(add):
         return x + y;
-
-    case INDEX_op_sub_i32:
-#if TCG_TARGET_REG_BITS == 64
-    case INDEX_op_sub_i64:
-#endif
+    CASE_OP_32_64(sub):
         return x - y;
-
-    case INDEX_op_mul_i32:
-#if TCG_TARGET_REG_BITS == 64
-    case INDEX_op_mul_i64:
-#endif
+    CASE_OP_32_64(mul):
         return x * y;
-
-    case INDEX_op_and_i32:
-#if TCG_TARGET_REG_BITS == 64
-    case INDEX_op_and_i64:
-#endif
+    CASE_OP_32_64(and):
         return x & y;
-
-    case INDEX_op_or_i32:
-#if TCG_TARGET_REG_BITS == 64
-    case INDEX_op_or_i64:
-#endif
+    CASE_OP_32_64(or):
         return x | y;
-
-    case INDEX_op_xor_i32:
-#if TCG_TARGET_REG_BITS == 64
-    case INDEX_op_xor_i64:
-#endif
+    CASE_OP_32_64(xor):
         return x ^ y;
 
-    case INDEX_op_shl_i32:
-#if TCG_TARGET_REG_BITS == 64
-        y &= 0xffffffff;
-    case INDEX_op_shl_i64:
-#endif
-        return x << y;
+    CASE_OP_32_64(neg):
+        return -x;
+    CASE_OP_32_64(not):
+        return ~x;
+
+    CASE_OP_32_64(shl):
+        return x << (y & (op_bits(op) - 1));
 
     case INDEX_op_shr_i32:
+        return (uint32_t)x >> (y & 31);
 #if TCG_TARGET_REG_BITS == 64
-        x &= 0xffffffff;
-        y &= 0xffffffff;
     case INDEX_op_shr_i64:
+        return (uint64_t)x >> (y & 63);
 #endif
-        /* Assuming TCGArg to be unsigned */
-        return x >> y;
 
     case INDEX_op_sar_i32:
-#if TCG_TARGET_REG_BITS == 64
-        x &= 0xffffffff;
-        y &= 0xffffffff;
-#endif
-        return (int32_t)x >> (int32_t)y;
-
+        return (int32_t)x >> (y & 31);
 #if TCG_TARGET_REG_BITS == 64
     case INDEX_op_sar_i64:
-        return (int64_t)x >> (int64_t)y;
+        return (int64_t)x >> (y & 63);
 #endif
 
     case INDEX_op_rotr_i32:
-#if TCG_TARGET_REG_BITS == 64
-        x &= 0xffffffff;
-        y &= 0xffffffff;
-#endif
-        x = (x << (32 - y)) | (x >> y);
+        y &= 31;
+        x = ((uint32_t)x << (32 - y)) | ((uint32_t)x >> y);
         return x;
-
 #if TCG_TARGET_REG_BITS == 64
     case INDEX_op_rotr_i64:
-        x = (x << (64 - y)) | (x >> y);
+        y &= 63;
+        x = ((uint64_t)x << (64 - y)) | ((uint64_t)x >> y);
         return x;
 #endif
 
     case INDEX_op_rotl_i32:
-#if TCG_TARGET_REG_BITS == 64
-        x &= 0xffffffff;
-        y &= 0xffffffff;
-#endif
-        x = (x << y) | (x >> (32 - y));
+        y &= 31;
+        x = ((uint32_t)x << y) | ((uint32_t)x >> (32 - y));
         return x;
-
 #if TCG_TARGET_REG_BITS == 64
     case INDEX_op_rotl_i64:
-        x = (x << y) | (x >> (64 - y));
+        y &= 63;
+        x = ((uint64_t)x << y) | ((uint64_t)x >> (64 - y));
         return x;
 #endif
 
-    case INDEX_op_not_i32:
-#if TCG_TARGET_REG_BITS == 64
-    case INDEX_op_not_i64:
-#endif
-        return ~x;
-
-    case INDEX_op_ext8s_i32:
-        return (int32_t)(int8_t)x;
-
-    case INDEX_op_ext16s_i32:
-        return (int32_t)(int16_t)x;
-
-    case INDEX_op_ext8u_i32:
-        return (uint32_t)(uint8_t)x;
-
-    case INDEX_op_ext16u_i32:
-        return (uint32_t)(uint16_t)x;
-
+    CASE_OP_32_64(ext8s):
+        return (int8_t)x;
+    CASE_OP_32_64(ext8u):
+        return (uint8_t)x;
+    CASE_OP_32_64(ext16s):
+        return (int16_t)x;
+    CASE_OP_32_64(ext16u):
+        return (uint16_t)x;
 #if TCG_TARGET_REG_BITS == 64
-    case INDEX_op_ext8s_i64:
-        return (int64_t)(int8_t)x;
-
-    case INDEX_op_ext16s_i64:
-        return (int64_t)(int16_t)x;
-
     case INDEX_op_ext32s_i64:
-        return (int64_t)(int32_t)x;
-
-    case INDEX_op_ext8u_i64:
-        return (uint64_t)(uint8_t)x;
-
-    case INDEX_op_ext16u_i64:
-        return (uint64_t)(uint16_t)x;
-
+        return (int32_t)x;
     case INDEX_op_ext32u_i64:
-        return (uint64_t)(uint32_t)x;
+        return (uint32_t)x;
 #endif
 
     default:
@@ -300,7 +306,7 @@  static TCGArg do_constant_folding_2(int op, TCGArg x, TCGArg y)
     }
 }
 
-static TCGArg do_constant_folding(int op, TCGArg x, TCGArg y)
+static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
 {
     TCGArg res = do_constant_folding_2(op, x, y);
 #if TCG_TARGET_REG_BITS == 64
@@ -315,310 +321,320 @@  static TCGArg do_constant_folding(int op, TCGArg x, TCGArg y)
 static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                                     TCGArg *args, TCGOpDef *tcg_op_defs)
 {
-    int i, nb_ops, op_index, op, nb_temps, nb_globals;
-    const TCGOpDef *def;
+    int i, nb_ops, op_index, nb_temps, nb_globals;
     TCGArg *gen_args;
-    TCGArg tmp;
-    /* Array VALS has an element for each temp.
-       If this temp holds a constant then its value is kept in VALS' element.
-       If this temp is a copy of other ones then this equivalence class'
-       representative is kept in VALS' element.
-       If this temp is neither copy nor constant then corresponding VALS'
-       element is unused. */
-    struct tcg_temp_info temps[TCG_MAX_TEMPS];
 
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
-    memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
+    reset_all_temps(nb_temps);
 
     nb_ops = tcg_opc_ptr - gen_opc_buf;
     gen_args = args;
     for (op_index = 0; op_index < nb_ops; op_index++) {
-        op = gen_opc_buf[op_index];
-        def = &tcg_op_defs[op];
-        /* Do copy propagation */
-        if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS))) {
-            assert(op != INDEX_op_call);
-            for (i = def->nb_oargs; i < def->nb_oargs + def->nb_iargs; i++) {
-                if (temps[args[i]].state == TCG_TEMP_COPY
-                    && !(def->args_ct[i].ct & TCG_CT_IALIAS)
-                    && (def->args_ct[i].ct & TCG_CT_REG)) {
-                    args[i] = temps[args[i]].val;
+        TCGOpcode op = gen_opc_buf[op_index];
+        const TCGOpDef *def = &tcg_op_defs[op];
+        TCGArg arg0, arg1, arg2, tmp;
+
+        /* The format of CALL operations is special.  */
+        if (op == INDEX_op_call) {
+            int n_in, n_out, flags;
+
+            n_in = *gen_args++ = *args++;  /* in/out argument */
+            n_out = n_in >> 16;
+            n_in &= 0xffff;
+
+            /* Both pure and const functions do not modify globals.  */
+            flags = args[n_out + n_in];
+            if ((flags & (TCG_CALL_PURE | TCG_CALL_CONST)) == 0) {
+                for (i = 0; i < nb_globals; i++) {
+                    reset_temp(i);
                 }
             }
-        }
 
-        /* Simplify expression if possible. */
-        switch (op) {
-        case INDEX_op_add_i32:
-#if TCG_TARGET_REG_BITS == 64
-        case INDEX_op_add_i64:
-#endif
-            if (temps[args[1]].state == TCG_TEMP_CONST) {
-                /* 0 + x == x + 0 */
-                tmp = args[1];
-                args[1] = args[2];
-                args[2] = tmp;
+            /* All outputs are now variable.  */
+            for (i = 0; i < n_out; i++) {
+                tmp = *args++;
+                *gen_args++ = tmp;
+                reset_temp(tmp);
             }
-            /* Fallthrough */
-        case INDEX_op_sub_i32:
-        case INDEX_op_shl_i32:
-        case INDEX_op_shr_i32:
-        case INDEX_op_sar_i32:
-        case INDEX_op_rotl_i32:
-        case INDEX_op_rotr_i32:
-#if TCG_TARGET_REG_BITS == 64
-        case INDEX_op_sub_i64:
-        case INDEX_op_shl_i64:
-        case INDEX_op_shr_i64:
-        case INDEX_op_sar_i64:
-        case INDEX_op_rotl_i64:
-        case INDEX_op_rotr_i64:
-#endif
-            if (temps[args[1]].state == TCG_TEMP_CONST) {
-                /* Proceed with possible constant folding. */
-                break;
-            }
-            if (temps[args[2]].state == TCG_TEMP_CONST
-                && temps[args[2]].val == 0) {
-                if ((temps[args[0]].state == TCG_TEMP_COPY
-                    && temps[args[0]].val == args[1])
-                    || args[0] == args[1]) {
-                    args += 3;
-                    gen_opc_buf[op_index] = INDEX_op_nop;
-                } else {
-                    reset_temp(temps, args[0], nb_temps, nb_globals);
-                    if (args[1] >= s->nb_globals) {
-                        temps[args[0]].state = TCG_TEMP_COPY;
-                        temps[args[0]].val = args[1];
-                        temps[args[1]].num_copies++;
-                    }
-                    gen_opc_buf[op_index] = op_to_mov(op);
-                    gen_args[0] = args[0];
-                    gen_args[1] = args[1];
-                    gen_args += 2;
-                    args += 3;
+
+            /* Copy propagate into input arguments.  */
+            for (i = 0; i < n_in; i++) {
+                tmp = *args++;
+                if (temps[tmp].state == TCG_TEMP_COPY) {
+                    tmp = temps[tmp].val;
                 }
-                continue;
+                *gen_args++ = tmp;
             }
-            break;
-        case INDEX_op_mul_i32:
-#if TCG_TARGET_REG_BITS == 64
-        case INDEX_op_mul_i64:
-#endif
-            if ((temps[args[1]].state == TCG_TEMP_CONST
-                 && temps[args[1]].val == 0)
-                || (temps[args[2]].state == TCG_TEMP_CONST
-                    && temps[args[2]].val == 0)) {
-                reset_temp(temps, args[0], nb_temps, nb_globals);
-                temps[args[0]].state = TCG_TEMP_CONST;
-                temps[args[0]].val = 0;
-                assert(temps[args[0]].num_copies == 0);
-                gen_opc_buf[op_index] = op_to_movi(op);
-                gen_args[0] = args[0];
-                gen_args[1] = 0;
-                args += 3;
-                gen_args += 2;
-                continue;
+
+            for (i = def->nb_cargs; i > 0; --i) {
+                *gen_args++ = *args++;
             }
-            break;
-        case INDEX_op_or_i32:
-        case INDEX_op_and_i32:
-#if TCG_TARGET_REG_BITS == 64
-        case INDEX_op_and_i64:
-        case INDEX_op_or_i64:
-#endif
-            if (args[1] == args[2]) {
-                if (args[1] == args[0]) {
-                    args += 3;
-                    gen_opc_buf[op_index] = INDEX_op_nop;
-                } else {
-                    reset_temp(temps, args[0], nb_temps, nb_globals);
-                    if (args[1] >= s->nb_globals) {
-                        temps[args[0]].state = TCG_TEMP_COPY;
-                        temps[args[0]].val = args[1];
-                        assert(temps[args[0]].num_copies == 0);
-                    }
-                    gen_opc_buf[op_index] = op_to_mov(op);
-                    gen_args[0] = args[0];
-                    gen_args[1] = args[1];
-                    gen_args += 2;
-                    args += 3;
-                }
-                continue;
+            continue;
+        }
+
+        /* Do copy propagation.  */
+        for (i = def->nb_oargs; i < def->nb_oargs + def->nb_iargs; i++) {
+            tmp = args[i];
+            if (temps[tmp].state == TCG_TEMP_COPY
+                && (def->args_ct[i].ct & TCG_CT_REG)) {
+                args[i] = temps[tmp].val;
             }
-            break;
         }
 
-        /* Propagate constants through copy operations and do constant
-           folding.  Constants will be substituted to arguments by register
-           allocator where needed and possible.  Also detect copies. */
+        /* Most everything below operates on unary or binary ops.  We
+           can encourage better code by pulling the arguments into
+           registers now.  At the same time, filter out ops that we
+           don't handle at all.  */
+        arg2 = 0;
         switch (op) {
-        case INDEX_op_mov_i32:
+        CASE_OP_32_64(add):
+        CASE_OP_32_64(sub):
+        CASE_OP_32_64(mul):
+        CASE_OP_32_64(and):
+        CASE_OP_32_64(or):
+        CASE_OP_32_64(xor):
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+            arg2 = args[2];
+            /* FALLTHRU */
+
+        CASE_OP_32_64(mov):
+        CASE_OP_32_64(movi):
+        CASE_OP_32_64(not):
+        CASE_OP_32_64(neg):
+        CASE_OP_32_64(ext8s):
+        CASE_OP_32_64(ext8u):
+        CASE_OP_32_64(ext16s):
+        CASE_OP_32_64(ext16u):
 #if TCG_TARGET_REG_BITS == 64
-        case INDEX_op_mov_i64:
+        case INDEX_op_ext32s_i64:
+        case INDEX_op_ext32u_i64:
 #endif
-            if ((temps[args[1]].state == TCG_TEMP_COPY
-                && temps[args[1]].val == args[0])
-                || args[0] == args[1]) {
-                args += 2;
-                gen_opc_buf[op_index] = INDEX_op_nop;
-                break;
+            arg0 = args[0];
+            arg1 = args[1];
+            break;
+
+        case INDEX_op_set_label:
+            /* Flush info at the beginning of an extended basic block.  */
+            reset_all_temps(nb_temps);
+            *gen_args++ = *args++;
+            continue;
+
+        default:
+            /* At the end of a basic block (but not extended bb)
+               we must flush all bb temporaries.  */
+            if (def->flags & TCG_OPF_BB_END) {
+                reset_temps_bb(s);
             }
-            if (temps[args[1]].state != TCG_TEMP_CONST) {
-                reset_temp(temps, args[0], nb_temps, nb_globals);
-                if (args[1] >= s->nb_globals) {
-                    temps[args[0]].state = TCG_TEMP_COPY;
-                    temps[args[0]].val = args[1];
-                    temps[args[1]].num_copies++;
-                }
-                gen_args[0] = args[0];
-                gen_args[1] = args[1];
-                gen_args += 2;
-                args += 2;
-                break;
+
+            /* An unhandled opcode.  Outputs are variable.  */
+            for (i = 0; i < def->nb_oargs; i++) {
+                reset_temp(args[i]);
+            }
+            for (i = def->nb_args; i > 0; --i) {
+                *gen_args++ = *args++;
+            }
+            continue;
+        }
+
+        /* Do full constant folding.  */
+        switch (op) {
+        CASE_OP_32_64(add):
+        CASE_OP_32_64(sub):
+        CASE_OP_32_64(mul):
+        CASE_OP_32_64(and):
+        CASE_OP_32_64(or):
+        CASE_OP_32_64(xor):
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+            if (temps[arg1].state == TCG_TEMP_CONST
+                && temps[arg2].state == TCG_TEMP_CONST) {
+                arg1 = do_constant_folding(op, temps[arg1].val,
+                                           temps[arg2].val);
+                gen_opc_buf[op_index] = op_to_movi(op);
+                goto do_const_prop;
             }
-            /* Source argument is constant.  Rewrite the operation and
-               let movi case handle it. */
-            op = op_to_movi(op);
-            gen_opc_buf[op_index] = op;
-            args[1] = temps[args[1]].val;
-            /* fallthrough */
-        case INDEX_op_movi_i32:
-#if TCG_TARGET_REG_BITS == 64
-        case INDEX_op_movi_i64:
-#endif
-            reset_temp(temps, args[0], nb_temps, nb_globals);
-            temps[args[0]].state = TCG_TEMP_CONST;
-            temps[args[0]].val = args[1];
-            assert(temps[args[0]].num_copies == 0);
-            gen_args[0] = args[0];
-            gen_args[1] = args[1];
-            gen_args += 2;
-            args += 2;
             break;
-        case INDEX_op_not_i32:
-        case INDEX_op_ext8s_i32:
-        case INDEX_op_ext16s_i32:
-        case INDEX_op_ext8u_i32:
-        case INDEX_op_ext16u_i32:
+
+        CASE_OP_32_64(not):
+        CASE_OP_32_64(neg):
+        CASE_OP_32_64(ext8s):
+        CASE_OP_32_64(ext8u):
+        CASE_OP_32_64(ext16s):
+        CASE_OP_32_64(ext16u):
 #if TCG_TARGET_REG_BITS == 64
-        case INDEX_op_not_i64:
-        case INDEX_op_ext8s_i64:
-        case INDEX_op_ext16s_i64:
         case INDEX_op_ext32s_i64:
-        case INDEX_op_ext8u_i64:
-        case INDEX_op_ext16u_i64:
         case INDEX_op_ext32u_i64:
 #endif
-            if (temps[args[1]].state == TCG_TEMP_CONST) {
+            if (temps[arg1].state == TCG_TEMP_CONST) {
+                arg1 = do_constant_folding(op, temps[arg1].val, 0);
                 gen_opc_buf[op_index] = op_to_movi(op);
-                gen_args[0] = args[0];
-                gen_args[1] = do_constant_folding(op, temps[args[1]].val, 0);
-                reset_temp(temps, gen_args[0], nb_temps, nb_globals);
-                temps[gen_args[0]].state = TCG_TEMP_CONST;
-                temps[gen_args[0]].val = gen_args[1];
-                assert(temps[gen_args[0]].num_copies == 0);
-                gen_args += 2;
-                args += 2;
-                break;
-            } else {
-                reset_temp(temps, args[0], nb_temps, nb_globals);
-                gen_args[0] = args[0];
-                gen_args[1] = args[1];
-                gen_args += 2;
-                args += 2;
-                break;
+                goto do_const_prop;
             }
-        case INDEX_op_or_i32:
-        case INDEX_op_and_i32:
-        case INDEX_op_xor_i32:
-        case INDEX_op_add_i32:
-        case INDEX_op_sub_i32:
-        case INDEX_op_mul_i32:
-        case INDEX_op_shl_i32:
-        case INDEX_op_shr_i32:
-        case INDEX_op_sar_i32:
-        case INDEX_op_rotl_i32:
-        case INDEX_op_rotr_i32:
-#if TCG_TARGET_REG_BITS == 64
-        case INDEX_op_and_i64:
-        case INDEX_op_or_i64:
-        case INDEX_op_xor_i64:
-        case INDEX_op_add_i64:
-        case INDEX_op_sub_i64:
-        case INDEX_op_mul_i64:
-        case INDEX_op_shl_i64:
-        case INDEX_op_shr_i64:
-        case INDEX_op_sar_i64:
-        case INDEX_op_rotl_i64:
-        case INDEX_op_rotr_i64:
-#endif
-            if (temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[2]].state == TCG_TEMP_CONST) {
-                gen_opc_buf[op_index] = op_to_movi(op);
-                gen_args[0] = args[0];
-                gen_args[1] =
-                    do_constant_folding(op, temps[args[1]].val,
-                                        temps[args[2]].val);
-                reset_temp(temps, gen_args[0], nb_temps, nb_globals);
-                temps[gen_args[0]].state = TCG_TEMP_CONST;
-                temps[gen_args[0]].val = gen_args[1];
-                assert(temps[gen_args[0]].num_copies == 0);
-                gen_args += 2;
-                args += 3;
-                break;
-            } else {
-                reset_temp(temps, args[0], nb_temps, nb_globals);
-                gen_args[0] = args[0];
-                gen_args[1] = args[1];
-                gen_args[2] = args[2];
-                gen_args += 3;
-                args += 3;
-                break;
+            break;
+
+        default:
+            break;
+        }
+
+        /* For commutative operations, put the constant second.
+           Also swap operands to help matching operands.  */
+        switch (op) {
+        CASE_OP_32_64(add):
+        CASE_OP_32_64(mul):
+        CASE_OP_32_64(and):
+        CASE_OP_32_64(or):
+        CASE_OP_32_64(xor):
+            if (temps[arg1].state == TCG_TEMP_CONST || arg0 == arg2) {
+                tmp = arg1;
+                arg1 = arg2;
+                arg2 = tmp;
             }
-        case INDEX_op_call:
-            for (i = 0; i < nb_globals; i++) {
-                reset_temp(temps, i, nb_temps, nb_globals);
+            break;
+
+        default:
+            break;
+        }
+
+        /* Simplify expressions involving one constant.  */
+        switch (op) {
+        CASE_OP_32_64(add):
+        CASE_OP_32_64(sub):
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+        CASE_OP_32_64(or):
+        CASE_OP_32_64(xor):
+            if (temps[arg2].state == TCG_TEMP_CONST && temps[arg2].val == 0) {
+                goto do_copy_const_prop;
             }
-            for (i = 0; i < (args[0] >> 16); i++) {
-                reset_temp(temps, args[i + 1], nb_temps, nb_globals);
+            break;
+
+        CASE_OP_32_64(mul):
+        CASE_OP_32_64(and):
+            if (temps[arg2].state == TCG_TEMP_CONST && temps[arg2].val == 0) {
+                arg1 = 0;
+                goto do_const_prop;
             }
-            i = (args[0] >> 16) + (args[0] & 0xffff) + 3;
-            while (i) {
-                *gen_args = *args;
-                args++;
-                gen_args++;
-                i--;
+            break;
+
+        default:
+            break;
+        }
+
+        /* Simplify expressions involving no constants.  */
+        switch (op) {
+        CASE_OP_32_64(and):
+        CASE_OP_32_64(or):
+            if (arg1 == arg2) {
+                goto do_copy_const_prop;
             }
             break;
-        case INDEX_op_set_label:
-        case INDEX_op_jmp:
-        case INDEX_op_br:
-        case INDEX_op_brcond_i32:
-#if TCG_TARGET_REG_BITS == 64
-        case INDEX_op_brcond_i64:
-#endif
-            memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
-            for (i = 0; i < def->nb_args; i++) {
-                *gen_args = *args;
-                args++;
-                gen_args++;
+
+        CASE_OP_32_64(xor):
+            if (arg1 == arg2) {
+                arg1 = 0;
+                goto do_const_prop;
             }
             break;
+
         default:
-            /* Default case: we do know nothing about operation so no
-               propagation is done.  We only trash output args.  */
-            for (i = 0; i < def->nb_oargs; i++) {
-                reset_temp(temps, args[i], nb_temps, nb_globals);
+            break;
+        }
+
+        /* Finish constant and copy propagation.  */
+        switch (op) {
+        CASE_OP_32_64(mov):
+        do_copy_const_prop:
+            /* Notice copies back to self.  */
+            if (arg0 == arg1) {
+            do_nop:
+                gen_opc_buf[op_index] = INDEX_op_nop;
+                break;
+            }
+
+            if (temps[arg1].state == TCG_TEMP_CONST) {
+                arg1 = temps[arg1].val;
+                goto do_const_prop;
             }
-            for (i = 0; i < def->nb_args; i++) {
-                gen_args[i] = args[i];
+
+            /* Notice copies through another member of the equivalence.  */
+            for (i = temps[arg1].next_copy; i != arg1; i = temps[i].next_copy) {
+                if (i == arg0) {
+                    goto do_nop;
+                }
             }
-            args += def->nb_args;
-            gen_args += def->nb_args;
+
+            /* Set up the equivalence class for copy propagation.  */
+            reset_temp(arg0);
+            make_copy(arg0, arg1, nb_globals);
+
+            /* "Copy" the move down.  Note that we get here by any path
+               that simplifies to a move, so OP may not itself be a move.  */
+            gen_opc_buf[op_index] = op_to_mov(op);
+            gen_args[0] = arg0;
+            gen_args[1] = arg1;
+            gen_args += 2;
             break;
+
+        CASE_OP_32_64(movi):
+        do_const_prop:
+            reset_temp(arg0);
+            temps[arg0].state = TCG_TEMP_CONST;
+            temps[arg0].val = arg1;
+
+            gen_opc_buf[op_index] = op_to_movi(op);
+            gen_args[0] = arg0;
+            gen_args[1] = arg1;
+            gen_args += 2;
+            break;
+
+        CASE_OP_32_64(add):
+        CASE_OP_32_64(sub):
+        CASE_OP_32_64(mul):
+        CASE_OP_32_64(and):
+        CASE_OP_32_64(or):
+        CASE_OP_32_64(xor):
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+            /* Given that the opcode is already as simple as we can make it,
+               and the result is not a simple copy or constant, the output
+               must now be variable.  */
+            reset_temp(arg0);
+            gen_args[0] = arg0;
+            gen_args[1] = arg1;
+            gen_args[2] = arg2;
+            gen_args += 3;
+            break;
+
+        CASE_OP_32_64(not):
+        CASE_OP_32_64(neg):
+        CASE_OP_32_64(ext8s):
+        CASE_OP_32_64(ext8u):
+        CASE_OP_32_64(ext16s):
+        CASE_OP_32_64(ext16u):
+            reset_temp(arg0);
+            gen_args[0] = arg0;
+            gen_args[1] = arg1;
+            gen_args += 2;
+            break;
+
+        default:
+            tcg_abort();
         }
+        args += def->nb_args;
     }
 
     return gen_args;