diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index df1bdfc..b4e8e74 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -165,7 +165,12 @@ static inline int tcg_target_const_match(tcg_target_long val,
 
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
+#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
+#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
+#define OPC_DEC_r32	(0x48)
 #define OPC_BSWAP	(0xc8 | P_EXT)
+#define OPC_INC_r32	(0x40)
 #define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
 #define OPC_JCC_short	(0x70)		/* ... plus condition code */
 #define OPC_JMP_long	(0xe9)
@@ -180,6 +185,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_SHIFT_1	(0xd1)
 #define OPC_SHIFT_Ib	(0xc1)
 #define OPC_SHIFT_cl	(0xd3)
+#define OPC_TESTL	(0x85)
 
 /* Group 1 opcode extensions for 0x80-0x83.  */
 #define ARITH_ADD 0
@@ -280,6 +286,12 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, int rm,
     }
 }
 
+/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
+static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
+{
+    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3), dest, src);
+}
+
 static inline void tcg_out_mov(TCGContext *s, int ret, int arg)
 {
     if (arg != ret) {
@@ -291,8 +303,7 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
                                 int ret, int32_t arg)
 {
     if (arg == 0) {
-        /* xor r0,r0 */
-        tcg_out_modrm(s, 0x01 | (ARITH_XOR << 3), ret, ret);
+        tgen_arithr(s, ARITH_XOR, ret, ret);
     } else {
         tcg_out8(s, 0xb8 + ret);
         tcg_out32(s, arg);
@@ -374,14 +385,15 @@ static inline void tcg_out_rolw_8(TCGContext *s, int reg)
     tcg_out_shifti(s, SHIFT_ROL, reg, 8);
 }
 
-static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
+static inline void tgen_arithi(TCGContext *s, int c, int r0,
+                               int32_t val, int cf)
 {
-    if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
-        /* inc */
-        tcg_out_opc(s, 0x40 + r0);
-    } else if (!cf && ((c == ARITH_ADD && val == -1) || (c == ARITH_SUB && val == 1))) {
-        /* dec */
-        tcg_out_opc(s, 0x48 + r0);
+    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
+       partial flags update stalls on Pentium4 and are not recommended
+       by current Intel optimization manuals.  */
+    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
+        int opc = ((c == ARITH_ADD) ^ (val < 0) ? OPC_INC_r32 : OPC_DEC_r32);
+        tcg_out_opc(s, opc + r0);
     } else if (val == (int8_t)val) {
         tcg_out_modrm(s, OPC_ARITH_EvIb, c, r0);
         tcg_out8(s, val);
@@ -454,12 +466,12 @@ static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
     if (const_arg2) {
         if (arg2 == 0) {
             /* test r, r */
-            tcg_out_modrm(s, 0x85, arg1, arg1);
+            tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
         } else {
             tgen_arithi(s, ARITH_CMP, arg1, arg2, 0);
         }
     } else {
-        tcg_out_modrm(s, 0x01 | (ARITH_CMP << 3), arg2, arg1);
+        tgen_arithr(s, ARITH_CMP, arg1, arg2);
     }
 }
 
@@ -674,7 +686,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     tcg_out32(s, offsetof(CPUState, tlb_table[mem_index][0].addr_read));
 
     /* cmp 0(r1), r0 */
-    tcg_out_modrm_offset(s, 0x3b, r0, r1, 0);
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv, r0, r1, 0);
     
     tcg_out_mov(s, r0, addr_reg);
     
@@ -690,7 +702,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     s->code_ptr++;
     
     /* cmp 4(r1), addr_reg2 */
-    tcg_out_modrm_offset(s, 0x3b, addr_reg2, r1, 4);
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv, addr_reg2, r1, 4);
 
     /* je label1 */
     tcg_out8(s, OPC_JCC_short + JCC_JE);
@@ -749,7 +761,8 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     *label1_ptr = s->code_ptr - label1_ptr - 1;
 
     /* add x(r1), r0 */
-    tcg_out_modrm_offset(s, 0x03, r0, r1, offsetof(CPUTLBEntry, addend) - 
+    tcg_out_modrm_offset(s, OPC_ADD_GvEv, r0, r1,
+                         offsetof(CPUTLBEntry, addend) - 
                          offsetof(CPUTLBEntry, addr_read));
 #else
     r0 = addr_reg;
@@ -864,7 +877,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     tcg_out32(s, offsetof(CPUState, tlb_table[mem_index][0].addr_write));
 
     /* cmp 0(r1), r0 */
-    tcg_out_modrm_offset(s, 0x3b, r0, r1, 0);
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv, r0, r1, 0);
     
     tcg_out_mov(s, r0, addr_reg);
     
@@ -880,7 +893,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     s->code_ptr++;
     
     /* cmp 4(r1), addr_reg2 */
-    tcg_out_modrm_offset(s, 0x3b, addr_reg2, r1, 4);
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv, addr_reg2, r1, 4);
 
     /* je label1 */
     tcg_out8(s, OPC_JCC_short + JCC_JE);
@@ -961,7 +974,8 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     *label1_ptr = s->code_ptr - label1_ptr - 1;
 
     /* add x(r1), r0 */
-    tcg_out_modrm_offset(s, 0x03, r0, r1, offsetof(CPUTLBEntry, addend) - 
+    tcg_out_modrm_offset(s, OPC_ADD_GvEv, r0, r1,
+                         offsetof(CPUTLBEntry, addend) - 
                          offsetof(CPUTLBEntry, addr_write));
 #else
     r0 = addr_reg;
@@ -1113,7 +1127,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         if (const_args[2]) {
             tgen_arithi(s, c, args[0], args[2], 0);
         } else {
-            tcg_out_modrm(s, 0x01 | (c << 3), args[2], args[0]);
+            tgen_arithr(s, c, args[0], args[2]);
         }
         break;
     case INDEX_op_mul_i32:
@@ -1163,24 +1177,28 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         goto gen_shift32;
 
     case INDEX_op_add2_i32:
-        if (const_args[4]) 
+        if (const_args[4]) {
             tgen_arithi(s, ARITH_ADD, args[0], args[4], 1);
-        else
-            tcg_out_modrm(s, 0x01 | (ARITH_ADD << 3), args[4], args[0]);
-        if (const_args[5]) 
+        } else {
+            tgen_arithr(s, ARITH_ADD, args[0], args[4]);
+        }
+        if (const_args[5]) {
             tgen_arithi(s, ARITH_ADC, args[1], args[5], 1);
-        else
-            tcg_out_modrm(s, 0x01 | (ARITH_ADC << 3), args[5], args[1]);
+        } else {
+            tgen_arithr(s, ARITH_ADC, args[0], args[5]);
+        }
         break;
     case INDEX_op_sub2_i32:
-        if (const_args[4]) 
+        if (const_args[4]) {
             tgen_arithi(s, ARITH_SUB, args[0], args[4], 1);
-        else
-            tcg_out_modrm(s, 0x01 | (ARITH_SUB << 3), args[4], args[0]);
-        if (const_args[5]) 
+        } else {
+            tgen_arithr(s, ARITH_SUB, args[0], args[4]);
+        }
+        if (const_args[5]) {
             tgen_arithi(s, ARITH_SBB, args[1], args[5], 1);
-        else
-            tcg_out_modrm(s, 0x01 | (ARITH_SBB << 3), args[5], args[1]);
+        } else {
+            tgen_arithr(s, ARITH_SBB, args[1], args[5]);
+        }
         break;
     case INDEX_op_brcond_i32:
         tcg_out_brcond(s, args[2], args[0], args[1], const_args[1],
