Patchwork [39/57] target-i386: optimize flags checking after sub using CC_SRC2

login
register
mail settings
Submitter Richard Henderson
Date Jan. 24, 2013, 4:03 a.m.
Message ID <1359000221-19834-40-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/215204/
State New
Headers show

Comments

Richard Henderson - Jan. 24, 2013, 4:03 a.m.
After a comparison or subtraction, the original value of the LHS will
currently be reconstructed using an addition.  However, in most cases
it is already available: store it in a temp-local variable and save 1
or 2 TCG ops (2 if the result of the addition needs to be extended).

The temp-local can be declared dead as soon as the cc_op changes again,
or also before the translation block ends because gen_prepare_cc will
always make a copy before returning it.  All this magic, plus copy
propagation and dead-code elimination, ensures that the temp local will
(almost) never be spilled.

Example (cmp $0x21,%rax + jbe):

 Before                                     After
----------------------------------------------------------------------------
 movi_i64 tmp1,$0x21                        movi_i64 tmp1,$0x21
 movi_i64 cc_src,$0x21                      movi_i64 cc_src,$0x21
 sub_i64 cc_dst,rax,tmp1                    sub_i64 cc_dst,rax,tmp1
 add_i64 tmp7,cc_dst,cc_src
 movi_i32 cc_op,$0x11                       movi_i32 cc_op,$0x11
 brcond_i64 tmp7,cc_src,leu,$0x0            discard loc11
                                            brcond_i64 rax,cc_src,leu,$0x0

 Before                                     After
----------------------------------------------------------------------------
  mov    (%r14),%rbp                        mov    (%r14),%rbp
  mov    %rbp,%rbx                          mov    %rbp,%rbx
  sub    $0x21,%rbx                         sub    $0x21,%rbx
  lea    0x21(%rbx),%r12
  movl   $0x11,0xa0(%r14)                   movl   $0x11,0xa0(%r14)
  movq   $0x21,0x90(%r14)                   movq   $0x21,0x90(%r14)
  mov    %rbx,0x98(%r14)                    mov    %rbx,0x98(%r14)
  cmp    $0x21,%r12                     |   cmp    $0x21,%rbp
  jbe    ...                                jbe    ...

[rth: Make cc_src2 global not temp local.  We're about to use it for other
things, so there's no point in creating a local now.  That also means that
we cannot consider cc_src2 dead before exceptions taken.]

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/cpu.h       |  3 ++-
 target-i386/translate.c | 59 +++++++++++++++++++++++++++++++++++--------------
 2 files changed, 45 insertions(+), 17 deletions(-)

Patch

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 04ac339..868627e 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -725,8 +725,9 @@  typedef struct CPUX86State {
                         stored elsewhere */
 
     /* emulator internal eflags handling */
-    target_ulong cc_src;
     target_ulong cc_dst;
+    target_ulong cc_src;
+    target_ulong cc_src2;
     uint32_t cc_op;
     int32_t df; /* D flag : 1 if D = 0, -1 if D = 1 */
     uint32_t hflags; /* TB flags, see HF_xxx constants. These flags
diff --git a/target-i386/translate.c b/target-i386/translate.c
index b3ba93e..aaee393 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -60,7 +60,7 @@ 
 
 /* global register indexes */
 static TCGv_ptr cpu_env;
-static TCGv cpu_A0, cpu_cc_src, cpu_cc_dst;
+static TCGv cpu_A0, cpu_cc_src, cpu_cc_dst, cpu_cc_src2;
 static TCGv_i32 cpu_cc_op;
 static TCGv cpu_regs[CPU_NB_REGS];
 /* local temps */
@@ -185,8 +185,9 @@  enum {
 };
 
 enum {
-    USES_CC_DST = 1,
-    USES_CC_SRC = 2,
+    USES_CC_DST  = 1,
+    USES_CC_SRC  = 2,
+    USES_CC_SRC2 = 4,
 };
 
 /* Bit set if the global variable is live after setting CC_OP to X.  */
@@ -196,7 +197,7 @@  static const uint8_t cc_op_live[CC_OP_NB] = {
     [CC_OP_MULB ... CC_OP_MULQ] = USES_CC_DST | USES_CC_SRC,
     [CC_OP_ADDB ... CC_OP_ADDQ] = USES_CC_DST | USES_CC_SRC,
     [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC,
-    [CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
     [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC,
     [CC_OP_LOGICB ... CC_OP_LOGICQ] = USES_CC_DST,
     [CC_OP_INCB ... CC_OP_INCQ] = USES_CC_DST | USES_CC_SRC,
@@ -205,6 +206,12 @@  static const uint8_t cc_op_live[CC_OP_NB] = {
     [CC_OP_SARB ... CC_OP_SARQ] = USES_CC_DST | USES_CC_SRC,
 };
 
+/* Bit set if the global variable is live, but merely an optimization
+   within the TB and not used to compute EFLAGS.  */
+static const uint8_t cc_op_opt[CC_OP_NB] = {
+    [CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_SRC2,
+};
+
 static void set_cc_op(DisasContext *s, CCOp op)
 {
     int dead;
@@ -214,13 +221,20 @@  static void set_cc_op(DisasContext *s, CCOp op)
     }
 
     /* Discard CC computation that will no longer be used.  */
-    dead = cc_op_live[s->cc_op] & ~cc_op_live[op];
+    if (op == CC_OP_DYNAMIC) {
+        dead = cc_op_opt[s->cc_op];
+    } else {
+        dead = cc_op_live[s->cc_op] & ~cc_op_live[op];
+    }
     if (dead & USES_CC_DST) {
         tcg_gen_discard_tl(cpu_cc_dst);
     }
     if (dead & USES_CC_SRC) {
         tcg_gen_discard_tl(cpu_cc_src);
     }
+    if (dead & USES_CC_SRC2) {
+        tcg_gen_discard_tl(cpu_cc_src2);
+    }
 
     s->cc_op = op;
     /* The DYNAMIC setting is translator only, and should never be
@@ -869,8 +883,9 @@  static inline void gen_op_testl_T0_T1_cc(void)
 
 static void gen_op_update_neg_cc(void)
 {
-    tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+    tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
+    tcg_gen_movi_tl(cpu_cc_src2, 0);
 }
 
 /* compute all eflags to cc_src */
@@ -903,12 +918,12 @@  static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
 
     switch (s->cc_op) {
     case CC_OP_SUBB ... CC_OP_SUBQ:
-        /* (DATA_TYPE)(CC_DST + CC_SRC) < (DATA_TYPE)CC_SRC */
+        /* (DATA_TYPE)CC_SRC2 < (DATA_TYPE)CC_SRC */
         size = s->cc_op - CC_OP_SUBB;
         t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
         /* If no temporary was used, be careful not to alias t1 and t0.  */
         t0 = TCGV_EQUAL(t1, cpu_cc_src) ? cpu_tmp0 : reg;
-        tcg_gen_add_tl(t0, cpu_cc_dst, cpu_cc_src);
+        tcg_gen_mov_tl(t0, cpu_cc_src2);
         gen_extu(size, t0);
         goto add_sub;
 
@@ -1046,7 +1061,7 @@  static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg)
         size = s->cc_op - CC_OP_SUBB;
         switch (jcc_op) {
         case JCC_BE:
-            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+            tcg_gen_mov_tl(cpu_tmp4, cpu_cc_src2);
             gen_extu(size, cpu_tmp4);
             t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
             cc = (CCPrepare) { .cond = TCG_COND_LEU, .reg = cpu_tmp4,
@@ -1059,7 +1074,7 @@  static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg)
         case JCC_LE:
             cond = TCG_COND_LE;
         fast_jcc_l:
-            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+            tcg_gen_mov_tl(cpu_tmp4, cpu_cc_src2);
             gen_exts(size, cpu_tmp4);
             t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, true);
             cc = (CCPrepare) { .cond = cond, .reg = cpu_tmp4,
@@ -1414,6 +1429,10 @@  static void gen_op(DisasContext *s1, int op, int ot, int d)
         set_cc_op(s1, CC_OP_DYNAMIC);
         break;
     case OP_SBBL:
+        /*
+         * No need to store cpu_cc_src2, because it is used only
+         * when the cc_op is known.
+         */
         gen_compute_eflags_c(s1, cpu_tmp4);
         tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
@@ -1438,12 +1457,14 @@  static void gen_op(DisasContext *s1, int op, int ot, int d)
         set_cc_op(s1, CC_OP_ADDB + ot);
         break;
     case OP_SUBL:
+        tcg_gen_mov_tl(cpu_tmp0, cpu_T[0]);
         tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         if (d != OR_TMP0)
             gen_op_mov_reg_T0(ot, d);
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update2_cc();
+        tcg_gen_mov_tl(cpu_cc_src2, cpu_tmp0);
         set_cc_op(s1, CC_OP_SUBB + ot);
         break;
     default:
@@ -1476,6 +1497,7 @@  static void gen_op(DisasContext *s1, int op, int ot, int d)
         break;
     case OP_CMPL:
         tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
+        tcg_gen_mov_tl(cpu_cc_src2, cpu_T[0]);
         tcg_gen_sub_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
         set_cc_op(s1, CC_OP_SUBB + ot);
         break;
@@ -2799,8 +2821,9 @@  static void gen_eob(DisasContext *s)
    direct call to the next block may occur */
 static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num)
 {
+    gen_update_cc_op(s);
+    set_cc_op(s, CC_OP_DYNAMIC);
     if (s->jmp_opt) {
-        gen_update_cc_op(s);
         gen_goto_tb(s, tb_num, eip);
         s->is_jmp = DISAS_TB_JUMP;
     } else {
@@ -5017,9 +5040,10 @@  static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 rm = 0; /* avoid warning */
             }
             label1 = gen_new_label();
-            tcg_gen_sub_tl(t2, cpu_regs[R_EAX], t0);
+            tcg_gen_mov_tl(t2, cpu_regs[R_EAX]);
+            gen_extu(ot, t0);
             gen_extu(ot, t2);
-            tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
+            tcg_gen_brcond_tl(TCG_COND_EQ, t2, t0, label1);
             label2 = gen_new_label();
             if (mod == 3) {
                 gen_op_mov_reg_v(ot, R_EAX, t0);
@@ -5038,7 +5062,8 @@  static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             }
             gen_set_label(label2);
             tcg_gen_mov_tl(cpu_cc_src, t0);
-            tcg_gen_mov_tl(cpu_cc_dst, t2);
+            tcg_gen_mov_tl(cpu_cc_src2, t2);
+            tcg_gen_sub_tl(cpu_cc_dst, t2, t0);
             set_cc_op(s, CC_OP_SUBB + ot);
             tcg_temp_free(t0);
             tcg_temp_free(t1);
@@ -7746,10 +7771,12 @@  void optimize_flags_init(void)
     cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
     cpu_cc_op = tcg_global_mem_new_i32(TCG_AREG0,
                                        offsetof(CPUX86State, cc_op), "cc_op");
-    cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src),
-                                    "cc_src");
     cpu_cc_dst = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_dst),
                                     "cc_dst");
+    cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src),
+                                    "cc_src");
+    cpu_cc_src2 = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src2),
+                                     "cc_src2");
 
 #ifdef TARGET_X86_64
     cpu_regs[R_EAX] = tcg_global_mem_new_i64(TCG_AREG0,