Patchwork [14/14] i386: optimize setcc instructions

login
register
mail settings
Submitter Paolo Bonzini
Date Oct. 6, 2012, 12:30 p.m.
Message ID <1349526621-13939-15-git-send-email-pbonzini@redhat.com>
Download mbox | patch
Permalink /patch/189688/
State New
Headers show

Comments

Paolo Bonzini - Oct. 6, 2012, 12:30 p.m.
Reconstruct the arguments for complex conditions involving CC_OP_SUBx (BE,
L, LE).  In the others do it via setcond and gen_setcc_slow (which is
not that slow in many cases).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target-i386/translate.c | 93 +++++++++++++++++++------------------------------
 1 file modificato, 36 inserzioni(+), 57 rimozioni(-)
Blue Swirl - Oct. 7, 2012, 7:58 p.m.
On Sat, Oct 6, 2012 at 12:30 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Reconstruct the arguments for complex conditions involving CC_OP_SUBx (BE,
> L, LE).  In the others do it via setcond and gen_setcc_slow (which is
> not that slow in many cases).

I think it would be useful to reconstruct also for add, inc and dec
along the same lines, the others are probably not so often used.

>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  target-i386/translate.c | 93 +++++++++++++++++++------------------------------
>  1 file modificato, 36 inserzioni(+), 57 rimozioni(-)
>
> diff --git a/target-i386/translate.c b/target-i386/translate.c
> index 342b9ec..92e8291 100644
> --- a/target-i386/translate.c
> +++ b/target-i386/translate.c
> @@ -1063,55 +1063,55 @@ static inline void gen_setcc_slow(DisasContext *s, int jcc_op, TCGv reg, bool in
>      }
>  }
>
> -/* return true if setcc_slow is not needed (WARNING: must be kept in
> -   sync with gen_jcc1) */
> -static int is_fast_jcc_case(DisasContext *s, int b)
> +/* perform a conditional store into register 'reg' according to jump opcode
> +   value 'b'. In the fast case, T0 is guaranted not to be used. */
> +static inline void gen_setcc1(DisasContext *s, int b, TCGv reg)
>  {
> -    int jcc_op;
> +    int inv, jcc_op, size, cond;
> +    TCGv t0;
> +
> +    inv = b & 1;
>      jcc_op = (b >> 1) & 7;
> +
>      switch(s->cc_op) {
> -        /* we optimize the cmp/jcc case */
> +        /* we optimize relational operators for the cmp/jcc case */
>      case CC_OP_SUBB:
>      case CC_OP_SUBW:
>      case CC_OP_SUBL:
>      case CC_OP_SUBQ:
> -        if (jcc_op == JCC_O || jcc_op == JCC_P)
> -            goto slow_jcc;
> -        break;
> -
> -        /* some jumps are easy to compute */
> -    case CC_OP_ADDB:
> -    case CC_OP_ADDW:
> -    case CC_OP_ADDL:
> -    case CC_OP_ADDQ:
> -
> -    case CC_OP_LOGICB:
> -    case CC_OP_LOGICW:
> -    case CC_OP_LOGICL:
> -    case CC_OP_LOGICQ:
> -
> -    case CC_OP_INCB:
> -    case CC_OP_INCW:
> -    case CC_OP_INCL:
> -    case CC_OP_INCQ:
> +        size = s->cc_op - CC_OP_SUBB;
> +        switch(jcc_op) {
> +        case JCC_BE:
> +            cond = inv ? TCG_COND_GTU : TCG_COND_LEU;
> +            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
> +            gen_extu(size, cpu_tmp4);
> +            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
> +            tcg_gen_setcond_tl(cond, reg, cpu_tmp4, t0);
> +            break;
>
> -    case CC_OP_DECB:
> -    case CC_OP_DECW:
> -    case CC_OP_DECL:
> -    case CC_OP_DECQ:
> +        case JCC_L:
> +            cond = inv ? TCG_COND_GE : TCG_COND_LT;
> +            goto fast_jcc_l;
> +        case JCC_LE:
> +            cond = inv ? TCG_COND_GT : TCG_COND_LE;
> +        fast_jcc_l:
> +            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
> +            gen_exts(size, cpu_tmp4);
> +            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, true);
> +            tcg_gen_setcond_tl(cond, reg, cpu_tmp4, t0);
> +            break;
>
> -    case CC_OP_SHLB:
> -    case CC_OP_SHLW:
> -    case CC_OP_SHLL:
> -    case CC_OP_SHLQ:
> -        if (jcc_op != JCC_Z && jcc_op != JCC_S)
> +        default:
>              goto slow_jcc;
> +        }
>          break;
> +
>      default:
>      slow_jcc:
> -        return 0;
> +        /* gen_setcc_slow actually generates good code for JC, JZ and JS */
> +        gen_setcc_slow(s, jcc_op, reg, inv);
> +        break;
>      }
> -    return 1;
>  }
>
>  /* generate a conditional jump to label 'l1' according to jump opcode
> @@ -2477,28 +2477,7 @@ static inline void gen_jcc(DisasContext *s, int b,
>
>  static void gen_setcc(DisasContext *s, int b)
>  {
> -    int inv, jcc_op, l1;
> -    TCGv t0;
> -
> -    if (is_fast_jcc_case(s, b)) {
> -        /* nominal case: we use a jump */
> -        /* XXX: make it faster by adding new instructions in TCG */
> -        t0 = tcg_temp_local_new();
> -        tcg_gen_movi_tl(t0, 0);
> -        l1 = gen_new_label();
> -        gen_jcc1(s, b ^ 1, l1);
> -        tcg_gen_movi_tl(t0, 1);
> -        gen_set_label(l1);
> -        tcg_gen_mov_tl(cpu_T[0], t0);
> -        tcg_temp_free(t0);
> -    } else {
> -        /* slow case: it is more efficient not to generate a jump,
> -           although it is questionnable whether this optimization is
> -           worth to */
> -        inv = b & 1;
> -        jcc_op = (b >> 1) & 7;
> -        gen_setcc_slow(s, jcc_op, cpu_T[0], inv);
> -    }
> +    gen_setcc1(s, b, cpu_T[0]);
>  }
>
>  static inline void gen_op_movl_T0_seg(int seg_reg)
> --
> 1.7.12.1
>
>
Richard Henderson - Oct. 9, 2012, 8:22 p.m.
On 10/06/2012 05:30 AM, Paolo Bonzini wrote:
> +static inline void gen_setcc1(DisasContext *s, int b, TCGv reg)
>  {
> +    int inv, jcc_op, size, cond;
> +    TCGv t0;
> +
> +    inv = b & 1;
>      jcc_op = (b >> 1) & 7;
> +
>      switch(s->cc_op) {
> +        /* we optimize relational operators for the cmp/jcc case */
>      case CC_OP_SUBB:
>      case CC_OP_SUBW:
>      case CC_OP_SUBL:
>      case CC_OP_SUBQ:
> +        size = s->cc_op - CC_OP_SUBB;
> +        switch(jcc_op) {
> +        case JCC_BE:
> +            cond = inv ? TCG_COND_GTU : TCG_COND_LEU;
> +            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
> +            gen_extu(size, cpu_tmp4);
> +            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
> +            tcg_gen_setcond_tl(cond, reg, cpu_tmp4, t0);
> +            break;

I don't think this patch is going in the right direction.  In particular,
this is going to be largely redundant with gen_jcc1.

Instead, c.f. the DisasCompare structure now present in target-sparc/,
or a similar DisasCompare structure present in my jumbo target-s390x
patch set.  Here we use common code to generate a comparison, which
can then be fed into brcond, setcond, or movcond as desired.

I think that this Compare structure should be fed to gen_compute_eflags_*
so that a parent gen_condition routine can make use of them for simple
conditions like z/nz.

At which point gen_jcc1 and gen_setcc1 become fairly trivial routines.


r~
Paolo Bonzini - Oct. 10, 2012, 6:51 a.m.
Il 09/10/2012 22:22, Richard Henderson ha scritto:
> On 10/06/2012 05:30 AM, Paolo Bonzini wrote:
>> +static inline void gen_setcc1(DisasContext *s, int b, TCGv reg)
>>  {
>> +    int inv, jcc_op, size, cond;
>> +    TCGv t0;
>> +
>> +    inv = b & 1;
>>      jcc_op = (b >> 1) & 7;
>> +
>>      switch(s->cc_op) {
>> +        /* we optimize relational operators for the cmp/jcc case */
>>      case CC_OP_SUBB:
>>      case CC_OP_SUBW:
>>      case CC_OP_SUBL:
>>      case CC_OP_SUBQ:
>> +        size = s->cc_op - CC_OP_SUBB;
>> +        switch(jcc_op) {
>> +        case JCC_BE:
>> +            cond = inv ? TCG_COND_GTU : TCG_COND_LEU;
>> +            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
>> +            gen_extu(size, cpu_tmp4);
>> +            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
>> +            tcg_gen_setcond_tl(cond, reg, cpu_tmp4, t0);
>> +            break;
> 
> I don't think this patch is going in the right direction.  In particular,
> this is going to be largely redundant with gen_jcc1.

Yes, it is.  That's something I had started after posting this series,
but didn't finish in time for the weekend... :)

You can look at a few more changes in the eflags2 branch of my github
repo, including:

- delaying the actual generation of conditions, so that they can be used
in setcond/brcond/movcond

- optimization of setle/setl similar to setbe (shift OF onto SF, XOR,
mask to SF or SF+ZF, after which you can already do a brcond)

There are also TCG changes that add zero-bit tracking to optimize.c to
eliminate redundant ext (leading to both better code generation and
better copy propagation).

Paolo

> Instead, c.f. the DisasCompare structure now present in target-sparc/,
> or a similar DisasCompare structure present in my jumbo target-s390x
> patch set.  Here we use common code to generate a comparison, which
> can then be fed into brcond, setcond, or movcond as desired.
> 
> I think that this Compare structure should be fed to gen_compute_eflags_*
> so that a parent gen_condition routine can make use of them for simple
> conditions like z/nz.
> 
> At which point gen_jcc1 and gen_setcc1 become fairly trivial routines.
> 
> 
> r~
> 
>

Patch

diff --git a/target-i386/translate.c b/target-i386/translate.c
index 342b9ec..92e8291 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -1063,55 +1063,55 @@  static inline void gen_setcc_slow(DisasContext *s, int jcc_op, TCGv reg, bool in
     }
 }
 
-/* return true if setcc_slow is not needed (WARNING: must be kept in
-   sync with gen_jcc1) */
-static int is_fast_jcc_case(DisasContext *s, int b)
+/* perform a conditional store into register 'reg' according to jump opcode
+   value 'b'. In the fast case, T0 is guaranted not to be used. */
+static inline void gen_setcc1(DisasContext *s, int b, TCGv reg)
 {
-    int jcc_op;
+    int inv, jcc_op, size, cond;
+    TCGv t0;
+
+    inv = b & 1;
     jcc_op = (b >> 1) & 7;
+
     switch(s->cc_op) {
-        /* we optimize the cmp/jcc case */
+        /* we optimize relational operators for the cmp/jcc case */
     case CC_OP_SUBB:
     case CC_OP_SUBW:
     case CC_OP_SUBL:
     case CC_OP_SUBQ:
-        if (jcc_op == JCC_O || jcc_op == JCC_P)
-            goto slow_jcc;
-        break;
-
-        /* some jumps are easy to compute */
-    case CC_OP_ADDB:
-    case CC_OP_ADDW:
-    case CC_OP_ADDL:
-    case CC_OP_ADDQ:
-
-    case CC_OP_LOGICB:
-    case CC_OP_LOGICW:
-    case CC_OP_LOGICL:
-    case CC_OP_LOGICQ:
-
-    case CC_OP_INCB:
-    case CC_OP_INCW:
-    case CC_OP_INCL:
-    case CC_OP_INCQ:
+        size = s->cc_op - CC_OP_SUBB;
+        switch(jcc_op) {
+        case JCC_BE:
+            cond = inv ? TCG_COND_GTU : TCG_COND_LEU;
+            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+            gen_extu(size, cpu_tmp4);
+            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
+            tcg_gen_setcond_tl(cond, reg, cpu_tmp4, t0);
+            break;
 
-    case CC_OP_DECB:
-    case CC_OP_DECW:
-    case CC_OP_DECL:
-    case CC_OP_DECQ:
+        case JCC_L:
+            cond = inv ? TCG_COND_GE : TCG_COND_LT;
+            goto fast_jcc_l;
+        case JCC_LE:
+            cond = inv ? TCG_COND_GT : TCG_COND_LE;
+        fast_jcc_l:
+            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+            gen_exts(size, cpu_tmp4);
+            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, true);
+            tcg_gen_setcond_tl(cond, reg, cpu_tmp4, t0);
+            break;
 
-    case CC_OP_SHLB:
-    case CC_OP_SHLW:
-    case CC_OP_SHLL:
-    case CC_OP_SHLQ:
-        if (jcc_op != JCC_Z && jcc_op != JCC_S)
+        default:
             goto slow_jcc;
+        }
         break;
+
     default:
     slow_jcc:
-        return 0;
+        /* gen_setcc_slow actually generates good code for JC, JZ and JS */
+        gen_setcc_slow(s, jcc_op, reg, inv);
+        break;
     }
-    return 1;
 }
 
 /* generate a conditional jump to label 'l1' according to jump opcode
@@ -2477,28 +2477,7 @@  static inline void gen_jcc(DisasContext *s, int b,
 
 static void gen_setcc(DisasContext *s, int b)
 {
-    int inv, jcc_op, l1;
-    TCGv t0;
-
-    if (is_fast_jcc_case(s, b)) {
-        /* nominal case: we use a jump */
-        /* XXX: make it faster by adding new instructions in TCG */
-        t0 = tcg_temp_local_new();
-        tcg_gen_movi_tl(t0, 0);
-        l1 = gen_new_label();
-        gen_jcc1(s, b ^ 1, l1);
-        tcg_gen_movi_tl(t0, 1);
-        gen_set_label(l1);
-        tcg_gen_mov_tl(cpu_T[0], t0);
-        tcg_temp_free(t0);
-    } else {
-        /* slow case: it is more efficient not to generate a jump,
-           although it is questionnable whether this optimization is
-           worth to */
-        inv = b & 1;
-        jcc_op = (b >> 1) & 7;
-        gen_setcc_slow(s, jcc_op, cpu_T[0], inv);
-    }
+    gen_setcc1(s, b, cpu_T[0]);
 }
 
 static inline void gen_op_movl_T0_seg(int seg_reg)