Patchwork [3/6] tcg-x86_64: Implement setcond and movcond.

login
register
mail settings
Submitter Richard Henderson
Date Dec. 17, 2009, 5:32 p.m.
Message ID <b3a1f6980056160646b35d50a673a851383a3bc5.1261078375.git.rth@twiddle.net>
Download mbox | patch
Permalink /patch/41338/
State New
Headers show

Comments

Richard Henderson - Dec. 17, 2009, 5:32 p.m.
Implement conditional moves in the x86_64 backend.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/x86_64/tcg-target.c |   65 ++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 62 insertions(+), 3 deletions(-)
Laurent Desnogues - Dec. 18, 2009, 11:39 a.m.
On Thu, Dec 17, 2009 at 6:32 PM, Richard Henderson <rth@twiddle.net> wrote:
> Implement conditional moves in the x86_64 backend.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/x86_64/tcg-target.c |   65 ++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 62 insertions(+), 3 deletions(-)
>
> diff --git a/tcg/x86_64/tcg-target.c b/tcg/x86_64/tcg-target.c
> index 2339091..e411755 100644
> --- a/tcg/x86_64/tcg-target.c
> +++ b/tcg/x86_64/tcg-target.c
> @@ -491,9 +491,8 @@ static void tcg_out_jxx(TCGContext *s, int opc, int label_index)
>     }
>  }
>
> -static void tcg_out_brcond(TCGContext *s, int cond,
> -                           TCGArg arg1, TCGArg arg2, int const_arg2,
> -                           int label_index, int rexw)
> +static void tcg_out_cond(TCGContext *s, int cond, TCGArg arg1,
> +                         TCGArg arg2, int const_arg2, int rexw)
>  {
>     if (const_arg2) {
>         if (arg2 == 0) {
> @@ -508,9 +507,45 @@ static void tcg_out_brcond(TCGContext *s, int cond,
>     } else {
>         tcg_out_modrm(s, 0x01 | (ARITH_CMP << 3) | rexw, arg2, arg1);
>     }
> +}
> +
> +static void tcg_out_brcond(TCGContext *s, int cond,
> +                           TCGArg arg1, TCGArg arg2, int const_arg2,
> +                           int label_index, int rexw)
> +{
> +    tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw);
>     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label_index);
>  }
>
> +static void tcg_out_setcond(TCGContext *s, int cond, TCGArg arg0,
> +                            TCGArg arg1, TCGArg arg2, int const_arg2, int rexw)

Perhaps renaming arg0 to dest would make things slightly
more readable.

> +{
> +    int use_xor = (arg0 != arg1 && (const_arg2 || arg0 != arg2));
> +
> +    if (use_xor)
> +        tcg_out_movi(s, TCG_TYPE_I32, arg0, 0);
> +    tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw);
> +    tcg_out_modrm(s, 0x90 | tcg_cond_to_jcc[cond] | P_EXT | P_REXB, 0, arg0);

A comment saying this is a setcc would be nice.

Also note that tcg_out_modrm will generate an unneeded prefix
for some registers. cf. the patch I sent to the list months ago.

> +    if (!use_xor)
> +        tgen_arithi32(s, ARITH_AND, arg0, 0xff);

Wouldn't movzbl be better?

Regarding the xor optimization, I tested it on my i7 and it was
(very) slightly slower running a 64-bit SPEC2k gcc.

> +}
> +
> +static void tcg_out_movcond(TCGContext *s, int cond, TCGArg arg0,
> +                            TCGArg arg1, TCGArg arg2, int const_arg2,
> +                            TCGArg arg3, TCGArg arg4, int rexw)

Perhaps renaming arg0 to dest would make things slightly
more readable.

You should also add a note stating that arg3 != arg4.

> +{
> +    if (arg0 == arg3) {
> +        cond = tcg_invert_cond(cond);
> +        arg3 = arg4;
> +        arg4 = arg0;
> +    }
> +
> +    tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw);
> +    if (arg0 != arg4)
> +        tcg_out_mov(s, arg0, arg4);
> +    tcg_out_modrm(s, 0x40 | tcg_cond_to_jcc[cond] | P_EXT | rexw, arg0, arg3);

A comment saying this is cmovcc would be nice.

Note, I didn't check the correctness, though it looks OK. I'll
make real tests at the next iteration :-)

> +}
> +
>  #if defined(CONFIG_SOFTMMU)
>
>  #include "../../softmmu_defs.h"
> @@ -1197,6 +1232,24 @@ static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
>         tcg_out_modrm(s, 0x8b, args[0], args[1]);
>         break;
>
> +    case INDEX_op_setcond_i32:
> +        tcg_out_setcond(s, args[3], args[0], args[1], args[2],
> +                        const_args[2], 0);
> +        break;
> +    case INDEX_op_setcond_i64:
> +        tcg_out_setcond(s, args[3], args[0], args[1], args[2],
> +                        const_args[2], P_REXW);
> +        break;
> +
> +    case INDEX_op_movcond_i32:
> +        tcg_out_movcond(s, args[5], args[0], args[1], args[2],
> +                        const_args[2], args[3], args[4], 0);
> +        break;
> +    case INDEX_op_movcond_i64:
> +        tcg_out_movcond(s, args[5], args[0], args[1], args[2],
> +                        const_args[2], args[3], args[4], P_REXW);
> +        break;
> +
>     case INDEX_op_qemu_ld8u:
>         tcg_out_qemu_ld(s, args, 0);
>         break;
> @@ -1376,6 +1429,12 @@ static const TCGTargetOpDef x86_64_op_defs[] = {
>     { INDEX_op_ext16u_i64, { "r", "r"} },
>     { INDEX_op_ext32u_i64, { "r", "r"} },
>
> +    { INDEX_op_setcond_i32, { "r", "r", "re" } },
> +    { INDEX_op_setcond_i64, { "r", "r", "re" } },
> +
> +    { INDEX_op_movcond_i32, { "r", "r", "re", "r", "r" } },
> +    { INDEX_op_movcond_i64, { "r", "r", "re", "r", "r" } },

For the i32 variants, "ri" instead of "re" is enough.


Laurent

> +
>     { INDEX_op_qemu_ld8u, { "r", "L" } },
>     { INDEX_op_qemu_ld8s, { "r", "L" } },
>     { INDEX_op_qemu_ld16u, { "r", "L" } },
> --
> 1.6.5.2
>
>
Richard Henderson - Dec. 18, 2009, 5:11 p.m.
On 12/18/2009 03:39 AM, Laurent Desnogues wrote:
>> +static void tcg_out_setcond(TCGContext *s, int cond, TCGArg arg0,
>> +                            TCGArg arg1, TCGArg arg2, int const_arg2, int rexw)
>
> Perhaps renaming arg0 to dest would make things slightly
> more readable.

Ok.

> Also note that tcg_out_modrm will generate an unneeded prefix
> for some registers. cf. the patch I sent to the list months ago.

Huh.  Didn't notice since the disassembler printed what I expected to 
see.  Is fixing this at the same time a requirement for acceptance?
I'd prefer to tackle that separately, since no doubt it affects every 
use of P_REXB.

>> +        tgen_arithi32(s, ARITH_AND, arg0, 0xff);
>
> Wouldn't movzbl be better?

Handled inside tgen_arithi32:

     } else if (c == ARITH_AND && val == 0xffu) {
         /* movzbl */
         tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB, r0, r0);

I didn't feel the need to replicate that.

> Regarding the xor optimization, I tested it on my i7 and it was
> (very) slightly slower running a 64-bit SPEC2k gcc.

Huh.  It used to be recommended.  The partial word store used to stall 
the pipeline until the old value was ready, and the XOR was 
special-cased as a clear, which broke both the input dependency and also 
prevented a partial-register stall on the output.

Actually, this recommendation is still present: Section 3.5.1.6 in the 
November 2009 revision of the Intel Optimization Reference Manual.

If it's all the same, I'd prefer to keep what I have there.  All other 
things being equal, the XOR is 2 bytes and the MOVZBL is 3.

>> +static void tcg_out_movcond(TCGContext *s, int cond, TCGArg arg0,
>> +                            TCGArg arg1, TCGArg arg2, int const_arg2,
>> +                            TCGArg arg3, TCGArg arg4, int rexw)
>
> Perhaps renaming arg0 to dest would make things slightly
> more readable.

Ok.

> You should also add a note stating that arg3 != arg4.

I don't believe that's true though.  It's caught immediately when we 
emit the movcond opcode, but there's no check later once 
copy-propagation has been done within TCG.

I check for that in the i386 and sparc backends, because dest==arg3 && 
dest==arg4 would actually generate incorrect code.  Here in the x86_64 
backend, where we always use cmov it doesn't generate incorrect code, 
merely inefficient.

I could add an early out for that case, if you prefer.

>> +    { INDEX_op_setcond_i32, { "r", "r", "re" } },
>> +    { INDEX_op_setcond_i64, { "r", "r", "re" } },
>> +
>> +    { INDEX_op_movcond_i32, { "r", "r", "re", "r", "r" } },
>> +    { INDEX_op_movcond_i64, { "r", "r", "re", "r", "r" } },
>
> For the i32 variants, "ri" instead of "re" is enough.

Ah, quite right.


r~
Laurent Desnogues - Dec. 18, 2009, 5:41 p.m.
On Fri, Dec 18, 2009 at 6:11 PM, Richard Henderson <rth@twiddle.net> wrote:
>> Also note that tcg_out_modrm will generate an unneeded prefix
>> for some registers. cf. the patch I sent to the list months ago.
>
> Huh.  Didn't notice since the disassembler printed what I expected to see.
>  Is fixing this at the same time a requirement for acceptance?
> I'd prefer to tackle that separately, since no doubt it affects every use of
> P_REXB.

I agree this change can be delayed.

>>> +        tgen_arithi32(s, ARITH_AND, arg0, 0xff);
>>
>> Wouldn't movzbl be better?
>
> Handled inside tgen_arithi32:
>
>    } else if (c == ARITH_AND && val == 0xffu) {
>        /* movzbl */
>        tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB, r0, r0);
>
> I didn't feel the need to replicate that.

Oups, I compared with my code which has an explicit mozbl :)

>> Regarding the xor optimization, I tested it on my i7 and it was
>> (very) slightly slower running a 64-bit SPEC2k gcc.
>
> Huh.  It used to be recommended.  The partial word store used to stall the
> pipeline until the old value was ready, and the XOR was special-cased as a
> clear, which broke both the input dependency and also prevented a
> partial-register stall on the output.
>
> Actually, this recommendation is still present: Section 3.5.1.6 in the
> November 2009 revision of the Intel Optimization Reference Manual.
>
> If it's all the same, I'd prefer to keep what I have there.  All other
> things being equal, the XOR is 2 bytes and the MOVZBL is 3.

I agree too.  Anyway my measure is not representative enough
to mean anything.  And in that case I think shorter code is
better, so let's go for XOR.

>>> +static void tcg_out_movcond(TCGContext *s, int cond, TCGArg arg0,
>>> +                            TCGArg arg1, TCGArg arg2, int const_arg2,
>>> +                            TCGArg arg3, TCGArg arg4, int rexw)
>>
>> Perhaps renaming arg0 to dest would make things slightly
>> more readable.
>
> Ok.
>
>> You should also add a note stating that arg3 != arg4.
>
> I don't believe that's true though.  It's caught immediately when we emit
> the movcond opcode, but there's no check later once copy-propagation has
> been done within TCG.
>
> I check for that in the i386 and sparc backends, because dest==arg3 &&
> dest==arg4 would actually generate incorrect code.  Here in the x86_64
> backend, where we always use cmov it doesn't generate incorrect code, merely
> inefficient.
>
> I could add an early out for that case, if you prefer.

No, you can leave it as is unless someone else objects.


Laurent

Patch

diff --git a/tcg/x86_64/tcg-target.c b/tcg/x86_64/tcg-target.c
index 2339091..e411755 100644
--- a/tcg/x86_64/tcg-target.c
+++ b/tcg/x86_64/tcg-target.c
@@ -491,9 +491,8 @@  static void tcg_out_jxx(TCGContext *s, int opc, int label_index)
     }
 }
 
-static void tcg_out_brcond(TCGContext *s, int cond, 
-                           TCGArg arg1, TCGArg arg2, int const_arg2,
-                           int label_index, int rexw)
+static void tcg_out_cond(TCGContext *s, int cond, TCGArg arg1,
+                         TCGArg arg2, int const_arg2, int rexw)
 {
     if (const_arg2) {
         if (arg2 == 0) {
@@ -508,9 +507,45 @@  static void tcg_out_brcond(TCGContext *s, int cond,
     } else {
         tcg_out_modrm(s, 0x01 | (ARITH_CMP << 3) | rexw, arg2, arg1);
     }
+}
+
+static void tcg_out_brcond(TCGContext *s, int cond,
+                           TCGArg arg1, TCGArg arg2, int const_arg2,
+                           int label_index, int rexw)
+{
+    tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw);
     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label_index);
 }
 
+static void tcg_out_setcond(TCGContext *s, int cond, TCGArg arg0,
+                            TCGArg arg1, TCGArg arg2, int const_arg2, int rexw)
+{
+    int use_xor = (arg0 != arg1 && (const_arg2 || arg0 != arg2));
+
+    if (use_xor)
+        tcg_out_movi(s, TCG_TYPE_I32, arg0, 0);
+    tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw);
+    tcg_out_modrm(s, 0x90 | tcg_cond_to_jcc[cond] | P_EXT | P_REXB, 0, arg0);
+    if (!use_xor)
+        tgen_arithi32(s, ARITH_AND, arg0, 0xff);
+}
+
+static void tcg_out_movcond(TCGContext *s, int cond, TCGArg arg0,
+                            TCGArg arg1, TCGArg arg2, int const_arg2,
+                            TCGArg arg3, TCGArg arg4, int rexw)
+{
+    if (arg0 == arg3) {
+        cond = tcg_invert_cond(cond);
+        arg3 = arg4;
+        arg4 = arg0;
+    }
+
+    tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw);
+    if (arg0 != arg4)
+        tcg_out_mov(s, arg0, arg4);
+    tcg_out_modrm(s, 0x40 | tcg_cond_to_jcc[cond] | P_EXT | rexw, arg0, arg3);
+}
+
 #if defined(CONFIG_SOFTMMU)
 
 #include "../../softmmu_defs.h"
@@ -1197,6 +1232,24 @@  static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
         tcg_out_modrm(s, 0x8b, args[0], args[1]);
         break;
 
+    case INDEX_op_setcond_i32:
+        tcg_out_setcond(s, args[3], args[0], args[1], args[2],
+                        const_args[2], 0);
+        break;
+    case INDEX_op_setcond_i64:
+        tcg_out_setcond(s, args[3], args[0], args[1], args[2],
+                        const_args[2], P_REXW);
+        break;
+
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond(s, args[5], args[0], args[1], args[2],
+                        const_args[2], args[3], args[4], 0);
+        break;
+    case INDEX_op_movcond_i64:
+        tcg_out_movcond(s, args[5], args[0], args[1], args[2],
+                        const_args[2], args[3], args[4], P_REXW);
+        break;
+
     case INDEX_op_qemu_ld8u:
         tcg_out_qemu_ld(s, args, 0);
         break;
@@ -1376,6 +1429,12 @@  static const TCGTargetOpDef x86_64_op_defs[] = {
     { INDEX_op_ext16u_i64, { "r", "r"} },
     { INDEX_op_ext32u_i64, { "r", "r"} },
 
+    { INDEX_op_setcond_i32, { "r", "r", "re" } },
+    { INDEX_op_setcond_i64, { "r", "r", "re" } },
+
+    { INDEX_op_movcond_i32, { "r", "r", "re", "r", "r" } },
+    { INDEX_op_movcond_i64, { "r", "r", "re", "r", "r" } },
+
     { INDEX_op_qemu_ld8u, { "r", "L" } },
     { INDEX_op_qemu_ld8s, { "r", "L" } },
     { INDEX_op_qemu_ld16u, { "r", "L" } },