diff mbox

[16/25] tcg/i386: Handle ctz and clz opcodes

Message ID 1479324335-2074-17-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Nov. 16, 2016, 7:25 p.m. UTC
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.h     |  8 ++---
 tcg/i386/tcg-target.inc.c | 83 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 78 insertions(+), 13 deletions(-)

Comments

Bastian Koppelmann Nov. 17, 2016, 4:50 p.m. UTC | #1
On 11/16/2016 08:25 PM, Richard Henderson wrote:
> +
> +    OP_32_64(clz):
> +        if (const_args[2]) {
> +            tcg_debug_assert(have_bmi1);
> +            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
> +            tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
> +        } else {
> +            /* ??? See above.  */
> +            tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);

The Intel ISA manual states that it find the bit index of the most
significant bit, where the least significant bit is index 0. So for the
input 0x2 this should return 1. However this is not the number of
leading zeros.

Cheers,
    Bastian
Richard Henderson Nov. 17, 2016, 7:53 p.m. UTC | #2
On 11/17/2016 05:50 PM, Bastian Koppelmann wrote:
> On 11/16/2016 08:25 PM, Richard Henderson wrote:
>> +
>> +    OP_32_64(clz):
>> +        if (const_args[2]) {
>> +            tcg_debug_assert(have_bmi1);
>> +            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
>> +            tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
>> +        } else {
>> +            /* ??? See above.  */
>> +            tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);
>
> The Intel ISA manual states that it find the bit index of the most
> significant bit, where the least significant bit is index 0. So for the
> input 0x2 this should return 1. However this is not the number of
> leading zeros.

Oh, of course you're right.  I thought I was testing this, but while alpha does 
have this operation, it turns out it isn't used much.


r~
Richard Henderson Nov. 17, 2016, 7:59 p.m. UTC | #3
On 11/17/2016 08:53 PM, Richard Henderson wrote:
> On 11/17/2016 05:50 PM, Bastian Koppelmann wrote:
>> On 11/16/2016 08:25 PM, Richard Henderson wrote:
>>> +
>>> +    OP_32_64(clz):
>>> +        if (const_args[2]) {
>>> +            tcg_debug_assert(have_bmi1);
>>> +            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
>>> +            tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
>>> +        } else {
>>> +            /* ??? See above.  */
>>> +            tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);
>>
>> The Intel ISA manual states that it find the bit index of the most
>> significant bit, where the least significant bit is index 0. So for the
>> input 0x2 this should return 1. However this is not the number of
>> leading zeros.
>
> Oh, of course you're right.  I thought I was testing this, but while alpha does
> have this operation, it turns out it isn't used much.

Alternately, what I tested was on a haswell machine, which takes the LZCNT 
path, which *does* produce the intended results.  Just the BSR path doesn't.


r~
Bastian Koppelmann Nov. 17, 2016, 10:09 p.m. UTC | #4
On 11/17/2016 08:59 PM, Richard Henderson wrote:
> On 11/17/2016 08:53 PM, Richard Henderson wrote:
>> On 11/17/2016 05:50 PM, Bastian Koppelmann wrote:
>>> On 11/16/2016 08:25 PM, Richard Henderson wrote:
>>>> +
>>>> +    OP_32_64(clz):
>>>> +        if (const_args[2]) {
>>>> +            tcg_debug_assert(have_bmi1);
>>>> +            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
>>>> +            tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
>>>> +        } else {
>>>> +            /* ??? See above.  */
>>>> +            tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);
>>>
>>> The Intel ISA manual states that it find the bit index of the most
>>> significant bit, where the least significant bit is index 0. So for the
>>> input 0x2 this should return 1. However this is not the number of
>>> leading zeros.
>>
>> Oh, of course you're right.  I thought I was testing this, but while
>> alpha does
>> have this operation, it turns out it isn't used much.
> 
> Alternately, what I tested was on a haswell machine, which takes the
> LZCNT path, which *does* produce the intended results.  Just the BSR
> path doesn't.

Luckily my old laptop is a Core 2 Duo without LZCNT :)

Cheers,
    Bastian
diff mbox

Patch

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index f2d9955..8fff287 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -93,8 +93,8 @@  extern bool have_bmi1;
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
-#define TCG_TARGET_HAS_clz_i32          0
-#define TCG_TARGET_HAS_ctz_i32          0
+#define TCG_TARGET_HAS_clz_i32          1
+#define TCG_TARGET_HAS_ctz_i32          1
 #define TCG_TARGET_HAS_deposit_i32      1
 #define TCG_TARGET_HAS_extract_i32      1
 #define TCG_TARGET_HAS_sextract_i32     1
@@ -127,8 +127,8 @@  extern bool have_bmi1;
 #define TCG_TARGET_HAS_eqv_i64          0
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
-#define TCG_TARGET_HAS_clz_i64          0
-#define TCG_TARGET_HAS_ctz_i64          0
+#define TCG_TARGET_HAS_clz_i64          1
+#define TCG_TARGET_HAS_ctz_i64          1
 #define TCG_TARGET_HAS_deposit_i64      1
 #define TCG_TARGET_HAS_extract_i64      1
 #define TCG_TARGET_HAS_sextract_i64     0
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 39f62bd..3eeb58f 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -92,6 +92,7 @@  static const int tcg_target_call_oarg_regs[] = {
 #define TCG_CT_CONST_S32 0x100
 #define TCG_CT_CONST_U32 0x200
 #define TCG_CT_CONST_I32 0x400
+#define TCG_CT_CONST_WSZ 0x800
 
 /* Registers used with L constraint, which are the first argument 
    registers on x86_64, and two random call clobbered registers on
@@ -225,6 +226,12 @@  static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         } else {
             goto case_c;
         }
+    case 'W':
+        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
+        if (have_bmi1) {
+            ct->ct |= TCG_CT_CONST_WSZ;
+        }
+        break;
 
         /* qemu_ld/st address constraint */
     case 'L':
@@ -273,6 +280,9 @@  static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
     if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
         return 1;
     }
+    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
+        return 1;
+    }
     return 0;
 }
 
@@ -306,6 +316,8 @@  static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
 #define OPC_ANDN        (0xf2 | P_EXT38)
 #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_BSF         (0xbc | P_EXT)
+#define OPC_BSR         (0xbd | P_EXT)
 #define OPC_BSWAP	(0xc8 | P_EXT)
 #define OPC_CALL_Jz	(0xe8)
 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
@@ -320,6 +332,7 @@  static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_JMP_long	(0xe9)
 #define OPC_JMP_short	(0xeb)
 #define OPC_LEA         (0x8d)
+#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 #define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
 #define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
 #define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
@@ -346,6 +359,7 @@  static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 #define OPC_TESTL	(0x85)
+#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 #define OPC_XCHG_ax_r32	(0x90)
 
 #define OPC_GRP3_Ev	(0xf7)
@@ -431,6 +445,11 @@  static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
     if (opc & P_ADDR32) {
         tcg_out8(s, 0x67);
     }
+    if (opc & P_SIMDF3) {
+        tcg_out8(s, 0xf3);
+    } else if (opc & P_SIMDF2) {
+        tcg_out8(s, 0xf2);
+    }
 
     rex = 0;
     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
@@ -465,6 +484,11 @@  static void tcg_out_opc(TCGContext *s, int opc)
     if (opc & P_DATA16) {
         tcg_out8(s, 0x66);
     }
+    if (opc & P_SIMDF3) {
+        tcg_out8(s, 0xf3);
+    } else if (opc & P_SIMDF2) {
+        tcg_out8(s, 0xf2);
+    }
     if (opc & (P_EXT | P_EXT38)) {
         tcg_out8(s, 0x0f);
         if (opc & P_EXT38) {
@@ -1093,13 +1117,11 @@  static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
 }
 #endif
 
-static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGArg dest,
-                              TCGArg c1, TCGArg c2, int const_c2,
-                              TCGArg v1)
+static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
+                         TCGReg dest, TCGReg v1)
 {
-    tcg_out_cmp(s, c1, c2, const_c2, 0);
     if (have_cmov) {
-        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond], dest, v1);
+        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
     } else {
         TCGLabel *over = gen_new_label();
         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
@@ -1108,13 +1130,21 @@  static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGArg dest,
     }
 }
 
+static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
+                              TCGReg c1, TCGArg c2, int const_c2,
+                              TCGReg v1)
+{
+    tcg_out_cmp(s, c1, c2, const_c2, 0);
+    tcg_out_cmov(s, cond, 0, dest, v1);
+}
+
 #if TCG_TARGET_REG_BITS == 64
-static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGArg dest,
-                              TCGArg c1, TCGArg c2, int const_c2,
-                              TCGArg v1)
+static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
+                              TCGReg c1, TCGArg c2, int const_c2,
+                              TCGReg v1)
 {
     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
-    tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | P_REXW, dest, v1);
+    tcg_out_cmov(s, cond, P_REXW, dest, v1);
 }
 #endif
 
@@ -1993,6 +2023,37 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    OP_32_64(ctz):
+        if (const_args[2]) {
+            tcg_debug_assert(have_bmi1);
+            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
+            tcg_out_modrm(s, OPC_TZCNT + rexw, args[0], args[1]);
+        } else {
+            /* ??? The manual says that the output is undefined when the
+               input is zero, but real hardware leaves it unchanged.  As
+               noted in target-i386/translate.c, real programs depend on
+               this -- now we are one more of those.  */
+            tcg_out_modrm(s, OPC_BSF + rexw, args[0], args[1]);
+            if (args[0] != args[2]) {
+                tcg_out_cmov(s, TCG_COND_EQ, rexw, args[0], args[2]);
+            }
+        }
+        break;
+
+    OP_32_64(clz):
+        if (const_args[2]) {
+            tcg_debug_assert(have_bmi1);
+            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
+            tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
+        } else {
+            /* ??? See above.  */
+            tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);
+            if (args[0] != args[2]) {
+                tcg_out_cmov(s, TCG_COND_EQ, rexw, args[0], args[2]);
+            }
+        }
+        break;
+
     case INDEX_op_brcond_i32:
         tcg_out_brcond32(s, args[2], args[0], args[1], const_args[1],
                          arg_label(args[3]), 0);
@@ -2220,6 +2281,8 @@  static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_sar_i32, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i32, { "r", "0", "ci" } },
     { INDEX_op_rotr_i32, { "r", "0", "ci" } },
+    { INDEX_op_clz_i32, { "r", "r", "rW" } },
+    { INDEX_op_ctz_i32, { "r", "r", "rW" } },
 
     { INDEX_op_brcond_i32, { "r", "ri" } },
 
@@ -2281,6 +2344,8 @@  static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_sar_i64, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i64, { "r", "0", "ci" } },
     { INDEX_op_rotr_i64, { "r", "0", "ci" } },
+    { INDEX_op_clz_i64, { "r", "r", "rW" } },
+    { INDEX_op_ctz_i64, { "r", "r", "rW" } },
 
     { INDEX_op_brcond_i64, { "r", "re" } },
     { INDEX_op_setcond_i64, { "r", "r", "re" } },