diff mbox

[i386] : Fix PR62011, False data dependency in popcnt instruction

Message ID CAFULd4YFuNdHCemxDGK4rp2h2sf=qyXP4D1O+0O_iFYj8JsDQQ@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak Aug. 16, 2014, 1:01 p.m. UTC
Hello!

Attached patch fixes the problem with false data dependency on output
register for popcnt, lzcnt and tzcnt insns on sandybridge and haswell
targets.

The new insn pattern shadows existing one, and after reload, the
clearing isns is split out of the insn. This way the clearing insn can
be scheduled by postreload scheduler. The new pattern takes care to
avoid live registers, so the compiler is always able to clear output
reg.

The testcase from the PR, compiled with -O3 -march=corei7 improves on
Ivybridge from:

unsigned        209717360000    3.21002 sec     16.3329 GB/s
uint64_t        209717360000    4.06517 sec     12.8971 GB/s

to (-O3 -march=corei7 -mtune-ctrl=avoid_false_dep_for_bmi):

unsigned        209717360000    3.14541 sec     16.6683 GB/s
uint64_t        209717360000    2.3663 sec      22.1564 GB/s

Due to high impact, the new tune flag is enabled by default for Intel
tunes and generic:

m_SANDYBRIDGE | m_HASWELL | m_INTEL | m_GENERIC

2014-08-16  Uros Bizjak  <ubizjak@gmail.com>

    PR target/62011
    * config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI):
    New tune flag.
    * config/i386/i386.h (TARGET_AVOID_FALSE_DEP_FOR_BMI): New define.
    * config/i386/i386.md (unspec) <UNSPEC_INSN_FALSE_DEP>: New unspec.
    (ffs<mode>2): Do not expand with tzcnt for
    TARGET_AVOID_FALSE_DEP_FOR_BMI.
    (ffssi2_no_cmove): Ditto.
    (*tzcnt<mode>_1): Disable for TARGET_AVOID_FALSE_DEP_FOR_BMI.
    (ctz<mode>2): New expander.
    (*ctz<mode>2_falsedep_1): New insn_and_split pattern.
    (*ctz<mode>2_falsedep): New insn.
    (*ctz<mode>2): Rename from ctz<mode>2.
    (clz<mode>2_lzcnt): New expander.
    (*clz<mode>2_lzcnt_falsedep_1): New insn_and_split pattern.
    (*clz<mode>2_lzcnt_falsedep): New insn.
    (*clz<mode>2): Rename from ctz<mode>2.
    (popcount<mode>2): New expander.
    (*popcount<mode>2_falsedep_1): New insn_and_split pattern.
    (*popcount<mode>2_falsedep): New insn.
    (*popcount<mode>2): Rename from ctz<mode>2.
    (*popcount<mode>2_cmp): Remove.
    (*popcountsi2_cmp_zext): Ditto.

The patch was bootstrapped and regression tested on
x86_64-pc-linux-gnu {,-m32} and will be committed to mainline SVN
after a couple of days. The patch will be also backported to 4.9
branch.

Uros.
diff mbox

Patch

Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 214059)
+++ config/i386/i386.h	(working copy)
@@ -473,6 +473,8 @@ 
 	ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS]
 #define TARGET_ADJUST_UNROLL \
     ix86_tune_features[X86_TUNE_ADJUST_UNROLL]
+#define TARGET_AVOID_FALSE_DEP_FOR_BMI \
+	ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 214059)
+++ config/i386/i386.md	(working copy)
@@ -112,6 +112,7 @@ 
   UNSPEC_XBEGIN_ABORT
   UNSPEC_STOS
   UNSPEC_PEEPSIB
+  UNSPEC_INSN_FALSE_DEP
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -12177,7 +12178,8 @@ 
       DONE;
     }
 
-  flags_mode = TARGET_BMI ? CCCmode : CCZmode;
+  flags_mode
+    = (TARGET_BMI && !TARGET_AVOID_FALSE_DEP_FOR_BMI) ? CCCmode : CCZmode;
 
   operands[2] = gen_reg_rtx (<MODE>mode);
   operands[3] = gen_rtx_REG (flags_mode, FLAGS_REG);
@@ -12203,7 +12205,8 @@ 
    (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
 	      (clobber (reg:CC FLAGS_REG))])]
 {
-  enum machine_mode flags_mode = TARGET_BMI ? CCCmode : CCZmode;
+  enum machine_mode flags_mode
+    = (TARGET_BMI && !TARGET_AVOID_FALSE_DEP_FOR_BMI) ? CCCmode : CCZmode;
 
   operands[3] = gen_lowpart (QImode, operands[2]);
   operands[4] = gen_rtx_REG (flags_mode, FLAGS_REG);
@@ -12218,7 +12221,7 @@ 
 		     (const_int 0)))
    (set (match_operand:SWI48 0 "register_operand" "=r")
 	(ctz:SWI48 (match_dup 1)))]
-  "TARGET_BMI"
+  "TARGET_BMI && !TARGET_AVOID_FALSE_DEP_FOR_BMI"
   "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
   [(set_attr "type" "alu1")
    (set_attr "prefix_0f" "1")
@@ -12239,7 +12242,52 @@ 
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "ctz<mode>2"
+(define_expand "ctz<mode>2"
+  [(parallel
+    [(set (match_operand:SWI248 0 "register_operand")
+	  (ctz:SWI248
+	    (match_operand:SWI248 1 "nonimmediate_operand")))
+     (clobber (reg:CC FLAGS_REG))])])
+
+(define_insn_and_split "*ctz<mode>2_falsedep_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=&r")
+	(ctz:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "(TARGET_BMI || TARGET_GENERIC)
+   && TARGET_AVOID_FALSE_DEP_FOR_BMI && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(parallel
+    [(set (match_dup 0)
+	  (ctz:SWI48 (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)
+     (clobber (reg:CC FLAGS_REG))])]
+  "ix86_expand_clear (operands[0]);")
+
+(define_insn "*ctz<mode>2_falsedep"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(ctz:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+	   UNSPEC_INSN_FALSE_DEP)
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+{
+  if (TARGET_BMI)
+    return "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}";
+  else if (TARGET_GENERIC)
+    /* tzcnt expands to 'rep bsf' and we can use it even if !TARGET_BMI.  */
+    return "rep%; bsf{<imodesuffix>}\t{%1, %0|%0, %1}";
+  else
+    gcc_unreachable ();
+}
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*ctz<mode>2"
   [(set (match_operand:SWI248 0 "register_operand" "=r")
 	(ctz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
    (clobber (reg:CC FLAGS_REG))]
@@ -12286,7 +12334,44 @@ 
   operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
 })
 
-(define_insn "clz<mode>2_lzcnt"
+(define_expand "clz<mode>2_lzcnt"
+  [(parallel
+    [(set (match_operand:SWI248 0 "register_operand")
+	  (clz:SWI248
+	    (match_operand:SWI248 1 "nonimmediate_operand")))
+     (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_LZCNT")
+
+(define_insn_and_split "*clz<mode>2_lzcnt_falsedep_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=&r")
+	(clz:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_LZCNT
+   && TARGET_AVOID_FALSE_DEP_FOR_BMI && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(parallel
+    [(set (match_dup 0)
+	  (clz:SWI48 (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)
+     (clobber (reg:CC FLAGS_REG))])]
+  "ix86_expand_clear (operands[0]);")
+
+(define_insn "*clz<mode>2_lzcnt_falsedep"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(clz:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+	   UNSPEC_INSN_FALSE_DEP)
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_LZCNT"
+  "lzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*clz<mode>2_lzcnt"
   [(set (match_operand:SWI248 0 "register_operand" "=r")
 	(clz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
    (clobber (reg:CC FLAGS_REG))]
@@ -12569,11 +12654,37 @@ 
    (set_attr "prefix_0f" "1")
    (set_attr "mode" "HI")])
 
-(define_insn "popcount<mode>2"
-  [(set (match_operand:SWI248 0 "register_operand" "=r")
-	(popcount:SWI248
-	  (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
+(define_expand "popcount<mode>2"
+  [(parallel
+    [(set (match_operand:SWI248 0 "register_operand")
+	  (popcount:SWI248
+	    (match_operand:SWI248 1 "nonimmediate_operand")))
+     (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_POPCNT")
+
+(define_insn_and_split "*popcount<mode>2_falsedep_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=&r")
+	(popcount:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
    (clobber (reg:CC FLAGS_REG))]
+  "TARGET_POPCNT
+   && TARGET_AVOID_FALSE_DEP_FOR_BMI && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(parallel
+    [(set (match_dup 0)
+	  (popcount:SWI48 (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)
+     (clobber (reg:CC FLAGS_REG))])]
+  "ix86_expand_clear (operands[0]);")
+
+(define_insn "*popcount<mode>2_falsedep"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(popcount:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+	   UNSPEC_INSN_FALSE_DEP)
+   (clobber (reg:CC FLAGS_REG))]
   "TARGET_POPCNT"
 {
 #if TARGET_MACHO
@@ -12586,15 +12697,12 @@ 
    (set_attr "type" "bitmanip")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*popcount<mode>2_cmp"
-  [(set (reg FLAGS_REG)
-	(compare
-	  (popcount:SWI248
-	    (match_operand:SWI248 1 "nonimmediate_operand" "rm"))
-	  (const_int 0)))
-   (set (match_operand:SWI248 0 "register_operand" "=r")
-	(popcount:SWI248 (match_dup 1)))]
-  "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
+(define_insn "*popcount<mode>2"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+	(popcount:SWI248
+	  (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_POPCNT"
 {
 #if TARGET_MACHO
   return "popcnt\t{%1, %0|%0, %1}";
@@ -12606,25 +12714,6 @@ 
    (set_attr "type" "bitmanip")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*popcountsi2_cmp_zext"
-  [(set (reg FLAGS_REG)
-        (compare
-          (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))
-          (const_int 0)))
-   (set (match_operand:DI 0 "register_operand" "=r")
-        (zero_extend:DI(popcount:SI (match_dup 1))))]
-  "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
-{
-#if TARGET_MACHO
-  return "popcnt\t{%1, %0|%0, %1}";
-#else
-  return "popcnt{l}\t{%1, %0|%0, %1}";
-#endif
-}
-  [(set_attr "prefix_rep" "1")
-   (set_attr "type" "bitmanip")
-   (set_attr "mode" "SI")])
-
 (define_expand "bswapdi2"
   [(set (match_operand:DI 0 "register_operand")
 	(bswap:DI (match_operand:DI 1 "nonimmediate_operand")))]
Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def	(revision 214059)
+++ config/i386/x86-tune.def	(working copy)
@@ -509,6 +509,11 @@ 
 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
           m_K8)
 
+/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
+   for bit-manipulation instructions.  */
+DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
+	  m_SANDYBRIDGE | m_HASWELL | m_INTEL | m_GENERIC)
+
 /*****************************************************************************/
 /* This never worked well before.                                            */
 /*****************************************************************************/