diff mbox series

[2/2] Align tight&hot loop without considering max skipping bytes.

Message ID 20240515030429.2575440-3-haochen.jiang@intel.com
State New
Headers show
Series [1/2] Adjust generic loop alignment from 16:11:8 to 16 for Intel processors | expand

Commit Message

Jiang, Haochen May 15, 2024, 3:04 a.m. UTC
From: liuhongt <hongtao.liu@intel.com>

When hot loop is small enough to fix into one cacheline, we should align
the loop with ceil_log2 (loop_size) without considering maximum
skipp bytes. It will help code prefetch.

gcc/ChangeLog:

	* config/i386/i386.cc (ix86_avoid_jump_mispredicts): Change
	gen_pad to gen_max_skip_align.
	(ix86_align_loops): New function.
	(ix86_reorg): Call ix86_align_loops.
	* config/i386/i386.md (pad): Rename to ..
	(max_skip_align): .. this, and accept 2 operands for align and
	skip.
---
 gcc/config/i386/i386.cc | 148 +++++++++++++++++++++++++++++++++++++++-
 gcc/config/i386/i386.md |  10 +--
 2 files changed, 153 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index e67e5f62533..c617091c8e1 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23137,7 +23137,7 @@  ix86_avoid_jump_mispredicts (void)
 	  if (dump_file)
 	    fprintf (dump_file, "Padding insn %i by %i bytes!\n",
 		     INSN_UID (insn), padsize);
-          emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
+	  emit_insn_before (gen_max_skip_align (GEN_INT (4), GEN_INT (padsize)), insn);
 	}
     }
 }
@@ -23410,6 +23410,150 @@  ix86_split_stlf_stall_load ()
     }
 }
 
+/* When a hot loop can be fit into one cacheline,
+   force align the loop without considering the max skip.  */
+static void
+ix86_align_loops ()
+{
+  basic_block bb;
+
+  /* Don't do this when we don't know cache line size.  */
+  if (ix86_cost->prefetch_block == 0)
+    return;
+
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *label = BB_HEAD (bb);
+      bool has_fallthru = 0;
+      edge e;
+      edge_iterator ei;
+
+      if (!LABEL_P (label))
+	continue;
+
+      profile_count fallthru_count = profile_count::zero ();
+      profile_count branch_count = profile_count::zero ();
+
+      FOR_EACH_EDGE (e, ei, bb->preds)
+	{
+	  if (e->flags & EDGE_FALLTHRU)
+	    has_fallthru = 1, fallthru_count += e->count ();
+	  else
+	    branch_count += e->count ();
+	}
+
+      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+	continue;
+
+      if (bb->loop_father
+	  && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+	  && (has_fallthru
+	      ? (!(single_succ_p (bb)
+		   && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+		 && optimize_bb_for_speed_p (bb)
+		 && branch_count + fallthru_count > count_threshold
+		 && (branch_count > fallthru_count * param_align_loop_iterations))
+	      /* In case there'no fallthru for the loop.
+		 Nops inserted won't be executed.  */
+	      : (branch_count > count_threshold
+		 || (bb->count > bb->prev_bb->count * 10
+		     && (bb->prev_bb->count
+			 <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
+	{
+	  rtx_insn* insn, *end_insn;
+	  HOST_WIDE_INT size = 0;
+	  bool padding_p = true;
+	  basic_block tbb = bb;
+	  unsigned cond_branch_num = 0;
+	  bool detect_tight_loop_p = false;
+
+	  for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+	       i++, tbb = tbb->next_bb)
+	    {
+	      /* Only handle continuous cfg layout. */
+	      if (bb->loop_father != tbb->loop_father)
+		{
+		  padding_p = false;
+		  break;
+		}
+
+	      FOR_BB_INSNS (tbb, insn)
+		{
+		  if (!NONDEBUG_INSN_P (insn))
+		    continue;
+		  size += ix86_min_insn_size (insn);
+
+		  /* We don't know size of inline asm.
+		     Don't align loop for call.  */
+		  if (asm_noperands (PATTERN (insn)) >= 0
+		      || CALL_P (insn))
+		    {
+		      size = -1;
+		      break;
+		    }
+		}
+
+	      if (size == -1 || size > ix86_cost->prefetch_block)
+		{
+		  padding_p = false;
+		  break;
+		}
+
+	      FOR_EACH_EDGE (e, ei, tbb->succs)
+		{
+		  /* It could be part of the loop.  */
+		  if (e->dest == bb)
+		    {
+		      detect_tight_loop_p = true;
+		      break;
+		    }
+		}
+
+	      if (detect_tight_loop_p)
+		break;
+
+	      end_insn = BB_END (tbb);
+	      if (JUMP_P (end_insn))
+		{
+		  /* For decoded icache:
+		     1. Up to two branches are allowed per Way.
+		     2. A non-conditional branch is the last micro-op in a Way.
+		  */
+		  if (onlyjump_p (end_insn)
+		      && (any_uncondjump_p (end_insn)
+			  || single_succ_p (tbb)))
+		    {
+		      padding_p = false;
+		      break;
+		    }
+		  else if (++cond_branch_num >= 2)
+		    {
+		      padding_p = false;
+		      break;
+		    }
+		}
+
+	    }
+
+	  if (padding_p && detect_tight_loop_p)
+	    {
+	      emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
+						    GEN_INT (0)), label);
+	      /* End of function.  */
+	      if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
+		break;
+	      /* Skip bb which already fits into one cacheline.  */
+	      bb = tbb;
+	    }
+	}
+    }
+
+  loop_optimizer_finalize ();
+  free_dominance_info (CDI_DOMINATORS);
+}
+
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
 static void
@@ -23433,6 +23577,8 @@  ix86_reorg (void)
 #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
       if (TARGET_FOUR_JUMP_LIMIT)
 	ix86_avoid_jump_mispredicts ();
+
+      ix86_align_loops ();
 #endif
     }
 }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 764bfe20ff2..686de0bf2ff 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -19150,16 +19150,18 @@ 
    (set_attr "length_immediate" "0")
    (set_attr "modrm" "0")])
 
-;; Pad to 16-byte boundary, max skip in op0.  Used to avoid
+;; Pad to 1 << op0 byte boundary, max skip in op1.  Used to avoid
 ;; branch prediction penalty for the third jump in a 16-byte
 ;; block on K8.
+;; Also it's used to align tight loops which can be fix into 1 cacheline.
+;; It can help code prefetch and reduce DSB miss.
 
-(define_insn "pad"
-  [(unspec_volatile [(match_operand 0)] UNSPECV_ALIGN)]
+(define_insn "max_skip_align"
+  [(unspec_volatile [(match_operand 0) (match_operand 1)] UNSPECV_ALIGN)]
   ""
 {
 #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
-  ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, 4, (int)INTVAL (operands[0]));
+  ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, (int)INTVAL (operands[0]), (int)INTVAL (operands[1]));
 #else
   /* It is tempting to use ASM_OUTPUT_ALIGN here, but we don't want to do that.
      The align insn is used to avoid 3 jump instructions in the row to improve