From 4817aa3894ab5f8ff3cae394cbb20de86c607ed7 Mon Sep 17 00:00:00 2001
From: Carey Williams <carey.williams@arm.com>
Date: Wed, 17 Oct 2018 11:13:40 +0100
Subject: [PATCH 1/1] B-dilution
2018-10-30 Carey Williams <Carey.Williams@arm.com>
* cfgbuild.c (inside_basic_block_p):
* cfgrtl.c (rtl_verify_bb_layout):
* config.gcc:
* config/aarch64/aarch64-passes.def (INSERT_PASS_BEFORE):
* config/aarch64/aarch64-protos.h (struct tune_params):
(make_pass_branch_diluter):
* config/aarch64/aarch64.c:
* config/aarch64/aarch64.md:
* config/aarch64/aarch64.opt:
* config/aarch64/b-diluter.c: New file.
* config/aarch64/t-aarch64:
* coretypes.h (class rtx_def):
* emit-rtl.c (emit_filler_after):
* rtl.def (FILLERT):
* rtl.h (class GTY):
(FILLERT_P):
(test):
(emit_filler_after):
* target-insns.def (fillert):
---
gcc/cfgbuild.c | 1 +
gcc/cfgrtl.c | 17 +-
gcc/config.gcc | 2 +-
gcc/config/aarch64/aarch64-branch-dilution.c | 668 +++++++++++++++++++++
gcc/config/aarch64/aarch64-passes.def | 1 +
gcc/config/aarch64/aarch64-protos.h | 4 +
gcc/config/aarch64/aarch64.c | 28 +
gcc/config/aarch64/aarch64.md | 8 +
gcc/config/aarch64/aarch64.opt | 12 +
gcc/config/aarch64/t-aarch64 | 6 +
gcc/coretypes.h | 1 +
gcc/doc/invoke.texi | 20 +
gcc/emit-rtl.c | 14 +
gcc/rtl.def | 5 +
gcc/rtl.h | 23 +
gcc/target-insns.def | 1 +
.../gcc.target/aarch64/branch-dilution-off.c | 57 ++
.../gcc.target/aarch64/branch-dilution-on.c | 58 ++
18 files changed, 924 insertions(+), 2 deletions(-)
create mode 100644 gcc/config/aarch64/aarch64-branch-dilution.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/branch-dilution-off.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/branch-dilution-on.c
@@ -58,6 +58,7 @@ inside_basic_block_p (const rtx_insn *insn)
case JUMP_TABLE_DATA:
case BARRIER:
+ case FILLER_INSN:
case NOTE:
return false;
@@ -61,6 +61,8 @@ along with GCC; see the file COPYING3. If not see
#include "cfgloop.h"
#include "tree-pass.h"
#include "print-rtl.h"
+#include "rtl-iter.h"
+
/* Holds the interesting leading and trailing notes for the function.
Only applicable if the CFG is in cfglayout mode. */
@@ -2982,7 +2984,20 @@ rtl_verify_bb_layout (void)
break;
default:
- fatal_insn ("insn outside basic block", x);
+ /* Allow nops after branches, via FILLER_INSN. */
+ bool fail = true;
+ subrtx_iterator::array_type array;
+ FOR_EACH_SUBRTX (iter, array, x, ALL)
+ {
+ const_rtx x = *iter;
+ if (GET_CODE (x) == FILLER_INSN)
+ {
+ fail = false;
+ break;
+ }
+ }
+ if (fail)
+ fatal_insn ("insn outside basic block", x);
}
}
@@ -305,7 +305,7 @@ aarch64*-*-*)
extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
c_target_objs="aarch64-c.o"
cxx_target_objs="aarch64-c.o"
- extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o"
+ extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-branch-dilution.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c"
target_has_targetm_common=yes
;;
new file mode 100644
@@ -0,0 +1,668 @@
+/* Branch dilution optimization pass for AArch64.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+ Contributed by ARM Ltd.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#define INCLUDE_LIST
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "df.h"
+#include "insn-config.h"
+#include "regs.h"
+#include "memmodel.h"
+#include "emit-rtl.h"
+#include "recog.h"
+#include "cfganal.h"
+#include "insn-attr.h"
+#include "context.h"
+#include "tree-pass.h"
+#include "regrename.h"
+#include "aarch64-protos.h"
+#include "cfghooks.h"
+#include "cfgrtl.h"
+#include "cfgbuild.h"
+#include "errors.h"
+
+unsigned MAX_BRANCH = 0;
+unsigned GRANULE_SIZE = 0;
+
+inline bool
+is_branch (rtx_insn *insn)
+{
+ if (insn != NULL)
+ return JUMP_P (insn) || CALL_P (insn) || ANY_RETURN_P (insn);
+ return false;
+}
+
+const pass_data
+pass_data_branch_dilution =
+{
+ RTL_PASS, /* type. */
+ "branch-dilution", /* name. */
+ OPTGROUP_NONE, /* optinfo_flags. */
+ TV_NONE, /* tv_id. */
+ 0, /* properties_required. */
+ 0, /* properties_provided. */
+ 0, /* properties_destroyed. */
+ 0, /* todo_flags_start. */
+ 0, /* todo_flags_finish. */
+};
+
+/* Return true if INSN is a branch insn. */
+
+class pass_branch_dilution : public rtl_opt_pass
+{
+public:
+ pass_branch_dilution (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_branch_dilution, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ { return optimize && aarch64_bdilution; }
+
+ virtual unsigned execute (function *);
+
+};
+
+/* Simple wrapper for RTX insns added to a granule.
+ It helps aid analysis and manipulation. */
+struct insn_info
+{
+ insn_info (rtx_insn *);
+ rtx_insn *rtx; /* underlying gcc rtx insn. */
+ insn_info *next; /* next insn in the granule. */
+ insn_info *prev; /* prev insn in the granule. */
+ unsigned index; /* current position in the granule. */
+ bool is_branch; /* denotes a branch insn. */
+ bool is_unconditional; /* denotes an unconditonal branch. */
+ bool is_nop; /* denotes a nop insn. */
+ bool ignore; /* to ignore unsupported branch types. */
+};
+
+insn_info::insn_info (rtx_insn *i)
+ : rtx (i), next (NULL), prev (NULL), index (-1),
+ is_branch (false), is_unconditional (false), is_nop (false),
+ ignore (false)
+{}
+
+/* A 'sliding window' like abstraction that represents
+ the current view of instructions that are being
+ considered for branch dilution. */
+class insn_granule
+{
+public:
+ insn_granule ();
+ /* Debug method. */
+ void dump (const char *desc = "General");
+ /* Attempt to dilute/pad granule with nops. */
+ int dilute ();
+ /* Returns true if the granule needs diluting. */
+ bool saturated ();
+ /* Utility functions for inserting instructions into
+ the granule. */
+ void insert_insn_before (insn_info *, insn_info *);
+ void insert_insn_after (insn_info *, insn_info *);
+ void add_insn (insn_info *);
+private:
+ void remove_oldest_insn ();
+ void remove_newest_insn ();
+ void update_indexes ();
+ /* Utility functions for handling nop special nop
+ insertion. */
+ void insert_nop_block_after (insn_info *insn);
+ basic_block create_nop_block_after_insn (insn_info *, int);
+ /* Dilution heuristics. */
+ insn_info *get_best_branch ();
+ int branch_heuristic (insn_info *);
+
+ /* Pointers to the first/last instructions in granule. */
+ insn_info *m_first = NULL;
+ insn_info *m_last = NULL;
+
+ /* Current counts of each interesting insn type in the granule. */
+ unsigned m_insn_count;
+ unsigned m_branch_count;
+ unsigned m_ubranch_count;
+};
+
+insn_granule::insn_granule ()
+ : m_insn_count (0), m_branch_count (0), m_ubranch_count (0)
+{}
+
+/* Create a new basic block, populated with nop_count nops, after a given
+ instruction. */
+
+basic_block insn_granule::create_nop_block_after_insn
+ (insn_info *insn, int nop_count)
+{
+ gcc_assert (nop_count > 0);
+ basic_block bb = BLOCK_FOR_INSN (insn->rtx);
+ rtx_insn *nop_ptr = NULL;
+ gcc_assert (is_branch (BB_END (bb)));
+ nop_ptr = emit_insn_after (gen_nop (), BB_END (bb));
+ insn_info *nop_insn = new insn_info (nop_ptr);
+ nop_insn->is_nop = true;
+ insert_insn_after (nop_insn, insn);
+ basic_block new_bb = create_basic_block (nop_ptr, NULL, bb);
+ set_block_for_insn (nop_ptr, new_bb);
+ for (int i = 0; i < (nop_count - 1); i++)
+ {
+ nop_ptr = emit_insn_after (gen_nop (), nop_ptr);
+ nop_insn = new insn_info (nop_ptr);
+ set_block_for_insn (nop_ptr, new_bb);
+ nop_insn->is_nop = true;
+ insert_insn_after (nop_insn, insn);
+ }
+ /* Fix up block endings. */
+ BB_END (new_bb) = nop_ptr;
+ BB_END (bb) = insn->rtx;
+ return new_bb;
+}
+
+/* Dump a textual representation of the current state
+ of the insn_granule in the following format:
+
+ "===== GRANULE ====="
+ "====> {desc} <===="
+ "---> INS:?, BRA:? (?), FIRST: ?, LAST: ?"
+ "0. jump_insn > NEXT = (?), PREV = (?) -- UID: ?"
+ "1. insn (nop) > NEXT = (?), PREV = (?) -- UID: ?"
+ "2. insn > NEXT = (?), PREV = (?) -- UID: ?"
+ "3. jump_insn > NEXT = (?), PREV = (?) -- UID: ?"
+
+ Used only for debugging with fdump.
+ */
+
+void insn_granule::dump (const char *desc)
+{
+ gcc_assert (dump_file);
+ insn_info *insn = m_first;
+ fprintf (dump_file, "===== GRANULE =====\n====> %s <====\n", desc);
+ fprintf (dump_file, "---> INS:%d, BRA:%d (%d), FIRST: %d, LAST: %d\n",
+ m_insn_count, m_branch_count, m_ubranch_count,
+ m_first->index, m_last->index);
+ while (insn)
+ {
+ fprintf (dump_file, "%d. %s%s%s > NEXT = (%d), PREV = (%d) -- UID: %d\n",
+ insn->index, GET_RTX_NAME (GET_CODE (insn->rtx)),
+ any_uncondjump_p (insn->rtx) ? " (ubranch)" : "",
+ insn->is_nop ? " (nop)" : "", insn->next ? insn->next->index : -1,
+ insn->prev ? insn->prev->index : -1, INSN_UID (insn->rtx));
+ insn = insn->next;
+ }
+}
+
+/* Simple heuristic used to favor certain types branches to pad after, for
+ e.g. we prefer unconditional branches or branches surrounded by other
+ branches. */
+
+int insn_granule::branch_heuristic (insn_info *insn)
+{
+ int value = 0;
+ if (!is_branch (insn->rtx))
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "--> Ignoring insn: %d as it's not a branch\n",
+ insn->index);
+ return -1;
+ }
+ if (insn->ignore)
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "--> Ignoring branch insn: %d (unsupported)\n",
+ insn->index);
+ return -1;
+ }
+ if (insn == m_last)
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "--> Ignoring insn: %d as it's the last granule insn\n",
+ insn->index);
+ return -1;
+ }
+ if (insn->is_unconditional)
+ {
+ value += 2;
+ }
+ if (is_branch (prev_real_nondebug_insn (insn->rtx))
+ && is_branch (next_real_nondebug_insn (insn->rtx)))
+ {
+ value++;
+ }
+ return value;
+}
+
+/* Iterate over the granule and test a heuristic against each insn.
+ Return the insn that seems most promising to pad after, or the
+ first, in a case where no others seem suitable. */
+
+insn_info* insn_granule::get_best_branch ()
+{
+ insn_info *current_insn = m_first;
+ insn_info *best_insn = m_first;
+ int current_score = 0;
+ int best_score = 0;
+ /* Make sure we start with a branch. */
+ while (current_insn && !current_insn->is_branch)
+ {
+ current_insn = current_insn->next;
+ }
+ best_insn = current_insn;
+ while (current_insn)
+ {
+ current_score = branch_heuristic (current_insn);
+ if (dump_file)
+ fprintf (dump_file, "Evaluating insn %d (%s), score = %d\n",
+ current_insn->index,
+ GET_RTX_NAME (GET_CODE (current_insn->rtx)), current_score);
+ if (current_score > best_score)
+ {
+ best_score = current_score;
+ best_insn = current_insn;
+ }
+ current_insn = current_insn->next;
+ }
+ if (dump_file)
+ fprintf (dump_file, "Returning best insn: %d %s.\n", best_insn->index,
+ GET_RTX_NAME (GET_CODE (best_insn->rtx)));
+ return best_insn;
+}
+
+/* Insert the instruction into the granule, before the given instruction. */
+
+void insn_granule::insert_insn_before (insn_info *new_insn,
+ insn_info *current_insn)
+{
+ if (dump_file)
+ fprintf (dump_file, "Inserting insn before insn %d, at position %d\n",
+ current_insn->index, current_insn->index);
+ new_insn->index = current_insn->index;
+ if (current_insn != m_first)
+ {
+ current_insn->prev->next = new_insn;
+ new_insn->prev = current_insn->prev;
+ }
+ /* remove_newest_insn () will tidy up any loose ends, like LAST ptr. */
+ current_insn->prev = new_insn;
+ new_insn->next = current_insn;
+ m_insn_count++;
+ if (current_insn == m_first)
+ m_first = new_insn;
+ update_indexes ();
+}
+
+/* Insert the instruction into the granule, after the given instruction. */
+
+void insn_granule::insert_insn_after (insn_info *new_insn,\
+ insn_info *current_insn)
+{
+ gcc_assert (current_insn != m_last);
+ if (dump_file)
+ fprintf (dump_file, "Inserting nop after insn %d, at position %d\n",
+ current_insn->index, current_insn->index + 1);
+ new_insn->index = current_insn->index + 1;
+
+ current_insn->next->prev = new_insn;
+ new_insn->next = current_insn->next;
+ current_insn->next = new_insn;
+ new_insn->prev = current_insn;
+
+ m_insn_count++;
+ update_indexes ();
+}
+
+/* Insert a new basic block containing a nop. */
+
+void insn_granule::insert_nop_block_after (insn_info *insn)
+{
+ edge e;
+ basic_block bb = BLOCK_FOR_INSN (insn->rtx);
+ e = find_fallthru_edge (bb->succs);
+ if (e)
+ {
+ basic_block new_bb = create_nop_block_after_insn (insn, 1);
+ /* Wire up the edges and preserve the partition. */
+ make_edge (new_bb, e->dest, EDGE_FALLTHRU);
+ BB_COPY_PARTITION (new_bb, bb);
+ redirect_edge_succ_nodup (e, new_bb);
+ }
+ else
+ {
+ /* Odd special case for exit functions,
+ which have no fallthru edge. */
+ insn_info *nop_insn = new insn_info (
+ (emit_filler_after (gen_filler_insn (), insn->rtx)));
+ nop_insn->is_nop = true;
+ insert_insn_after (nop_insn, insn);
+ }
+}
+
+/* Analyse the granule's branches and insert a nop to dilute. */
+
+int insn_granule::dilute ()
+{
+ if (dump_file)
+ fprintf (dump_file, "> Starting dilution.\n");
+ insn_info *branch_insn = NULL;
+ branch_insn = get_best_branch ();
+ /* If the granule is saturated then branches should be available.
+ Inserting nops after the granule isn't going to help dilute it. */
+ gcc_assert (branch_insn && (branch_insn->is_branch
+ && (branch_insn != m_last)));
+ if (any_condjump_p (branch_insn->rtx))
+ {
+ /* We can insert a nop via a 'nop block'
+ attached to the fallthru edge. */
+ insert_nop_block_after (branch_insn);
+ }
+ else if (CALL_P (branch_insn->rtx))
+ {
+ /* Standard calls do not end the basic block,
+ so a simple emit will suffice. */
+ if (!control_flow_insn_p (branch_insn->rtx))
+ {
+ insn_info *nop_insn = new insn_info
+ (emit_insn_after (gen_nop (), branch_insn->rtx));
+ nop_insn->is_nop = true;
+ insert_insn_after (nop_insn, branch_insn);
+ }
+ else
+ {
+ /* Sibling calls and no-return calls do. */
+ insert_nop_block_after (branch_insn);
+ }
+ }
+ else if (returnjump_p (branch_insn->rtx))
+ {
+ /* Return jumps must be followed by their barrier,
+ so we emit the filler after that. */
+ insn_info *nop_insn = new insn_info (
+ (emit_filler_after (gen_filler_insn (),
+ next_nonnote_nondebug_insn (branch_insn->rtx))));
+ nop_insn->is_nop = true;
+ insert_insn_after (nop_insn, branch_insn);
+ }
+ else if (any_uncondjump_p (branch_insn->rtx))
+ {
+ /* Any remaining unconditionals can be handled by a filler. */
+ insn_info *nop_insn = new insn_info (
+ (emit_filler_after (gen_filler_insn (), branch_insn->rtx)));
+ nop_insn->is_nop = true;
+ insert_insn_after (nop_insn, branch_insn);
+ }
+ else if (pc_set (branch_insn->rtx) || JUMP_P (branch_insn->rtx))
+ {
+ /* TODO handle pc_set and asm _goto (s).
+ For now we'll just pretend they're not branches. */
+ branch_insn->is_branch = false;
+ m_branch_count--;
+ if (branch_insn->is_unconditional)
+ {
+ branch_insn->is_unconditional = false;
+ m_ubranch_count--;
+ }
+ branch_insn->ignore = true;
+ }
+ else
+ {
+ /* Unhandled branch type. */
+ if (dump_file)
+ {
+ fprintf (dump_file, "Error: unhandled branch type:\n");
+ print_rtl_single (dump_file, branch_insn->rtx);
+ }
+ gcc_unreachable ();
+ }
+
+ /* Trim the granule back to size after nop insertion. */
+ int rollback = 0;
+ while (m_insn_count > GRANULE_SIZE)
+ {
+ remove_newest_insn ();
+ rollback++;
+ }
+ if (dump_file)
+ fprintf (dump_file, "< End dilution.\n");
+ return rollback;
+}
+
+/* Return true if the granule needs diluting. */
+
+bool insn_granule::saturated ()
+{ return m_branch_count > MAX_BRANCH; }
+
+void
+insn_granule::update_indexes ()
+{
+ insn_info *i = m_first;
+ int n = 0;
+ i->index = n++;
+ while ((i = i->next))
+ {
+ i->index = n++;
+ }
+}
+
+/* Add an instruction to the granule. */
+
+void
+insn_granule::add_insn (insn_info *insn)
+{
+ if (m_insn_count == 0)
+ {
+ m_first = m_last = insn;
+ }
+ else
+ {
+ if (m_insn_count >= GRANULE_SIZE)
+ {
+ remove_oldest_insn ();
+ }
+ m_last->next = insn;
+ insn->prev = m_last;
+ m_last = insn;
+ }
+ insn->index = m_insn_count++;
+ if (is_branch (insn->rtx))
+ {
+ m_branch_count++;
+ insn->is_branch = true;
+ if (any_uncondjump_p (insn->rtx))
+ {
+ insn->is_unconditional = true;
+ m_ubranch_count++;
+ }
+ }
+ update_indexes ();
+}
+
+/* Remove the last added instruction from the granule. */
+
+void
+insn_granule::remove_newest_insn ()
+{
+ insn_info *to_delete = m_last;
+ if (dump_file)
+ fprintf (dump_file, "to_delete: %d UID: %d\n", to_delete->index,
+ INSN_UID (to_delete->rtx));
+ m_last = to_delete->prev;
+ m_last->next = NULL;
+ if (is_branch (to_delete->rtx) && !to_delete->ignore)
+ {
+ m_branch_count--;
+ if (any_uncondjump_p (to_delete->rtx))
+ {
+ m_ubranch_count--;
+ }
+ }
+ m_insn_count--;
+ delete to_delete;
+ update_indexes ();
+}
+
+/* Remove the first added instruction from the granule. */
+
+void
+insn_granule::remove_oldest_insn ()
+{
+ insn_info *to_delete = m_first;
+ m_first = to_delete->next;
+ m_first->prev = NULL;
+ if (is_branch (to_delete->rtx) && !to_delete->ignore)
+ {
+ m_branch_count--;
+ if (any_uncondjump_p (to_delete->rtx))
+ {
+ m_ubranch_count--;
+ }
+ }
+ m_insn_count--;
+ delete to_delete;
+ update_indexes ();
+}
+
+/* Create the branch dilution pass. */
+
+rtl_opt_pass
+*make_pass_branch_dilution (gcc::context *ctxt)
+{
+ return new pass_branch_dilution (ctxt);
+}
+
+/* Return the next instruction after START that is a "real" instruction
+ i.e. not barriers, code labels, debug insns etc. */
+
+static inline rtx_insn *
+next_space_consuming_insn (rtx_insn *start)
+{
+ rtx_insn *insn = next_real_nondebug_insn (start);
+ while (insn && (recog_memoized (insn) < 0))
+ insn = next_real_nondebug_insn (insn);
+ return insn;
+}
+
+/* Find the next branching insn in the instruction stream starting
+ from, excluding START. Return that insn or NULL if none is found.
+ Write in DIST the distance between the result and START
+ (in instructions). */
+
+rtx_insn *
+find_next_branch (rtx_insn *start)
+{
+ rtx_insn *insn = next_space_consuming_insn (start);
+ while (insn)
+ {
+ if (is_branch (insn))
+ {
+ return insn;
+ }
+ insn = next_space_consuming_insn (insn);
+ }
+ return NULL;
+}
+
+/* Execute the branch dilution pass. */
+
+unsigned int
+pass_branch_dilution::execute (ATTRIBUTE_UNUSED function *fun)
+{
+
+ if (global_options_set.x_aarch64_bdilution_gsize)
+ {
+ GRANULE_SIZE = (unsigned)aarch64_bdilution_gsize;
+ MAX_BRANCH = (unsigned)aarch64_bdilution_maxb;
+ }
+ else
+ {
+ GRANULE_SIZE = aarch64_tune_params.bdilution_gsize;
+ MAX_BRANCH = aarch64_tune_params.bdilution_maxb;
+ }
+
+ if (dump_file)
+ fprintf (dump_file, "BDILUTE OPTIONS: %d, %d\n", GRANULE_SIZE, MAX_BRANCH);
+
+ /* Disabled. */
+ if (GRANULE_SIZE < 1)
+ return 0;
+ /* Invalid. */
+ if (MAX_BRANCH < 1)
+ error ("branch dilution: max branch must be greater than zero");
+ else if (MAX_BRANCH >= GRANULE_SIZE)
+ {
+ error ("branch dilution: max branches (%d) must be \
+ less than granule size (%d)", MAX_BRANCH, GRANULE_SIZE);
+ }
+
+ if (dump_file)
+ fprintf (dump_file, "branch-dilution start:\n");
+
+ /* Start scanning from the first occuring branch. */
+ rtx_insn *curr_insn = find_next_branch (get_insns ());
+
+ if (!curr_insn)
+ return 0;
+
+ insn_granule granule;
+ granule.add_insn (new insn_info (curr_insn));
+
+ if (dump_file)
+ granule.dump ();
+
+ /* Iterate over the rest of the instruction stream. */
+ while ((curr_insn = next_nonnote_nondebug_insn (curr_insn)))
+ {
+ if (!LABEL_P (curr_insn) && !BARRIER_P (curr_insn))
+ {
+ granule.add_insn (new insn_info (curr_insn));
+ if (dump_file)
+ granule.dump ();
+ }
+ else
+ {
+ /* Only check for saturation if changes have been made. */
+ continue;
+ }
+ if (granule.saturated ())
+ {
+ if (dump_file)
+ fprintf (dump_file, "Granule is saturated.\n");
+ int rollback = granule.dilute ();
+ /* Now we need to re-scan any pushed out instructions. */
+ while (rollback > 0)
+ {
+ curr_insn = previous_insn (curr_insn);
+ rollback--;
+ }
+ if (dump_file)
+ granule.dump ();
+ }
+ }
+ if (dump_file)
+ fprintf (dump_file, "branch-dilution end.\n");
+ return 0;
+}
@@ -21,3 +21,4 @@
INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering);
INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation);
INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
+INSERT_PASS_BEFORE (pass_compute_alignments, 1, pass_branch_dilution);
@@ -263,6 +263,9 @@ struct tune_params
int vec_reassoc_width;
int min_div_recip_mul_sf;
int min_div_recip_mul_df;
+ /* Branch dilution. */
+ unsigned int bdilution_gsize;
+ unsigned int bdilution_maxb;
/* Value for aarch64_case_values_threshold; or 0 for the default. */
unsigned int max_case_values;
/* An enum specifying how to take into account CPU autoprefetch capabilities
@@ -623,6 +626,7 @@ std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
rtl_opt_pass *make_pass_fma_steering (gcc::context *);
rtl_opt_pass *make_pass_track_speculation (gcc::context *);
rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
+rtl_opt_pass *make_pass_branch_dilution (gcc::context *ctxt);
poly_uint64 aarch64_regmode_natural_size (machine_mode);
@@ -681,6 +681,8 @@ static const struct tune_params generic_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
@@ -707,6 +709,8 @@ static const struct tune_params cortexa35_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
@@ -733,6 +737,8 @@ static const struct tune_params cortexa53_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
@@ -759,6 +765,8 @@ static const struct tune_params cortexa57_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
@@ -785,6 +793,8 @@ static const struct tune_params cortexa72_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 4, /* bdilution_gsize. */
+ 2, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
@@ -811,6 +821,8 @@ static const struct tune_params cortexa73_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
@@ -838,6 +850,8 @@ static const struct tune_params exynosm1_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
48, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
@@ -863,6 +877,8 @@ static const struct tune_params thunderxt88_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
@@ -888,6 +904,8 @@ static const struct tune_params thunderx_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
@@ -915,6 +933,8 @@ static const struct tune_params tsv110_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
@@ -940,6 +960,8 @@ static const struct tune_params xgene1_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
@@ -966,6 +988,8 @@ static const struct tune_params qdf24xx_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
@@ -994,6 +1018,8 @@ static const struct tune_params saphira_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
@@ -1020,6 +1046,8 @@ static const struct tune_params thunderx2t99_tunings =
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
+ 0, /* bdilution_gsize. */
+ 0, /* bdilution_maxb. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
@@ -730,6 +730,14 @@
[(set_attr "type" "branch")]
)
+(define_insn "filler_insn"
+ [(filler_insn)]
+ ""
+ "nop"
+ [(set_attr "type" "no_insn")
+ (set_attr "length" "4")]
+)
+
(define_insn "*cb<optab><mode>1"
[(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
(const_int 0))
@@ -153,6 +153,18 @@ msign-return-address=
Target RejectNegative Report Joined Enum(aarch64_ra_sign_scope_t) Var(aarch64_ra_sign_scope) Init(AARCH64_FUNCTION_NONE) Save
Select return address signing scope.
+mbranch-dilution
+Target Report RejectNegative Save Var(aarch64_bdilution) Init(0) Save
+Run the branch dilution pass.
+
+mbranch-dilution-granularity=
+Target RejectNegative Joined UInteger Var(aarch64_bdilution_gsize) Init(0)
+Size of instruction stream granules (window).
+
+mbranch-dilution-max-branches=
+Target RejectNegative Joined UInteger Var(aarch64_bdilution_maxb) Init(0)
+Max number of branches that should appear within an instruction granule.
+
Enum
Name(aarch64_ra_sign_scope_t) Type(enum aarch64_function_type)
Supported AArch64 return address signing scope (for use with -msign-return-address= option):
@@ -87,6 +87,12 @@ falkor-tag-collision-avoidance.o: \
$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
$(srcdir)/config/aarch64/falkor-tag-collision-avoidance.c
+aarch64-branch-dilution.o: $(srcdir)/config/aarch64/aarch64-branch-dilution.c \
+ $(CONFIG_H) $(SYSTEM_H) $(RTL_BASE_H) \
+ $(srcdir)/config/aarch64/aarch64-protos.h
+ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+ $(srcdir)/config/aarch64/aarch64-branch-dilution.c
+
comma=,
MULTILIB_OPTIONS = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst $(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG))))
MULTILIB_DIRNAMES = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))
@@ -83,6 +83,7 @@ class rtx_def;
class rtx_call_insn; /* CALL_P (X) */
class rtx_jump_table_data; /* JUMP_TABLE_DATA_P (X) */
class rtx_barrier; /* BARRIER_P (X) */
+ class rtx_filler_insn; /* FILLER_INSN_P (X) */
class rtx_code_label; /* LABEL_P (X) */
class rtx_note; /* NOTE_P (X) */
@@ -624,6 +624,9 @@ Objective-C and Objective-C++ Dialects}.
-msign-return-address=@var{scope} @gol
-march=@var{name} -mcpu=@var{name} -mtune=@var{name} @gol
-moverride=@var{string} -mverbose-cost-dump -mtrack-speculation}
+-mbranch-dilution @gol
+-mbranch-dilution-granularity=@var{num}
+-mbranch-dilution-max-branches=@var{num}
@emph{Adapteva Epiphany Options}
@gccoptlist{-mhalf-reg-file -mprefer-short-insn-regs @gol
@@ -15209,6 +15212,23 @@ Permissible values are @samp{none}, which disables return address signing,
functions, and @samp{all}, which enables pointer signing for all functions. The
default value is @samp{none}.
+@item -mbranch-dilution
+@opindex mbranch-dilution
+Enable the branch dilution optimization pass to improve performance when
+expecting high branch density.
+
+@item -mbranch-dilution-granularity
+@opindex mbranch-dilution-granularity
+Specify the size of the granules (instruction windows) to be considered
+for branch dilution. When omitted, the tuning for the specified @option{-mcpu}
+will be used.
+
+@item -mbranch-dilution-max-branches
+@opindex mbranch-dilution-max-branches
+Specify the amount of branches a granule may contain before it is considered
+saturated, requiring branch dilution. When omitted, the tuning for the
+specified @option{-mcpu} will be used.
+
@item -msve-vector-bits=@var{bits}
@opindex msve-vector-bits
Specify the number of bits in an SVE vector register. This option only has
@@ -4758,6 +4758,20 @@ emit_barrier_after (rtx_insn *after)
return insn;
}
+/* Make an insn of code FILLER_INSN to
+ pad out the instruction stream.
+ PATTERN should be from gen_filler_insn ().
+ AFTER will typically be an unconditional
+ branch at the end of a basic block. */
+
+rtx_insn *
+emit_filler_after (rtx pattern, rtx_insn *after)
+{
+ rtx_insn* i = make_insn_raw (pattern);
+ add_insn_after_nobb (i, after);
+ return i;
+}
+
/* Emit the label LABEL after the insn AFTER. */
rtx_insn *
@@ -338,6 +338,11 @@ DEF_RTL_EXPR(RETURN, "return", "", RTX_EXTRA)
conditional jumps. */
DEF_RTL_EXPR(SIMPLE_RETURN, "simple_return", "", RTX_EXTRA)
+/* Special filler type, used to pad the instruction stream. */
+
+DEF_RTL_EXPR(FILLER_INSN, "filler_insn", "", RTX_INSN)
+
+
/* Special for EH return from subroutine. */
DEF_RTL_EXPR(EH_RETURN, "eh_return", "", RTX_EXTRA)
@@ -668,6 +668,17 @@ class GTY(()) rtx_barrier : public rtx_insn
from rtl.def. */
};
+class GTY(()) rtx_filler_insn : public rtx_insn
+{
+ /* No extra fields, but adds the invariant:
+ FILLER_INSN_P (X) aka (GET_CODE (X) == FILLER_INSN)
+ i.e. a marker that indicates the INSN stream should be padded.
+
+ This is an instance of:
+ DEF_RTL_EXPR(FILLER_INSN, "filler_insn", "", RTX_INSN)
+ from rtl.def. */
+};
+
class GTY(()) rtx_code_label : public rtx_insn
{
/* No extra fields, but adds the invariant:
@@ -860,6 +871,9 @@ struct GTY(()) rtvec_def {
/* Predicate yielding nonzero iff X is a barrier insn. */
#define BARRIER_P(X) (GET_CODE (X) == BARRIER)
+/* Predicate yielding nonzero iff X is a filler insn. */
+#define FILLER_INSN_P(X) (GET_CODE (X) == FILLER_INSN)
+
/* Predicate yielding nonzero iff X is a data for a jump table. */
#define JUMP_TABLE_DATA_P(INSN) (GET_CODE (INSN) == JUMP_TABLE_DATA)
@@ -968,6 +982,14 @@ is_a_helper <rtx_barrier *>::test (rtx rt)
template <>
template <>
inline bool
+is_a_helper <rtx_filler_insn *>::test (rtx rt)
+{
+ return FILLER_INSN_P (rt);
+}
+
+template <>
+template <>
+inline bool
is_a_helper <rtx_code_label *>::test (rtx rt)
{
return LABEL_P (rt);
@@ -3257,6 +3279,7 @@ extern rtx_insn *emit_debug_insn_after (rtx, rtx_insn *);
extern rtx_insn *emit_debug_insn_after_noloc (rtx, rtx_insn *);
extern rtx_insn *emit_debug_insn_after_setloc (rtx, rtx_insn *, location_t);
extern rtx_barrier *emit_barrier_after (rtx_insn *);
+extern rtx_insn *emit_filler_after (rtx, rtx_insn *);
extern rtx_insn *emit_label_after (rtx_insn *, rtx_insn *);
extern rtx_note *emit_note_after (enum insn_note, rtx_insn *);
extern rtx_insn *emit_insn (rtx);
@@ -94,6 +94,7 @@ DEF_TARGET_INSN (sibcall_epilogue, (void))
DEF_TARGET_INSN (sibcall_value, (rtx x0, rtx x1, rtx opt2, rtx opt3,
rtx opt4))
DEF_TARGET_INSN (simple_return, (void))
+DEF_TARGET_INSN (filler_insn, (void))
DEF_TARGET_INSN (split_stack_prologue, (void))
DEF_TARGET_INSN (split_stack_space_check, (rtx x0, rtx x1))
DEF_TARGET_INSN (stack_protect_set, (rtx x0, rtx x1))
new file mode 100644
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -mcpu=cortex-a72 --param case-values-threshold=50" } */
+/* { dg-final { scan-assembler-not "\\s*b.*\n\\snop\n" } } */
+#include <stdlib.h>
+
+void
+branch (int* n)
+{
+ while((*n % 2 != 0))
+ *n=(*n * *n);
+}
+
+int
+main()
+{
+ int *i = malloc(sizeof(int));
+ *i = 5;
+ while (*i<1000) {
+ switch (*i) {
+ case 1:
+ *i += 1;
+ branch(i);
+ break;
+ case 2:
+ *i += 2;
+ branch(i);
+ break;
+ case 3:
+ *i += 3;
+ branch(i);
+ break;
+ case 4:
+ *i += 4;
+ branch(i);
+ break;
+ case 5:
+ *i += 5;
+ branch(i);
+ break;
+ case 6:
+ *i += 6;
+ branch(i);
+ break;
+ case 7:
+ *i += 7;
+ branch(i);
+ break;
+ case 8:
+ *i += 8;
+ branch(i);
+ break;
+ default:
+ branch(i);
+ }
+ }
+ return *i;
+}
new file mode 100644
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -mbranch-dilution -mcpu=cortex-a72 --param case-values-threshold=50 -fdump-rtl-branch-dilution" } */
+/* { dg-final { scan-assembler "\\s*b.*\n\\snop\n" } } */
+/* { dg-final { scan-rtl-dump "filler_insn" "branch-dilution"} } */
+#include <stdlib.h>
+
+void
+branch (int* n)
+{
+ while((*n % 2 != 0))
+ *n=(*n * *n);
+}
+
+int
+main()
+{
+ int *i = malloc(sizeof(int));
+ *i = 5;
+ while (*i<1000) {
+ switch (*i) {
+ case 1:
+ *i += 1;
+ branch(i);
+ break;
+ case 2:
+ *i += 2;
+ branch(i);
+ break;
+ case 3:
+ *i += 3;
+ branch(i);
+ break;
+ case 4:
+ *i += 4;
+ branch(i);
+ break;
+ case 5:
+ *i += 5;
+ branch(i);
+ break;
+ case 6:
+ *i += 6;
+ branch(i);
+ break;
+ case 7:
+ *i += 7;
+ branch(i);
+ break;
+ case 8:
+ *i += 8;
+ branch(i);
+ break;
+ default:
+ branch(i);
+ }
+ }
+ return *i;
+}
--
2.7.4