@@ -781,6 +781,7 @@ extern const atomic_ool_names aarch64_ool_ldeor_names;
tree aarch64_resolve_overloaded_builtin_general (location_t, tree, void *);
const char * aarch64_sls_barrier (int);
+const char * aarch64_indirect_call_asm (rtx);
extern bool aarch64_harden_sls_retbr_p (void);
extern bool aarch64_harden_sls_blr_p (void);
@@ -643,6 +643,16 @@ extern unsigned aarch64_architecture_version;
#define GP_REGNUM_P(REGNO) \
(((unsigned) (REGNO - R0_REGNUM)) <= (R30_REGNUM - R0_REGNUM))
+/* Registers known to be preserved over a BL instruction. This consists of the
+ GENERAL_REGS without x16, x17, and x30. The x30 register is changed by the BL
+ instruction itself, while the x16 and x17 registers may be used by veneers
+ which can be inserted by the linker. */
+#define STUB_REGNUM_P(REGNO) \
+ (GP_REGNUM_P (REGNO) \
+ && ((unsigned) (REGNO - R0_REGNUM)) != (R16_REGNUM - R0_REGNUM) \
+ && ((unsigned) (REGNO - R0_REGNUM)) != (R17_REGNUM - R0_REGNUM) \
+ && ((unsigned) (REGNO - R0_REGNUM)) != (R30_REGNUM - R0_REGNUM)) \
+
#define FP_REGNUM_P(REGNO) \
(((unsigned) (REGNO - V0_REGNUM)) <= (V31_REGNUM - V0_REGNUM))
@@ -667,6 +677,7 @@ enum reg_class
{
NO_REGS,
TAILCALL_ADDR_REGS,
+ STUB_REGS,
GENERAL_REGS,
STACK_REG,
POINTER_REGS,
@@ -689,6 +700,7 @@ enum reg_class
{ \
"NO_REGS", \
"TAILCALL_ADDR_REGS", \
+ "STUB_REGS", \
"GENERAL_REGS", \
"STACK_REG", \
"POINTER_REGS", \
@@ -708,6 +720,7 @@ enum reg_class
{ \
{ 0x00000000, 0x00000000, 0x00000000 }, /* NO_REGS */ \
{ 0x00030000, 0x00000000, 0x00000000 }, /* TAILCALL_ADDR_REGS */\
+ { 0x3ffcffff, 0x00000000, 0x00000000 }, /* STUB_REGS */ \
{ 0x7fffffff, 0x00000000, 0x00000003 }, /* GENERAL_REGS */ \
{ 0x80000000, 0x00000000, 0x00000000 }, /* STACK_REG */ \
{ 0xffffffff, 0x00000000, 0x00000003 }, /* POINTER_REGS */ \
@@ -879,6 +892,8 @@ typedef struct GTY (()) machine_function
struct aarch64_frame frame;
/* One entry for each hard register. */
bool reg_is_wrapped_separately[LAST_SAVED_REGNUM];
+ /* One entry for each general purpose register. */
+ rtx call_via[SP_REGNUM];
bool label_is_assembled;
} machine_function;
#endif
@@ -10607,6 +10607,9 @@ aarch64_label_mentioned_p (rtx x)
enum reg_class
aarch64_regno_regclass (unsigned regno)
{
+ if (STUB_REGNUM_P (regno))
+ return STUB_REGS;
+
if (GP_REGNUM_P (regno))
return GENERAL_REGS;
@@ -10869,7 +10872,7 @@ aarch64_asm_trampoline_template (FILE *f)
specific attributes to choose between hardening against straight line
speculation or not, but such function specific attributes are likely to
happen in the future. */
- output_asm_insn ("dsb\tsy\n\tisb", NULL);
+ asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
/* The trampoline needs an extra padding instruction. In case if BTI is
enabled the padding instruction is replaced by the BTI instruction at
@@ -10919,6 +10922,7 @@ aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
unsigned int nregs, vec_flags;
switch (regclass)
{
+ case STUB_REGS:
case TAILCALL_ADDR_REGS:
case POINTER_REGS:
case GENERAL_REGS:
@@ -13157,10 +13161,12 @@ aarch64_register_move_cost (machine_mode mode,
= aarch64_tune_params.regmove_cost;
/* Caller save and pointer regs are equivalent to GENERAL_REGS. */
- if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
+ if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
+ || to == STUB_REGS)
to = GENERAL_REGS;
- if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
+ if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
+ || from == STUB_REGS)
from = GENERAL_REGS;
/* Make RDFFR very expensive. In particular, if we know that the FFR
@@ -22964,6 +22970,215 @@ aarch64_sls_barrier (int mitigation_required)
: "";
}
+static GTY (()) tree aarch64_sls_shared_thunks[30];
+static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
+const char *indirect_symbol_names[30] = {
+ "__call_indirect_x0",
+ "__call_indirect_x1",
+ "__call_indirect_x2",
+ "__call_indirect_x3",
+ "__call_indirect_x4",
+ "__call_indirect_x5",
+ "__call_indirect_x6",
+ "__call_indirect_x7",
+ "__call_indirect_x8",
+ "__call_indirect_x9",
+ "__call_indirect_x10",
+ "__call_indirect_x11",
+ "__call_indirect_x12",
+ "__call_indirect_x13",
+ "__call_indirect_x14",
+ "__call_indirect_x15",
+ "", /* "__call_indirect_x16", */
+ "", /* "__call_indirect_x17", */
+ "__call_indirect_x18",
+ "__call_indirect_x19",
+ "__call_indirect_x20",
+ "__call_indirect_x21",
+ "__call_indirect_x22",
+ "__call_indirect_x23",
+ "__call_indirect_x24",
+ "__call_indirect_x25",
+ "__call_indirect_x26",
+ "__call_indirect_x27",
+ "__call_indirect_x28",
+ "__call_indirect_x29",
+};
+
+/* Function to create a BLR thunk. This thunk is used to mitigate straight
+ line speculation. Instead of a simple BLR that can be speculated past,
+ we emit a BL to this thunk, and this thunk contains a BR to the relevant
+ register. These thunks have the relevant speculation barries put after
+ their indirect branch so that speculation is blocked.
+
+ We use such a thunk so the speculation barriers are kept off the
+ architecturally executed path in order to reduce the performance overhead.
+
+ When optimising for size we use stubs shared by the linked object.
+ When optimising for performance we emit stubs for each function in the hope
+ that the branch predictor can better train on jumps specific for a given
+ function. */
+rtx
+aarch64_sls_create_blr_label (int regnum)
+{
+ gcc_assert (regnum < 30 && regnum != 16 && regnum != 17);
+ if (optimize_function_for_size_p (cfun))
+ {
+ /* For the thunks shared between different functions in this compilation
+ unit we use a named symbol -- this is just for users to more easily
+ understand the generated assembly. */
+ aarch64_sls_shared_thunks_needed = true;
+ const char *thunk_name = indirect_symbol_names[regnum];
+ if (aarch64_sls_shared_thunks[regnum] == NULL)
+ {
+ /* Build a decl representing this function stub and record it for
+ later. We build a decl here so we can use the GCC machinery for
+ handling sections automatically (through `get_named_section` and
+ `make_decl_one_only`). That saves us a lot of trouble handling
+ the specifics of different output file formats. */
+ tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+ get_identifier (thunk_name),
+ build_function_type_list (void_type_node,
+ NULL_TREE));
+ DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+ NULL_TREE, void_type_node);
+ TREE_PUBLIC (decl) = 1;
+ TREE_STATIC (decl) = 1;
+ DECL_IGNORED_P (decl) = 1;
+ DECL_ARTIFICIAL (decl) = 1;
+ make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
+ resolve_unique_section (decl, 0, false);
+ aarch64_sls_shared_thunks[regnum] = decl;
+ }
+
+ return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
+ }
+
+ if (cfun->machine->call_via[regnum] == NULL)
+ cfun->machine->call_via[regnum]
+ = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
+ return cfun->machine->call_via[regnum];
+}
+
+/* Helper function for aarch64_sls_emit_blr_function_thunks and
+ aarch64_sls_emit_shared_blr_thunks below. */
+static void
+aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
+{
+ /* Save in x16 and branch to that function so this transformation does
+ not prevent jumping to `BTI c` instructions. */
+ asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
+ asm_fprintf (out_file, "\tbr\tx16\n");
+}
+
+/* Emit all BLR stubs for this particular function.
+ Here we emit all the BLR stubs needed for the current function. Since we
+ emit these stubs in a consecutive block we know there will be no speculation
+ gadgets between each stub, and hence we only emit a speculation barrier at
+ the end of the stub sequences.
+
+ This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
+void
+aarch64_sls_emit_blr_function_thunks (FILE *out_file)
+{
+ if (! aarch64_harden_sls_blr_p ())
+ return;
+
+ bool any_functions_emitted = false;
+ /* We must save and restore the current function section since this assembly
+ is emitted at the end of the function. This means it can be emitted *just
+ after* the cold section of a function. That cold part would be emitted in
+ a different section. That switch would trigger a `.cfi_endproc` directive
+ to be emitted in the original section and a `.cfi_startproc` directive to
+ be emitted in the new section. Switching to the original section without
+ restoring would mean that the `.cfi_endproc` emitted as a function ends
+ would happen in a different section -- leaving an unmatched
+ `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
+ in the standard text section. */
+ section *save_text_section = in_section;
+ switch_to_section (function_section (current_function_decl));
+ for (int regnum = 0; regnum < 30; ++regnum)
+ {
+ rtx specu_label = cfun->machine->call_via[regnum];
+ if (specu_label == NULL)
+ continue;
+
+ targetm.asm_out.print_operand (out_file, specu_label, 0);
+ asm_fprintf (out_file, ":\n");
+ aarch64_sls_emit_function_stub (out_file, regnum);
+ any_functions_emitted = true;
+ }
+ if (any_functions_emitted)
+ /* Can use the SB if needs be here, since this stub will only be used
+ by the current function, and hence for the current target. */
+ asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
+ switch_to_section (save_text_section);
+}
+
+/* Emit shared BLR stubs for the current compilation unit.
+ Over the course of compiling this unit we may have converted some BLR
+ instructions to a BL to a shared stub function. This is where we emit those
+ stub functions.
+ This function is for the stubs shared between different functions in this
+ compilation unit. We share when optimising for size instead of speed.
+
+ This function is called through the TARGET_ASM_FILE_END hook. */
+void
+aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
+{
+ if (! aarch64_sls_shared_thunks_needed)
+ return;
+
+ for (int regnum = 0; regnum < 30; ++regnum)
+ {
+ tree decl = aarch64_sls_shared_thunks[regnum];
+ if (!decl)
+ continue;
+
+ const char *name = indirect_symbol_names[regnum];
+ switch_to_section (get_named_section (decl, NULL, 0));
+ ASM_OUTPUT_ALIGN (out_file, 2);
+ targetm.asm_out.globalize_label (out_file, name);
+ /* Only emits if the compiler is configured for an assembler that can
+ handle visibility directives. */
+ targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
+ ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
+ ASM_OUTPUT_LABEL (out_file, name);
+ aarch64_sls_emit_function_stub (out_file, regnum);
+ /* Use the most conservative target to ensure it can always be used by any
+ function in the translation unit. */
+ asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
+ ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
+ }
+}
+
+/* Implement TARGET_ASM_FILE_END. */
+void
+aarch64_asm_file_end ()
+{
+ aarch64_sls_emit_shared_blr_thunks (asm_out_file);
+ /* Since this function will be called for the ASM_FILE_END hook, we ensure
+ that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
+ for FreeBSD) still gets called. */
+#ifdef TARGET_ASM_FILE_END
+ TARGET_ASM_FILE_END ();
+#endif
+}
+
+const char *
+aarch64_indirect_call_asm (rtx addr)
+{
+ gcc_assert (REG_P (addr));
+ if (aarch64_harden_sls_blr_p ())
+ {
+ rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
+ output_asm_insn ("bl\t%0", &stub_label);
+ }
+ else
+ output_asm_insn ("blr\t%0", &addr);
+ return "";
+}
+
/* Target-specific selftests. */
#if CHECKING_P
@@ -23514,6 +23729,12 @@ aarch64_libgcc_floating_mode_supported_p
#undef TARGET_MD_ASM_ADJUST
#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
+#undef TARGET_ASM_FILE_END
+#define TARGET_ASM_FILE_END aarch64_asm_file_end
+
+#undef TARGET_ASM_FUNCTION_EPILOGUE
+#define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-aarch64.h"
@@ -1019,16 +1019,15 @@
)
(define_insn "*call_insn"
- [(call (mem:DI (match_operand:DI 0 "aarch64_call_insn_operand" "r, Usf"))
+ [(call (mem:DI (match_operand:DI 0 "aarch64_call_insn_operand" "Ucr, Usf"))
(match_operand 1 "" ""))
(unspec:DI [(match_operand:DI 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
(clobber (reg:DI LR_REGNUM))]
""
"@
- blr\\t%0
+ * return aarch64_indirect_call_asm (operands[0]);
bl\\t%c0"
- [(set_attr "type" "call, call")]
-)
+ [(set_attr "type" "call, call")])
(define_expand "call_value"
[(parallel
@@ -1047,13 +1046,13 @@
(define_insn "*call_value_insn"
[(set (match_operand 0 "" "")
- (call (mem:DI (match_operand:DI 1 "aarch64_call_insn_operand" "r, Usf"))
+ (call (mem:DI (match_operand:DI 1 "aarch64_call_insn_operand" "Ucr, Usf"))
(match_operand 2 "" "")))
(unspec:DI [(match_operand:DI 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
(clobber (reg:DI LR_REGNUM))]
""
"@
- blr\\t%1
+ * return aarch64_indirect_call_asm (operands[1]);
bl\\t%c1"
[(set_attr "type" "call, call")]
)
@@ -24,6 +24,15 @@
(define_register_constraint "Ucs" "TAILCALL_ADDR_REGS"
"@internal Registers suitable for an indirect tail call")
+(define_register_constraint "Ucr"
+ "aarch64_harden_sls_blr_p () ? STUB_REGS : GENERAL_REGS"
+ "@internal Registers to be used for an indirect call.
+ This is usually the general registers, but when we are hardening against
+ Straight Line Speculation we disallow x16, x17, and x30 so we can use
+ indirection stubs. These indirection stubs cannot use the above registers
+ since they will be reached by a BL that may have to go through a linker
+ veneer.")
+
(define_register_constraint "w" "FP_REGS"
"Floating point and SIMD vector registers.")
@@ -32,7 +32,8 @@
(define_predicate "aarch64_general_reg"
(and (match_operand 0 "register_operand")
- (match_test "REGNO_REG_CLASS (REGNO (op)) == GENERAL_REGS")))
+ (match_test "REGNO_REG_CLASS (REGNO (op)) == STUB_REGS
+ || REGNO_REG_CLASS (REGNO (op)) == GENERAL_REGS")))
;; Return true if OP a (const_int 0) operand.
(define_predicate "const0_operand"
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-mharden-sls=blr -mbranch-protection=bti" } */
+/*
+ Ensure that the SLS hardening of BLR leaves no BLR instructions.
+ Here we also check that there are no BR instructions with anything except an
+ x16 or x17 register. This is because a `BTI c` instruction can be branched
+ to using a BLR instruction using any register, but can only be branched to
+ with a BR using an x16 or x17 register.
+ */
+typedef int (foo) (int, int);
+typedef void (bar) (int, int);
+struct sls_testclass {
+ foo *x;
+ bar *y;
+ int left;
+ int right;
+};
+
+/* We test both RTL patterns for a call which returns a value and a call which
+ does not. */
+int blr_call_value (struct sls_testclass x)
+{
+ int retval = x.x(x.left, x.right);
+ if (retval % 10)
+ return 100;
+ return 9;
+}
+
+int blr_call (struct sls_testclass x)
+{
+ x.y(x.left, x.right);
+ if (x.left % 10)
+ return 100;
+ return 9;
+}
+
+/* { dg-final { scan-assembler-not "\tblr\t" } } */
+/* { dg-final { scan-assembler-not "\tbr\tx(?!16|17)" } } */
+/* { dg-final { scan-assembler "\tbr\tx(16|17)" } } */
+
new file mode 100644
@@ -0,0 +1,35 @@
+/* { dg-additional-options "-mharden-sls=blr -save-temps" } */
+/*
+ Ensure that the SLS hardening of BLR leaves no BLR instructions.
+ We only test that all BLR instructions have been removed, not that the
+ resulting code makes sense.
+ */
+typedef int (foo) (int, int);
+typedef void (bar) (int, int);
+struct sls_testclass {
+ foo *x;
+ bar *y;
+ int left;
+ int right;
+};
+
+/* We test both RTL patterns for a call which returns a value and a call which
+ does not. */
+int blr_call_value (struct sls_testclass x)
+{
+ int retval = x.x(x.left, x.right);
+ if (retval % 10)
+ return 100;
+ return 9;
+}
+
+int blr_call (struct sls_testclass x)
+{
+ x.y(x.left, x.right);
+ if (x.left % 10)
+ return 100;
+ return 9;
+}
+
+/* { dg-final { scan-assembler-not "\tblr\t" } } */
+/* { dg-final { scan-assembler "\tbr\tx\[0-9\]\[0-9\]?" } } */