diff mbox

[RFA/RFC] Stack clash mitigation patch 07/08 - V3

Message ID 8444434d-d8d7-a7f3-113a-7f216daede8f@redhat.com
State New
Headers show

Commit Message

Jeff Law July 31, 2017, 5:45 a.m. UTC
This patch adds aarch64 supports for stack clash mitigation.

Changes since V2:

Uses new PARAMS to control the guard size and probing interval.  Uses a
64k guard by default and a 1k outgoing args size.

Simplified code slightly as a result of "can't happen" cases.

Probe the last allocated word in the outgoing argument space if >1kb of
outgoing argument space was allocated.

Defines STACK_CLASH_PROTECTION_NEEDS_FINAL_DYNAMIC_PROBE to force
generic handling of dynamic area to probe its last word when necessary.

Epilogue doesn't assume IP0/IP1 contain usable values for
-fstack-clash-protection.


Adds some aarch64 specific tests for problems found by Wilco and myself.

Jeff
* config/aarch/aarch64.c (aarch64_output_probe_stack_range): Handle
	-fstack-clash-protection probing too.
	(aarch64_allocate_and_probe_stack_space): New function.
	(aarch64_expand_prologue): Assert we never have both an initial
	adjustment and callee save adjustment.    Dump as needed when
	-fstack-clash-protection is enabled.  For an initial or final
	adjustment that is large enough, call
	aarch64_allocate_and_probe_stack_space.  For final adjustment
	that is large enough, also emit a trailing probe.
	(aarch64_expand_epilogue): Do not assume the prologue has
	set IP1_REGNUM or IP0_REGNUM when -fstack-clash-protection is
	active.
	* config/aarch64/aarch64.h
	(STACK_CLASH_PROTECTION_NEEDS_FINAL_DYNAMIC_PROBE): Define.

	* testsuite/gcc.target/aarch64.c/stack-check-12.c: New test.
	* testsuite/gcc.target/aarch64.c/stack-check-13.c: New test.
	* testsuite/gcc.target/aarch64.c/stack-check-14.c: New test.
	* testsuite/gcc.target/aarch64.c/stack-check-15.c: New test.
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0a8b40a..7d57299 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2830,6 +2830,9 @@  aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   char loop_lab[32];
   rtx xops[2];
 
+  if (flag_stack_clash_protection)
+    reg1 = stack_pointer_rtx;
+
   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
 
   /* Loop.  */
@@ -2841,7 +2844,14 @@  aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   output_asm_insn ("sub\t%0, %0, %1", xops);
 
   /* Probe at TEST_ADDR.  */
-  output_asm_insn ("str\txzr, [%0]", xops);
+  if (flag_stack_clash_protection)
+    {
+      gcc_assert (xops[0] == stack_pointer_rtx);
+      xops[1] = GEN_INT (PROBE_INTERVAL - 8);
+      output_asm_insn ("str\txzr, [%0, %1]", xops);
+    }
+  else
+    output_asm_insn ("str\txzr, [%0]", xops);
 
   /* Test if TEST_ADDR == LAST_ADDR.  */
   xops[1] = reg2;
@@ -3605,6 +3615,96 @@  aarch64_set_handled_components (sbitmap components)
       cfun->machine->reg_is_wrapped_separately[regno] = true;
 }
 
+/* Allocate SIZE bytes of stack space using SCRATCH_REG as a scratch
+   register.  */
+
+static void
+aarch64_allocate_and_probe_stack_space (int scratchreg, HOST_WIDE_INT size)
+{
+  HOST_WIDE_INT probe_interval
+    = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
+  HOST_WIDE_INT guard_size
+    = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+  HOST_WIDE_INT guard_used_by_caller = 1024;
+
+  /* SIZE should be large enough to require probing here.  ie, it
+     must be larger than GUARD_SIZE - GUARD_USED_BY_CALLER.
+
+     We can allocate GUARD_SIZE - GUARD_USED_BY_CALLER as a single chunk
+     without any probing.  */
+  gcc_assert (size >= guard_size - guard_used_by_caller);
+  aarch64_sub_sp (scratchreg, guard_size - guard_used_by_caller, true);
+  size -= (guard_size - guard_used_by_caller);
+
+  HOST_WIDE_INT rounded_size = size & -probe_interval;
+  HOST_WIDE_INT residual = size - rounded_size;
+
+  /* We can handle a small number of allocations/probes inline.  Otherwise
+     punt to a loop.  */
+  if (rounded_size && rounded_size <= 4 * probe_interval)
+    {
+      /* We don't use aarch64_sub_sp here because we don't want to
+	 repeatedly load SCRATCHREG.  */
+      rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
+      if (probe_interval > ARITH_FACTOR)
+	emit_move_insn (scratch_rtx, GEN_INT (-probe_interval));
+      else
+	scratch_rtx = GEN_INT (-probe_interval);
+
+      for (HOST_WIDE_INT i = 0; i < rounded_size; i += probe_interval)
+	{
+	  rtx_insn *insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
+						     scratch_rtx));
+          add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
+
+	  if (probe_interval > ARITH_FACTOR)
+	    {
+	      RTX_FRAME_RELATED_P (insn) = 1;
+	      rtx adj = plus_constant (Pmode, stack_pointer_rtx, rounded_size);
+	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
+			    gen_rtx_SET (stack_pointer_rtx, adj));
+	    }
+
+	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+					   (probe_interval
+					    - GET_MODE_SIZE (word_mode))));
+	  emit_insn (gen_blockage ());
+	}
+      dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
+    }
+  else if (rounded_size)
+    {
+      /* Compute the ending address.  */
+      rtx temp = gen_rtx_REG (word_mode, scratchreg);
+      emit_move_insn (temp, GEN_INT (-rounded_size));
+      emit_insn (gen_add3_insn (temp, stack_pointer_rtx, temp));
+
+      /* This allocates and probes the stack.
+
+	 It almost certainly does not update CFIs correctly.
+
+	 It also probes at a 4k interval regardless of the value of
+	 PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL.  */
+      emit_insn (gen_probe_stack_range (temp, temp, temp));
+      emit_insn (gen_blockage ());
+      dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
+    }
+  else
+    dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
+
+
+  /* Handle any residuals.
+     Note that any residual must be probed.  */
+  if (residual)
+    {
+      aarch64_sub_sp (scratchreg, residual, true);
+      emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+				       (residual - GET_MODE_SIZE (word_mode))));
+      emit_insn (gen_blockage ());
+    }
+  return;
+}
+
 /* AArch64 stack frames generated by this compiler look like:
 
 	+-------------------------------+
@@ -3686,7 +3786,65 @@  aarch64_expand_prologue (void)
 	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
     }
 
-  aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
+  /* We do not fully protect aarch64 against stack clash style attacks
+     as doing so would be prohibitively expensive with less utility over
+     time as newer compilers are deployed.
+
+     We assume the guard is at least 64k.  Furthermore, we assume that
+     the caller has not pushed the stack pointer more than 1k into
+     the guard.  A caller that pushes the stack pointer than 1k into
+     the guard is considered invalid.
+
+     Note that the caller's ability to push the stack pointer into the
+     guard is a function of the number and size of outgoing arguments and/or
+     dynamic stack allocations due to the mandatory save of the link register
+     in the caller's frame.
+
+     With those assumptions the callee can allocate up to 63k of stack
+     space without probing.
+
+     When probing is needed, we emit a probe at the start of the prologue
+     and every PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL bytes thereafter.
+
+     We have to track how much space has been allocated, but we do not
+     track stores into the stack as implicit probes except for the
+     fp/lr store.  */
+  HOST_WIDE_INT guard_size
+    = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+  HOST_WIDE_INT guard_used_by_caller = 1024;
+  if (flag_stack_clash_protection)
+    {
+      if (frame_size == 0)
+	dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
+      else if (initial_adjust < guard_size - guard_used_by_caller
+	       && final_adjust < guard_size - guard_used_by_caller)
+	dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+      else
+	{
+	  /* This probes into the red zone, which is sub-optimal, but we
+	     allow it to avoid the async signal race.  Valgrind may complain
+	     about this probe.  */
+	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+					   - GET_MODE_SIZE (word_mode)));
+	}
+    }
+
+  /* In theory we should never have both an initial adjustment
+     and a callee save adjustment.  Verify that is the case since the
+     code below does not handle it for -fstack-clash-protection.  */
+  gcc_assert (initial_adjust == 0 || callee_adjust == 0);
+
+  /* We have to track the offset to the last probe in the stack so that
+     we know when to emit probes for stack clash protection.
+
+     If this function needs probes (most do not), then the code above
+     already emitted one.  Thus we can consider the last probe into the
+     stack was at offset zero.  */
+  if (flag_stack_clash_protection
+      && initial_adjust >= guard_size - guard_used_by_caller)
+    aarch64_allocate_and_probe_stack_space (IP0_REGNUM, initial_adjust);
+  else
+    aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
 
   if (callee_adjust != 0)
     aarch64_push_regs (reg1, reg2, callee_adjust);
@@ -3707,7 +3865,26 @@  aarch64_expand_prologue (void)
 			     callee_adjust != 0 || frame_pointer_needed);
   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
 			     callee_adjust != 0 || frame_pointer_needed);
-  aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
+
+  if (flag_stack_clash_protection && final_adjust != 0)
+    {
+      if (final_adjust >= guard_size - guard_used_by_caller)
+	aarch64_allocate_and_probe_stack_space (IP1_REGNUM, final_adjust);
+      else
+	aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
+
+      /* We must probe if the final adjustment is larger than the guard
+	 that is assumed used by the caller.  */
+      if (final_adjust >= guard_used_by_caller)
+	{
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "Stack clash aarch64 large outgoing arg, probing\n");
+	  emit_stack_probe (stack_pointer_rtx);
+	}
+    }
+  else
+    aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
 }
 
 /* Return TRUE if we can use a simple_return insn.
@@ -3773,7 +3950,11 @@  aarch64_expand_epilogue (bool for_sibcall)
       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
     }
   else
-    aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
+    aarch64_add_sp (IP1_REGNUM, final_adjust,
+		    /* A stack clash protection prologue may not have
+		       left IP1_REGNUM in a usable state.  */
+		    (flag_stack_clash_protection
+		     || df_regs_ever_live_p (IP1_REGNUM)));
 
   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 				callee_adjust != 0, &cfi_ops);
@@ -3796,7 +3977,11 @@  aarch64_expand_epilogue (bool for_sibcall)
       cfi_ops = NULL;
     }
 
-  aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
+  /* A stack clash protection prologue may not have left IP0_REGNUM
+     in a usable state.  */
+  aarch64_add_sp (IP0_REGNUM, initial_adjust,
+		  (flag_stack_clash_protection
+		   || df_regs_ever_live_p (IP0_REGNUM)));
 
   if (cfi_ops)
     {
@@ -8939,6 +9124,12 @@  aarch64_override_options_internal (struct gcc_options *opts)
       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
     opts->x_flag_prefetch_loop_arrays = 1;
 
+  /* We assume the guard page is 64k.  */
+  maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
+			 65536,
+			 opts->x_param_values,
+			 global_options_set.x_param_values);
+
   aarch64_override_options_after_change_1 (opts);
 }
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 106cf3a..d2d84dd 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -969,4 +969,12 @@  extern const char *host_detect_local_cpu (int argc, const char **argv);
 extern tree aarch64_fp16_type_node;
 extern tree aarch64_fp16_ptr_type_node;
 
+/* The prologue assumes that the most recent probe is within 1k of the
+   incoming stack pointer.  We may need an additional probe for
+   dynamically allocated stack space.  */
+#define STACK_CLASH_PROTECTION_NEEDS_FINAL_DYNAMIC_PROBE(RESIDUAL) \
+  ((RESIDUAL) == CONST0_RTX (Pmode) \
+   || GET_CODE (RESIDUAL) != CONST_INT \
+   || INTVAL (RESIDUAL) >= 1024)
+
 #endif /* GCC_AARCH64_H */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-12.c b/gcc/testsuite/gcc.target/aarch64/stack-check-12.c
new file mode 100644
index 0000000..9ad4133
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-12.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=4096" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+extern void arf (unsigned long int *, unsigned long int *);
+void
+frob ()
+{
+  unsigned long int num[1000];
+  unsigned long int den[1000];
+  arf (den, num);
+}
+
+/* This verifies that the scheduler did not break the dependencies
+   by adjusting the offsets within the probe and that the scheduler
+   did not reorder around the stack probes.  */
+/* { dg-final { scan-assembler-times "sub\\tsp, sp, #4096\\n\\tstr\\txzr, .sp, 4088." 3 } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-13.c b/gcc/testsuite/gcc.target/aarch64/stack-check-13.c
new file mode 100644
index 0000000..2404777
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-13.c
@@ -0,0 +1,28 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=4096" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#define ARG32(X) X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
+#define ARG192(X) ARG32(X),ARG32(X),ARG32(X),ARG32(X),ARG32(X),ARG32(X)
+void out1(ARG192(__int128));
+int t1(int);
+
+int t3(int x)
+{
+  if (x < 1000)
+    return t1 (x) + 1;
+
+  out1 (ARG192(1));
+  return 0;
+}
+
+
+
+/* This test creates a large (> 1k) outgoing argument area that needs
+   to be probed.  We don't test the exact size of the space or the
+   exact offset to make the test a little less sensitive to trivial
+   output changes.  */
+/* { dg-final { scan-assembler-times "sub\\tsp, sp, #....\\n\\tstr\\txzr, \\\[sp" 1 } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-14.c b/gcc/testsuite/gcc.target/aarch64/stack-check-14.c
new file mode 100644
index 0000000..38ff5b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-14.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=4096" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+int t1(int);
+
+int t2(int x)
+{
+  char *p = __builtin_alloca (4050);
+  x = t1 (x);
+  return p[x];
+}
+
+
+/* This test has a constant sized alloca that is smaller than the
+   probe interval.  But it actually requires two probes instead
+   of one because of the optimistic assumptions we made in the
+   aarch64 prologue code WRT probing state. 
+
+   The form can change quite a bit so we just check for two
+   probes without looking at the actual address.  */
+/* { dg-final { scan-assembler-times "str\\txzr," 2 } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-15.c b/gcc/testsuite/gcc.target/aarch64/stack-check-15.c
new file mode 100644
index 0000000..302bbe8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-15.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=4096" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+int t1(int);
+
+int t2(int x)
+{
+  char *p = __builtin_alloca (x);
+  x = t1 (x);
+  return p[x];
+}
+
+
+/* This test has a variable sized alloca.  It requires 3 probes.
+   One in the loop, one for the residual and at the end of the
+   alloca area. 
+
+   The form can change quite a bit so we just check for two
+   probes without looking at the actual address.  */
+/* { dg-final { scan-assembler-times "str\\txzr," 3 } } */
+
+
+