diff mbox

Ping: C-family stack check for threads

Message ID 4E384DA4.40500@web.de
State New
Headers show

Commit Message

Thomas Klein Aug. 2, 2011, 7:19 p.m. UTC
Hello

Here is my next try to put the stack check into rtl at prologue stage.
To me, it was not as easy as I hoped.
I've had little problems to get push/pop and the compare/jump working.
Hoping the way i choose is acceptable.
With rtl no extra pool to hold pointer or size values is required any more.
That's fine.
So this movement to rtl dose make sense.

Regards
   Thomas Klein


  ;; being inserted into the upper 16 bits of the register.
  (define_insn "*arm_movtas_ze"
diff mbox

Patch

Index: gcc/opts.c
===================================================================
--- gcc/opts.c    (revision 176974)
+++ gcc/opts.c    (working copy)
@@ -1644,6 +1644,12 @@  common_handle_option (struct gcc_options *opts,
                 : STACK_CHECK_STATIC_BUILTIN
                   ? STATIC_BUILTIN_STACK_CHECK
                   : GENERIC_STACK_CHECK;
+      else if (!strcmp (arg, "indirect"))
+    /* This is an other stack checking method.  */
+    opts->x_flag_stack_check = INDIRECT_STACK_CHECK;
+      else if (!strcmp (arg, "direct"))
+    /* This is an other stack checking method.  */
+    opts->x_flag_stack_check = DIRECT_STACK_CHECK;
        else
      warning_at (loc, 0, "unknown stack check parameter \"%s\"", arg);
        break;
Index: gcc/flag-types.h
===================================================================
--- gcc/flag-types.h    (revision 176974)
+++ gcc/flag-types.h    (working copy)
@@ -153,7 +153,15 @@  enum stack_check_type

    /* Check the stack and entirely rely on the target configuration
       files, i.e. do not use the generic mechanism at all.  */
-  FULL_BUILTIN_STACK_CHECK
+  FULL_BUILTIN_STACK_CHECK,
+
+  /* Check the stack (if possible) before allocation of local variables at
+     each function entry. The stack limit is directly given e.g. by address
+     of a symbol */
+  DIRECT_STACK_CHECK,
+  /* Check the stack (if possible) before allocation of local variables at
+     each function entry. The stack limit is given by global variable. */
+  INDIRECT_STACK_CHECK
  };

  /* Names for the different levels of -Wstrict-overflow=N.  The numeric
Index: gcc/explow.c
===================================================================
--- gcc/explow.c    (revision 176974)
+++ gcc/explow.c    (working copy)
@@ -1358,7 +1358,12 @@  allocate_dynamic_stack_space (rtx size, unsigned s

    /* If needed, check that we have the required amount of stack.  Take 
into
       account what has already been checked.  */
-  if (STACK_CHECK_MOVING_SP)
+  if (  STACK_CHECK_MOVING_SP
+#ifdef HAVE_generic_limit_check_stack
+     || crtl->limit_stack
+#endif
+     || flag_stack_check == DIRECT_STACK_CHECK
+     || flag_stack_check == INDIRECT_STACK_CHECK)
      ;
    else if (flag_stack_check == GENERIC_STACK_CHECK)
      probe_stack_range (STACK_OLD_CHECK_PROTECT + 
STACK_CHECK_MAX_FRAME_SIZE,
@@ -1392,19 +1397,32 @@  allocate_dynamic_stack_space (rtx size, unsigned s
        /* Check stack bounds if necessary.  */
        if (crtl->limit_stack)
      {
+          rtx limit_rtx;
        rtx available;
        rtx space_available = gen_label_rtx ();
+          if (  GET_CODE (stack_limit_rtx) == SYMBOL_REF
+ && flag_stack_check == INDIRECT_STACK_CHECK)
+            limit_rtx = expand_unop (Pmode, mov_optab,
+                    gen_rtx_MEM (Pmode, stack_limit_rtx),
+                    NULL_RTX, 1);
+          else
+            limit_rtx = stack_limit_rtx;
  #ifdef STACK_GROWS_DOWNWARD
        available = expand_binop (Pmode, sub_optab,
-                    stack_pointer_rtx, stack_limit_rtx,
+                    stack_pointer_rtx, limit_rtx,
                      NULL_RTX, 1, OPTAB_WIDEN);
  #else
        available = expand_binop (Pmode, sub_optab,
-                    stack_limit_rtx, stack_pointer_rtx,
+                    limit_rtx, stack_pointer_rtx,
                      NULL_RTX, 1, OPTAB_WIDEN);
  #endif
        emit_cmp_and_jump_insns (available, size, GEU, NULL_RTX, Pmode, 1,
                     space_available);
+#ifdef HAVE_stack_failure
+      if (HAVE_stack_failure)
+        emit_insn (gen_stack_failure ());
+      else
+#endif
  #ifdef HAVE_trap
        if (HAVE_trap)
          emit_insn (gen_trap ());
@@ -1547,6 +1565,13 @@  probe_stack_range (HOST_WIDE_INT first, rtx size)
      return;
      }
  #endif
+#ifdef HAVE_generic_limit_check_stack
+  else if (HAVE_generic_limit_check_stack)
+    {
+      rtx addr = memory_address (Pmode,stack_pointer_rtx);
+      emit_insn (gen_generic_limit_check_stack (addr));
+    }
+#endif

    /* Otherwise we have to generate explicit probes.  If we have a constant
       small number of them to generate, that's the easy case.  */
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c    (revision 176974)
+++ gcc/config/arm/arm.c    (working copy)
@@ -15809,6 +15809,299 @@  thumb_set_frame_pointer (arm_stack_offsets *offset
    RTX_FRAME_RELATED_P (insn) = 1;
  }

+/*search for possible work registers for stack-check operation at prologue
+ return the number of register that can be used without extra push/pop */
+
+static int
+stack_check_work_registers (rtx *workreg)
+{
+  int reg, i, k, n, nregs;
+
+  if (crtl->args.info.pcs_variant <= ARM_PCS_AAPCS_LOCAL)
+    {
+      nregs = crtl->args.info.aapcs_next_ncrn;
+    }
+  else
+    nregs = crtl->args.info.nregs;
+
+
+  n = 0;
+  i = 0;
+  /* check if we can use one of the argument registers r0..r3 as long 
as they
+   * not holding data*/
+  for (reg = 0; reg <= LAST_ARG_REGNUM && i < 2; reg++)
+    {
+      if (  !df_regs_ever_live_p (reg)
+         || (cfun->machine->uses_anonymous_args && 
crtl->args.pretend_args_size
+ > (LAST_ARG_REGNUM - reg) * UNITS_PER_WORD)
+         || (!cfun->machine->uses_anonymous_args && nregs < reg + 1)
+         )
+        {
+          workreg[i++] = gen_rtx_REG (SImode, reg);
+          n = (reg + 1) % 4;
+        }
+    }
+
+  /* otherwise try to use r4..r7*/
+  for (reg = LAST_ARG_REGNUM + 1; reg <= LAST_LO_REGNUM && i < 2; reg++)
+    {
+      if (  df_regs_ever_live_p (reg)
+ && !fixed_regs[reg]
+ && reg != FP_REGNUM )
+        {
+          workreg[i++] = gen_rtx_REG (SImode, reg);
+        }
+    }
+
+  if (TARGET_32BIT)
+    {
+      /* ARM and Thumb-2 can use high regs.  */
+      for (reg = FIRST_HI_REGNUM; reg <= LAST_HI_REGNUM && i < 2; reg ++)
+        if (  df_regs_ever_live_p (reg)
+ && !fixed_regs[reg]
+ && reg != FP_REGNUM )
+          {
+            workreg[i++] = gen_rtx_REG (SImode, reg);
+          }
+    }
+
+  k = i;
+  /* if not enough found to be uses without extra push,
+   * collect next from r0..r4*/
+  for ( ; i<2; i++)
+    workreg[i] = gen_rtx_REG (SImode, n++);
+
+  /* only if k==0, two register will be pushed later
+   * only in this case the registers are guaranteed to be sorted */
+  return k;
+}
+
+/* push some registers to stack */
+static void
+emit_push_regs(int num_to_push, rtx *reg)
+{
+  int i;
+  rtvec tmpvec;
+  rtx par[16], dwarf, tmp, insn;
+
+  if (num_to_push > 15 || num_to_push < 0)
+    return;
+
+  tmpvec = gen_rtvec (1, reg[0]);
+  par[0] = gen_rtx_UNSPEC (BLKmode, tmpvec, UNSPEC_PUSH_MULT);
+  for (i=1; i<num_to_push; i++)
+    par[i] = gen_rtx_USE (VOIDmode, reg[i]);
+
+  tmp = plus_constant (stack_pointer_rtx, -4 * num_to_push);
+  dwarf = gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp);
+  tmp = gen_rtx_PRE_MODIFY (Pmode, stack_pointer_rtx, tmp);
+  tmp = gen_frame_mem (BLKmode, tmp);
+  par[0]= gen_rtx_SET (VOIDmode, tmp, par[0]);
+  tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (num_to_push, par));
+  insn = emit_insn (tmp);
+  RTX_FRAME_RELATED_P (insn) = 1;
+  add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
+  return;
+}
+
+/* pop some registers from stack */
+static void
+emit_pop_regs(const int num_to_pop, rtx *reg)
+{
+  int i;
+  rtvec tmpvec;
+  rtx par[16], dwarf, tmp, insn;
+
+  if (num_to_pop > 15 || num_to_pop < 0)
+    return;
+
+  tmpvec = gen_rtvec (1, reg[0]);
+  par[0] = gen_rtx_UNSPEC (BLKmode, tmpvec, UNSPEC_PUSH_MULT);
+  for (i=1; i<num_to_pop; i++)
+    par[i] = gen_rtx_USE (VOIDmode, reg[i]);
+  tmp = plus_constant (stack_pointer_rtx, 4 * num_to_pop);
+  dwarf = gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp);
+  tmp = gen_rtx_PRE_MODIFY (Pmode, stack_pointer_rtx, tmp);
+  tmp = gen_frame_mem (BLKmode, tmp);
+  par[0] = gen_rtx_SET (VOIDmode, tmp, par[0]);
+  tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (num_to_pop, par));
+  insn = emit_insn (tmp);
+  RTX_FRAME_RELATED_P (insn) = 1;
+  add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
+  return;
+}
+
+/*
+ * Emit RTL instructions for stack check at prologue stage.
+ * For Thumb this may look like this:
+ *   push {rsym,ramn}
+ *   ldr rsym, =symbol_addr_of(stack_limit_rtx)
+ *   ldr rsym, [rsym]
+ *   ldr ramn, =lenght_of(amount)
+ *   add rsym, rsym, ramn
+ *   cmp sp, rsym
+ *   bhs .LSPCHK0
+ *   push {lr}
+ *   bl __thumb_stack_failure
+ * .LSPCHK0:
+ *   pop {rsym,ramn}
+ */
+static void
+emit_stack_check_insns (HOST_WIDE_INT amount, int lr_not_yet_pushed)
+{
+  unsigned numregs;
+  unsigned amount_needsreg;
+  bool amount_const_ok, is_non_opt_thumb2, is_thumb2_hi_reg[2];
+  bool issym=false;
+  rtx reg[2], cmp_reg, amount_rtx;
+  rtx dwarf, tmp, insn;
+  rtx jump, label;
+
+  numregs = stack_check_work_registers(reg);
+
+  if (TARGET_THUMB1)
+    amount_const_ok = (amount < 256);
+  else
+    amount_const_ok = const_ok_for_arm (amount);
+
+  if (GET_CODE (stack_limit_rtx) == SYMBOL_REF) /*stack_limit_rtx*/
+    {
+      issym = true;
+      amount_needsreg = !amount_const_ok;
+    }
+  else
+    amount_needsreg = (amount != 0);
+
+  is_non_opt_thumb2 = (TARGET_THUMB2 && !(optimize_size || optimize >= 2));
+  is_thumb2_hi_reg[0] = (TARGET_THUMB2 && INTVAL(reg[0])>7);
+  is_thumb2_hi_reg[1] = (TARGET_THUMB2 && INTVAL(reg[1])>7);
+
+  /* push as many as needed */
+  if (issym && amount_needsreg) /*need two temp regs for limit and amount*/
+    {
+      if (numregs >= 2)
+        ; /*have 2 regs => no need to push*/
+      else if (numregs == 1)
+        {
+          /*have one reg but need two regs => push temp reg for amount*/
+          emit_push_regs (1, &reg[1]); /*push {reg1}*/
+          /*due to additional push try to correct amount*/
+          if (amount >= 4)
+            amount -= 4;
+        }
+      else
+        {
+          /*have no reg but need two => push temp regs for limit and 
amount*/
+          emit_push_regs (2, &reg[0]); /*push {reg0,reg1}*/
+          /*due to additional push try to correct amount*/
+          if (amount >= 8)
+            amount -= 8;
+        }
+    }
+  else if ((issym || amount_needsreg) && numregs == 0)
+    { /*push temp reg either for limit or amount*/
+      emit_push_regs (1, &reg[0]); /*push {reg0}*/
+      /*due to additional push try to correct amount*/
+      if (amount >= 4)
+        {
+          if (amount_const_ok)
+            {
+              if (TARGET_THUMB1 || const_ok_for_arm(amount - 4))
+                amount -= 4;
+              /*on Thumb2 or ARM may not corrected; shouldn't hurt*/
+            }
+          else /*will be loaded from pool*/
+            amount -= 4;
+        }
+    }
+
+  amount_rtx = GEN_INT (amount);
+
+  /* move limit plus amount to cmp_reg e.g. reg[0] */
+  if (issym)
+    {
+      if (is_non_opt_thumb2 || is_thumb2_hi_reg[0])
+           arm_emit_movpair(reg[0], stack_limit_rtx);
+      else
+        emit_move_insn(reg[0], stack_limit_rtx);
+
+      if (flag_stack_check == INDIRECT_STACK_CHECK)
+           emit_insn (gen_movsi (reg[0], gen_rtx_MEM (SImode, reg[0])));
+      if (amount)
+        {
+          if (amount_const_ok)
+            emit_insn(gen_addsi3(reg[0], reg[0], amount_rtx));
+          else
+            {
+              if (is_non_opt_thumb2 || is_thumb2_hi_reg[1])
+                arm_emit_movpair(reg[1], amount_rtx);
+              else
+                emit_insn (gen_movsi (reg[1], amount_rtx));
+              emit_insn(gen_addsi3(reg[0], reg[0], reg[1]));
+            }
+        }
+      cmp_reg = reg[0];
+    }
+  else if (amount)
+    {
+      if (amount_const_ok)
+        emit_move_insn(reg[0], amount_rtx);
+      else
+        {
+          if (is_non_opt_thumb2 || is_thumb2_hi_reg[0])
+            arm_emit_movpair(reg[0], amount_rtx);
+          else
+            emit_insn (gen_movsi (reg[0], amount_rtx));
+        }
+      emit_insn(gen_addsi3(reg[0], reg[0], stack_limit_rtx));
+      cmp_reg = reg[0];
+    }
+  else
+    cmp_reg = stack_limit_rtx;
+
+  /*compare and jump*/
+  emit_insn (gen_blockage ());
+  label = gen_label_rtx ();
+  do_compare_rtx_and_jump (stack_pointer_rtx, cmp_reg, GEU, 1, Pmode,
+          NULL_RTX, NULL_RTX, label, -1);
+  jump = get_last_insn ();
+  gcc_assert (JUMP_P (jump));
+  JUMP_LABEL (jump) = label;
+  LABEL_NUSES (label)++;
+  if (lr_not_yet_pushed) /*push LR if not already done*/
+    {
+      rtx lr = gen_rtx_REG (SImode, LR_REGNUM);
+      emit_push_regs (1, &lr);
+    }
+  insn = emit_insn (gen_stack_failure ());
+  if (lr_not_yet_pushed)
+    {
+      /*the trap will not come back; but tell it has restored the stack*/
+      tmp = plus_constant (stack_pointer_rtx, 4);
+      dwarf = gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp);
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
+    }
+  emit_label (label);
+
+  /*restore registers*/
+  if (issym && amount_needsreg) /*pop temp regs used by limit and amount*/
+    {
+      if (numregs >= 2)
+        ; /*no need to pop*/
+      else if (numregs == 1)
+        emit_pop_regs (1, &reg[1]); /*pop {reg1}*/
+      else
+        emit_pop_regs (2, &reg[0]); /*pop {reg0, reg1}*/
+    }
+  else if ((issym || amount_needsreg) && numregs == 0)
+    { /*pop temp reg used by limit or amount*/
+      emit_pop_regs (1, &reg[0]); /*pop {reg0}*/
+    }
+
+  return;
+}
+
  /* Generate the prologue instructions for entry into an ARM or Thumb-2
     function.  */
  void
@@ -16060,6 +16353,17 @@  arm_expand_prologue (void)
      current_function_static_stack_size
        = offsets->outgoing_args - offsets->saved_args;

+  if (  crtl->limit_stack
+ && !(IS_INTERRUPT (func_type))
+ && (  flag_stack_check == DIRECT_STACK_CHECK
+        || flag_stack_check == INDIRECT_STACK_CHECK)
+ && (offsets->outgoing_args - offsets->saved_args) > 0
+     )
+    {
+      emit_stack_check_insns (offsets->outgoing_args - saved_regs
+            - offsets->saved_args, !(live_regs_mask & (1<<LR_REGNUM)) );
+    }
+
    if (offsets->outgoing_args != offsets->saved_args + saved_regs)
      {
        /* This add can produce multiple insns for a large constant, so we
@@ -21261,6 +21565,16 @@  thumb1_expand_prologue (void)

    amount = offsets->outgoing_args - offsets->saved_regs;
    amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
+
+  if(  crtl->limit_stack
+ && (  flag_stack_check == DIRECT_STACK_CHECK
+       || flag_stack_check == INDIRECT_STACK_CHECK)
+ && (offsets->outgoing_args - offsets->saved_args)
+    )
+    {
+      emit_stack_check_insns (amount, !(l_mask & (1<<LR_REGNUM)));
+    }
+
    if (amount)
      {
        if (amount < 512)
@@ -21420,6 +21734,7 @@  thumb1_output_interwork (void)
    asm_fprintf (f, "%s%U%s:\n", STUB_NAME, name);

    return "";
+
  }

  /* Handle the case of a double word load into a low register from
Index: gcc/config/arm/arm.md
===================================================================
--- gcc/config/arm/arm.md    (revision 176974)
+++ gcc/config/arm/arm.md    (working copy)
@@ -103,6 +103,7 @@ 
    UNSPEC_SYMBOL_OFFSET  ; The offset of the start of the symbol from
                          ; another symbolic address.
    UNSPEC_MEMORY_BARRIER ; Represent a memory barrier.
+  UNSPEC_PROBE_STACK    ; probe stack memory reference
  ])

  ;; UNSPEC_VOLATILE Usage:
@@ -6669,8 +6670,8 @@ 
  (define_insn "cbranchsi4_insn"
    [(set (pc) (if_then_else
            (match_operator 0 "arm_comparison_operator"
-           [(match_operand:SI 1 "s_register_operand" "l,l*h")
-            (match_operand:SI 2 "thumb1_cmp_operand" "lI*h,*r")])
+           [(match_operand:SI 1 "s_register_operand" "l,l*h,k")
+            (match_operand:SI 2 "thumb1_cmp_operand" "lI*h,*r,r")])
            (label_ref (match_operand 3 "" ""))
            (pc)))]
    "TARGET_THUMB1"
@@ -7216,17 +7217,18 @@ 

  (define_insn "*arm_cmpsi_insn"
    [(set (reg:CC CC_REGNUM)
-    (compare:CC (match_operand:SI 0 "s_register_operand" "l,r,r,r")
-            (match_operand:SI 1 "arm_add_operand"    "Py,r,rI,L")))]
+    (compare:CC (match_operand:SI 0 "s_register_operand" "l,r,k,r,r")
+            (match_operand:SI 1 "arm_add_operand"    "Py,r,r,rI,L")))]
    "TARGET_32BIT"
    "@
     cmp%?\\t%0, %1
     cmp%?\\t%0, %1
     cmp%?\\t%0, %1
+   cmp%?\\t%0, %1
     cmn%?\\t%0, #%n1"
    [(set_attr "conds" "set")
-   (set_attr "arch" "t2,t2,any,any")
-   (set_attr "length" "2,2,4,4")]
+   (set_attr "arch" "t2,t2,any,any,any")
+   (set_attr "length" "2,2,4,4,4")]
  )

  (define_insn "*cmpsi_shiftsi"
@@ -10756,6 +10758,69 @@ 

  ;;

+(define_expand "probe_stack"
+  [(match_operand 0 "memory_operand" "")]
+  "TARGET_EITHER"
+{
+  if (  flag_stack_check == DIRECT_STACK_CHECK
+     || flag_stack_check == INDIRECT_STACK_CHECK)
+    ;
+  else
+    {
+      emit_move_insn (operands[0], const0_rtx);
+      emit_insn (gen_probe_stack_done ());
+      emit_insn (gen_blockage ());
+    }
+  DONE;
+}
+)
+
+(define_insn "probe_stack_done"
+  [(unspec_volatile [(const_int 0)] UNSPEC_PROBE_STACK)]
+  "TARGET_EITHER"
+  {return \"@ probe stack done\";}
+  [(set_attr "type" "store1")
+   (set_attr "length" "0")]
+)
+
+(define_expand "generic_limit_check_stack"
+  [(match_operand 0 "memory_operand" "")]
+  "crtl->limit_stack
+ && flag_stack_check != DIRECT_STACK_CHECK
+ && flag_stack_check != INDIRECT_STACK_CHECK"
+{
+  rtx label = gen_label_rtx ();
+  rtx addr = copy_rtx (operands[0]);
+  addr = gen_rtx_fmt_ee (MINUS, Pmode, addr, GEN_INT (0));
+  addr = force_operand (addr, NULL_RTX);
+  emit_insn (gen_blockage ());
+  emit_cmp_and_jump_insns (stack_limit_rtx, addr, LEU, NULL_RTX, Pmode, 1,
+                           label);
+  emit_insn (gen_stack_failure ());
+  emit_label (label);
+  emit_insn (gen_blockage ());
+  DONE;
+}
+)
+
+(define_insn "stack_failure"
+  [(trap_if (const_int 1) (const_int 0))
+   (clobber (reg:SI LR_REGNUM))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_EITHER"
+  "*
+  {
+    if (TARGET_ARM)
+      output_asm_insn (\"bl\\t__arm_stack_failure\\t%@ trap call\", 
operands);
+    else
+      output_asm_insn (\"bl\\t__thumb_stack_failure\\t%@ trap call\", 
operands);
+  }
+  return \"\";
+  "
+  [(set_attr "conds" "clob")
+    (set_attr "length" "8")]
+)
+
  ;; We only care about the lower 16 bits of the constant