diff mbox series

[AArch64] Add support for SVE stack clash probing [patch (2/7)]

Message ID 20180828121925.GA31072@arm.com
State New
Headers show
Series [AArch64] Add support for SVE stack clash probing [patch (2/7)] | expand

Commit Message

Tamar Christina Aug. 28, 2018, 12:19 p.m. UTC
Hi all,

This patch adds basic support for SVE stack clash protection.
It is a first implementation and will use a loop to do the
probing and stack adjustments.

An example sequence is:

	.cfi_startproc
	mov	x15, sp
	cntb	x16, all, mul #11
	add	x16, x16, 304
	.cfi_def_cfa_register 15
.SVLPSRL0:
	cmp	x16, 4096
	b.lt	.BRRCHK0
	sub	sp, sp, 4096
	str	xzr, [sp, 1024]
	sub	x16, x16, 4096
	b	.SVLPSRL0
.BRRCHK0:
	sub	sp, sp, x16
	cmp	sp, 1024
	b.lt	.BERCHK0
	str	xzr, [sp, 1024]
.BERCHK0:
	.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22

This has about the same semantics as alloca.


Bootstrapped Regtested on aarch64-none-linux-gnu and no issues in sve testsuite.
Target was tested with stack clash on and off by default.

Ok for trunk?

Thanks,
Tamar

gcc/
2018-08-28  Tamar Christina  <tamar.christina@arm.com>

	PR target/86486
	* config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New.
	* config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash): New.
	(aarch64_allocate_and_probe_stack_space): Add SVE specific section.
	* config/aarch64/aarch64.md (probe_sve_stack_clash): New.

gcc/testsuite/
2018-08-28  Tamar Christina  <tamar.christina@arm.com>

	PR target/86486
	* gcc.target/aarch64/stack-check-prologue-12.c: New test
	* gcc.target/aarch64/stack-check-cfa-3.c: New test.

--

Comments

Richard Sandiford Aug. 28, 2018, 8:40 p.m. UTC | #1
I'll leave the AArch64 maintainers to review, but some comments.

Tamar Christina <tamar.christina@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 06451f38b11822ea77323438fe8c7e373eb9e614..e7efde79bb111e820f4df44a276f6f73070ecd17 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -3970,6 +3970,90 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
>    return "";
>  }
 
> +/* Emit the probe loop for doing stack clash probes and stack adjustments for
> +   SVE.  This emits probes from BASE to BASE + ADJUSTMENT based on a guard size
> +   of GUARD_SIZE and emits a probe when at least LIMIT bytes are allocated.  By
> +   the end of this function BASE = BASE + ADJUSTMENT.  */
> +
> +const char *
> +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment, rtx limit,
> +				      rtx guard_size)
> +{
> +  /* This function is not allowed to use any instruction generation function
> +     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
> +     so instead emit the code you want using output_asm_insn.  */
> +  gcc_assert (flag_stack_clash_protection);
> +  gcc_assert (CONST_INT_P (limit) && CONST_INT_P (guard_size));
> +  gcc_assert (aarch64_uimm12_shift (INTVAL (limit)));
> +  gcc_assert (aarch64_uimm12_shift (INTVAL (guard_size)));
> +
> +  static int labelno = 0;
> +  char loop_start_lab[32];
> +  char loop_res_lab[32];
> +  char loop_end_lab[32];
> +  rtx xops[2];
> +
> +  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSRL", labelno);
> +  ASM_GENERATE_INTERNAL_LABEL (loop_res_lab, "BRRCHK", labelno);
> +  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "BERCHK", labelno++);
> +
> +  /* Emit loop start label.  */
> +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
> +
> +  /* Test if ADJUSTMENT < GUARD_SIZE.  */
> +  xops[0] = adjustment;
> +  xops[1] = guard_size;
> +  output_asm_insn ("cmp\t%0, %1", xops);
> +
> +  /* Branch to residual loop if it is.  */
> +  fputs ("\tb.lt\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_res_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  /* BASE = BASE - GUARD_SIZE.  */
> +  xops[0] = base;
> +  xops[1] = guard_size;
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> +  /* Probe at BASE + LIMIT.  */
> +  xops[1] = limit;
> +  output_asm_insn ("str\txzr, [%0, %1]", xops);
> +
> +  /* ADJUSTMENT = ADJUSTMENT - GUARD_SIZE.  */
> +  xops[0] = adjustment;
> +  xops[1] = guard_size;
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> +  /* Branch to loop start.  */
> +  fputs ("\tb\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_start_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  /* Emit residual check label.  */
> +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_res_lab);
> +
> +  /* BASE = BASE - ADJUSTMENT.  */
> +  xops[0] = base;
> +  xops[1] = adjustment;
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> +  /* Test if BASE < LIMIT.  */
> +  xops[1] = limit;
> +  output_asm_insn ("cmp\t%0, %1", xops);

Think this should be ADJUSTMENT < LIMIT.

> +  /* Branch to end.  */
> +  fputs ("\tb.lt\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_end_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  /* Probe at BASE + LIMIT.  */
> +  output_asm_insn ("str\txzr, [%0, %1]", xops);

It looks like this would probe at LIMIT when ADJUSTMENT is exactly LIMIT,
which could clobber the caller's frame.

> +
> +  /* No probe leave.  */
> +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
> +  return "";

With the CFA stuff and constant load, I think this works out as:

---------------------------------------------
# 12 insns
	mov	r15, base
	mov	adjustment, N
1:
	cmp	adjustment, guard_size
	b.lt    2f
	sub	base, base, guard_size
	str	xzr, [base, limit]
	sub     adjustment, adjustment, guard_size
	b	1b
2:
	sub     base, base, adjustment
	cmp	adjustment, limit
	b.le	3f
	str	xzr, [base, limit]
3:
---------------------------------------------

What do you think about something like:

---------------------------------------------
# 10 insns
	mov	adjustment, N
	sub	r15, base, adjustment
	subs	adjustment, adjustment, min_probe_threshold
	b.lo	2f
1:
	add	base, x15, adjustment
	str	xzr, [base, 0]
	subs	adjustment, adjustment, 16
	and	adjustment, adjustment, ~(guard_size-1)
	b.hs	1b
2:
	mov	base, r15
---------------------------------------------

or (with different trade-offs):

---------------------------------------------
# 11 insns
	mov	adjustment, N
	sub	r15, base, adjustment
	subs	adjustment, adjustment, min_probe_threshold
	b.lo	2f
	# Might be 0, leading to a double probe
	and	r14, adjustment, guard_size-1
1:
	add	base, x15, adjustment
	str	xzr, [base, 0]
	subs	adjustment, adjustment, r14
	mov	r14, guard_size
	b.hs	1b
2:
	mov	base, r15
---------------------------------------------

or (longer, but with a simpler loop):

---------------------------------------------
# 12 insns
	mov	adjustment, N
	sub	r15, base, adjustment
	subs	adjustment, adjustment, min_probe_threshold
	b.lo	2f
	str	xzr, [base, -16]!
	sub	adjustment, adjustment, 32
	and	adjustment, adjustment, -(guard_size-1)
1:
	add	base, x15, adjustment
	str	xzr, [base, 0]
	subs	adjustment, adjustment, guard_size
	b.hs	1b
2:
	mov	base, r15
---------------------------------------------

with the CFA based on r15+offset?

These loops probe more often than necessary in some cases,
but they only need a single branch in the common case that
ADJUSTMENT <= MIN_PROBE_THRESHOLD.

> @@ -4826,22 +4910,30 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
>  	}
>      }
> 
> -  HOST_WIDE_INT size;
> +  /* GCC's initialization analysis is broken so initialize size.  */
> +  HOST_WIDE_INT size = 0;

It's not broken in this case. :-)  is_constant only modifies its argument
when returning true.  In other cases the variable keeps whatever value
it had originally.  And the code does use "size" when !is_constant,
so an explicit initialisation is necessary.

>    /* If SIZE is not large enough to require probing, just adjust the stack and
>       exit.  */
> -  if (!poly_size.is_constant (&size)
> -      || known_lt (poly_size, min_probe_threshold)
> +  if ((poly_size.is_constant (&size)
> +       && known_lt (poly_size, min_probe_threshold))
>        || !flag_stack_clash_protection)

No need for the is_constant here, just known_lt is enough.

>      {
>        aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
>        return;
>      }
> 
> -  if (dump_file)
> +  if (dump_file && poly_size.is_constant ())
>      fprintf (dump_file,
>  	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
>  	     ", probing will be required.\n", size);
> 
> +  if (dump_file && !poly_size.is_constant ())
> +    {
> +      fprintf (dump_file, "Stack clash SVE prologue: ");
> +      dump_dec (MSG_NOTE, poly_size);

This should be print_dec (poly_size, dump_file);

> +      fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> +    }
> +
>    /* Round size to the nearest multiple of guard_size, and calculate the
>       residual as the difference between the original size and the rounded
>       size.  */
> @@ -4850,7 +4942,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> 
>    /* We can handle a small number of allocations/probes inline.  Otherwise
>       punt to a loop.  */
> -  if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
> +  if (poly_size.is_constant ()
> +      && rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
>      {
>        for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
>  	{
> @@ -4861,7 +4954,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
>  	}
>        dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
>      }
> -  else
> +  else if (poly_size.is_constant ())
>      {
>        /* Compute the ending address.  */
>        aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
> @@ -4910,6 +5003,48 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
>        emit_insn (gen_blockage ());
>        dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
>      }
> +  else
> +    {

It would probably be better to handle "!poly_size.is_constant ()"
after the "!flag_stack_clash_protection" if statement and exit early,
so that we don't do calculations based on "size" when "size" has a
fairly meaningless value.  It would also avoid repeated checks for
is_constant.

> +      rtx probe_const = gen_rtx_CONST_INT (Pmode, STACK_CLASH_CALLER_GUARD);
> +      rtx guard_const = gen_rtx_CONST_INT (Pmode, guard_size);

CONST_INTs don't have a recorded mode, so this should either be GEN_INT or
(better) gen_int_mode.

Thanks,
Richard
Tamar Christina Sept. 7, 2018, 4:05 p.m. UTC | #2
Hi Richard,

Here's the updated patch and some comments inline below.

An example sequence is:

        .cfi_startproc
        mov     x15, sp
        cntb    x16, all, mul #11
        add     x16, x16, 304
        .cfi_def_cfa_register 15
.SVLPSRL0:
        cmp     x16, 65536
        b.lt    .BRRCHK0
        sub     sp, sp, 65536
        str     xzr, [sp, 1024]
        sub     x16, x16, 65536
        b       .SVLPSRL0
.BRRCHK0:
        sub     sp, sp, x16
        cmp     x16, 2048
        b.lt    .BERCHK0
        str     xzr, [sp, 1024]
.BERCHK0:
        .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22
        stp     x29, x30, [sp]

Ok for trunk?

Thanks,
Tamar

gcc/
2018-09-07  Tamar Christina  <tamar.christina@arm.com>

	PR target/86486
	* config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New.
	* config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash): New.
	(aarch64_allocate_and_probe_stack_space): Add SVE specific section.
	* config/aarch64/aarch64.md (probe_sve_stack_clash): New.

gcc/testsuite/
2018-09-07  Tamar Christina  <tamar.christina@arm.com>

	PR target/86486
	* gcc.target/aarch64/stack-check-prologue-16.c: New test
	* gcc.target/aarch64/stack-check-cfa-3.c: New test.

The 08/28/2018 21:40, Richard Sandiford wrote:
> I'll leave the AArch64 maintainers to review, but some comments.
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index 06451f38b11822ea77323438fe8c7e373eb9e614..e7efde79bb111e820f4df44a276f6f73070ecd17 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -3970,6 +3970,90 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
> >    return "";
> >  }
> > +
> > +  /* Test if BASE < LIMIT.  */
> > +  xops[1] = limit;
> > +  output_asm_insn ("cmp\t%0, %1", xops);
> 
> Think this should be ADJUSTMENT < LIMIT.

Actually it should be 2KB in this case. I've explained why in the updated patch.

> 
> > +  /* Branch to end.  */
> > +  fputs ("\tb.lt\t", asm_out_file);
> > +  assemble_name_raw (asm_out_file, loop_end_lab);
> > +  fputc ('\n', asm_out_file);
> > +
> > +  /* Probe at BASE + LIMIT.  */
> > +  output_asm_insn ("str\txzr, [%0, %1]", xops);
> 
> It looks like this would probe at LIMIT when ADJUSTMENT is exactly LIMIT,
> which could clobber the caller's frame.
> 

Yeah, the comparison should have been a bit larger. Thanks.

> > +
> > +  /* No probe leave.  */
> > +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
> > +  return "";
> 
> With the CFA stuff and constant load, I think this works out as:
> 
> ---------------------------------------------
> # 12 insns
> 	mov	r15, base
> 	mov	adjustment, N
> 1:
> 	cmp	adjustment, guard_size
> 	b.lt    2f
> 	sub	base, base, guard_size
> 	str	xzr, [base, limit]
> 	sub     adjustment, adjustment, guard_size
> 	b	1b
> 2:
> 	sub     base, base, adjustment
> 	cmp	adjustment, limit
> 	b.le	3f
> 	str	xzr, [base, limit]
> 3:
> ---------------------------------------------
> 
> What do you think about something like:
> 
> ---------------------------------------------
> # 10 insns
> 	mov	adjustment, N
> 	sub	r15, base, adjustment
> 	subs	adjustment, adjustment, min_probe_threshold
> 	b.lo	2f
> 1:
> 	add	base, x15, adjustment
> 	str	xzr, [base, 0]
> 	subs	adjustment, adjustment, 16
> 	and	adjustment, adjustment, ~(guard_size-1)
> 	b.hs	1b
> 2:
> 	mov	base, r15
> ---------------------------------------------
> 
> or (with different trade-offs):
> 
> ---------------------------------------------
> # 11 insns
> 	mov	adjustment, N
> 	sub	r15, base, adjustment
> 	subs	adjustment, adjustment, min_probe_threshold
> 	b.lo	2f
> 	# Might be 0, leading to a double probe
> 	and	r14, adjustment, guard_size-1
> 1:
> 	add	base, x15, adjustment
> 	str	xzr, [base, 0]
> 	subs	adjustment, adjustment, r14
> 	mov	r14, guard_size
> 	b.hs	1b
> 2:
> 	mov	base, r15
> ---------------------------------------------
> 
> or (longer, but with a simpler loop):
> 
> ---------------------------------------------
> # 12 insns
> 	mov	adjustment, N
> 	sub	r15, base, adjustment
> 	subs	adjustment, adjustment, min_probe_threshold
> 	b.lo	2f
> 	str	xzr, [base, -16]!
> 	sub	adjustment, adjustment, 32
> 	and	adjustment, adjustment, -(guard_size-1)
> 1:
> 	add	base, x15, adjustment
> 	str	xzr, [base, 0]
> 	subs	adjustment, adjustment, guard_size
> 	b.hs	1b
> 2:
> 	mov	base, r15
> ---------------------------------------------
> 
> with the CFA based on r15+offset?
> 
> These loops probe more often than necessary in some cases,
> but they only need a single branch in the common case that
> ADJUSTMENT <= MIN_PROBE_THRESHOLD.

I haven't changed the loop yet because I'm a bit on the edge about
whether the implementation difficulties would outweigh the benefits.
We are planning on doing something smarter for SVE so optimizing these
loops only to replace them later may not be time well spent now.

The problem is that to support both 4KB and 64KB pages, instructions such
as subs would require different immediates and shifts. Granted we technically
only support these two so I could hardcode the values, but that would mean
these functions are less general than the rest.

If you think it would be worthwhile, I'd be happy to use one of these loops instead.

> 
> > @@ -4826,22 +4910,30 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> >  	}
> >      }
> > 
> > -  HOST_WIDE_INT size;
> > +  /* GCC's initialization analysis is broken so initialize size.  */
> > +  HOST_WIDE_INT size = 0;
> 
> It's not broken in this case. :-)  is_constant only modifies its argument
> when returning true.  In other cases the variable keeps whatever value
> it had originally.  And the code does use "size" when !is_constant,
> so an explicit initialisation is necessary.

ah, ok. Thanks!

> 
> >    /* If SIZE is not large enough to require probing, just adjust the stack and
> >       exit.  */
> > -  if (!poly_size.is_constant (&size)
> > -      || known_lt (poly_size, min_probe_threshold)
> > +  if ((poly_size.is_constant (&size)
> > +       && known_lt (poly_size, min_probe_threshold))
> >        || !flag_stack_clash_protection)
> 
> No need for the is_constant here, just known_lt is enough.
>

The is_constant is used to extract the size value safely.
 
> >      {
> >        aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
> >        return;
> >      }
> > 
> > -  if (dump_file)
> > +  if (dump_file && poly_size.is_constant ())
> >      fprintf (dump_file,
> >  	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> >  	     ", probing will be required.\n", size);
> > 
> > +  if (dump_file && !poly_size.is_constant ())
> > +    {
> > +      fprintf (dump_file, "Stack clash SVE prologue: ");
> > +      dump_dec (MSG_NOTE, poly_size);
> 
> This should be print_dec (poly_size, dump_file);
> 

done.

> > +      fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> > +    }
> > +
> >    /* Round size to the nearest multiple of guard_size, and calculate the
> >       residual as the difference between the original size and the rounded
> >       size.  */
> > @@ -4850,7 +4942,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> > 
> >    /* We can handle a small number of allocations/probes inline.  Otherwise
> >       punt to a loop.  */
> > -  if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
> > +  if (poly_size.is_constant ()
> > +      && rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
> >      {
> >        for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
> >  	{
> > @@ -4861,7 +4954,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> >  	}
> >        dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
> >      }
> > -  else
> > +  else if (poly_size.is_constant ())
> >      {
> >        /* Compute the ending address.  */
> >        aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
> > @@ -4910,6 +5003,48 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> >        emit_insn (gen_blockage ());
> >        dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
> >      }
> > +  else
> > +    {
> 
> It would probably be better to handle "!poly_size.is_constant ()"
> after the "!flag_stack_clash_protection" if statement and exit early,
> so that we don't do calculations based on "size" when "size" has a
> fairly meaningless value.  It would also avoid repeated checks for
> is_constant.
> 

done

> > +      rtx probe_const = gen_rtx_CONST_INT (Pmode, STACK_CLASH_CALLER_GUARD);
> > +      rtx guard_const = gen_rtx_CONST_INT (Pmode, guard_size);
> 
> CONST_INTs don't have a recorded mode, so this should either be GEN_INT or
> (better) gen_int_mode.
>

done.

> Thanks,
> Richard

--
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
 const char * aarch64_output_probe_stack_range (rtx, rtx);
+const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index cf278f4b9eb420d12f46461d4d090df42aa1980c..aaf5f4e106d0024c967462b6717d2d58a1c44457 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3973,6 +3973,105 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   return "";
 }
 
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+   SVE.  This emits probes from BASE to BASE + ADJUSTMENT based on a guard size
+   of GUARD_SIZE.  When a probe is emitted it is done at PROBE_OFFSET bytes from
+   the current BASE.  By the end of this function BASE = BASE + ADJUSTMENT.  */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
+				      rtx probe_offset, rtx guard_size)
+{
+
+  /* The minimum required allocation before the residual requires probing.
+     See comment at usage site for more.  */
+  const HOST_WIDE_INT residual_probe_guard = 1 << 11;
+
+  /* This function is not allowed to use any instruction generation function
+     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
+     so instead emit the code you want using output_asm_insn.  */
+  gcc_assert (flag_stack_clash_protection);
+  gcc_assert (CONST_INT_P (probe_offset) && CONST_INT_P (guard_size));
+  gcc_assert (aarch64_uimm12_shift (INTVAL (probe_offset)));
+  gcc_assert (aarch64_uimm12_shift (INTVAL (guard_size)));
+  gcc_assert (INTVAL (guard_size) > INTVAL (probe_offset));
+  gcc_assert (INTVAL (guard_size) > residual_probe_guard);
+
+  static int labelno = 0;
+  char loop_start_lab[32];
+  char loop_res_lab[32];
+  char loop_end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSRL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_res_lab, "BRRCHK", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "BERCHK", labelno++);
+
+  /* Emit loop start label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+  /* Test if ADJUSTMENT < GUARD_SIZE.  */
+  xops[0] = adjustment;
+  xops[1] = guard_size;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to residual loop if it is.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_res_lab);
+  fputc ('\n', asm_out_file);
+
+  /* BASE = BASE - GUARD_SIZE.  */
+  xops[0] = base;
+  xops[1] = guard_size;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at BASE + PROBE_OFFSET.  */
+  xops[1] = probe_offset;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* ADJUSTMENT = ADJUSTMENT - GUARD_SIZE.  */
+  xops[0] = adjustment;
+  xops[1] = guard_size;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Branch to loop start.  */
+  fputs ("\tb\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_start_lab);
+  fputc ('\n', asm_out_file);
+
+  /* Emit residual check label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_res_lab);
+
+  /* BASE = BASE - ADJUSTMENT.  */
+  xops[0] = base;
+  xops[1] = adjustment;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Test if ADJUSTMENT < RESIDUAL_PROBE_GUARD, in principle any power of two
+     larger than 1024B would work, but we need one that works for all supported
+     guard-sizes.  What we actually want to check is guard-size - 1KB, but this
+     immediate won't fit inside a cmp without requiring a tempory, so instead we
+     just accept a smaller immediate that doesn't, we may probe a bit more often
+     but that doesn't matter much on the long run.  */
+  xops[0] = adjustment;
+  xops[1] = gen_int_mode (residual_probe_guard, Pmode);
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to end.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* Probe at BASE + PROBE_OFFSET.  */
+  xops[0] = base;
+  xops[1] = probe_offset;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* No probe leave.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+  return "";
+}
+
 /* Determine whether a frame chain needs to be generated.  */
 static bool
 aarch64_needs_frame_chain (void)
@@ -4830,11 +4929,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 	}
     }
 
-  HOST_WIDE_INT size;
+  HOST_WIDE_INT size = 0;
   /* If SIZE is not large enough to require probing, just adjust the stack and
      exit.  */
-  if (!poly_size.is_constant (&size)
-      || known_lt (poly_size, min_probe_threshold)
+  if ((poly_size.is_constant (&size)
+       && known_lt (poly_size, min_probe_threshold))
       || !flag_stack_clash_protection)
     {
       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
@@ -4842,9 +4941,64 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
     }
 
   if (dump_file)
-    fprintf (dump_file,
-	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
-	     ", probing will be required.\n", size);
+    {
+      if (poly_size.is_constant ())
+	fprintf (dump_file,
+		 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
+		 " bytes, probing will be required.\n", size);
+      else
+	{
+	  fprintf (dump_file, "Stack clash SVE prologue: ");
+	  print_dec (poly_size, dump_file);
+	  fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+	}
+    }
+
+  /* Handle the SVE non-constant case first.  */
+  if (!poly_size.is_constant ())
+    {
+      /* First calculate the amount of bytes we're actually spilling.  */
+      aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),
+			  poly_size, temp1, temp2, false, true);
+
+      rtx_insn *insn = get_last_insn ();
+
+      if (frame_related_p)
+	{
+	  /* This is done to provide unwinding information for the stack
+	     adjustments we're about to do, however to prevent the optimizers
+	     from removing the R15 move and leaving the CFA note (which would be
+	     very wrong) we tie the old and new stack pointer together.
+	     The tie will expand to nothing but the optimizers will not touch
+	     the instruction.  */
+	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+	  /* We want the CFA independent of the stack pointer for the
+	     duration of the loop.  */
+	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      rtx probe_const = gen_int_mode (guard_used_by_caller, Pmode);
+      rtx guard_const = gen_int_mode (guard_size, Pmode);
+
+      insn = emit_insn (gen_probe_sve_stack_clash (stack_pointer_rtx,
+						   stack_pointer_rtx, temp1,
+						   probe_const, guard_const));
+
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+	{
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				      gen_int_mode (poly_size, Pmode)));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      return;
+    }
 
   /* Round size to the nearest multiple of guard_size, and calculate the
      residual as the difference between the original size and the rounded
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b8da13f14fa9990e8fdc3c71ed407c8afc65a324..4901f55478eb0ea26a36f15d51aaf9779a8efaf4 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6464,6 +6464,25 @@
   [(set_attr "length" "32")]
 )
 
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop.  We can't change the control flow during prologue and epilogue
+;; code generation.  So we must emit a volatile unspec and expand it later on.
+
+(define_insn "probe_sve_stack_clash"
+  [(set (match_operand:DI 0 "register_operand" "=rk")
+	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
+			     (match_operand:DI 2 "register_operand" "r")
+			     (match_operand:DI 3 "aarch64_plus_immediate" "L")
+			     (match_operand:DI 4 "aarch64_plus_immediate" "L")]
+			      UNSPECV_PROBE_STACK_RANGE))]
+  "TARGET_SVE"
+{
+  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+					       operands[3], operands[4]);
+}
+  [(set_attr "length" "40")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include <stdint.h>
+
+#define N 20040
+
+void __attribute__ ((noinline, noclone))
+test (int8_t *restrict dest, int8_t *restrict src)
+{
+  for (int i = 0; i < N; i+=8)
+    {
+      dest[i] += src[i * 4];
+      dest[i+1] += src[i * 4 + 1];
+      dest[i+2] += src[i * 4 + 2];
+      dest[i+3] += src[i * 4 + 3];
+      dest[i+4] += src[i * 4 + 4];
+      dest[i+5] += src[i * 4 + 5];
+      dest[i+6] += src[i * 4 + 6];
+      dest[i+7] += src[i * 4 + 7];
+    }
+}
+/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+   need to make sure we can unwind correctly before the frame is set up.  So
+   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa8327b9f48ebba64b3e55206435bdbdb6f5ac18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -ftree-vectorize" } */
+
+
+#include <stdint.h>
+
+#define N 20040
+
+void __attribute__ ((noinline, noclone))
+test (int8_t *restrict dest, int8_t *restrict src)
+{
+  for (int i = 0; i < N; i+=8)
+    {
+      dest[i] += src[i * 4];
+      dest[i+1] += src[i * 4 + 1];
+      dest[i+2] += src[i * 4 + 2];
+      dest[i+3] += src[i * 4 + 3];
+      dest[i+4] += src[i * 4 + 4];
+      dest[i+5] += src[i * 4 + 5];
+      dest[i+6] += src[i * 4 + 6];
+      dest[i+7] += src[i * 4 + 7];
+    }
+}
+
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 1024\]} 2 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 2048} 1 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 65536} 1 } } */
+
+/* SVE spill, requires probing as vector size is unknown at compile time.  */
+
Richard Sandiford Sept. 11, 2018, 3:20 p.m. UTC | #3
Tamar Christina <Tamar.Christina@arm.com> writes:
>> > +
>> > +  /* No probe leave.  */
>> > +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
>> > +  return "";
>> 
>> With the CFA stuff and constant load, I think this works out as:
>> 
>> ---------------------------------------------
>> # 12 insns
>> 	mov	r15, base
>> 	mov	adjustment, N
>> 1:
>> 	cmp	adjustment, guard_size
>> 	b.lt    2f
>> 	sub	base, base, guard_size
>> 	str	xzr, [base, limit]
>> 	sub     adjustment, adjustment, guard_size
>> 	b	1b
>> 2:
>> 	sub     base, base, adjustment
>> 	cmp	adjustment, limit
>> 	b.le	3f
>> 	str	xzr, [base, limit]
>> 3:
>> ---------------------------------------------
>> 
>> What do you think about something like:
>> 
>> ---------------------------------------------
>> # 10 insns
>> 	mov	adjustment, N
>> 	sub	r15, base, adjustment
>> 	subs	adjustment, adjustment, min_probe_threshold
>> 	b.lo	2f
>> 1:
>> 	add	base, x15, adjustment
>> 	str	xzr, [base, 0]
>> 	subs	adjustment, adjustment, 16
>> 	and	adjustment, adjustment, ~(guard_size-1)
>> 	b.hs	1b
>> 2:
>> 	mov	base, r15
>> ---------------------------------------------
>> 
>> or (with different trade-offs):
>> 
>> ---------------------------------------------
>> # 11 insns
>> 	mov	adjustment, N
>> 	sub	r15, base, adjustment
>> 	subs	adjustment, adjustment, min_probe_threshold
>> 	b.lo	2f
>> 	# Might be 0, leading to a double probe
>> 	and	r14, adjustment, guard_size-1
>> 1:
>> 	add	base, x15, adjustment
>> 	str	xzr, [base, 0]
>> 	subs	adjustment, adjustment, r14
>> 	mov	r14, guard_size
>> 	b.hs	1b
>> 2:
>> 	mov	base, r15
>> ---------------------------------------------
>> 
>> or (longer, but with a simpler loop):
>> 
>> ---------------------------------------------
>> # 12 insns
>> 	mov	adjustment, N
>> 	sub	r15, base, adjustment
>> 	subs	adjustment, adjustment, min_probe_threshold
>> 	b.lo	2f
>> 	str	xzr, [base, -16]!
>> 	sub	adjustment, adjustment, 32
>> 	and	adjustment, adjustment, -(guard_size-1)
>> 1:
>> 	add	base, x15, adjustment
>> 	str	xzr, [base, 0]
>> 	subs	adjustment, adjustment, guard_size
>> 	b.hs	1b
>> 2:
>> 	mov	base, r15
>> ---------------------------------------------
>> 
>> with the CFA based on r15+offset?
>> 
>> These loops probe more often than necessary in some cases,
>> but they only need a single branch in the common case that
>> ADJUSTMENT <= MIN_PROBE_THRESHOLD.
>
> I haven't changed the loop yet because I'm a bit on the edge about
> whether the implementation difficulties would outweigh the benefits.
> We are planning on doing something smarter for SVE so optimizing these
> loops only to replace them later may not be time well spent now.
>
> The problem is that to support both 4KB and 64KB pages, instructions such
> as subs would require different immediates and shifts. Granted we technically
> only support these two so I could hardcode the values, but that would mean
> these functions are less general than the rest.

Because of the min_probe_threshold?  You could conservatively clamp it
to the next lowest value that's in range, which we could do without
having to hard-code specific values.  I think it would be better
to do that even with the current code, since hard-coding 2048 with:

  /* Test if ADJUSTMENT < RESIDUAL_PROBE_GUARD, in principle any power of two
     larger than 1024B would work, but we need one that works for all supported
     guard-sizes.  What we actually want to check is guard-size - 1KB, but this
     immediate won't fit inside a cmp without requiring a tempory, so instead we
     just accept a smaller immediate that doesn't, we may probe a bit more often
     but that doesn't matter much on the long run.  */

seems a bit of a hack.

> If you think it would be worthwhile, I'd be happy to use one of these
> loops instead.

Yeah, I still think we should do this unless we can commit to doing
the optimised version by a specific date, and that date is soon enough
that the optimisation could reasonably be backported to GCC 8.

> @@ -4830,11 +4929,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
>  	}
>      }
> 
> -  HOST_WIDE_INT size;
> +  HOST_WIDE_INT size = 0;
>    /* If SIZE is not large enough to require probing, just adjust the stack and
>       exit.  */
> -  if (!poly_size.is_constant (&size)
> -      || known_lt (poly_size, min_probe_threshold)
> +  if ((poly_size.is_constant (&size)
> +       && known_lt (poly_size, min_probe_threshold))
>        || !flag_stack_clash_protection)
>      {
>        aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);

I still think we should remove this poly_size.is_constant, and instead:

> @@ -4842,9 +4941,64 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
>      }
 
>    if (dump_file)
> -    fprintf (dump_file,
> -	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> -	     ", probing will be required.\n", size);
> +    {
> +      if (poly_size.is_constant ())
> +	fprintf (dump_file,
> +		 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
> +		 " bytes, probing will be required.\n", size);
> +      else
> +	{
> +	  fprintf (dump_file, "Stack clash SVE prologue: ");
> +	  print_dec (poly_size, dump_file);
> +	  fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> +	}
> +    }
> +
> +  /* Handle the SVE non-constant case first.  */
> +  if (!poly_size.is_constant ())

...use is_constant (&size) here, and put the dump messages for the
constant and non-constant cases in their respective constant and
non-constant blocks.  That way each use of "size" is directly protected
by an is_constant call, and there's no need to initialise size to 0.

The non-constant case doesn't have the new special handling of
final_adjustment_p, so I think the !is_constant block should assert
!final_adjustment_p.

Thanks,
Richard
Tamar Christina Sept. 20, 2018, 9:23 a.m. UTC | #4
Hi Richard,

The 09/11/2018 16:20, Richard Sandiford wrote:
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> > +
> >> > +  /* No probe leave.  */
> >> > +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
> >> > +  return "";
> >> 
> >> With the CFA stuff and constant load, I think this works out as:
> >> 
> >> ---------------------------------------------
> >> # 12 insns
> >> 	mov	r15, base
> >> 	mov	adjustment, N
> >> 1:
> >> 	cmp	adjustment, guard_size
> >> 	b.lt    2f
> >> 	sub	base, base, guard_size
> >> 	str	xzr, [base, limit]
> >> 	sub     adjustment, adjustment, guard_size
> >> 	b	1b
> >> 2:
> >> 	sub     base, base, adjustment
> >> 	cmp	adjustment, limit
> >> 	b.le	3f
> >> 	str	xzr, [base, limit]
> >> 3:
> >> ---------------------------------------------
> >> 
> >> What do you think about something like:
> >> 
> >> ---------------------------------------------
> >> # 10 insns
> >> 	mov	adjustment, N
> >> 	sub	r15, base, adjustment
> >> 	subs	adjustment, adjustment, min_probe_threshold
> >> 	b.lo	2f
> >> 1:
> >> 	add	base, x15, adjustment
> >> 	str	xzr, [base, 0]
> >> 	subs	adjustment, adjustment, 16
> >> 	and	adjustment, adjustment, ~(guard_size-1)
> >> 	b.hs	1b
> >> 2:
> >> 	mov	base, r15
> >> ---------------------------------------------
> >> 
> >> or (with different trade-offs):
> >> 
> >> ---------------------------------------------
> >> # 11 insns
> >> 	mov	adjustment, N
> >> 	sub	r15, base, adjustment
> >> 	subs	adjustment, adjustment, min_probe_threshold
> >> 	b.lo	2f
> >> 	# Might be 0, leading to a double probe
> >> 	and	r14, adjustment, guard_size-1
> >> 1:
> >> 	add	base, x15, adjustment
> >> 	str	xzr, [base, 0]
> >> 	subs	adjustment, adjustment, r14
> >> 	mov	r14, guard_size
> >> 	b.hs	1b
> >> 2:
> >> 	mov	base, r15
> >> ---------------------------------------------
> >> 
> >> or (longer, but with a simpler loop):
> >> 
> >> ---------------------------------------------
> >> # 12 insns
> >> 	mov	adjustment, N
> >> 	sub	r15, base, adjustment
> >> 	subs	adjustment, adjustment, min_probe_threshold
> >> 	b.lo	2f
> >> 	str	xzr, [base, -16]!
> >> 	sub	adjustment, adjustment, 32
> >> 	and	adjustment, adjustment, -(guard_size-1)
> >> 1:
> >> 	add	base, x15, adjustment
> >> 	str	xzr, [base, 0]
> >> 	subs	adjustment, adjustment, guard_size
> >> 	b.hs	1b
> >> 2:
> >> 	mov	base, r15
> >> ---------------------------------------------
> >> 
> >> with the CFA based on r15+offset?
> >> 
> >> These loops probe more often than necessary in some cases,
> >> but they only need a single branch in the common case that
> >> ADJUSTMENT <= MIN_PROBE_THRESHOLD.
> >
> > I haven't changed the loop yet because I'm a bit on the edge about
> > whether the implementation difficulties would outweigh the benefits.
> > We are planning on doing something smarter for SVE so optimizing these
> > loops only to replace them later may not be time well spent now.
> >
> > The problem is that to support both 4KB and 64KB pages, instructions such
> > as subs would require different immediates and shifts. Granted we technically
> > only support these two so I could hardcode the values, but that would mean
> > these functions are less general than the rest.
> 
> Because of the min_probe_threshold?  You could conservatively clamp it
> to the next lowest value that's in range, which we could do without
> having to hard-code specific values.  I think it would be better
> to do that even with the current code, since hard-coding 2048 with:
> 
>   /* Test if ADJUSTMENT < RESIDUAL_PROBE_GUARD, in principle any power of two
>      larger than 1024B would work, but we need one that works for all supported
>      guard-sizes.  What we actually want to check is guard-size - 1KB, but this
>      immediate won't fit inside a cmp without requiring a tempory, so instead we
>      just accept a smaller immediate that doesn't, we may probe a bit more often
>      but that doesn't matter much on the long run.  */
> 
> seems a bit of a hack.
> 
> > If you think it would be worthwhile, I'd be happy to use one of these
> > loops instead.
> 
> Yeah, I still think we should do this unless we can commit to doing
> the optimised version by a specific date, and that date is soon enough
> that the optimisation could reasonably be backported to GCC 8.
> 

While implementing these loops I found them a bit hard to follow, or rather a bit
difficult to prove correct, to someone looking at the code it may not be trivially clear
what it does. I believe the main concern here is that the common case
isn't shortcutted? e.g. spills small enough not to require a probe. So how about

	mov	r15, base
	mov	adjustment, N
	cmp	adjustment, nearest(min_probe_threshold)
	b.lt	end
begin:
	sub	base, base, nearest(min_probe_threshold)
	str	xzr, [base, 0]
	subs	size, size, nearest(min_probe_threshold)
	b.hs	begin
end:
	sub	base, base, size

as an alternative? Which is 9 insn but also much simpler and follows the same semantics
as the other probing codes.  This has the downside that we probe a bit more often than we need to
but on the average case you'd likely not enter the loop more than once, so I'd expect in real world usage
the amount of probes to be the same as the previous code, since you'd have to spill a significant amount of SVE
vectors in order to enter the loop, let alone iterate.

This is still safe as the only invariant we have to hold is not to drop the SP by more than a page at a time,
doing less than a page it fine.

nearest just rounds down to the nearest value that fits in a 12-bit shifted immediate.


Thanks,
Tamar

> > @@ -4830,11 +4929,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> >  	}
> >      }
> > 
> > -  HOST_WIDE_INT size;
> > +  HOST_WIDE_INT size = 0;
> >    /* If SIZE is not large enough to require probing, just adjust the stack and
> >       exit.  */
> > -  if (!poly_size.is_constant (&size)
> > -      || known_lt (poly_size, min_probe_threshold)
> > +  if ((poly_size.is_constant (&size)
> > +       && known_lt (poly_size, min_probe_threshold))
> >        || !flag_stack_clash_protection)
> >      {
> >        aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
> 
> I still think we should remove this poly_size.is_constant, and instead:
> 
> > @@ -4842,9 +4941,64 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> >      }
>  
> >    if (dump_file)
> > -    fprintf (dump_file,
> > -	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> > -	     ", probing will be required.\n", size);
> > +    {
> > +      if (poly_size.is_constant ())
> > +	fprintf (dump_file,
> > +		 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
> > +		 " bytes, probing will be required.\n", size);
> > +      else
> > +	{
> > +	  fprintf (dump_file, "Stack clash SVE prologue: ");
> > +	  print_dec (poly_size, dump_file);
> > +	  fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> > +	}
> > +    }
> > +
> > +  /* Handle the SVE non-constant case first.  */
> > +  if (!poly_size.is_constant ())
> 
> ...use is_constant (&size) here, and put the dump messages for the
> constant and non-constant cases in their respective constant and
> non-constant blocks.  That way each use of "size" is directly protected
> by an is_constant call, and there's no need to initialise size to 0.
> 
> The non-constant case doesn't have the new special handling of
> final_adjustment_p, so I think the !is_constant block should assert
> !final_adjustment_p.
> 
> Thanks,
> Richard

--
Tamar Christina Sept. 26, 2018, 8:20 a.m. UTC | #5
Hi Richard,

I've added a new loop that should also exit early as described in my previous email.

An example sequence is:

        .cfi_startproc
        mov     x15, sp
        cntb    x16, all, mul #11
        add     x16, x16, 304
        .cfi_def_cfa_register 15
        cmp     x16, 61440
        b.lt    .SVLPEND0
.SVLPSPL0:
        sub     sp, sp, 61440
        str     xzr, [sp, 0]
        subs    x16, x16, 61440
        b.hs    .SVLPSPL0
        add     x16, x16, 61440
.SVLPEND0:
        sub     sp, sp, x16
        .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22

for a 64KB guard size, and for a 4KB guard size

        .cfi_startproc
        mov     x15, sp
        cntb    x16, all, mul #11
        add     x16, x16, 304
        .cfi_def_cfa_register 15
        cmp     x16, 3072
        b.lt    .SVLPEND0
.SVLPSPL0:
        sub     sp, sp, 3072
        str     xzr, [sp, 0]
        subs    x16, x16, 3072
        b.hs    .SVLPSPL0
        add     x16, x16, 3072
.SVLPEND0:
        sub     sp, sp, x16
        .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22


This has about the same semantics as alloca, except we prioritize the common case
where no probe is required.  We also change the amount we adjust the stack and
the probing interval to be the nearest value to `guard size - abi buffer` that
fits in the 12-bit shifted immediate used by cmp.

While this would mean we probe a bit more often than we require, in practice the
amount of SVE vectors you'd need to spill is significant. Even more so to enter the
loop more than once.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues in sve testsuite.
Target was tested with stack clash on and off by default.

Ok for trunk?

Thanks,
Tamar

gcc/
2018-09-26  Tamar Christina  <tamar.christina@arm.com>

	PR target/86486
	* config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New.
	* config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash,
	aarch64_uimm12_nearest_value): New.
	(aarch64_allocate_and_probe_stack_space): Add SVE specific section.
	* config/aarch64/aarch64.md (probe_sve_stack_clash): New.

gcc/testsuite/
2018-09-26  Tamar Christina  <tamar.christina@arm.com>

	PR target/86486
	* gcc.target/aarch64/stack-check-prologue-16.c: New test
	* gcc.target/aarch64/stack-check-cfa-3.c: New test.


The 09/20/2018 10:23, Tamar Christina wrote:
> Hi Richard,
> 
> The 09/11/2018 16:20, Richard Sandiford wrote:
> > Tamar Christina <Tamar.Christina@arm.com> writes:
> > >> > +
> > >> > +  /* No probe leave.  */
> > >> > +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
> > >> > +  return "";
> > >> 
> > >> With the CFA stuff and constant load, I think this works out as:
> > >> 
> > >> ---------------------------------------------
> > >> # 12 insns
> > >> 	mov	r15, base
> > >> 	mov	adjustment, N
> > >> 1:
> > >> 	cmp	adjustment, guard_size
> > >> 	b.lt    2f
> > >> 	sub	base, base, guard_size
> > >> 	str	xzr, [base, limit]
> > >> 	sub     adjustment, adjustment, guard_size
> > >> 	b	1b
> > >> 2:
> > >> 	sub     base, base, adjustment
> > >> 	cmp	adjustment, limit
> > >> 	b.le	3f
> > >> 	str	xzr, [base, limit]
> > >> 3:
> > >> ---------------------------------------------
> > >> 
> > >> What do you think about something like:
> > >> 
> > >> ---------------------------------------------
> > >> # 10 insns
> > >> 	mov	adjustment, N
> > >> 	sub	r15, base, adjustment
> > >> 	subs	adjustment, adjustment, min_probe_threshold
> > >> 	b.lo	2f
> > >> 1:
> > >> 	add	base, x15, adjustment
> > >> 	str	xzr, [base, 0]
> > >> 	subs	adjustment, adjustment, 16
> > >> 	and	adjustment, adjustment, ~(guard_size-1)
> > >> 	b.hs	1b
> > >> 2:
> > >> 	mov	base, r15
> > >> ---------------------------------------------
> > >> 
> > >> or (with different trade-offs):
> > >> 
> > >> ---------------------------------------------
> > >> # 11 insns
> > >> 	mov	adjustment, N
> > >> 	sub	r15, base, adjustment
> > >> 	subs	adjustment, adjustment, min_probe_threshold
> > >> 	b.lo	2f
> > >> 	# Might be 0, leading to a double probe
> > >> 	and	r14, adjustment, guard_size-1
> > >> 1:
> > >> 	add	base, x15, adjustment
> > >> 	str	xzr, [base, 0]
> > >> 	subs	adjustment, adjustment, r14
> > >> 	mov	r14, guard_size
> > >> 	b.hs	1b
> > >> 2:
> > >> 	mov	base, r15
> > >> ---------------------------------------------
> > >> 
> > >> or (longer, but with a simpler loop):
> > >> 
> > >> ---------------------------------------------
> > >> # 12 insns
> > >> 	mov	adjustment, N
> > >> 	sub	r15, base, adjustment
> > >> 	subs	adjustment, adjustment, min_probe_threshold
> > >> 	b.lo	2f
> > >> 	str	xzr, [base, -16]!
> > >> 	sub	adjustment, adjustment, 32
> > >> 	and	adjustment, adjustment, -(guard_size-1)
> > >> 1:
> > >> 	add	base, x15, adjustment
> > >> 	str	xzr, [base, 0]
> > >> 	subs	adjustment, adjustment, guard_size
> > >> 	b.hs	1b
> > >> 2:
> > >> 	mov	base, r15
> > >> ---------------------------------------------
> > >> 
> > >> with the CFA based on r15+offset?
> > >> 
> > >> These loops probe more often than necessary in some cases,
> > >> but they only need a single branch in the common case that
> > >> ADJUSTMENT <= MIN_PROBE_THRESHOLD.
> > >
> > > I haven't changed the loop yet because I'm a bit on the edge about
> > > whether the implementation difficulties would outweigh the benefits.
> > > We are planning on doing something smarter for SVE so optimizing these
> > > loops only to replace them later may not be time well spent now.
> > >
> > > The problem is that to support both 4KB and 64KB pages, instructions such
> > > as subs would require different immediates and shifts. Granted we technically
> > > only support these two so I could hardcode the values, but that would mean
> > > these functions are less general than the rest.
> > 
> > Because of the min_probe_threshold?  You could conservatively clamp it
> > to the next lowest value that's in range, which we could do without
> > having to hard-code specific values.  I think it would be better
> > to do that even with the current code, since hard-coding 2048 with:
> > 
> >   /* Test if ADJUSTMENT < RESIDUAL_PROBE_GUARD, in principle any power of two
> >      larger than 1024B would work, but we need one that works for all supported
> >      guard-sizes.  What we actually want to check is guard-size - 1KB, but this
> >      immediate won't fit inside a cmp without requiring a tempory, so instead we
> >      just accept a smaller immediate that doesn't, we may probe a bit more often
> >      but that doesn't matter much on the long run.  */
> > 
> > seems a bit of a hack.
> > 
> > > If you think it would be worthwhile, I'd be happy to use one of these
> > > loops instead.
> > 
> > Yeah, I still think we should do this unless we can commit to doing
> > the optimised version by a specific date, and that date is soon enough
> > that the optimisation could reasonably be backported to GCC 8.
> > 
> 
> While implementing these loops I found them a bit hard to follow, or rather a bit
> difficult to prove correct, to someone looking at the code it may not be trivially clear
> what it does. I believe the main concern here is that the common case
> isn't shortcutted? e.g. spills small enough not to require a probe. So how about
> 
> 	mov	r15, base
> 	mov	adjustment, N
> 	cmp	adjustment, nearest(min_probe_threshold)
> 	b.lt	end
> begin:
> 	sub	base, base, nearest(min_probe_threshold)
> 	str	xzr, [base, 0]
> 	subs	size, size, nearest(min_probe_threshold)
> 	b.hs	begin
> end:
> 	sub	base, base, size
> 
> as an alternative? Which is 9 insn but also much simpler and follows the same semantics
> as the other probing codes.  This has the downside that we probe a bit more often than we need to
> but on the average case you'd likely not enter the loop more than once, so I'd expect in real world usage
> the amount of probes to be the same as the previous code, since you'd have to spill a significant amount of SVE
> vectors in order to enter the loop, let alone iterate.
> 
> This is still safe as the only invariant we have to hold is not to drop the SP by more than a page at a time,
> doing less than a page it fine.
> 
> nearest just rounds down to the nearest value that fits in a 12-bit shifted immediate.
> 
> 
> Thanks,
> Tamar
> 
> > > @@ -4830,11 +4929,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> > >  	}
> > >      }
> > > 
> > > -  HOST_WIDE_INT size;
> > > +  HOST_WIDE_INT size = 0;
> > >    /* If SIZE is not large enough to require probing, just adjust the stack and
> > >       exit.  */
> > > -  if (!poly_size.is_constant (&size)
> > > -      || known_lt (poly_size, min_probe_threshold)
> > > +  if ((poly_size.is_constant (&size)
> > > +       && known_lt (poly_size, min_probe_threshold))
> > >        || !flag_stack_clash_protection)
> > >      {
> > >        aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
> > 
> > I still think we should remove this poly_size.is_constant, and instead:
> > 
> > > @@ -4842,9 +4941,64 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> > >      }
> >  
> > >    if (dump_file)
> > > -    fprintf (dump_file,
> > > -	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> > > -	     ", probing will be required.\n", size);
> > > +    {
> > > +      if (poly_size.is_constant ())
> > > +	fprintf (dump_file,
> > > +		 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
> > > +		 " bytes, probing will be required.\n", size);
> > > +      else
> > > +	{
> > > +	  fprintf (dump_file, "Stack clash SVE prologue: ");
> > > +	  print_dec (poly_size, dump_file);
> > > +	  fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> > > +	}
> > > +    }
> > > +
> > > +  /* Handle the SVE non-constant case first.  */
> > > +  if (!poly_size.is_constant ())
> > 
> > ...use is_constant (&size) here, and put the dump messages for the
> > constant and non-constant cases in their respective constant and
> > non-constant blocks.  That way each use of "size" is directly protected
> > by an is_constant call, and there's no need to initialise size to 0.
> > 
> > The non-constant case doesn't have the new special handling of
> > final_adjustment_p, so I think the !is_constant block should assert
> > !final_adjustment_p.
> > 
> > Thanks,
> > Richard
> 
> -- 

--
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
 const char * aarch64_output_probe_stack_range (rtx, rtx);
+const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d4b13d48d852a70848fc7c51fd867e776efb5e55..d189198a377e698964d34ef03a4c1a92fe1be4f0 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
+static HOST_WIDE_INT aarch64_uimm12_nearest_value (HOST_WIDE_INT val);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -3973,6 +3974,89 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   return "";
 }
 
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
+   of GUARD_SIZE.  When a probe is emitted it is done at MIN_PROBE_OFFSET bytes
+   from the current BASE at an interval of MIN_PROBE_OFFSET.  By the end of this
+   function BASE = BASE - ADJUSTMENT.  */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
+				      rtx min_probe_threshold, rtx guard_size)
+{
+  /* This function is not allowed to use any instruction generation function
+     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
+     so instead emit the code you want using output_asm_insn.  */
+  gcc_assert (flag_stack_clash_protection);
+  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
+  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
+
+  /* The minimum required allocation before the residual requires probing.  */
+  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
+
+  /* Clamp the value down to the nearest value that can be used with a cmp.  */
+  residual_probe_guard = aarch64_uimm12_nearest_value (residual_probe_guard);
+  rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
+
+  gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
+  gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
+
+  static int labelno = 0;
+  char loop_start_lab[32];
+  char loop_end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
+
+  /* ADJUSTMENT == RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to end if not enough adjustment to probe.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* Emit loop start label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = base;
+  xops[1] = gen_int_mode (residual_probe_guard, Pmode);
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at BASE.  */
+  xops[1] = const0_rtx;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("subs\t%0, %0, %1", xops);
+
+  /* Branch to start if still more bytes to allocate.  */
+  fputs ("\tb.hs\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_start_lab);
+  fputc ('\n', asm_out_file);
+
+  /* ADJUSTMENT = ADJUSTMENT + RESIDUAL_PROBE_GUARD, we need to undo the last
+     subtract in order to know how much to drop the stack by.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("add\t%0, %0, %1", xops);
+
+  /* No probe leave.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+
+  /* BASE = BASE - ADJUSTMENT.  */
+  xops[0] = base;
+  xops[1] = adjustment;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+  return "";
+}
+
 /* Determine whether a frame chain needs to be generated.  */
 static bool
 aarch64_needs_frame_chain (void)
@@ -4835,21 +4919,76 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 	}
     }
 
-  HOST_WIDE_INT size;
   /* If SIZE is not large enough to require probing, just adjust the stack and
      exit.  */
-  if (!poly_size.is_constant (&size)
-      || known_lt (poly_size, min_probe_threshold)
+  if (known_lt (poly_size, min_probe_threshold)
       || !flag_stack_clash_protection)
     {
       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
       return;
     }
 
+  HOST_WIDE_INT size;
+  /* Handle the SVE non-constant case first.  */
+  if (!poly_size.is_constant (&size))
+    {
+
+     if (dump_file)
+      {
+	  fprintf (dump_file, "Stack clash SVE prologue: ");
+	  print_dec (poly_size, dump_file);
+	  fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+      }
+
+      /* First calculate the amount of bytes we're actually spilling.  */
+      aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),
+			  poly_size, temp1, temp2, false, true);
+
+      rtx_insn *insn = get_last_insn ();
+
+      if (frame_related_p)
+	{
+	  /* This is done to provide unwinding information for the stack
+	     adjustments we're about to do, however to prevent the optimizers
+	     from removing the R15 move and leaving the CFA note (which would be
+	     very wrong) we tie the old and new stack pointer together.
+	     The tie will expand to nothing but the optimizers will not touch
+	     the instruction.  */
+	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+	  /* We want the CFA independent of the stack pointer for the
+	     duration of the loop.  */
+	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      rtx probe_const = gen_int_mode (min_probe_threshold, DImode);
+      rtx guard_const = gen_int_mode (guard_size, DImode);
+
+      insn = emit_insn (gen_probe_sve_stack_clash (stack_pointer_rtx,
+						   stack_pointer_rtx, temp1,
+						   probe_const, guard_const));
+
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+	{
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				      gen_int_mode (poly_size, Pmode)));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      return;
+    }
+
   if (dump_file)
-    fprintf (dump_file,
-	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
-	     ", probing will be required.\n", size);
+    {
+      fprintf (dump_file,
+	       "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
+	       " bytes, probing will be required.\n", size);
+    }
 
   /* Round size to the nearest multiple of guard_size, and calculate the
      residual as the difference between the original size and the rounded
@@ -5458,6 +5597,16 @@ aarch64_uimm12_shift (HOST_WIDE_INT val)
 	  );
 }
 
+/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
+   that can be created with a left shift of 0 or 12.  */
+static HOST_WIDE_INT
+aarch64_uimm12_nearest_value (HOST_WIDE_INT val)
+{
+  if ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val)
+    return val;
+
+  return val & (((HOST_WIDE_INT) 0xfff) << 12);
+}
 
 /* Return true if val is an immediate that can be loaded into a
    register by a MOVZ instruction.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b8da13f14fa9990e8fdc3c71ed407c8afc65a324..b422713019e5063babec1fb81d0dfc7b50c76038 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6464,6 +6464,25 @@
   [(set_attr "length" "32")]
 )
 
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop.  We can't change the control flow during prologue and epilogue
+;; code generation.  So we must emit a volatile unspec and expand it later on.
+
+(define_insn "probe_sve_stack_clash"
+  [(set (match_operand:DI 0 "register_operand" "=rk")
+	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
+			     (match_operand:DI 2 "register_operand" "r")
+			     (match_operand:DI 3 "const_int_operand" "n")
+			     (match_operand:DI 4 "aarch64_plus_immediate" "L")]
+			      UNSPECV_PROBE_STACK_RANGE))]
+  "TARGET_SVE"
+{
+  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+					       operands[3], operands[4]);
+}
+  [(set_attr "length" "32")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include <stdint.h>
+
+#define N 20040
+
+void __attribute__ ((noinline, noclone))
+test (int8_t *restrict dest, int8_t *restrict src)
+{
+  for (int i = 0; i < N; i+=8)
+    {
+      dest[i] += src[i * 4];
+      dest[i+1] += src[i * 4 + 1];
+      dest[i+2] += src[i * 4 + 2];
+      dest[i+3] += src[i * 4 + 3];
+      dest[i+4] += src[i * 4 + 4];
+      dest[i+5] += src[i * 4 + 5];
+      dest[i+6] += src[i * 4 + 6];
+      dest[i+7] += src[i * 4 + 7];
+    }
+}
+/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+   need to make sure we can unwind correctly before the frame is set up.  So
+   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
new file mode 100644
index 0000000000000000000000000000000000000000..fd139bb09274509bd4faeef324520927a1fe3d3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -ftree-vectorize" } */
+
+
+#include <stdint.h>
+
+#define N 20040
+
+void __attribute__ ((noinline, noclone))
+test (int8_t *restrict dest, int8_t *restrict src)
+{
+  for (int i = 0; i < N; i+=8)
+    {
+      dest[i] += src[i * 4];
+      dest[i+1] += src[i * 4 + 1];
+      dest[i+2] += src[i * 4 + 2];
+      dest[i+3] += src[i * 4 + 3];
+      dest[i+4] += src[i * 4 + 4];
+      dest[i+5] += src[i * 4 + 5];
+      dest[i+6] += src[i * 4 + 6];
+      dest[i+7] += src[i * 4 + 7];
+    }
+}
+
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
+/* { dg-final { scan-assembler-times {subs\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
+
+/* SVE spill, requires probing as vector size is unknown at compile time.  */
+
Tamar Christina Sept. 27, 2018, 10:02 a.m. UTC | #6
Hi All,

It turns out the testsuite didn't have a case in it which would cause a
significant enough spill to enter the loop.  After creating one I noticed a bug
in the loop and fixed it.

The loops are now

        .cfi_startproc
        mov     x15, sp
        cntb    x16, all, mul #11
        add     x16, x16, 304
        .cfi_def_cfa_register 15
.SVLPSPL0:
        cmp     x16, 61440
        b.lt    .SVLPEND0
        sub     sp, sp, 61440
        str     xzr, [sp, 0]
        subs    x16, x16, 61440
        b      .SVLPSPL0
.SVLPEND0:
        sub     sp, sp, x16
        .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22

for a 64KB guard size.

I'm also adding a new testcase that causes a large enough spill to enter the loop.

Ok for trunk?

Thanks,
Tamar

gcc/
2018-09-27  Tamar Christina  <tamar.christina@arm.com>

	PR target/86486
	* config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New.
	* config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash,
	aarch64_uimm12_nearest_value): New.
	(aarch64_allocate_and_probe_stack_space): Add SVE specific section.
	* config/aarch64/aarch64.md (probe_sve_stack_clash): New.

gcc/testsuite/
2018-09-27  Tamar Christina  <tamar.christina@arm.com>

	PR target/86486
	* gcc.target/aarch64/stack-check-prologue-16.c: New test
	* gcc.target/aarch64/stack-check-cfa-3.c: New test.
	* gcc.target/aarch64/sve/struct_vect_24.c: New test.
	* gcc.target/aarch64/sve/struct_vect_24_run.c: New test.

The 09/26/2018 09:20, Tamar Christina wrote:
> Hi Richard,
> 
> I've added a new loop that should also exit early as described in my previous email.
> 
> An example sequence is:
> 
>         .cfi_startproc
>         mov     x15, sp
>         cntb    x16, all, mul #11
>         add     x16, x16, 304
>         .cfi_def_cfa_register 15
>         cmp     x16, 61440
>         b.lt    .SVLPEND0
> .SVLPSPL0:
>         sub     sp, sp, 61440
>         str     xzr, [sp, 0]
>         subs    x16, x16, 61440
>         b.hs    .SVLPSPL0
>         add     x16, x16, 61440
> .SVLPEND0:
>         sub     sp, sp, x16
>         .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22
> 
> for a 64KB guard size, and for a 4KB guard size
> 
>         .cfi_startproc
>         mov     x15, sp
>         cntb    x16, all, mul #11
>         add     x16, x16, 304
>         .cfi_def_cfa_register 15
>         cmp     x16, 3072
>         b.lt    .SVLPEND0
> .SVLPSPL0:
>         sub     sp, sp, 3072
>         str     xzr, [sp, 0]
>         subs    x16, x16, 3072
>         b.hs    .SVLPSPL0
>         add     x16, x16, 3072
> .SVLPEND0:
>         sub     sp, sp, x16
>         .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22
> 
> 
> This has about the same semantics as alloca, except we prioritize the common case
> where no probe is required.  We also change the amount we adjust the stack and
> the probing interval to be the nearest value to `guard size - abi buffer` that
> fits in the 12-bit shifted immediate used by cmp.
> 
> While this would mean we probe a bit more often than we require, in practice the
> amount of SVE vectors you'd need to spill is significant. Even more so to enter the
> loop more than once.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues in sve testsuite.
> Target was tested with stack clash on and off by default.
> 
> Ok for trunk?
> 
> Thanks,
> Tamar
> 
> gcc/
> 2018-09-26  Tamar Christina  <tamar.christina@arm.com>
> 
> 	PR target/86486
> 	* config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New.
> 	* config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash,
> 	aarch64_uimm12_nearest_value): New.
> 	(aarch64_allocate_and_probe_stack_space): Add SVE specific section.
> 	* config/aarch64/aarch64.md (probe_sve_stack_clash): New.
> 
> gcc/testsuite/
> 2018-09-26  Tamar Christina  <tamar.christina@arm.com>
> 
> 	PR target/86486
> 	* gcc.target/aarch64/stack-check-prologue-16.c: New test
> 	* gcc.target/aarch64/stack-check-cfa-3.c: New test.
> 
> 
> The 09/20/2018 10:23, Tamar Christina wrote:
> > Hi Richard,
> > 
> > The 09/11/2018 16:20, Richard Sandiford wrote:
> > > Tamar Christina <Tamar.Christina@arm.com> writes:
> > > >> > +
> > > >> > +  /* No probe leave.  */
> > > >> > +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
> > > >> > +  return "";
> > > >> 
> > > >> With the CFA stuff and constant load, I think this works out as:
> > > >> 
> > > >> ---------------------------------------------
> > > >> # 12 insns
> > > >> 	mov	r15, base
> > > >> 	mov	adjustment, N
> > > >> 1:
> > > >> 	cmp	adjustment, guard_size
> > > >> 	b.lt    2f
> > > >> 	sub	base, base, guard_size
> > > >> 	str	xzr, [base, limit]
> > > >> 	sub     adjustment, adjustment, guard_size
> > > >> 	b	1b
> > > >> 2:
> > > >> 	sub     base, base, adjustment
> > > >> 	cmp	adjustment, limit
> > > >> 	b.le	3f
> > > >> 	str	xzr, [base, limit]
> > > >> 3:
> > > >> ---------------------------------------------
> > > >> 
> > > >> What do you think about something like:
> > > >> 
> > > >> ---------------------------------------------
> > > >> # 10 insns
> > > >> 	mov	adjustment, N
> > > >> 	sub	r15, base, adjustment
> > > >> 	subs	adjustment, adjustment, min_probe_threshold
> > > >> 	b.lo	2f
> > > >> 1:
> > > >> 	add	base, x15, adjustment
> > > >> 	str	xzr, [base, 0]
> > > >> 	subs	adjustment, adjustment, 16
> > > >> 	and	adjustment, adjustment, ~(guard_size-1)
> > > >> 	b.hs	1b
> > > >> 2:
> > > >> 	mov	base, r15
> > > >> ---------------------------------------------
> > > >> 
> > > >> or (with different trade-offs):
> > > >> 
> > > >> ---------------------------------------------
> > > >> # 11 insns
> > > >> 	mov	adjustment, N
> > > >> 	sub	r15, base, adjustment
> > > >> 	subs	adjustment, adjustment, min_probe_threshold
> > > >> 	b.lo	2f
> > > >> 	# Might be 0, leading to a double probe
> > > >> 	and	r14, adjustment, guard_size-1
> > > >> 1:
> > > >> 	add	base, x15, adjustment
> > > >> 	str	xzr, [base, 0]
> > > >> 	subs	adjustment, adjustment, r14
> > > >> 	mov	r14, guard_size
> > > >> 	b.hs	1b
> > > >> 2:
> > > >> 	mov	base, r15
> > > >> ---------------------------------------------
> > > >> 
> > > >> or (longer, but with a simpler loop):
> > > >> 
> > > >> ---------------------------------------------
> > > >> # 12 insns
> > > >> 	mov	adjustment, N
> > > >> 	sub	r15, base, adjustment
> > > >> 	subs	adjustment, adjustment, min_probe_threshold
> > > >> 	b.lo	2f
> > > >> 	str	xzr, [base, -16]!
> > > >> 	sub	adjustment, adjustment, 32
> > > >> 	and	adjustment, adjustment, -(guard_size-1)
> > > >> 1:
> > > >> 	add	base, x15, adjustment
> > > >> 	str	xzr, [base, 0]
> > > >> 	subs	adjustment, adjustment, guard_size
> > > >> 	b.hs	1b
> > > >> 2:
> > > >> 	mov	base, r15
> > > >> ---------------------------------------------
> > > >> 
> > > >> with the CFA based on r15+offset?
> > > >> 
> > > >> These loops probe more often than necessary in some cases,
> > > >> but they only need a single branch in the common case that
> > > >> ADJUSTMENT <= MIN_PROBE_THRESHOLD.
> > > >
> > > > I haven't changed the loop yet because I'm a bit on the edge about
> > > > whether the implementation difficulties would outweigh the benefits.
> > > > We are planning on doing something smarter for SVE so optimizing these
> > > > loops only to replace them later may not be time well spent now.
> > > >
> > > > The problem is that to support both 4KB and 64KB pages, instructions such
> > > > as subs would require different immediates and shifts. Granted we technically
> > > > only support these two so I could hardcode the values, but that would mean
> > > > these functions are less general than the rest.
> > > 
> > > Because of the min_probe_threshold?  You could conservatively clamp it
> > > to the next lowest value that's in range, which we could do without
> > > having to hard-code specific values.  I think it would be better
> > > to do that even with the current code, since hard-coding 2048 with:
> > > 
> > >   /* Test if ADJUSTMENT < RESIDUAL_PROBE_GUARD, in principle any power of two
> > >      larger than 1024B would work, but we need one that works for all supported
> > >      guard-sizes.  What we actually want to check is guard-size - 1KB, but this
> > >      immediate won't fit inside a cmp without requiring a tempory, so instead we
> > >      just accept a smaller immediate that doesn't, we may probe a bit more often
> > >      but that doesn't matter much on the long run.  */
> > > 
> > > seems a bit of a hack.
> > > 
> > > > If you think it would be worthwhile, I'd be happy to use one of these
> > > > loops instead.
> > > 
> > > Yeah, I still think we should do this unless we can commit to doing
> > > the optimised version by a specific date, and that date is soon enough
> > > that the optimisation could reasonably be backported to GCC 8.
> > > 
> > 
> > While implementing these loops I found them a bit hard to follow, or rather a bit
> > difficult to prove correct, to someone looking at the code it may not be trivially clear
> > what it does. I believe the main concern here is that the common case
> > isn't shortcutted? e.g. spills small enough not to require a probe. So how about
> > 
> > 	mov	r15, base
> > 	mov	adjustment, N
> > 	cmp	adjustment, nearest(min_probe_threshold)
> > 	b.lt	end
> > begin:
> > 	sub	base, base, nearest(min_probe_threshold)
> > 	str	xzr, [base, 0]
> > 	subs	size, size, nearest(min_probe_threshold)
> > 	b.hs	begin
> > end:
> > 	sub	base, base, size
> > 
> > as an alternative? Which is 9 insn but also much simpler and follows the same semantics
> > as the other probing codes.  This has the downside that we probe a bit more often than we need to
> > but on the average case you'd likely not enter the loop more than once, so I'd expect in real world usage
> > the amount of probes to be the same as the previous code, since you'd have to spill a significant amount of SVE
> > vectors in order to enter the loop, let alone iterate.
> > 
> > This is still safe as the only invariant we have to hold is not to drop the SP by more than a page at a time,
> > doing less than a page it fine.
> > 
> > nearest just rounds down to the nearest value that fits in a 12-bit shifted immediate.
> > 
> > 
> > Thanks,
> > Tamar
> > 
> > > > @@ -4830,11 +4929,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> > > >  	}
> > > >      }
> > > > 
> > > > -  HOST_WIDE_INT size;
> > > > +  HOST_WIDE_INT size = 0;
> > > >    /* If SIZE is not large enough to require probing, just adjust the stack and
> > > >       exit.  */
> > > > -  if (!poly_size.is_constant (&size)
> > > > -      || known_lt (poly_size, min_probe_threshold)
> > > > +  if ((poly_size.is_constant (&size)
> > > > +       && known_lt (poly_size, min_probe_threshold))
> > > >        || !flag_stack_clash_protection)
> > > >      {
> > > >        aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
> > > 
> > > I still think we should remove this poly_size.is_constant, and instead:
> > > 
> > > > @@ -4842,9 +4941,64 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> > > >      }
> > >  
> > > >    if (dump_file)
> > > > -    fprintf (dump_file,
> > > > -	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> > > > -	     ", probing will be required.\n", size);
> > > > +    {
> > > > +      if (poly_size.is_constant ())
> > > > +	fprintf (dump_file,
> > > > +		 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
> > > > +		 " bytes, probing will be required.\n", size);
> > > > +      else
> > > > +	{
> > > > +	  fprintf (dump_file, "Stack clash SVE prologue: ");
> > > > +	  print_dec (poly_size, dump_file);
> > > > +	  fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> > > > +	}
> > > > +    }
> > > > +
> > > > +  /* Handle the SVE non-constant case first.  */
> > > > +  if (!poly_size.is_constant ())
> > > 
> > > ...use is_constant (&size) here, and put the dump messages for the
> > > constant and non-constant cases in their respective constant and
> > > non-constant blocks.  That way each use of "size" is directly protected
> > > by an is_constant call, and there's no need to initialise size to 0.
> > > 
> > > The non-constant case doesn't have the new special handling of
> > > final_adjustment_p, so I think the !is_constant block should assert
> > > !final_adjustment_p.
> > > 
> > > Thanks,
> > > Richard
> > 
> > -- 
> 
> -- 

> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
>  void aarch64_cpu_cpp_builtins (cpp_reader *);
>  const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
>  const char * aarch64_output_probe_stack_range (rtx, rtx);
> +const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
>  void aarch64_err_no_fpadvsimd (machine_mode);
>  void aarch64_expand_epilogue (bool);
>  void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index d4b13d48d852a70848fc7c51fd867e776efb5e55..d189198a377e698964d34ef03a4c1a92fe1be4f0 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
>  static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
>  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
>  					    aarch64_addr_query_type);
> +static HOST_WIDE_INT aarch64_uimm12_nearest_value (HOST_WIDE_INT val);
>  
>  /* Major revision number of the ARM Architecture implemented by the target.  */
>  unsigned aarch64_architecture_version;
> @@ -3973,6 +3974,89 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
>    return "";
>  }
>  
> +/* Emit the probe loop for doing stack clash probes and stack adjustments for
> +   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
> +   of GUARD_SIZE.  When a probe is emitted it is done at MIN_PROBE_OFFSET bytes
> +   from the current BASE at an interval of MIN_PROBE_OFFSET.  By the end of this
> +   function BASE = BASE - ADJUSTMENT.  */
> +
> +const char *
> +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
> +				      rtx min_probe_threshold, rtx guard_size)
> +{
> +  /* This function is not allowed to use any instruction generation function
> +     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
> +     so instead emit the code you want using output_asm_insn.  */
> +  gcc_assert (flag_stack_clash_protection);
> +  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
> +  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
> +
> +  /* The minimum required allocation before the residual requires probing.  */
> +  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
> +
> +  /* Clamp the value down to the nearest value that can be used with a cmp.  */
> +  residual_probe_guard = aarch64_uimm12_nearest_value (residual_probe_guard);
> +  rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
> +
> +  gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
> +  gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
> +
> +  static int labelno = 0;
> +  char loop_start_lab[32];
> +  char loop_end_lab[32];
> +  rtx xops[2];
> +
> +  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
> +  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
> +
> +  /* ADJUSTMENT == RESIDUAL_PROBE_GUARD.  */
> +  xops[0] = adjustment;
> +  xops[1] = probe_offset_value_rtx;
> +  output_asm_insn ("cmp\t%0, %1", xops);
> +
> +  /* Branch to end if not enough adjustment to probe.  */
> +  fputs ("\tb.lt\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_end_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  /* Emit loop start label.  */
> +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
> +
> +  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
> +  xops[0] = base;
> +  xops[1] = gen_int_mode (residual_probe_guard, Pmode);
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> +  /* Probe at BASE.  */
> +  xops[1] = const0_rtx;
> +  output_asm_insn ("str\txzr, [%0, %1]", xops);
> +
> +  /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
> +  xops[0] = adjustment;
> +  xops[1] = probe_offset_value_rtx;
> +  output_asm_insn ("subs\t%0, %0, %1", xops);
> +
> +  /* Branch to start if still more bytes to allocate.  */
> +  fputs ("\tb.hs\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_start_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  /* ADJUSTMENT = ADJUSTMENT + RESIDUAL_PROBE_GUARD, we need to undo the last
> +     subtract in order to know how much to drop the stack by.  */
> +  xops[0] = adjustment;
> +  xops[1] = probe_offset_value_rtx;
> +  output_asm_insn ("add\t%0, %0, %1", xops);
> +
> +  /* No probe leave.  */
> +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
> +
> +  /* BASE = BASE - ADJUSTMENT.  */
> +  xops[0] = base;
> +  xops[1] = adjustment;
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
> +  return "";
> +}
> +
>  /* Determine whether a frame chain needs to be generated.  */
>  static bool
>  aarch64_needs_frame_chain (void)
> @@ -4835,21 +4919,76 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
>  	}
>      }
>  
> -  HOST_WIDE_INT size;
>    /* If SIZE is not large enough to require probing, just adjust the stack and
>       exit.  */
> -  if (!poly_size.is_constant (&size)
> -      || known_lt (poly_size, min_probe_threshold)
> +  if (known_lt (poly_size, min_probe_threshold)
>        || !flag_stack_clash_protection)
>      {
>        aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
>        return;
>      }
>  
> +  HOST_WIDE_INT size;
> +  /* Handle the SVE non-constant case first.  */
> +  if (!poly_size.is_constant (&size))
> +    {
> +
> +     if (dump_file)
> +      {
> +	  fprintf (dump_file, "Stack clash SVE prologue: ");
> +	  print_dec (poly_size, dump_file);
> +	  fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> +      }
> +
> +      /* First calculate the amount of bytes we're actually spilling.  */
> +      aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),
> +			  poly_size, temp1, temp2, false, true);
> +
> +      rtx_insn *insn = get_last_insn ();
> +
> +      if (frame_related_p)
> +	{
> +	  /* This is done to provide unwinding information for the stack
> +	     adjustments we're about to do, however to prevent the optimizers
> +	     from removing the R15 move and leaving the CFA note (which would be
> +	     very wrong) we tie the old and new stack pointer together.
> +	     The tie will expand to nothing but the optimizers will not touch
> +	     the instruction.  */
> +	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
> +	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
> +	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
> +
> +	  /* We want the CFA independent of the stack pointer for the
> +	     duration of the loop.  */
> +	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
> +	  RTX_FRAME_RELATED_P (insn) = 1;
> +	}
> +
> +      rtx probe_const = gen_int_mode (min_probe_threshold, DImode);
> +      rtx guard_const = gen_int_mode (guard_size, DImode);
> +
> +      insn = emit_insn (gen_probe_sve_stack_clash (stack_pointer_rtx,
> +						   stack_pointer_rtx, temp1,
> +						   probe_const, guard_const));
> +
> +      /* Now reset the CFA register if needed.  */
> +      if (frame_related_p)
> +	{
> +	  add_reg_note (insn, REG_CFA_DEF_CFA,
> +			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
> +				      gen_int_mode (poly_size, Pmode)));
> +	  RTX_FRAME_RELATED_P (insn) = 1;
> +	}
> +
> +      return;
> +    }
> +
>    if (dump_file)
> -    fprintf (dump_file,
> -	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> -	     ", probing will be required.\n", size);
> +    {
> +      fprintf (dump_file,
> +	       "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
> +	       " bytes, probing will be required.\n", size);
> +    }
>  
>    /* Round size to the nearest multiple of guard_size, and calculate the
>       residual as the difference between the original size and the rounded
> @@ -5458,6 +5597,16 @@ aarch64_uimm12_shift (HOST_WIDE_INT val)
>  	  );
>  }
>  
> +/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
> +   that can be created with a left shift of 0 or 12.  */
> +static HOST_WIDE_INT
> +aarch64_uimm12_nearest_value (HOST_WIDE_INT val)
> +{
> +  if ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val)
> +    return val;
> +
> +  return val & (((HOST_WIDE_INT) 0xfff) << 12);
> +}
>  
>  /* Return true if val is an immediate that can be loaded into a
>     register by a MOVZ instruction.  */
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index b8da13f14fa9990e8fdc3c71ed407c8afc65a324..b422713019e5063babec1fb81d0dfc7b50c76038 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -6464,6 +6464,25 @@
>    [(set_attr "length" "32")]
>  )
>  
> +;; This instruction is used to generate the stack clash stack adjustment and
> +;; probing loop.  We can't change the control flow during prologue and epilogue
> +;; code generation.  So we must emit a volatile unspec and expand it later on.
> +
> +(define_insn "probe_sve_stack_clash"
> +  [(set (match_operand:DI 0 "register_operand" "=rk")
> +	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
> +			     (match_operand:DI 2 "register_operand" "r")
> +			     (match_operand:DI 3 "const_int_operand" "n")
> +			     (match_operand:DI 4 "aarch64_plus_immediate" "L")]
> +			      UNSPECV_PROBE_STACK_RANGE))]
> +  "TARGET_SVE"
> +{
> +  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
> +					       operands[3], operands[4]);
> +}
> +  [(set_attr "length" "32")]
> +)
> +
>  ;; Named pattern for expanding thread pointer reference.
>  (define_expand "get_thread_pointerdi"
>    [(match_operand:DI 0 "register_operand" "=r")]
> diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
> +/* { dg-require-effective-target supports_stack_clash_protection } */
> +
> +#include <stdint.h>
> +
> +#define N 20040
> +
> +void __attribute__ ((noinline, noclone))
> +test (int8_t *restrict dest, int8_t *restrict src)
> +{
> +  for (int i = 0; i < N; i+=8)
> +    {
> +      dest[i] += src[i * 4];
> +      dest[i+1] += src[i * 4 + 1];
> +      dest[i+2] += src[i * 4 + 2];
> +      dest[i+3] += src[i * 4 + 3];
> +      dest[i+4] += src[i * 4 + 4];
> +      dest[i+5] += src[i * 4 + 5];
> +      dest[i+6] += src[i * 4 + 6];
> +      dest[i+7] += src[i * 4 + 7];
> +    }
> +}
> +/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
> +/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
> +/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
> +
> +/* Checks that the CFA notes are correct for every sp adjustment, but we also
> +   need to make sure we can unwind correctly before the frame is set up.  So
> +   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..fd139bb09274509bd4faeef324520927a1fe3d3c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> @@ -0,0 +1,32 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target supports_stack_clash_protection } */
> +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -ftree-vectorize" } */
> +
> +
> +#include <stdint.h>
> +
> +#define N 20040
> +
> +void __attribute__ ((noinline, noclone))
> +test (int8_t *restrict dest, int8_t *restrict src)
> +{
> +  for (int i = 0; i < N; i+=8)
> +    {
> +      dest[i] += src[i * 4];
> +      dest[i+1] += src[i * 4 + 1];
> +      dest[i+2] += src[i * 4 + 2];
> +      dest[i+3] += src[i * 4 + 3];
> +      dest[i+4] += src[i * 4 + 4];
> +      dest[i+5] += src[i * 4 + 5];
> +      dest[i+6] += src[i * 4 + 6];
> +      dest[i+7] += src[i * 4 + 7];
> +    }
> +}
> +
> +
> +/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
> +/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
> +/* { dg-final { scan-assembler-times {subs\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
> +
> +/* SVE spill, requires probing as vector size is unknown at compile time.  */
> +
> 


--
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
 const char * aarch64_output_probe_stack_range (rtx, rtx);
+const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d4b13d48d852a70848fc7c51fd867e776efb5e55..245fd6832ec0afe27c42a242c901a2e13024f935 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
+static HOST_WIDE_INT aarch64_uimm12_nearest_value (HOST_WIDE_INT val);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -3973,6 +3974,83 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   return "";
 }
 
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
+   of GUARD_SIZE.  When a probe is emitted it is done at MIN_PROBE_OFFSET bytes
+   from the current BASE at an interval of MIN_PROBE_OFFSET.  By the end of this
+   function BASE = BASE - ADJUSTMENT.  */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
+				      rtx min_probe_threshold, rtx guard_size)
+{
+  /* This function is not allowed to use any instruction generation function
+     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
+     so instead emit the code you want using output_asm_insn.  */
+  gcc_assert (flag_stack_clash_protection);
+  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
+  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
+
+  /* The minimum required allocation before the residual requires probing.  */
+  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
+
+  /* Clamp the value down to the nearest value that can be used with a cmp.  */
+  residual_probe_guard = aarch64_uimm12_nearest_value (residual_probe_guard);
+  rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
+
+  gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
+  gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
+
+  static int labelno = 0;
+  char loop_start_lab[32];
+  char loop_end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
+
+  /* Emit loop start label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+  /* ADJUSTMENT == RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to end if not enough adjustment to probe.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = base;
+  xops[1] = gen_int_mode (residual_probe_guard, Pmode);
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at BASE.  */
+  xops[1] = const0_rtx;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Branch to start if still more bytes to allocate.  */
+  fputs ("\tb\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_start_lab);
+  fputc ('\n', asm_out_file);
+
+  /* No probe leave.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+
+  /* BASE = BASE - ADJUSTMENT.  */
+  xops[0] = base;
+  xops[1] = adjustment;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+  return "";
+}
+
 /* Determine whether a frame chain needs to be generated.  */
 static bool
 aarch64_needs_frame_chain (void)
@@ -4835,21 +4913,76 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 	}
     }
 
-  HOST_WIDE_INT size;
   /* If SIZE is not large enough to require probing, just adjust the stack and
      exit.  */
-  if (!poly_size.is_constant (&size)
-      || known_lt (poly_size, min_probe_threshold)
+  if (known_lt (poly_size, min_probe_threshold)
       || !flag_stack_clash_protection)
     {
       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
       return;
     }
 
+  HOST_WIDE_INT size;
+  /* Handle the SVE non-constant case first.  */
+  if (!poly_size.is_constant (&size))
+    {
+
+     if (dump_file)
+      {
+	  fprintf (dump_file, "Stack clash SVE prologue: ");
+	  print_dec (poly_size, dump_file);
+	  fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+      }
+
+      /* First calculate the amount of bytes we're actually spilling.  */
+      aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),
+			  poly_size, temp1, temp2, false, true);
+
+      rtx_insn *insn = get_last_insn ();
+
+      if (frame_related_p)
+	{
+	  /* This is done to provide unwinding information for the stack
+	     adjustments we're about to do, however to prevent the optimizers
+	     from removing the R15 move and leaving the CFA note (which would be
+	     very wrong) we tie the old and new stack pointer together.
+	     The tie will expand to nothing but the optimizers will not touch
+	     the instruction.  */
+	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+	  /* We want the CFA independent of the stack pointer for the
+	     duration of the loop.  */
+	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      rtx probe_const = gen_int_mode (min_probe_threshold, DImode);
+      rtx guard_const = gen_int_mode (guard_size, DImode);
+
+      insn = emit_insn (gen_probe_sve_stack_clash (stack_pointer_rtx,
+						   stack_pointer_rtx, temp1,
+						   probe_const, guard_const));
+
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+	{
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				      gen_int_mode (poly_size, Pmode)));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      return;
+    }
+
   if (dump_file)
-    fprintf (dump_file,
-	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
-	     ", probing will be required.\n", size);
+    {
+      fprintf (dump_file,
+	       "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
+	       " bytes, probing will be required.\n", size);
+    }
 
   /* Round size to the nearest multiple of guard_size, and calculate the
      residual as the difference between the original size and the rounded
@@ -5458,6 +5591,16 @@ aarch64_uimm12_shift (HOST_WIDE_INT val)
 	  );
 }
 
+/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
+   that can be created with a left shift of 0 or 12.  */
+static HOST_WIDE_INT
+aarch64_uimm12_nearest_value (HOST_WIDE_INT val)
+{
+  if ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val)
+    return val;
+
+  return val & (((HOST_WIDE_INT) 0xfff) << 12);
+}
 
 /* Return true if val is an immediate that can be loaded into a
    register by a MOVZ instruction.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b8da13f14fa9990e8fdc3c71ed407c8afc65a324..100bd2cc603656d2b8ba97f905c5eff16c59793b 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6464,6 +6464,25 @@
   [(set_attr "length" "32")]
 )
 
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop.  We can't change the control flow during prologue and epilogue
+;; code generation.  So we must emit a volatile unspec and expand it later on.
+
+(define_insn "probe_sve_stack_clash"
+  [(set (match_operand:DI 0 "register_operand" "=rk")
+	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
+			     (match_operand:DI 2 "register_operand" "r")
+			     (match_operand:DI 3 "const_int_operand" "n")
+			     (match_operand:DI 4 "aarch64_plus_immediate" "L")]
+			      UNSPECV_PROBE_STACK_RANGE))]
+  "TARGET_SVE"
+{
+  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+					       operands[3], operands[4]);
+}
+  [(set_attr "length" "28")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include <stdint.h>
+
+#define N 20040
+
+void __attribute__ ((noinline, noclone))
+test (int8_t *restrict dest, int8_t *restrict src)
+{
+  for (int i = 0; i < N; i+=8)
+    {
+      dest[i] += src[i * 4];
+      dest[i+1] += src[i * 4 + 1];
+      dest[i+2] += src[i * 4 + 2];
+      dest[i+3] += src[i * 4 + 3];
+      dest[i+4] += src[i * 4 + 4];
+      dest[i+5] += src[i * 4 + 5];
+      dest[i+6] += src[i * 4 + 6];
+      dest[i+7] += src[i * 4 + 7];
+    }
+}
+/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+   need to make sure we can unwind correctly before the frame is set up.  So
+   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
new file mode 100644
index 0000000000000000000000000000000000000000..fd0e987597eba406fa7351433fe7157743aeca42
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -ftree-vectorize" } */
+
+
+#include <stdint.h>
+
+#define N 20040
+
+void __attribute__ ((noinline, noclone))
+test (int8_t *restrict dest, int8_t *restrict src)
+{
+  for (int i = 0; i < N; i+=8)
+    {
+      dest[i] += src[i * 4];
+      dest[i+1] += src[i * 4 + 1];
+      dest[i+2] += src[i * 4 + 2];
+      dest[i+3] += src[i * 4 + 3];
+      dest[i+4] += src[i * 4 + 4];
+      dest[i+5] += src[i * 4 + 5];
+      dest[i+6] += src[i * 4 + 6];
+      dest[i+7] += src[i * 4 + 7];
+    }
+}
+
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
+
+/* SVE spill, requires probing as vector size is unknown at compile time.  */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
new file mode 100644
index 0000000000000000000000000000000000000000..4199e391881f1e260535c3fbdb6c41e428d54fbf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O2 -ftree-vectorize -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include <stdint.h>
+
+#define N 2000
+#define S 2 * 64 * 1024
+
+#define TEST_LOOP(NAME, TYPE)					\
+  void __attribute__ ((noinline, noclone))			\
+  NAME (TYPE *restrict dest, TYPE *restrict src)		\
+  {								\
+    volatile char foo[S];					\
+    foo[S-1]=1;							\
+    for (int i = 0; i < N; i=i+4)				\
+      {								\
+	dest[i] += src[i * 4];					\
+	dest[i+1] += src[(i+1) * 4];				\
+	dest[i+2] += src[(i+2) * 4];				\
+	dest[i+3] += src[(i+3) * 4];				\
+      }								\
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i8, int8_t) \
+  TEST_LOOP (NAME##_i16, uint16_t) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop for stack clash probing.  */
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 2 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 2 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..cbf1bdfe1e3f23d342042c5b1bb5994714a65cec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O2 -ftree-vectorize -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include "struct_vect_22.c"
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)				\
+  {							\
+    TYPE out[N];					\
+    TYPE in[N * 4];					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	out[i] = i * 7 / 2;				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    for (int i = 0; i < N * 4; ++i)			\
+      {							\
+	in[i] = i * 9 / 2;				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    NAME (out, in);					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	TYPE expected = i * 7 / 2 + in[i * 4];		\
+	if (out[i] != expected)				\
+	  __builtin_abort ();				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}
Richard Sandiford Sept. 27, 2018, 11:11 a.m. UTC | #7
> It turns out the testsuite didn't have a case in it which would cause a
> significant enough spill to enter the loop.  After creating one I noticed a bug
> in the loop and fixed it.
>
> The loops are now
>
>         .cfi_startproc
>         mov     x15, sp
>         cntb    x16, all, mul #11
>         add     x16, x16, 304
>         .cfi_def_cfa_register 15
> .SVLPSPL0:
>         cmp     x16, 61440
>         b.lt    .SVLPEND0
>         sub     sp, sp, 61440
>         str     xzr, [sp, 0]
>         subs    x16, x16, 61440

(The code uses sub rather than subs here)

>         b      .SVLPSPL0
> .SVLPEND0:
>         sub     sp, sp, x16
>         .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22
>
> for a 64KB guard size.

That's OK with me.  Like you say, the main goal was to make the common
case of no probe as fast as possible.

> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
>  void aarch64_cpu_cpp_builtins (cpp_reader *);
>  const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
>  const char * aarch64_output_probe_stack_range (rtx, rtx);
> +const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
>  void aarch64_err_no_fpadvsimd (machine_mode);
>  void aarch64_expand_epilogue (bool);
>  void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index d4b13d48d852a70848fc7c51fd867e776efb5e55..245fd6832ec0afe27c42a242c901a2e13024f935 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
>  static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
>  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
>  					    aarch64_addr_query_type);
> +static HOST_WIDE_INT aarch64_uimm12_nearest_value (HOST_WIDE_INT val);
 
>  /* Major revision number of the ARM Architecture implemented by the target.  */
>  unsigned aarch64_architecture_version;
> @@ -3973,6 +3974,83 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
>    return "";
>  }
 
> +/* Emit the probe loop for doing stack clash probes and stack adjustments for
> +   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
> +   of GUARD_SIZE.  When a probe is emitted it is done at MIN_PROBE_OFFSET bytes
> +   from the current BASE at an interval of MIN_PROBE_OFFSET.  By the end of this

MIN_PROBE_THRESHOLD in both cases (or rename the var to min_probe_offset,
either's fine).  Probably "at most MIN_PROBE..." given the round down.

> +   function BASE = BASE - ADJUSTMENT.  */
> +
> +const char *
> +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
> +				      rtx min_probe_threshold, rtx guard_size)
> +{
> +  /* This function is not allowed to use any instruction generation function
> +     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
> +     so instead emit the code you want using output_asm_insn.  */
> +  gcc_assert (flag_stack_clash_protection);
> +  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
> +  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
> +
> +  /* The minimum required allocation before the residual requires probing.  */
> +  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
> +
> +  /* Clamp the value down to the nearest value that can be used with a cmp.  */
> +  residual_probe_guard = aarch64_uimm12_nearest_value (residual_probe_guard);

Maybe aarch64_clamp_to_uimm12_shift or aarch64_round_down_to_uimm12_shift
would be better; nearest implies that "0x1ff0" should become "0x2000"
rather than "0x1000".

> +  /* ADJUSTMENT == RESIDUAL_PROBE_GUARD.  */
> +  xops[0] = adjustment;
> +  xops[1] = probe_offset_value_rtx;
> +  output_asm_insn ("cmp\t%0, %1", xops);

< rather than == (or just "Compare ...")

> +  /* Branch to end if not enough adjustment to probe.  */
> +  fputs ("\tb.lt\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_end_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
> +  xops[0] = base;
> +  xops[1] = gen_int_mode (residual_probe_guard, Pmode);

probe_offset_value_rtx

> +  HOST_WIDE_INT size;
> +  /* Handle the SVE non-constant case first.  */
> +  if (!poly_size.is_constant (&size))
> +    {
> +

Excess blank line.

> +     if (dump_file)
> +      {
> +	  fprintf (dump_file, "Stack clash SVE prologue: ");
> +	  print_dec (poly_size, dump_file);
> +	  fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> +      }
> +
> +      /* First calculate the amount of bytes we're actually spilling.  */
> +      aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),

Might as well use Pmode for the CONST0_RTX too, for consistency with the
first argument to aarch64_add_offset.

> +			  poly_size, temp1, temp2, false, true);
> +
> +      rtx_insn *insn = get_last_insn ();
> +
> +      if (frame_related_p)
> +	{
> +	  /* This is done to provide unwinding information for the stack
> +	     adjustments we're about to do, however to prevent the optimizers
> +	     from removing the R15 move and leaving the CFA note (which would be
> +	     very wrong) we tie the old and new stack pointer together.
> +	     The tie will expand to nothing but the optimizers will not touch
> +	     the instruction.  */
> +	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
> +	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
> +	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
> +
> +	  /* We want the CFA independent of the stack pointer for the
> +	     duration of the loop.  */
> +	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
> +	  RTX_FRAME_RELATED_P (insn) = 1;
> +	}
> +
> +      rtx probe_const = gen_int_mode (min_probe_threshold, DImode);
> +      rtx guard_const = gen_int_mode (guard_size, DImode);

Pmode in both cases.  (No practical difference, but it makes everything
agree on the mode.)

>    if (dump_file)
> -    fprintf (dump_file,
> -	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> -	     ", probing will be required.\n", size);
> +    {
> +      fprintf (dump_file,
> +	       "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
> +	       " bytes, probing will be required.\n", size);
> +    }

Not needed (previous formatting without { ... } was right).

> +/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
> +   that can be created with a left shift of 0 or 12.  */
> +static HOST_WIDE_INT
> +aarch64_uimm12_nearest_value (HOST_WIDE_INT val)
> +{
> +  if ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val)
> +    return val;
> +
> +  return val & (((HOST_WIDE_INT) 0xfff) << 12);
> +}

Are these HOST_WIDE_INT casts needed?

Probably worth asserting that (val & 0xffffff) == val, or handle
the case in which it isn't by returning 0xfff000.
 
> +;; This instruction is used to generate the stack clash stack adjustment and
> +;; probing loop.  We can't change the control flow during prologue and epilogue
> +;; code generation.  So we must emit a volatile unspec and expand it later on.
> +
> +(define_insn "probe_sve_stack_clash"
> +  [(set (match_operand:DI 0 "register_operand" "=rk")
> +	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
> +			     (match_operand:DI 2 "register_operand" "r")
> +			     (match_operand:DI 3 "const_int_operand" "n")
> +			     (match_operand:DI 4 "aarch64_plus_immediate" "L")]
> +			      UNSPECV_PROBE_STACK_RANGE))]
> +  "TARGET_SVE"
> +{
> +  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
> +					       operands[3], operands[4]);
> +}
> +  [(set_attr "length" "28")]
> +)

Think this will break for ILP32.  We probably need :P instead of :DI and

  "@probe_sve_stack_clash_<mode>"

  gen_probe_sve_stack_clash (Pmode, ...)

> diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
> +/* { dg-require-effective-target supports_stack_clash_protection } */
> +
> +#include <stdint.h>
> +
> +#define N 20040
> +
> +void __attribute__ ((noinline, noclone))
> +test (int8_t *restrict dest, int8_t *restrict src)
> +{
> +  for (int i = 0; i < N; i+=8)
> +    {
> +      dest[i] += src[i * 4];
> +      dest[i+1] += src[i * 4 + 1];
> +      dest[i+2] += src[i * 4 + 2];
> +      dest[i+3] += src[i * 4 + 3];
> +      dest[i+4] += src[i * 4 + 4];
> +      dest[i+5] += src[i * 4 + 5];
> +      dest[i+6] += src[i * 4 + 6];
> +      dest[i+7] += src[i * 4 + 7];
> +    }
> +}

I think we should use something that has a higher guarantee of
spilling, since we shouldn't really need to spill for the above.
See g++.target/aarch64/sve/catch_1.C for one possibility.

> +/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
> +/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
> +/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
> +
> +/* Checks that the CFA notes are correct for every sp adjustment, but we also
> +   need to make sure we can unwind correctly before the frame is set up.  So
> +   check that we're emitting r15 with a copy of sp an setting the CFA there.  */

Think this comment belongs above the dg-finals -- seems odd to have it at
the end of the file.

I'll take your word that the cfi_escape is correct, but it looks like
it matches the full calculation, including the VG multiple.  It would
be better to leave out that part of the encoding, since the number of
SVE vectors spilled could vary quite easily.

> diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..fd0e987597eba406fa7351433fe7157743aeca42
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> @@ -0,0 +1,32 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target supports_stack_clash_protection } */
> +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -ftree-vectorize" } */
> +
> +
> +#include <stdint.h>

Excess blank line before include.

> +#define N 20040
> +
> +void __attribute__ ((noinline, noclone))
> +test (int8_t *restrict dest, int8_t *restrict src)
> +{
> +  for (int i = 0; i < N; i+=8)
> +    {
> +      dest[i] += src[i * 4];
> +      dest[i+1] += src[i * 4 + 1];
> +      dest[i+2] += src[i * 4 + 2];
> +      dest[i+3] += src[i * 4 + 3];
> +      dest[i+4] += src[i * 4 + 4];
> +      dest[i+5] += src[i * 4 + 5];
> +      dest[i+6] += src[i * 4 + 6];
> +      dest[i+7] += src[i * 4 + 7];
> +    }
> +}
> +
> +
> +/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
> +/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
> +/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
> +
> +/* SVE spill, requires probing as vector size is unknown at compile time.  */

Same comments above forcing spilling and putting the comment before
the dg-finals.

Thanks,
Richard
Tamar Christina Sept. 28, 2018, 4:40 p.m. UTC | #8
Hi Richard,

Here's the updated patch with all the feedback processed.

I have also run the compile tests through with -mabi=ilp32 as well.

Ok for trunk?

Thanks,
Tamar

The 09/27/2018 12:11, Richard Sandiford wrote:
> > It turns out the testsuite didn't have a case in it which would cause a
> > significant enough spill to enter the loop.  After creating one I noticed a bug
> > in the loop and fixed it.
> >
> > The loops are now
> >
> >         .cfi_startproc
> >         mov     x15, sp
> >         cntb    x16, all, mul #11
> >         add     x16, x16, 304
> >         .cfi_def_cfa_register 15
> > .SVLPSPL0:
> >         cmp     x16, 61440
> >         b.lt    .SVLPEND0
> >         sub     sp, sp, 61440
> >         str     xzr, [sp, 0]
> >         subs    x16, x16, 61440
> 
> (The code uses sub rather than subs here)
> 
> >         b      .SVLPSPL0
> > .SVLPEND0:
> >         sub     sp, sp, x16
> >         .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22
> >
> > for a 64KB guard size.
> 
> That's OK with me.  Like you say, the main goal was to make the common
> case of no probe as fast as possible.
> 
> > diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> > index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
> > --- a/gcc/config/aarch64/aarch64-protos.h
> > +++ b/gcc/config/aarch64/aarch64-protos.h
> > @@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
> >  void aarch64_cpu_cpp_builtins (cpp_reader *);
> >  const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
> >  const char * aarch64_output_probe_stack_range (rtx, rtx);
> > +const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
> >  void aarch64_err_no_fpadvsimd (machine_mode);
> >  void aarch64_expand_epilogue (bool);
> >  void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index d4b13d48d852a70848fc7c51fd867e776efb5e55..245fd6832ec0afe27c42a242c901a2e13024f935 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
> >  static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
> >  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
> >  					    aarch64_addr_query_type);
> > +static HOST_WIDE_INT aarch64_uimm12_nearest_value (HOST_WIDE_INT val);
>  
> >  /* Major revision number of the ARM Architecture implemented by the target.  */
> >  unsigned aarch64_architecture_version;
> > @@ -3973,6 +3974,83 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
> >    return "";
> >  }
>  
> > +/* Emit the probe loop for doing stack clash probes and stack adjustments for
> > +   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
> > +   of GUARD_SIZE.  When a probe is emitted it is done at MIN_PROBE_OFFSET bytes
> > +   from the current BASE at an interval of MIN_PROBE_OFFSET.  By the end of this
> 
> MIN_PROBE_THRESHOLD in both cases (or rename the var to min_probe_offset,
> either's fine).  Probably "at most MIN_PROBE..." given the round down.
> 
> > +   function BASE = BASE - ADJUSTMENT.  */
> > +
> > +const char *
> > +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
> > +				      rtx min_probe_threshold, rtx guard_size)
> > +{
> > +  /* This function is not allowed to use any instruction generation function
> > +     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
> > +     so instead emit the code you want using output_asm_insn.  */
> > +  gcc_assert (flag_stack_clash_protection);
> > +  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
> > +  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
> > +
> > +  /* The minimum required allocation before the residual requires probing.  */
> > +  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
> > +
> > +  /* Clamp the value down to the nearest value that can be used with a cmp.  */
> > +  residual_probe_guard = aarch64_uimm12_nearest_value (residual_probe_guard);
> 
> Maybe aarch64_clamp_to_uimm12_shift or aarch64_round_down_to_uimm12_shift
> would be better; nearest implies that "0x1ff0" should become "0x2000"
> rather than "0x1000".
> 
> > +  /* ADJUSTMENT == RESIDUAL_PROBE_GUARD.  */
> > +  xops[0] = adjustment;
> > +  xops[1] = probe_offset_value_rtx;
> > +  output_asm_insn ("cmp\t%0, %1", xops);
> 
> < rather than == (or just "Compare ...")
> 
> > +  /* Branch to end if not enough adjustment to probe.  */
> > +  fputs ("\tb.lt\t", asm_out_file);
> > +  assemble_name_raw (asm_out_file, loop_end_lab);
> > +  fputc ('\n', asm_out_file);
> > +
> > +  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
> > +  xops[0] = base;
> > +  xops[1] = gen_int_mode (residual_probe_guard, Pmode);
> 
> probe_offset_value_rtx
> 
> > +  HOST_WIDE_INT size;
> > +  /* Handle the SVE non-constant case first.  */
> > +  if (!poly_size.is_constant (&size))
> > +    {
> > +
> 
> Excess blank line.
> 
> > +     if (dump_file)
> > +      {
> > +	  fprintf (dump_file, "Stack clash SVE prologue: ");
> > +	  print_dec (poly_size, dump_file);
> > +	  fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> > +      }
> > +
> > +      /* First calculate the amount of bytes we're actually spilling.  */
> > +      aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),
> 
> Might as well use Pmode for the CONST0_RTX too, for consistency with the
> first argument to aarch64_add_offset.
> 
> > +			  poly_size, temp1, temp2, false, true);
> > +
> > +      rtx_insn *insn = get_last_insn ();
> > +
> > +      if (frame_related_p)
> > +	{
> > +	  /* This is done to provide unwinding information for the stack
> > +	     adjustments we're about to do, however to prevent the optimizers
> > +	     from removing the R15 move and leaving the CFA note (which would be
> > +	     very wrong) we tie the old and new stack pointer together.
> > +	     The tie will expand to nothing but the optimizers will not touch
> > +	     the instruction.  */
> > +	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
> > +	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
> > +	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
> > +
> > +	  /* We want the CFA independent of the stack pointer for the
> > +	     duration of the loop.  */
> > +	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
> > +	  RTX_FRAME_RELATED_P (insn) = 1;
> > +	}
> > +
> > +      rtx probe_const = gen_int_mode (min_probe_threshold, DImode);
> > +      rtx guard_const = gen_int_mode (guard_size, DImode);
> 
> Pmode in both cases.  (No practical difference, but it makes everything
> agree on the mode.)
> 
> >    if (dump_file)
> > -    fprintf (dump_file,
> > -	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> > -	     ", probing will be required.\n", size);
> > +    {
> > +      fprintf (dump_file,
> > +	       "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
> > +	       " bytes, probing will be required.\n", size);
> > +    }
> 
> Not needed (previous formatting without { ... } was right).
> 
> > +/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
> > +   that can be created with a left shift of 0 or 12.  */
> > +static HOST_WIDE_INT
> > +aarch64_uimm12_nearest_value (HOST_WIDE_INT val)
> > +{
> > +  if ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val)
> > +    return val;
> > +
> > +  return val & (((HOST_WIDE_INT) 0xfff) << 12);
> > +}
> 
> Are these HOST_WIDE_INT casts needed?
> 
> Probably worth asserting that (val & 0xffffff) == val, or handle
> the case in which it isn't by returning 0xfff000.
>  
> > +;; This instruction is used to generate the stack clash stack adjustment and
> > +;; probing loop.  We can't change the control flow during prologue and epilogue
> > +;; code generation.  So we must emit a volatile unspec and expand it later on.
> > +
> > +(define_insn "probe_sve_stack_clash"
> > +  [(set (match_operand:DI 0 "register_operand" "=rk")
> > +	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
> > +			     (match_operand:DI 2 "register_operand" "r")
> > +			     (match_operand:DI 3 "const_int_operand" "n")
> > +			     (match_operand:DI 4 "aarch64_plus_immediate" "L")]
> > +			      UNSPECV_PROBE_STACK_RANGE))]
> > +  "TARGET_SVE"
> > +{
> > +  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
> > +					       operands[3], operands[4]);
> > +}
> > +  [(set_attr "length" "28")]
> > +)
> 
> Think this will break for ILP32.  We probably need :P instead of :DI and
> 
>   "@probe_sve_stack_clash_<mode>"
> 
>   gen_probe_sve_stack_clash (Pmode, ...)
> 
> > diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
> > +/* { dg-require-effective-target supports_stack_clash_protection } */
> > +
> > +#include <stdint.h>
> > +
> > +#define N 20040
> > +
> > +void __attribute__ ((noinline, noclone))
> > +test (int8_t *restrict dest, int8_t *restrict src)
> > +{
> > +  for (int i = 0; i < N; i+=8)
> > +    {
> > +      dest[i] += src[i * 4];
> > +      dest[i+1] += src[i * 4 + 1];
> > +      dest[i+2] += src[i * 4 + 2];
> > +      dest[i+3] += src[i * 4 + 3];
> > +      dest[i+4] += src[i * 4 + 4];
> > +      dest[i+5] += src[i * 4 + 5];
> > +      dest[i+6] += src[i * 4 + 6];
> > +      dest[i+7] += src[i * 4 + 7];
> > +    }
> > +}
> 
> I think we should use something that has a higher guarantee of
> spilling, since we shouldn't really need to spill for the above.
> See g++.target/aarch64/sve/catch_1.C for one possibility.
> 
> > +/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
> > +/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
> > +/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
> > +
> > +/* Checks that the CFA notes are correct for every sp adjustment, but we also
> > +   need to make sure we can unwind correctly before the frame is set up.  So
> > +   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
> 
> Think this comment belongs above the dg-finals -- seems odd to have it at
> the end of the file.
> 
> I'll take your word that the cfi_escape is correct, but it looks like
> it matches the full calculation, including the VG multiple.  It would
> be better to leave out that part of the encoding, since the number of
> SVE vectors spilled could vary quite easily.
> 
> > diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..fd0e987597eba406fa7351433fe7157743aeca42
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> > @@ -0,0 +1,32 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target supports_stack_clash_protection } */
> > +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -ftree-vectorize" } */
> > +
> > +
> > +#include <stdint.h>
> 
> Excess blank line before include.
> 
> > +#define N 20040
> > +
> > +void __attribute__ ((noinline, noclone))
> > +test (int8_t *restrict dest, int8_t *restrict src)
> > +{
> > +  for (int i = 0; i < N; i+=8)
> > +    {
> > +      dest[i] += src[i * 4];
> > +      dest[i+1] += src[i * 4 + 1];
> > +      dest[i+2] += src[i * 4 + 2];
> > +      dest[i+3] += src[i * 4 + 3];
> > +      dest[i+4] += src[i * 4 + 4];
> > +      dest[i+5] += src[i * 4 + 5];
> > +      dest[i+6] += src[i * 4 + 6];
> > +      dest[i+7] += src[i * 4 + 7];
> > +    }
> > +}
> > +
> > +
> > +/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
> > +/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
> > +/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
> > +
> > +/* SVE spill, requires probing as vector size is unknown at compile time.  */
> 
> Same comments above forcing spilling and putting the comment before
> the dg-finals.
> 
> Thanks,
> Richard

--
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
 const char * aarch64_output_probe_stack_range (rtx, rtx);
+const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d4b13d48d852a70848fc7c51fd867e776efb5e55..8c901e9d8c00d392a2df62d9b63ce5b865b48e50 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
+static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -3973,6 +3974,84 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   return "";
 }
 
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
+   of GUARD_SIZE.  When a probe is emitted it is done at most
+   MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
+   at most MIN_PROBE_THRESHOLD.  By the end of this function
+   BASE = BASE - ADJUSTMENT.  */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
+				      rtx min_probe_threshold, rtx guard_size)
+{
+  /* This function is not allowed to use any instruction generation function
+     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
+     so instead emit the code you want using output_asm_insn.  */
+  gcc_assert (flag_stack_clash_protection);
+  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
+  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
+
+  /* The minimum required allocation before the residual requires probing.  */
+  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
+
+  /* Clamp the value down to the nearest value that can be used with a cmp.  */
+  residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
+  rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
+
+  gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
+  gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
+
+  static int labelno = 0;
+  char loop_start_lab[32];
+  char loop_end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
+
+  /* Emit loop start label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+  /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to end if not enough adjustment to probe.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = base;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at BASE.  */
+  xops[1] = const0_rtx;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Branch to start if still more bytes to allocate.  */
+  fputs ("\tb\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_start_lab);
+  fputc ('\n', asm_out_file);
+
+  /* No probe leave.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+
+  /* BASE = BASE - ADJUSTMENT.  */
+  xops[0] = base;
+  xops[1] = adjustment;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+  return "";
+}
+
 /* Determine whether a frame chain needs to be generated.  */
 static bool
 aarch64_needs_frame_chain (void)
@@ -4835,21 +4914,73 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 	}
     }
 
-  HOST_WIDE_INT size;
   /* If SIZE is not large enough to require probing, just adjust the stack and
      exit.  */
-  if (!poly_size.is_constant (&size)
-      || known_lt (poly_size, min_probe_threshold)
+  if (known_lt (poly_size, min_probe_threshold)
       || !flag_stack_clash_protection)
     {
       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
       return;
     }
 
+  HOST_WIDE_INT size;
+  /* Handle the SVE non-constant case first.  */
+  if (!poly_size.is_constant (&size))
+    {
+     if (dump_file)
+      {
+	fprintf (dump_file, "Stack clash SVE prologue: ");
+	print_dec (poly_size, dump_file);
+	fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+      }
+
+      /* First calculate the amount of bytes we're actually spilling.  */
+      aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
+			  poly_size, temp1, temp2, false, true);
+
+      rtx_insn *insn = get_last_insn ();
+
+      if (frame_related_p)
+	{
+	  /* This is done to provide unwinding information for the stack
+	     adjustments we're about to do, however to prevent the optimizers
+	     from removing the R15 move and leaving the CFA note (which would be
+	     very wrong) we tie the old and new stack pointer together.
+	     The tie will expand to nothing but the optimizers will not touch
+	     the instruction.  */
+	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+	  /* We want the CFA independent of the stack pointer for the
+	     duration of the loop.  */
+	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
+      rtx guard_const = gen_int_mode (guard_size, Pmode);
+
+      insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
+						   stack_pointer_rtx, temp1,
+						   probe_const, guard_const));
+
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+	{
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				      gen_int_mode (poly_size, Pmode)));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      return;
+    }
+
   if (dump_file)
     fprintf (dump_file,
-	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
-	     ", probing will be required.\n", size);
+	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
+	     " bytes, probing will be required.\n", size);
 
   /* Round size to the nearest multiple of guard_size, and calculate the
      residual as the difference between the original size and the rounded
@@ -5458,6 +5589,20 @@ aarch64_uimm12_shift (HOST_WIDE_INT val)
 	  );
 }
 
+/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
+   that can be created with a left shift of 0 or 12.  */
+static HOST_WIDE_INT
+aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
+{
+  /* Check to see if the value fits in 24 bits, as that is the maximum we can
+     handle correctly.  */
+  gcc_assert ((val & 0xffffff) == val);
+
+  if (((val & 0xfff) << 0) == val)
+    return val;
+
+  return val & (0xfff << 12);
+}
 
 /* Return true if val is an immediate that can be loaded into a
    register by a MOVZ instruction.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b8da13f14fa9990e8fdc3c71ed407c8afc65a324..22eb026f0631958536ab0c33c4d234d0156dc120 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6464,6 +6464,25 @@
   [(set_attr "length" "32")]
 )
 
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop.  We can't change the control flow during prologue and epilogue
+;; code generation.  So we must emit a volatile unspec and expand it later on.
+
+(define_insn "@probe_sve_stack_clash_<mode>"
+  [(set (match_operand:P 0 "register_operand" "=rk")
+	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
+			    (match_operand:P 2 "register_operand" "r")
+			    (match_operand:P 3 "const_int_operand" "n")
+			    (match_operand:P 4 "aarch64_plus_immediate" "L")]
+			     UNSPECV_PROBE_STACK_RANGE))]
+  "TARGET_SVE"
+{
+  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+					       operands[3], operands[4]);
+}
+  [(set_attr "length" "28")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..41579f26ba9156f3e500f090d132ba9cf28364d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include "stack-check-prologue-16.c"
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+   need to make sure we can unwind correctly before the frame is set up.  So
+   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
+
+/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,.*} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
new file mode 100644
index 0000000000000000000000000000000000000000..d92ef47a57ddda556c563e36ad8aaf4acdeabc57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+/* Invoke X (P##n) for n in [0, 7].  */
+#define REPEAT8(X, P) \
+  X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7)
+
+/* Invoke X (n) for all octal n in [0, 39].  */
+#define REPEAT40(X) \
+  REPEAT8 (X, 0) REPEAT8 (X, 1)  REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+/* Expect vector work to be done, with spilling of vector registers.  */
+void
+f2 (int x[40][100], int *y)
+{
+  /* Try to force some spilling.  */
+#define DECLARE(N) int y##N = y[N];
+  REPEAT40 (DECLARE);
+#pragma omp simd
+  for (int i = 0; i < 100; ++i)
+    {
+#define INC(N) x[N][i] += y##N;
+      REPEAT40 (INC);
+    }
+}
+
+/* SVE spill, requires probing as vector size is unknown at compile time.  */
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
new file mode 100644
index 0000000000000000000000000000000000000000..68a9d5e3d2e74cb331dff0ef3bcd612f8bb0d0f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include <stdint.h>
+
+#define N 50
+#define S 2 * 64 * 1024
+
+/* Invoke X (P##n) for n in [0, 9].  */
+#define REPEAT8(X, P) \
+  X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) \
+  X (P##8)  X (P##9)
+
+/* Invoke X (n) for all n in [0, 49].  */
+#define REPEAT50(X) \
+  REPEAT8 (X, ) REPEAT8 (X, 1)  REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+  /* Try to force some spilling.  */
+#define DECLARE(N) int src##N = src[N * 4];
+#define INC(N) dest[i] += src##N;
+
+#define TEST_LOOP(NAME, TYPE)				\
+  void __attribute__ ((noinline, noclone, simd))	\
+  NAME (TYPE *restrict dest, TYPE *restrict src)	\
+  {							\
+    REPEAT50 (DECLARE);					\
+    volatile char foo[S];				\
+    foo[S-1]=1;						\
+    for (int i = 0; i < N; i++)				\
+      {							\
+	REPEAT50 (INC);					\
+      }							\
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i32, int32_t) \
+  TEST_LOOP (NAME##_i64, int64_t) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop for stack clash probing.  */
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 4 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 4 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..e764476faccded380102dfbc759be7cf6be88345
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include "struct_vect_24.c"
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)				\
+  {							\
+    TYPE out[N];					\
+    TYPE in[N * 4];					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	out[i] = i * 7 / 2;				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    for (int i = 0; i < N * 4; ++i)			\
+      {							\
+	in[i] = i * 9 / 2;				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    NAME (out, in);					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	TYPE expected = i * 7 / 2;			\
+	if (out[i] != out[0] + expected)		\
+	  __builtin_abort ();				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+  }
+
+int __attribute__ ((optimize (0)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}
Richard Sandiford Sept. 28, 2018, 5:17 p.m. UTC | #9
Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi Richard,
>
> Here's the updated patch with all the feedback processed.
>
> I have also run the compile tests through with -mabi=ilp32 as well.
>
> Ok for trunk?

OK.  Thanks for your patience through all the reviews.

Richard
Tamar Christina Oct. 9, 2018, 6:37 a.m. UTC | #10
Hi All,

I'm looking for permission to backport this patch to the GCC-8 branch
to fix PR86486.

OK for backport?

Thanks,
Tamar

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Friday, September 28, 2018 18:18
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; James Greenhalgh
> <James.Greenhalgh@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH][GCC][AArch64] Add support for SVE stack clash probing
> [patch (2/7)]
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> > Hi Richard,
> >
> > Here's the updated patch with all the feedback processed.
> >
> > I have also run the compile tests through with -mabi=ilp32 as well.
> >
> > Ok for trunk?
> 
> OK.  Thanks for your patience through all the reviews.
> 
> Richard
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -453,6 +453,7 @@  void aarch64_asm_output_labelref (FILE *, const char *);
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
 const char * aarch64_output_probe_stack_range (rtx, rtx);
+const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 06451f38b11822ea77323438fe8c7e373eb9e614..e7efde79bb111e820f4df44a276f6f73070ecd17 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3970,6 +3970,90 @@  aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   return "";
 }
 
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+   SVE.  This emits probes from BASE to BASE + ADJUSTMENT based on a guard size
+   of GUARD_SIZE and emits a probe when at least LIMIT bytes are allocated.  By
+   the end of this function BASE = BASE + ADJUSTMENT.  */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment, rtx limit,
+				      rtx guard_size)
+{
+  /* This function is not allowed to use any instruction generation function
+     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
+     so instead emit the code you want using output_asm_insn.  */
+  gcc_assert (flag_stack_clash_protection);
+  gcc_assert (CONST_INT_P (limit) && CONST_INT_P (guard_size));
+  gcc_assert (aarch64_uimm12_shift (INTVAL (limit)));
+  gcc_assert (aarch64_uimm12_shift (INTVAL (guard_size)));
+
+  static int labelno = 0;
+  char loop_start_lab[32];
+  char loop_res_lab[32];
+  char loop_end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSRL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_res_lab, "BRRCHK", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "BERCHK", labelno++);
+
+  /* Emit loop start label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+  /* Test if ADJUSTMENT < GUARD_SIZE.  */
+  xops[0] = adjustment;
+  xops[1] = guard_size;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to residual loop if it is.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_res_lab);
+  fputc ('\n', asm_out_file);
+
+  /* BASE = BASE - GUARD_SIZE.  */
+  xops[0] = base;
+  xops[1] = guard_size;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at BASE + LIMIT.  */
+  xops[1] = limit;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* ADJUSTMENT = ADJUSTMENT - GUARD_SIZE.  */
+  xops[0] = adjustment;
+  xops[1] = guard_size;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Branch to loop start.  */
+  fputs ("\tb\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_start_lab);
+  fputc ('\n', asm_out_file);
+
+  /* Emit residual check label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_res_lab);
+
+  /* BASE = BASE - ADJUSTMENT.  */
+  xops[0] = base;
+  xops[1] = adjustment;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Test if BASE < LIMIT.  */
+  xops[1] = limit;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to end.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* Probe at BASE + LIMIT.  */
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* No probe leave.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+  return "";
+}
+
 /* Determine whether a frame chain needs to be generated.  */
 static bool
 aarch64_needs_frame_chain (void)
@@ -4826,22 +4910,30 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 	}
     }
 
-  HOST_WIDE_INT size;
+  /* GCC's initialization analysis is broken so initialize size.  */
+  HOST_WIDE_INT size = 0;
   /* If SIZE is not large enough to require probing, just adjust the stack and
      exit.  */
-  if (!poly_size.is_constant (&size)
-      || known_lt (poly_size, min_probe_threshold)
+  if ((poly_size.is_constant (&size)
+       && known_lt (poly_size, min_probe_threshold))
       || !flag_stack_clash_protection)
     {
       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
       return;
     }
 
-  if (dump_file)
+  if (dump_file && poly_size.is_constant ())
     fprintf (dump_file,
 	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
 	     ", probing will be required.\n", size);
 
+  if (dump_file && !poly_size.is_constant ())
+    {
+      fprintf (dump_file, "Stack clash SVE prologue: ");
+      dump_dec (MSG_NOTE, poly_size);
+      fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+    }
+
   /* Round size to the nearest multiple of guard_size, and calculate the
      residual as the difference between the original size and the rounded
      size.  */
@@ -4850,7 +4942,8 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 
   /* We can handle a small number of allocations/probes inline.  Otherwise
      punt to a loop.  */
-  if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
+  if (poly_size.is_constant ()
+      && rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
     {
       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
 	{
@@ -4861,7 +4954,7 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 	}
       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
     }
-  else
+  else if (poly_size.is_constant ())
     {
       /* Compute the ending address.  */
       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
@@ -4910,6 +5003,48 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
       emit_insn (gen_blockage ());
       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
     }
+  else
+    {
+      /* First calculate the amount of bytes we're actually spilling.  */
+      aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),
+			  poly_size, temp1, temp2, false, true);
+
+      rtx_insn *insn = get_last_insn ();
+
+      if (frame_related_p)
+	{
+	  /* This is done to provide unwinding information for the stack
+	     adjustments we're about to do, however to prevent the optimizers
+	     from removing the R15 move and leaving the CFA note (which would be
+	     very wrong) we tie the old and new stack pointer together.
+	     The tie will expand to nothing but the optimizers will not touch
+	     the instruction.  */
+	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+	  /* We want the CFA independent of the stack pointer for the
+	     duration of the loop.  */
+	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      rtx probe_const = gen_rtx_CONST_INT (Pmode, STACK_CLASH_CALLER_GUARD);
+      rtx guard_const = gen_rtx_CONST_INT (Pmode, guard_size);
+
+      insn = emit_insn (gen_probe_sve_stack_clash (stack_pointer_rtx,
+						   stack_pointer_rtx, temp1,
+						   probe_const, guard_const));
+
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+	{
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				      gen_int_mode (poly_size, Pmode)));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+    }
 
   /* Handle any residuals.  Residuals of at least min_probe_threshold have to
      be probed.  This maintains the requirement that each page is probed at
@@ -4922,7 +5057,7 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
      probed by the saving of FP/LR either by this function or any callees.  If
      we don't have any callees then we won't have more stack adjustments and so
      are still safe.  */
-  if (residual)
+  if (poly_size.is_constant () && residual)
     {
       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
       if (residual >= min_probe_threshold)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b8da13f14fa9990e8fdc3c71ed407c8afc65a324..4901f55478eb0ea26a36f15d51aaf9779a8efaf4 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6464,6 +6464,25 @@ 
   [(set_attr "length" "32")]
 )
 
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop.  We can't change the control flow during prologue and epilogue
+;; code generation.  So we must emit a volatile unspec and expand it later on.
+
+(define_insn "probe_sve_stack_clash"
+  [(set (match_operand:DI 0 "register_operand" "=rk")
+	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
+			     (match_operand:DI 2 "register_operand" "r")
+			     (match_operand:DI 3 "aarch64_plus_immediate" "L")
+			     (match_operand:DI 4 "aarch64_plus_immediate" "L")]
+			      UNSPECV_PROBE_STACK_RANGE))]
+  "TARGET_SVE"
+{
+  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+					       operands[3], operands[4]);
+}
+  [(set_attr "length" "40")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include <stdint.h>
+
+#define N 20040
+
+void __attribute__ ((noinline, noclone))
+test (int8_t *restrict dest, int8_t *restrict src)
+{
+  for (int i = 0; i < N; i+=8)
+    {
+      dest[i] += src[i * 4];
+      dest[i+1] += src[i * 4 + 1];
+      dest[i+2] += src[i * 4 + 2];
+      dest[i+3] += src[i * 4 + 3];
+      dest[i+4] += src[i * 4 + 4];
+      dest[i+5] += src[i * 4 + 5];
+      dest[i+6] += src[i * 4 + 6];
+      dest[i+7] += src[i * 4 + 7];
+    }
+}
+/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+   need to make sure we can unwind correctly before the frame is set up.  So
+   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-12.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-12.c
new file mode 100644
index 0000000000000000000000000000000000000000..d66a2c19f4aec7b1121651da315442303cc1ed54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-12.c
@@ -0,0 +1,32 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -ftree-vectorize" } */
+
+
+#include <stdint.h>
+
+#define N 20040
+
+void __attribute__ ((noinline, noclone))
+test (int8_t *restrict dest, int8_t *restrict src)
+{
+  for (int i = 0; i < N; i+=8)
+    {
+      dest[i] += src[i * 4];
+      dest[i+1] += src[i * 4 + 1];
+      dest[i+2] += src[i * 4 + 2];
+      dest[i+3] += src[i * 4 + 3];
+      dest[i+4] += src[i * 4 + 4];
+      dest[i+5] += src[i * 4 + 5];
+      dest[i+6] += src[i * 4 + 6];
+      dest[i+7] += src[i * 4 + 7];
+    }
+}
+
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 1024\]} 2 } } */
+/* { dg-final { scan-assembler-times {cmp\s+sp, 1024} 1 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 65536} 1 } } */
+
+/* SVE spill, requires probing as vector size is unknown at compile time.
+   Dynamic loop expected.  */