diff mbox series

[7/7] : Enable clobber high for tls descs on Aarch64

Message ID 7DBAE2ED-5C4C-4C97-851C-6118D22C3B32@arm.com
State New
Headers show
Series [1/7] : SVE: Add CLOBBER_HIGH expression | expand

Commit Message

Alan Hayward Nov. 16, 2017, 12:35 p.m. UTC
This final patch adds the clobber high expressions to tls_desc for aarch64.
It also adds three tests.

In addition I also tested by taking the gcc torture test suite and making
all global variables __thread. Then emended the suite to compile with -fpic,
save the .s file and only for one given O level.
I ran this before and after the patch and compared the resulting .s files,
ensuring that there were no ASM changes.
I discarded the 10% of tests that failed to compile (due to the code in
the test now being invalid C).
I did this for O0,O2,O3 on both x86 and aarch64 and observed no difference
between ASM files before and after the patch.

Alan.

2017-11-16  Alan Hayward  <alan.hayward@arm.com>

gcc/
	* config/aarch64/aarch64.md: Add clobber highs to tls_desc.

gcc/testsuite/	
	* gcc.target/aarch64/sve_tls_preserve_1.c: New test.
	* gcc.target/aarch64/sve_tls_preserve_2.c: New test.
	* gcc.target/aarch64/sve_tls_preserve_3.c: New test.

Comments

Andrew Pinski Nov. 16, 2017, 7:32 p.m. UTC | #1
On Thu, Nov 16, 2017 at 4:35 AM, Alan Hayward <Alan.Hayward@arm.com> wrote:
> This final patch adds the clobber high expressions to tls_desc for aarch64.
> It also adds three tests.
>
> In addition I also tested by taking the gcc torture test suite and making
> all global variables __thread. Then emended the suite to compile with -fpic,
> save the .s file and only for one given O level.
> I ran this before and after the patch and compared the resulting .s files,
> ensuring that there were no ASM changes.
> I discarded the 10% of tests that failed to compile (due to the code in
> the test now being invalid C).
> I did this for O0,O2,O3 on both x86 and aarch64 and observed no difference
> between ASM files before and after the patch.

Isn't the ABI defined as non-clobbering the lower 64bits for normal
function calls?  Or is the TLS function "special" in that it
saves/restores the 128bit registers; is that documented anywhere?  The
main reason why I am asking is because glibc is not the only libc out
there and someone could have a slightly different ABI here.

Thanks,
Andrew Pinski

>
> Alan.
>
> 2017-11-16  Alan Hayward  <alan.hayward@arm.com>
>
> gcc/
>         * config/aarch64/aarch64.md: Add clobber highs to tls_desc.
>
> gcc/testsuite/
>         * gcc.target/aarch64/sve_tls_preserve_1.c: New test.
>         * gcc.target/aarch64/sve_tls_preserve_2.c: New test.
>         * gcc.target/aarch64/sve_tls_preserve_3.c: New test.
>
>
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 6a15ff0b61d775cf30189b8503cfa45987701228..1f332b254fe0e37954efbe92982f214100d7046f 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -57,7 +57,36 @@
>      (LR_REGNUM         30)
>      (SP_REGNUM         31)
>      (V0_REGNUM         32)
> +    (V1_REGNUM         33)
> +    (V2_REGNUM         34)
> +    (V3_REGNUM         35)
> +    (V4_REGNUM         36)
> +    (V5_REGNUM         37)
> +    (V6_REGNUM         38)
> +    (V7_REGNUM         39)
> +    (V8_REGNUM         40)
> +    (V9_REGNUM         41)
> +    (V10_REGNUM                42)
> +    (V11_REGNUM                43)
> +    (V12_REGNUM                44)
> +    (V13_REGNUM                45)
> +    (V14_REGNUM                46)
>      (V15_REGNUM                47)
> +    (V16_REGNUM                48)
> +    (V17_REGNUM                49)
> +    (V18_REGNUM                50)
> +    (V19_REGNUM                51)
> +    (V20_REGNUM                52)
> +    (V21_REGNUM                53)
> +    (V22_REGNUM                54)
> +    (V23_REGNUM                55)
> +    (V24_REGNUM                56)
> +    (V25_REGNUM                57)
> +    (V26_REGNUM                58)
> +    (V27_REGNUM                59)
> +    (V28_REGNUM                60)
> +    (V29_REGNUM                61)
> +    (V30_REGNUM                62)
>      (V31_REGNUM                63)
>      (LAST_SAVED_REGNUM 63)
>      (SFP_REGNUM                64)
> @@ -5745,6 +5774,38 @@
>                    UNSPEC_TLSDESC))
>     (clobber (reg:DI LR_REGNUM))
>     (clobber (reg:CC CC_REGNUM))
> +   (clobber_high (reg:TI V0_REGNUM))
> +   (clobber_high (reg:TI V1_REGNUM))
> +   (clobber_high (reg:TI V2_REGNUM))
> +   (clobber_high (reg:TI V3_REGNUM))
> +   (clobber_high (reg:TI V4_REGNUM))
> +   (clobber_high (reg:TI V5_REGNUM))
> +   (clobber_high (reg:TI V6_REGNUM))
> +   (clobber_high (reg:TI V7_REGNUM))
> +   (clobber_high (reg:TI V8_REGNUM))
> +   (clobber_high (reg:TI V9_REGNUM))
> +   (clobber_high (reg:TI V10_REGNUM))
> +   (clobber_high (reg:TI V11_REGNUM))
> +   (clobber_high (reg:TI V12_REGNUM))
> +   (clobber_high (reg:TI V13_REGNUM))
> +   (clobber_high (reg:TI V14_REGNUM))
> +   (clobber_high (reg:TI V15_REGNUM))
> +   (clobber_high (reg:TI V16_REGNUM))
> +   (clobber_high (reg:TI V17_REGNUM))
> +   (clobber_high (reg:TI V18_REGNUM))
> +   (clobber_high (reg:TI V19_REGNUM))
> +   (clobber_high (reg:TI V20_REGNUM))
> +   (clobber_high (reg:TI V21_REGNUM))
> +   (clobber_high (reg:TI V22_REGNUM))
> +   (clobber_high (reg:TI V23_REGNUM))
> +   (clobber_high (reg:TI V24_REGNUM))
> +   (clobber_high (reg:TI V25_REGNUM))
> +   (clobber_high (reg:TI V26_REGNUM))
> +   (clobber_high (reg:TI V27_REGNUM))
> +   (clobber_high (reg:TI V28_REGNUM))
> +   (clobber_high (reg:TI V29_REGNUM))
> +   (clobber_high (reg:TI V30_REGNUM))
> +   (clobber_high (reg:TI V31_REGNUM))
>     (clobber (match_scratch:DI 1 "=r"))]
>    "TARGET_TLS_DESC"
>    "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_1.c b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..5bad829568130181ef1ab386545bd3ee164c322e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_1.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fpic -march=armv8-a+sve" } */
> +
> +/* Clobber highs do not need to be spilled around tls usage.  */
> +
> +typedef float v4si __attribute__ ((vector_size (16)));
> +
> +__thread v4si tx;
> +
> +v4si foo (v4si a, v4si b, v4si c)
> +{
> +  v4si y;
> +
> +  y = a + tx + b + c;
> +
> +  return y + 7;
> +}
> +
> +/* { dg-final { scan-assembler-not {\tstr\t} } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_2.c b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..69e8829287b8418c28f8c227391c4f8d2186ea63
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fpic -march=armv8-a+sve -msve-vector-bits=256 -fno-schedule-insns" } */
> +
> +/* Clobber highs must be spilled around tls usage.  */
> +
> +typedef float v8si __attribute__ ((vector_size (32)));
> +
> +__thread v8si tx;
> +
> +v8si foo (v8si a, v8si b, v8si c)
> +{
> +  v8si y;
> +
> +  /* There is nothing stopping the compiler from making the tls call before
> +     loading the input variables off the stack.  However, there appears to
> +     be no way in C of enforcing this.  Thankfully the compiler doesn't
> +     do this reordering.  */
> +
> +  y = a + tx + b + c;
> +
> +  return y + 7;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tstr\tz[0-9]+,} 3 } } */
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_3.c b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b6aa59a3c7393d7e9ca419167d13b624a9ffafcc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_3.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fpic -march=armv8-a+sve -msve-vector-bits=512 -fno-schedule-insns" } */
> +
> +/* Clobber highs must be spilled around tls usage.  */
> +
> +typedef float v16si __attribute__ ((vector_size (64)));
> +
> +__thread v16si tx;
> +
> +v16si foo (v16si a, v16si b, v16si c)
> +{
> +  v16si y;
> +
> +  /* There is nothing stopping the compiler from making the tls call before
> +     loading the input variables off the stack.  However, there appears to
> +     be no way in C of enforcing this.  Thankfully the compiler doesn't
> +     do this reordering.  */
> +
> +  y = a + tx + b + c;
> +
> +  return y + 7;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tstr\tz[0-9]+,} 3 } } */
>
<div id="DAB4FAD8-2DD7-40BB-A1B8-4E2AA1F9FDF2"><br />
<table style="border-top: 1px solid #D3D4DE;">
	<tr>
        <td style="width: 55px; padding-top: 13px;"><a
href="http://www.avg.com/email-signature?utm_medium=email&utm_source=link&utm_campaign=sig-email&utm_content=webmail"
target="_blank"><img
src="https://ipmcdn.avast.com/images/icons/icon-envelope-tick-green-avg-v1.png"
alt="" width="46" height="29" style="width: 46px; height: 29px;"
/></a></td>
		<td style="width: 470px; padding-top: 12px; color: #41424e;
font-size: 13px; font-family: Arial, Helvetica, sans-serif;
line-height: 18px;">Virus-free. <a
href="http://www.avg.com/email-signature?utm_medium=email&utm_source=link&utm_campaign=sig-email&utm_content=webmail"
target="_blank" style="color: #4453ea;">www.avg.com</a>
		</td>
	</tr>
</table><a href="#DAB4FAD8-2DD7-40BB-A1B8-4E2AA1F9FDF2" width="1"
height="1"></a></div>
Alan Hayward Nov. 17, 2017, 8:21 a.m. UTC | #2
> On 16 Nov 2017, at 19:32, Andrew Pinski <pinskia@gmail.com> wrote:

> 

> On Thu, Nov 16, 2017 at 4:35 AM, Alan Hayward <Alan.Hayward@arm.com> wrote:

>> This final patch adds the clobber high expressions to tls_desc for aarch64.

>> It also adds three tests.

>> 

>> In addition I also tested by taking the gcc torture test suite and making

>> all global variables __thread. Then emended the suite to compile with -fpic,

>> save the .s file and only for one given O level.

>> I ran this before and after the patch and compared the resulting .s files,

>> ensuring that there were no ASM changes.

>> I discarded the 10% of tests that failed to compile (due to the code in

>> the test now being invalid C).

>> I did this for O0,O2,O3 on both x86 and aarch64 and observed no difference

>> between ASM files before and after the patch.

> 

> Isn't the ABI defined as non-clobbering the lower 64bits for normal

> function calls?  Or is the TLS function "special" in that it

> saves/restores the 128bit registers; is that documented anywhere?  The

> main reason why I am asking is because glibc is not the only libc out

> there and someone could have a slightly different ABI here.

> 


In NEON all the register SIMD registers are preserved around TLS calls - all
128bits of each register. That’s standard ABI behaviour for NEON.

SVE doesn’t have any explicit preserving of it’s SIMD registers.

However, the NEON and SVE registers share the same silicon - the lower
128bits of each SVE register is the same as the corresponding NEON
register. The side effect of this is that the lower 128bits of the SVE registers
are getting backed up.

Neither glibc or any libraries need updating to support this.
But, compilers do need to aware of this.


Alan.
Andrew Pinski Nov. 17, 2017, 8:42 a.m. UTC | #3
On Fri, Nov 17, 2017 at 12:21 AM, Alan Hayward <Alan.Hayward@arm.com> wrote:
>
>> On 16 Nov 2017, at 19:32, Andrew Pinski <pinskia@gmail.com> wrote:
>>
>> On Thu, Nov 16, 2017 at 4:35 AM, Alan Hayward <Alan.Hayward@arm.com> wrote:
>>> This final patch adds the clobber high expressions to tls_desc for aarch64.
>>> It also adds three tests.
>>>
>>> In addition I also tested by taking the gcc torture test suite and making
>>> all global variables __thread. Then emended the suite to compile with -fpic,
>>> save the .s file and only for one given O level.
>>> I ran this before and after the patch and compared the resulting .s files,
>>> ensuring that there were no ASM changes.
>>> I discarded the 10% of tests that failed to compile (due to the code in
>>> the test now being invalid C).
>>> I did this for O0,O2,O3 on both x86 and aarch64 and observed no difference
>>> between ASM files before and after the patch.
>>
>> Isn't the ABI defined as non-clobbering the lower 64bits for normal
>> function calls?  Or is the TLS function "special" in that it
>> saves/restores the 128bit registers; is that documented anywhere?  The
>> main reason why I am asking is because glibc is not the only libc out
>> there and someone could have a slightly different ABI here.
>>
>
> In NEON all the register SIMD registers are preserved around TLS calls - all
> 128bits of each register. That’s standard ABI behaviour for NEON.
>
> SVE doesn’t have any explicit preserving of it’s SIMD registers.
>
> However, the NEON and SVE registers share the same silicon - the lower
> 128bits of each SVE register is the same as the corresponding NEON
> register. The side effect of this is that the lower 128bits of the SVE registers
> are getting backed up.
>
> Neither glibc or any libraries need updating to support this.
> But, compilers do need to aware of this.

I had a different question.  I asked if this specification of the TLS
calls requiring not to clobber the lower 128bits of the SIMD registers
documented anywhere.  As I was trying to say I am in the middle of
writing a libc and did not know of this requirement until I saw this
thread.

Thanks,
Andrew

>
>
> Alan.
Szabolcs Nagy Nov. 17, 2017, 3:45 p.m. UTC | #4
On 17/11/17 08:42, Andrew Pinski wrote:
> On Fri, Nov 17, 2017 at 12:21 AM, Alan Hayward <Alan.Hayward@arm.com> wrote:
>>
>>> On 16 Nov 2017, at 19:32, Andrew Pinski <pinskia@gmail.com> wrote:
>>>
>>> On Thu, Nov 16, 2017 at 4:35 AM, Alan Hayward <Alan.Hayward@arm.com> wrote:
>>>> This final patch adds the clobber high expressions to tls_desc for aarch64.
>>>> It also adds three tests.
>>>>
>>>> In addition I also tested by taking the gcc torture test suite and making
>>>> all global variables __thread. Then emended the suite to compile with -fpic,
>>>> save the .s file and only for one given O level.
>>>> I ran this before and after the patch and compared the resulting .s files,
>>>> ensuring that there were no ASM changes.
>>>> I discarded the 10% of tests that failed to compile (due to the code in
>>>> the test now being invalid C).
>>>> I did this for O0,O2,O3 on both x86 and aarch64 and observed no difference
>>>> between ASM files before and after the patch.
>>>
>>> Isn't the ABI defined as non-clobbering the lower 64bits for normal
>>> function calls?  Or is the TLS function "special" in that it
>>> saves/restores the 128bit registers; is that documented anywhere?  The
>>> main reason why I am asking is because glibc is not the only libc out
>>> there and someone could have a slightly different ABI here.
>>>
>>
>> In NEON all the register SIMD registers are preserved around TLS calls - all
>> 128bits of each register. That’s standard ABI behaviour for NEON.
>>
>> SVE doesn’t have any explicit preserving of it’s SIMD registers.
>>
>> However, the NEON and SVE registers share the same silicon - the lower
>> 128bits of each SVE register is the same as the corresponding NEON
>> register. The side effect of this is that the lower 128bits of the SVE registers
>> are getting backed up.
>>
>> Neither glibc or any libraries need updating to support this.
>> But, compilers do need to aware of this.
> 
> I had a different question.  I asked if this specification of the TLS
> calls requiring not to clobber the lower 128bits of the SIMD registers
> documented anywhere.  As I was trying to say I am in the middle of
> writing a libc and did not know of this requirement until I saw this
> thread.
> 

nothing is clobbered just like on x86 and arm:
http://www.fsfla.org/~lxoliva/writeups/TLS/RFC-TLSDESC-x86.txt
http://www.fsfla.org/~lxoliva/writeups/TLS/RFC-TLSDESC-ARM.txt

there is no equivalent spec for aarch64 (yet)
but the behaviour is consistent with other
tlsdesc abis.

you could argue that it's suboptimal that
the libc has to preserve everything, but
it only affects the slow path of dynamic
tlsdesc wich does __tls_get_addr and thus
may clobber call-clobber registers there.
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6a15ff0b61d775cf30189b8503cfa45987701228..1f332b254fe0e37954efbe92982f214100d7046f 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -57,7 +57,36 @@ 
     (LR_REGNUM		30)
     (SP_REGNUM		31)
     (V0_REGNUM		32)
+    (V1_REGNUM		33)
+    (V2_REGNUM		34)
+    (V3_REGNUM		35)
+    (V4_REGNUM		36)
+    (V5_REGNUM		37)
+    (V6_REGNUM		38)
+    (V7_REGNUM		39)
+    (V8_REGNUM		40)
+    (V9_REGNUM		41)
+    (V10_REGNUM		42)
+    (V11_REGNUM		43)
+    (V12_REGNUM		44)
+    (V13_REGNUM		45)
+    (V14_REGNUM		46)
     (V15_REGNUM		47)
+    (V16_REGNUM		48)
+    (V17_REGNUM		49)
+    (V18_REGNUM		50)
+    (V19_REGNUM		51)
+    (V20_REGNUM		52)
+    (V21_REGNUM		53)
+    (V22_REGNUM		54)
+    (V23_REGNUM		55)
+    (V24_REGNUM		56)
+    (V25_REGNUM		57)
+    (V26_REGNUM		58)
+    (V27_REGNUM		59)
+    (V28_REGNUM		60)
+    (V29_REGNUM		61)
+    (V30_REGNUM		62)
     (V31_REGNUM		63)
     (LAST_SAVED_REGNUM	63)
     (SFP_REGNUM		64)
@@ -5745,6 +5774,38 @@ 
 		   UNSPEC_TLSDESC))
    (clobber (reg:DI LR_REGNUM))
    (clobber (reg:CC CC_REGNUM))
+   (clobber_high (reg:TI V0_REGNUM))
+   (clobber_high (reg:TI V1_REGNUM))
+   (clobber_high (reg:TI V2_REGNUM))
+   (clobber_high (reg:TI V3_REGNUM))
+   (clobber_high (reg:TI V4_REGNUM))
+   (clobber_high (reg:TI V5_REGNUM))
+   (clobber_high (reg:TI V6_REGNUM))
+   (clobber_high (reg:TI V7_REGNUM))
+   (clobber_high (reg:TI V8_REGNUM))
+   (clobber_high (reg:TI V9_REGNUM))
+   (clobber_high (reg:TI V10_REGNUM))
+   (clobber_high (reg:TI V11_REGNUM))
+   (clobber_high (reg:TI V12_REGNUM))
+   (clobber_high (reg:TI V13_REGNUM))
+   (clobber_high (reg:TI V14_REGNUM))
+   (clobber_high (reg:TI V15_REGNUM))
+   (clobber_high (reg:TI V16_REGNUM))
+   (clobber_high (reg:TI V17_REGNUM))
+   (clobber_high (reg:TI V18_REGNUM))
+   (clobber_high (reg:TI V19_REGNUM))
+   (clobber_high (reg:TI V20_REGNUM))
+   (clobber_high (reg:TI V21_REGNUM))
+   (clobber_high (reg:TI V22_REGNUM))
+   (clobber_high (reg:TI V23_REGNUM))
+   (clobber_high (reg:TI V24_REGNUM))
+   (clobber_high (reg:TI V25_REGNUM))
+   (clobber_high (reg:TI V26_REGNUM))
+   (clobber_high (reg:TI V27_REGNUM))
+   (clobber_high (reg:TI V28_REGNUM))
+   (clobber_high (reg:TI V29_REGNUM))
+   (clobber_high (reg:TI V30_REGNUM))
+   (clobber_high (reg:TI V31_REGNUM))
    (clobber (match_scratch:DI 1 "=r"))]
   "TARGET_TLS_DESC"
   "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_1.c b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..5bad829568130181ef1ab386545bd3ee164c322e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_1.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fpic -march=armv8-a+sve" } */
+
+/* Clobber highs do not need to be spilled around tls usage.  */
+
+typedef float v4si __attribute__ ((vector_size (16)));
+
+__thread v4si tx;
+
+v4si foo (v4si a, v4si b, v4si c)
+{
+  v4si y;
+
+  y = a + tx + b + c;
+
+  return y + 7;
+}
+
+/* { dg-final { scan-assembler-not {\tstr\t} } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_2.c b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..69e8829287b8418c28f8c227391c4f8d2186ea63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_2.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fpic -march=armv8-a+sve -msve-vector-bits=256 -fno-schedule-insns" } */
+
+/* Clobber highs must be spilled around tls usage.  */
+
+typedef float v8si __attribute__ ((vector_size (32)));
+
+__thread v8si tx;
+
+v8si foo (v8si a, v8si b, v8si c)
+{
+  v8si y;
+
+  /* There is nothing stopping the compiler from making the tls call before
+     loading the input variables off the stack.  However, there appears to
+     be no way in C of enforcing this.  Thankfully the compiler doesn't
+     do this reordering.  */
+
+  y = a + tx + b + c;
+
+  return y + 7;
+}
+
+/* { dg-final { scan-assembler-times {\tstr\tz[0-9]+,} 3 } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_3.c b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b6aa59a3c7393d7e9ca419167d13b624a9ffafcc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_tls_preserve_3.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fpic -march=armv8-a+sve -msve-vector-bits=512 -fno-schedule-insns" } */
+
+/* Clobber highs must be spilled around tls usage.  */
+
+typedef float v16si __attribute__ ((vector_size (64)));
+
+__thread v16si tx;
+
+v16si foo (v16si a, v16si b, v16si c)
+{
+  v16si y;
+
+  /* There is nothing stopping the compiler from making the tls call before
+     loading the input variables off the stack.  However, there appears to
+     be no way in C of enforcing this.  Thankfully the compiler doesn't
+     do this reordering.  */
+
+  y = a + tx + b + c;
+
+  return y + 7;
+}
+
+/* { dg-final { scan-assembler-times {\tstr\tz[0-9]+,} 3 } } */