diff mbox series

[1/7] : SVE: Add CLOBBER_HIGH expression

Message ID FE15054D-7A89-40CD-93EA-136C9C6F69D0@arm.com
State New
Headers show
Series [1/7] : SVE: Add CLOBBER_HIGH expression | expand

Commit Message

Alan Hayward Nov. 16, 2017, 12:34 p.m. UTC
This is a set of patches aimed at supporting aarch64 SVE register
preservation around TLS calls.

Across a TLS call, Aarch64 SVE does not explicitly preserve the
SVE vector registers. However, the Neon vector registers are preserved.
Due to overlapping of registers, this means the lower 128bits of all
SVE vector registers will be preserved.

The existing GCC code will currently incorrectly assume preservation
of all of the SVE registers.

This patch introduces a CLOBBER_HIGH expression. This behaves a bit like
a CLOBBER expression. CLOBBER_HIGH can only refer to a single register.
The mode of the expression indicates the size of the lower bits which
will be preserved. If the register contains a value bigger than this
mode then the code will treat the register as clobbered.

The means in order to evaluate if a clobber high is relevant, we need to ensure
the mode of the existing value in a register is tracked.

The following patches in this series add support for the CLOBBER_HIGH,
with the final patch adding CLOBBER_HIGHs around TLS_DESC calls for
aarch64. The testing performed on these patches is also detailed in the
final patch.

These patches are based on top of the linaro-dev/sve branch.

A simpler alternative to this patch would be to assume all Neon and SVE
registers are clobbered across TLS calls, however this would be a
performance regression against all Aarch64 targets.

Alan.


2017-11-16  Alan Hayward  <alan.hayward@arm.com>

	* doc/rtl.texi (clobber_high): Add.
	(parallel): Add in clobber high
	* rtl.c (rtl_check_failed_code3): Add function.
	* rtl.def (CLOBBER_HIGH): Add expression.
	* rtl.h (RTL_CHECKC3): Add macro.
	(rtl_check_failed_code3): Add declaration.
	(XC3EXP): Add macro.


 /* Methods of rtx_expr_list.  */
@@ -2551,7 +2565,7 @@ do {							        \

 /* For a SET rtx, SET_DEST is the place that is set
    and SET_SRC is the value it is set to.  */
-#define SET_DEST(RTX) XC2EXP (RTX, 0, SET, CLOBBER)
+#define SET_DEST(RTX) XC3EXP (RTX, 0, SET, CLOBBER, CLOBBER_HIGH)
 #define SET_SRC(RTX) XCEXP (RTX, 1, SET)
 #define SET_IS_RETURN_P(RTX)						\
   (RTL_FLAG_CHECK1 ("SET_IS_RETURN_P", (RTX), SET)->jump)

Comments

Jeff Law Nov. 16, 2017, 6:05 p.m. UTC | #1
On 11/16/2017 05:34 AM, Alan Hayward wrote:
> This is a set of patches aimed at supporting aarch64 SVE register
> preservation around TLS calls.
> 
> Across a TLS call, Aarch64 SVE does not explicitly preserve the
> SVE vector registers. However, the Neon vector registers are preserved.
> Due to overlapping of registers, this means the lower 128bits of all
> SVE vector registers will be preserved.
> 
> The existing GCC code will currently incorrectly assume preservation
> of all of the SVE registers.
> 
> This patch introduces a CLOBBER_HIGH expression. This behaves a bit like
> a CLOBBER expression. CLOBBER_HIGH can only refer to a single register.
> The mode of the expression indicates the size of the lower bits which
> will be preserved. If the register contains a value bigger than this
> mode then the code will treat the register as clobbered.
> 
> The means in order to evaluate if a clobber high is relevant, we need to ensure
> the mode of the existing value in a register is tracked.
> 
> The following patches in this series add support for the CLOBBER_HIGH,
> with the final patch adding CLOBBER_HIGHs around TLS_DESC calls for
> aarch64. The testing performed on these patches is also detailed in the
> final patch.
> 
> These patches are based on top of the linaro-dev/sve branch.
> 
> A simpler alternative to this patch would be to assume all Neon and SVE
> registers are clobbered across TLS calls, however this would be a
> performance regression against all Aarch64 targets.
So just a couple design questions.

Presumably there's no reasonable way to set up GCC's view of the
register file to avoid this problem?  ISTM that if the SVE register was
split into two, one for the part that overlapped with the neon register
and one that did not, then this could be handled via standard mechanisms?

Alternately would it be easier to clobber a subreg representing the high
part of the register?  Hmm, probably not.

Jeff
Richard Biener Nov. 16, 2017, 6:24 p.m. UTC | #2
On November 16, 2017 7:05:30 PM GMT+01:00, Jeff Law <law@redhat.com> wrote:
>On 11/16/2017 05:34 AM, Alan Hayward wrote:
>> This is a set of patches aimed at supporting aarch64 SVE register
>> preservation around TLS calls.
>> 
>> Across a TLS call, Aarch64 SVE does not explicitly preserve the
>> SVE vector registers. However, the Neon vector registers are
>preserved.
>> Due to overlapping of registers, this means the lower 128bits of all
>> SVE vector registers will be preserved.
>> 
>> The existing GCC code will currently incorrectly assume preservation
>> of all of the SVE registers.
>> 
>> This patch introduces a CLOBBER_HIGH expression. This behaves a bit
>like
>> a CLOBBER expression. CLOBBER_HIGH can only refer to a single
>register.
>> The mode of the expression indicates the size of the lower bits which
>> will be preserved. If the register contains a value bigger than this
>> mode then the code will treat the register as clobbered.
>> 
>> The means in order to evaluate if a clobber high is relevant, we need
>to ensure
>> the mode of the existing value in a register is tracked.
>> 
>> The following patches in this series add support for the
>CLOBBER_HIGH,
>> with the final patch adding CLOBBER_HIGHs around TLS_DESC calls for
>> aarch64. The testing performed on these patches is also detailed in
>the
>> final patch.
>> 
>> These patches are based on top of the linaro-dev/sve branch.
>> 
>> A simpler alternative to this patch would be to assume all Neon and
>SVE
>> registers are clobbered across TLS calls, however this would be a
>> performance regression against all Aarch64 targets.
>So just a couple design questions.
>
>Presumably there's no reasonable way to set up GCC's view of the
>register file to avoid this problem?  ISTM that if the SVE register was
>split into two, one for the part that overlapped with the neon register
>and one that did not, then this could be handled via standard
>mechanisms?
>
>Alternately would it be easier to clobber a subreg representing the
>high
>part of the register?  Hmm, probably not.

I thought of a set of the preserved part to itself that leaves the upper part undefined. Not sure if we have such thing or if it would work in all places that a clobber does.

Richard. 

>Jeff
Alan Hayward Nov. 16, 2017, 6:50 p.m. UTC | #3
> On 16 Nov 2017, at 18:24, Richard Biener <richard.guenther@gmail.com> wrote:

> 

> On November 16, 2017 7:05:30 PM GMT+01:00, Jeff Law <law@redhat.com> wrote:

>> On 11/16/2017 05:34 AM, Alan Hayward wrote:

>>> This is a set of patches aimed at supporting aarch64 SVE register

>>> preservation around TLS calls.

>>> 

>>> Across a TLS call, Aarch64 SVE does not explicitly preserve the

>>> SVE vector registers. However, the Neon vector registers are

>> preserved.

>>> Due to overlapping of registers, this means the lower 128bits of all

>>> SVE vector registers will be preserved.

>>> 

>>> The existing GCC code will currently incorrectly assume preservation

>>> of all of the SVE registers.

>>> 

>>> This patch introduces a CLOBBER_HIGH expression. This behaves a bit

>> like

>>> a CLOBBER expression. CLOBBER_HIGH can only refer to a single

>> register.

>>> The mode of the expression indicates the size of the lower bits which

>>> will be preserved. If the register contains a value bigger than this

>>> mode then the code will treat the register as clobbered.

>>> 

>>> The means in order to evaluate if a clobber high is relevant, we need

>> to ensure

>>> the mode of the existing value in a register is tracked.

>>> 

>>> The following patches in this series add support for the

>> CLOBBER_HIGH,

>>> with the final patch adding CLOBBER_HIGHs around TLS_DESC calls for

>>> aarch64. The testing performed on these patches is also detailed in

>> the

>>> final patch.

>>> 

>>> These patches are based on top of the linaro-dev/sve branch.

>>> 

>>> A simpler alternative to this patch would be to assume all Neon and

>> SVE

>>> registers are clobbered across TLS calls, however this would be a

>>> performance regression against all Aarch64 targets.

>> So just a couple design questions.

>> 

>> Presumably there's no reasonable way to set up GCC's view of the

>> register file to avoid this problem?  ISTM that if the SVE register was

>> split into two, one for the part that overlapped with the neon register

>> and one that did not, then this could be handled via standard

>> mechanisms?

>> 


Yes, that was an early alternative option for the patch.

With that it would effect every operation that uses SVE registers. A simple
add of two registers now has 4 inputs and two outputs. It would get in the
way when debugging any sve dumps and be generally annoying.
Possible that the code for that in would all be in the aarch64 target,
(making everyone else happy!) But I suspect that there would be still be
strange dependency issues that’d need sorting in the common code.

Whereas with this patch, there are no new oddities in non-tls compiles/dumps.
Although the patch touches a lot of files, the changes are mostly restricted
to places where standard clobbers were already being checked.


>> Alternately would it be easier to clobber a subreg representing the

>> high

>> part of the register?  Hmm, probably not.

> 

> I thought of a set of the preserved part to itself that leaves the upper part undefined. Not sure if we have such thing or if it would work in all places that a clobber does.


I’ve not seen such a thing in the code. But it would need specific handling in
the all the existing clobber code.


Alan.
Jeff Law Nov. 17, 2017, 7:31 p.m. UTC | #4
On 11/16/2017 11:50 AM, Alan Hayward wrote:
> 
>> On 16 Nov 2017, at 18:24, Richard Biener <richard.guenther@gmail.com> wrote:
>>
>> On November 16, 2017 7:05:30 PM GMT+01:00, Jeff Law <law@redhat.com> wrote:
>>> On 11/16/2017 05:34 AM, Alan Hayward wrote:
>>>> This is a set of patches aimed at supporting aarch64 SVE register
>>>> preservation around TLS calls.
>>>>
>>>> Across a TLS call, Aarch64 SVE does not explicitly preserve the
>>>> SVE vector registers. However, the Neon vector registers are
>>> preserved.
>>>> Due to overlapping of registers, this means the lower 128bits of all
>>>> SVE vector registers will be preserved.
>>>>
>>>> The existing GCC code will currently incorrectly assume preservation
>>>> of all of the SVE registers.
>>>>
>>>> This patch introduces a CLOBBER_HIGH expression. This behaves a bit
>>> like
>>>> a CLOBBER expression. CLOBBER_HIGH can only refer to a single
>>> register.
>>>> The mode of the expression indicates the size of the lower bits which
>>>> will be preserved. If the register contains a value bigger than this
>>>> mode then the code will treat the register as clobbered.
>>>>
>>>> The means in order to evaluate if a clobber high is relevant, we need
>>> to ensure
>>>> the mode of the existing value in a register is tracked.
>>>>
>>>> The following patches in this series add support for the
>>> CLOBBER_HIGH,
>>>> with the final patch adding CLOBBER_HIGHs around TLS_DESC calls for
>>>> aarch64. The testing performed on these patches is also detailed in
>>> the
>>>> final patch.
>>>>
>>>> These patches are based on top of the linaro-dev/sve branch.
>>>>
>>>> A simpler alternative to this patch would be to assume all Neon and
>>> SVE
>>>> registers are clobbered across TLS calls, however this would be a
>>>> performance regression against all Aarch64 targets.
>>> So just a couple design questions.
>>>
>>> Presumably there's no reasonable way to set up GCC's view of the
>>> register file to avoid this problem?  ISTM that if the SVE register was
>>> split into two, one for the part that overlapped with the neon register
>>> and one that did not, then this could be handled via standard
>>> mechanisms?
>>>
> 
> Yes, that was an early alternative option for the patch.
> 
> With that it would effect every operation that uses SVE registers. A simple
> add of two registers now has 4 inputs and two outputs. It would get in the
> way when debugging any sve dumps and be generally annoying.
> Possible that the code for that in would all be in the aarch64 target,
> (making everyone else happy!) But I suspect that there would be still be
> strange dependency issues that’d need sorting in the common code.
> 
> Whereas with this patch, there are no new oddities in non-tls compiles/dumps.
> Although the patch touches a lot of files, the changes are mostly restricted
> to places where standard clobbers were already being checked.
I'm not entirely sure that it would require doubling the number of
inputs/outputs.  It's not conceptually much different than how we
describe DImode operations on 32bit targets.  The mode selects one or
more consecutive registers, so you don't actually need anything weird in
your patterns.  This is pretty standard stuff.


It would be painful in that the Neon regs would have to interleave with
the upper part of the SVE regs in terms of register numbers.  It would
also mean that you couldn't tie together multiple neon regs into
something wider.  I'm not sure if the latter would be an issue or not.

You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd
totally forgotten about it.  And in fact it seems to come pretty close
to what you need...

> 
> 
>>> Alternately would it be easier to clobber a subreg representing the
>>> high
>>> part of the register?  Hmm, probably not.
>>
>> I thought of a set of the preserved part to itself that leaves the upper part undefined. Not sure if we have such thing or if it would work in all places that a clobber does.
> 
> I’ve not seen such a thing in the code. But it would need specific handling in
> the all the existing clobber code.
It could probably be done with a set of the low part via a subreg or
somesuch (rather than trying to clobber the upper part which was my
initial flawed idea).

jeff
Alan Hayward Nov. 20, 2017, 3:04 p.m. UTC | #5
> On 17 Nov 2017, at 19:31, Jeff Law <law@redhat.com> wrote:

> 

> On 11/16/2017 11:50 AM, Alan Hayward wrote:

>> 

>>> On 16 Nov 2017, at 18:24, Richard Biener <richard.guenther@gmail.com> wrote:

>>> 

>>> On November 16, 2017 7:05:30 PM GMT+01:00, Jeff Law <law@redhat.com> wrote:

>>>> On 11/16/2017 05:34 AM, Alan Hayward wrote:

>>>>> This is a set of patches aimed at supporting aarch64 SVE register

>>>>> preservation around TLS calls.

>>>>> 

>>>>> Across a TLS call, Aarch64 SVE does not explicitly preserve the

>>>>> SVE vector registers. However, the Neon vector registers are

>>>> preserved.

>>>>> Due to overlapping of registers, this means the lower 128bits of all

>>>>> SVE vector registers will be preserved.

>>>>> 

>>>>> The existing GCC code will currently incorrectly assume preservation

>>>>> of all of the SVE registers.

>>>>> 

>>>>> This patch introduces a CLOBBER_HIGH expression. This behaves a bit

>>>> like

>>>>> a CLOBBER expression. CLOBBER_HIGH can only refer to a single

>>>> register.

>>>>> The mode of the expression indicates the size of the lower bits which

>>>>> will be preserved. If the register contains a value bigger than this

>>>>> mode then the code will treat the register as clobbered.

>>>>> 

>>>>> The means in order to evaluate if a clobber high is relevant, we need

>>>> to ensure

>>>>> the mode of the existing value in a register is tracked.

>>>>> 

>>>>> The following patches in this series add support for the

>>>> CLOBBER_HIGH,

>>>>> with the final patch adding CLOBBER_HIGHs around TLS_DESC calls for

>>>>> aarch64. The testing performed on these patches is also detailed in

>>>> the

>>>>> final patch.

>>>>> 

>>>>> These patches are based on top of the linaro-dev/sve branch.

>>>>> 

>>>>> A simpler alternative to this patch would be to assume all Neon and

>>>> SVE

>>>>> registers are clobbered across TLS calls, however this would be a

>>>>> performance regression against all Aarch64 targets.

>>>> So just a couple design questions.

>>>> 

>>>> Presumably there's no reasonable way to set up GCC's view of the

>>>> register file to avoid this problem?  ISTM that if the SVE register was

>>>> split into two, one for the part that overlapped with the neon register

>>>> and one that did not, then this could be handled via standard

>>>> mechanisms?

>>>> 

>> 

>> Yes, that was an early alternative option for the patch.

>> 

>> With that it would effect every operation that uses SVE registers. A simple

>> add of two registers now has 4 inputs and two outputs. It would get in the

>> way when debugging any sve dumps and be generally annoying.

>> Possible that the code for that in would all be in the aarch64 target,

>> (making everyone else happy!) But I suspect that there would be still be

>> strange dependency issues that’d need sorting in the common code.

>> 

>> Whereas with this patch, there are no new oddities in non-tls compiles/dumps.

>> Although the patch touches a lot of files, the changes are mostly restricted

>> to places where standard clobbers were already being checked.

> I'm not entirely sure that it would require doubling the number of

> inputs/outputs.  It's not conceptually much different than how we

> describe DImode operations on 32bit targets.  The mode selects one or

> more consecutive registers, so you don't actually need anything weird in

> your patterns.  This is pretty standard stuff.


Ok, fair enough.

> 

> 

> It would be painful in that the Neon regs would have to interleave with

> the upper part of the SVE regs in terms of register numbers.  It would

> also mean that you couldn't tie together multiple neon regs into

> something wider.  I'm not sure if the latter would be an issue or not.


And there’s also the weirdness that the register would not be split evenly - it’ll be a TI
reg followed by a reg of the size of multiple TIs.

All of that has the potential to complicate all non-sve aarch64 code.

> 

> You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd

> totally forgotten about it.  And in fact it seems to come pretty close

> to what you need…


Yes, some of the code is similar to the way
TARGET_HARD_REGNO_CALL_PART_CLOBBERED works. Both that code and the
CLOBBER expr code served as a starting point for writing the patch. The main difference
here, is that _PART_CLOBBERED is around all calls and is not tied to a specific Instruction,
it’s part of the calling abi. Whereas clobber_high is explicitly tied to an expression (tls_desc).
It meant there wasn’t really any opportunity to resume any existing code.


Alan.
Jeff Law Nov. 21, 2017, 3:13 a.m. UTC | #6
On 11/20/2017 08:04 AM, Alan Hayward wrote:
>>>
>>> Yes, that was an early alternative option for the patch.
>>>
>>> With that it would effect every operation that uses SVE registers. A simple
>>> add of two registers now has 4 inputs and two outputs. It would get in the
>>> way when debugging any sve dumps and be generally annoying.
>>> Possible that the code for that in would all be in the aarch64 target,
>>> (making everyone else happy!) But I suspect that there would be still be
>>> strange dependency issues that’d need sorting in the common code.
>>>
>>> Whereas with this patch, there are no new oddities in non-tls compiles/dumps.
>>> Although the patch touches a lot of files, the changes are mostly restricted
>>> to places where standard clobbers were already being checked.
>> I'm not entirely sure that it would require doubling the number of
>> inputs/outputs.  It's not conceptually much different than how we
>> describe DImode operations on 32bit targets.  The mode selects one or
>> more consecutive registers, so you don't actually need anything weird in
>> your patterns.  This is pretty standard stuff.
> 
> Ok, fair enough.
> 
>>
>>
>> It would be painful in that the Neon regs would have to interleave with
>> the upper part of the SVE regs in terms of register numbers.  It would
>> also mean that you couldn't tie together multiple neon regs into
>> something wider.  I'm not sure if the latter would be an issue or not.
> 
> And there’s also the weirdness that the register would not be split evenly - it’ll be a TI
> reg followed by a reg of the size of multiple TIs.
> 
> All of that has the potential to complicate all non-sve aarch64 code.
Agreed.  Let's drop this line of exploration.


> 
>>
>> You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd
>> totally forgotten about it.  And in fact it seems to come pretty close
>> to what you need…
> 
> Yes, some of the code is similar to the way
> TARGET_HARD_REGNO_CALL_PART_CLOBBERED works. Both that code and the
> CLOBBER expr code served as a starting point for writing the patch. The main difference
> here, is that _PART_CLOBBERED is around all calls and is not tied to a specific Instruction,
> it’s part of the calling abi. Whereas clobber_high is explicitly tied to an expression (tls_desc).
> It meant there wasn’t really any opportunity to resume any existing code.
Understood.  Though your first patch mentions that you're trying to
describe partial preservation "around TLS calls".  Presumably those are
represented as normal insns, not call_insn.

That brings me back to Richi's idea of exposing a set of the low subreg
to itself using whatever mode is wide enough to cover the neon part of
the register.

That should tell the generic parts of the compiler that you're just
clobbering the upper part and at least in theory you can implement in
the aarch64 backend and the rest of the compiler should "just work"
because that's the existing semantics of a subreg store.

The only worry would be if a pass tried to get overly smart and
considered that kind of set a nop -- but I think I'd argue that's simply
wrong given the semantics of a partial store.

jeff
Alan Hayward Nov. 22, 2017, 11:31 a.m. UTC | #7
> On 21 Nov 2017, at 03:13, Jeff Law <law@redhat.com> wrote:

>> 

>>> 

>>> You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd

>>> totally forgotten about it.  And in fact it seems to come pretty close

>>> to what you need…

>> 

>> Yes, some of the code is similar to the way

>> TARGET_HARD_REGNO_CALL_PART_CLOBBERED works. Both that code and the

>> CLOBBER expr code served as a starting point for writing the patch. The main difference

>> here, is that _PART_CLOBBERED is around all calls and is not tied to a specific Instruction,

>> it’s part of the calling abi. Whereas clobber_high is explicitly tied to an expression (tls_desc).

>> It meant there wasn’t really any opportunity to resume any existing code.

> Understood.  Though your first patch mentions that you're trying to

> describe partial preservation "around TLS calls".  Presumably those are

> represented as normal insns, not call_insn.

> 

> That brings me back to Richi's idea of exposing a set of the low subreg

> to itself using whatever mode is wide enough to cover the neon part of

> the register.

> 

> That should tell the generic parts of the compiler that you're just

> clobbering the upper part and at least in theory you can implement in

> the aarch64 backend and the rest of the compiler should "just work"

> because that's the existing semantics of a subreg store.

> 

> The only worry would be if a pass tried to get overly smart and

> considered that kind of set a nop -- but I think I'd argue that's simply

> wrong given the semantics of a partial store.

> 


So, the instead of using clobber_high(reg X), to use set(reg X, reg X).
It’s something we considered, and then dismissed.

The problem then is you are now using SET semantics on those registers, and it
would make the register live around the function, which might not be the case.
Whereas clobber semantics will just make the register dead - which is exactly
what we want (but only conditionally).


Alan.
Jeff Law Nov. 22, 2017, 5:33 p.m. UTC | #8
On 11/22/2017 04:31 AM, Alan Hayward wrote:
> 
>> On 21 Nov 2017, at 03:13, Jeff Law <law@redhat.com> wrote:
>>>
>>>>
>>>> You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd
>>>> totally forgotten about it.  And in fact it seems to come pretty close
>>>> to what you need…
>>>
>>> Yes, some of the code is similar to the way
>>> TARGET_HARD_REGNO_CALL_PART_CLOBBERED works. Both that code and the
>>> CLOBBER expr code served as a starting point for writing the patch. The main difference
>>> here, is that _PART_CLOBBERED is around all calls and is not tied to a specific Instruction,
>>> it’s part of the calling abi. Whereas clobber_high is explicitly tied to an expression (tls_desc).
>>> It meant there wasn’t really any opportunity to resume any existing code.
>> Understood.  Though your first patch mentions that you're trying to
>> describe partial preservation "around TLS calls".  Presumably those are
>> represented as normal insns, not call_insn.
>>
>> That brings me back to Richi's idea of exposing a set of the low subreg
>> to itself using whatever mode is wide enough to cover the neon part of
>> the register.
>>
>> That should tell the generic parts of the compiler that you're just
>> clobbering the upper part and at least in theory you can implement in
>> the aarch64 backend and the rest of the compiler should "just work"
>> because that's the existing semantics of a subreg store.
>>
>> The only worry would be if a pass tried to get overly smart and
>> considered that kind of set a nop -- but I think I'd argue that's simply
>> wrong given the semantics of a partial store.
>>
> 
> So, the instead of using clobber_high(reg X), to use set(reg X, reg X).
> It’s something we considered, and then dismissed.
> 
> The problem then is you are now using SET semantics on those registers, and it
> would make the register live around the function, which might not be the case.
> Whereas clobber semantics will just make the register dead - which is exactly
> what we want (but only conditionally).
?!?  A set of the subreg is the *exact* semantics you want.  It says the
low part is preserved while the upper part is clobbered across the TLS
insns.

jeff
Alan Hayward Nov. 23, 2017, 11:11 a.m. UTC | #9
> On 22 Nov 2017, at 17:33, Jeff Law <law@redhat.com> wrote:

> 

> On 11/22/2017 04:31 AM, Alan Hayward wrote:

>> 

>>> On 21 Nov 2017, at 03:13, Jeff Law <law@redhat.com> wrote:

>>>> 

>>>>> 

>>>>> You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd

>>>>> totally forgotten about it.  And in fact it seems to come pretty close

>>>>> to what you need…

>>>> 

>>>> Yes, some of the code is similar to the way

>>>> TARGET_HARD_REGNO_CALL_PART_CLOBBERED works. Both that code and the

>>>> CLOBBER expr code served as a starting point for writing the patch. The main difference

>>>> here, is that _PART_CLOBBERED is around all calls and is not tied to a specific Instruction,

>>>> it’s part of the calling abi. Whereas clobber_high is explicitly tied to an expression (tls_desc).

>>>> It meant there wasn’t really any opportunity to resume any existing code.

>>> Understood.  Though your first patch mentions that you're trying to

>>> describe partial preservation "around TLS calls". Presumably those are

>>> represented as normal insns, not call_insn.

>>> 

>>> That brings me back to Richi's idea of exposing a set of the low subreg

>>> to itself using whatever mode is wide enough to cover the neon part of

>>> the register.

>>> 

>>> That should tell the generic parts of the compiler that you're just

>>> clobbering the upper part and at least in theory you can implement in

>>> the aarch64 backend and the rest of the compiler should "just work"

>>> because that's the existing semantics of a subreg store.

>>> 

>>> The only worry would be if a pass tried to get overly smart and

>>> considered that kind of set a nop -- but I think I'd argue that's simply

>>> wrong given the semantics of a partial store.

>>> 

>> 

>> So, the instead of using clobber_high(reg X), to use set(reg X, reg X).

>> It’s something we considered, and then dismissed.

>> 

>> The problem then is you are now using SET semantics on those registers, and it

>> would make the register live around the function, which might not be the case.

>> Whereas clobber semantics will just make the register dead - which is exactly

>> what we want (but only conditionally).

> ?!?  A set of the subreg is the *exact* semantics you want.  It says the

> low part is preserved while the upper part is clobbered across the TLS

> insns.

> 

> jeff


Consider where the TLS call is inside a loop. The compiler would normally want
to hoist that out of the loop. By adding a set(x,x) into the parallel of the tls_desc we
are now making x live across the loop, x is dependant on the value from the previous
iteration, and the tls_desc can no longer be hoisted.

Or consider a stream of code containing two tls_desc calls (ok, the compiler might
optimise one of the tls calls away, but this approach should be reusable for other exprs).
Between the two set(x,x)’s x is considered live so the register allocator can’t use that
register.
Given that we are applying this to all the neon registers, the register allocator now throws
an ICE because it can’t find any free hard neon registers to use.


Alan.
Jeff Law Nov. 27, 2017, 5:29 p.m. UTC | #10
On 11/23/2017 04:11 AM, Alan Hayward wrote:
> 
>> On 22 Nov 2017, at 17:33, Jeff Law <law@redhat.com> wrote:
>>
>> On 11/22/2017 04:31 AM, Alan Hayward wrote:
>>>
>>>> On 21 Nov 2017, at 03:13, Jeff Law <law@redhat.com> wrote:
>>>>>
>>>>>>
>>>>>> You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd
>>>>>> totally forgotten about it.  And in fact it seems to come pretty close
>>>>>> to what you need…
>>>>>
>>>>> Yes, some of the code is similar to the way
>>>>> TARGET_HARD_REGNO_CALL_PART_CLOBBERED works. Both that code and the
>>>>> CLOBBER expr code served as a starting point for writing the patch. The main difference
>>>>> here, is that _PART_CLOBBERED is around all calls and is not tied to a specific Instruction,
>>>>> it’s part of the calling abi. Whereas clobber_high is explicitly tied to an expression (tls_desc).
>>>>> It meant there wasn’t really any opportunity to resume any existing code.
>>>> Understood.  Though your first patch mentions that you're trying to
>>>> describe partial preservation "around TLS calls". Presumably those are
>>>> represented as normal insns, not call_insn.
>>>>
>>>> That brings me back to Richi's idea of exposing a set of the low subreg
>>>> to itself using whatever mode is wide enough to cover the neon part of
>>>> the register.
>>>>
>>>> That should tell the generic parts of the compiler that you're just
>>>> clobbering the upper part and at least in theory you can implement in
>>>> the aarch64 backend and the rest of the compiler should "just work"
>>>> because that's the existing semantics of a subreg store.
>>>>
>>>> The only worry would be if a pass tried to get overly smart and
>>>> considered that kind of set a nop -- but I think I'd argue that's simply
>>>> wrong given the semantics of a partial store.
>>>>
>>>
>>> So, the instead of using clobber_high(reg X), to use set(reg X, reg X).
>>> It’s something we considered, and then dismissed.
>>>
>>> The problem then is you are now using SET semantics on those registers, and it
>>> would make the register live around the function, which might not be the case.
>>> Whereas clobber semantics will just make the register dead - which is exactly
>>> what we want (but only conditionally).
>> ?!?  A set of the subreg is the *exact* semantics you want.  It says the
>> low part is preserved while the upper part is clobbered across the TLS
>> insns.
>>
>> jeff
> 
> Consider where the TLS call is inside a loop. The compiler would normally want
> to hoist that out of the loop. By adding a set(x,x) into the parallel of the tls_desc we
> are now making x live across the loop, x is dependant on the value from the previous
> iteration, and the tls_desc can no longer be hoisted.
Hmm.  I think I see the problem you're trying to point out.  Let me
restate it and see if you agree.

The low subreg set does clearly indicate the upper part of the SVE
register is clobbered.  The problem is from a liveness standpoint the
compiler is considering the low part live, even though it's a self-set.

In fact, if that is the case, then a single TLS call (independent of a
loop) would make the low part of the register globally live.  This
should be testable.  Include one of these low part self sets on the
existing TLS calls and compile a little test function and let's look at
the liveness data.


Now it could be the case that various local analysis could sub-optimally
handle things.  You mention LICM.  I know our original LICM did have a
problem in that if it saw a use of a hard reg in a loop without seeing a
set of that hard reg it considered the register varying within the loop.
 I have no idea if we carried that forward when the loop code was
rewritten (when I looked at this it was circa 1992).


> 
> Or consider a stream of code containing two tls_desc calls (ok, the compiler might
> optimise one of the tls calls away, but this approach should be reusable for other exprs).
> Between the two set(x,x)’s x is considered live so the register allocator can’t use that
> register.
> Given that we are applying this to all the neon registers, the register allocator now throws
> an ICE because it can’t find any free hard neon registers to use.
Given your statements it sounds like the liveness infrastructure is
making those neon regs globally live when it sees the low part subreg
self-set.  Let's confirm that one way or the other and see where it
takes us.

Jeff
Richard Biener Nov. 28, 2017, 11:55 a.m. UTC | #11
On Mon, Nov 27, 2017 at 6:29 PM, Jeff Law <law@redhat.com> wrote:
> On 11/23/2017 04:11 AM, Alan Hayward wrote:
>>
>>> On 22 Nov 2017, at 17:33, Jeff Law <law@redhat.com> wrote:
>>>
>>> On 11/22/2017 04:31 AM, Alan Hayward wrote:
>>>>
>>>>> On 21 Nov 2017, at 03:13, Jeff Law <law@redhat.com> wrote:
>>>>>>
>>>>>>>
>>>>>>> You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd
>>>>>>> totally forgotten about it.  And in fact it seems to come pretty close
>>>>>>> to what you need…
>>>>>>
>>>>>> Yes, some of the code is similar to the way
>>>>>> TARGET_HARD_REGNO_CALL_PART_CLOBBERED works. Both that code and the
>>>>>> CLOBBER expr code served as a starting point for writing the patch. The main difference
>>>>>> here, is that _PART_CLOBBERED is around all calls and is not tied to a specific Instruction,
>>>>>> it’s part of the calling abi. Whereas clobber_high is explicitly tied to an expression (tls_desc).
>>>>>> It meant there wasn’t really any opportunity to resume any existing code.
>>>>> Understood.  Though your first patch mentions that you're trying to
>>>>> describe partial preservation "around TLS calls". Presumably those are
>>>>> represented as normal insns, not call_insn.
>>>>>
>>>>> That brings me back to Richi's idea of exposing a set of the low subreg
>>>>> to itself using whatever mode is wide enough to cover the neon part of
>>>>> the register.
>>>>>
>>>>> That should tell the generic parts of the compiler that you're just
>>>>> clobbering the upper part and at least in theory you can implement in
>>>>> the aarch64 backend and the rest of the compiler should "just work"
>>>>> because that's the existing semantics of a subreg store.
>>>>>
>>>>> The only worry would be if a pass tried to get overly smart and
>>>>> considered that kind of set a nop -- but I think I'd argue that's simply
>>>>> wrong given the semantics of a partial store.
>>>>>
>>>>
>>>> So, the instead of using clobber_high(reg X), to use set(reg X, reg X).
>>>> It’s something we considered, and then dismissed.
>>>>
>>>> The problem then is you are now using SET semantics on those registers, and it
>>>> would make the register live around the function, which might not be the case.
>>>> Whereas clobber semantics will just make the register dead - which is exactly
>>>> what we want (but only conditionally).
>>> ?!?  A set of the subreg is the *exact* semantics you want.  It says the
>>> low part is preserved while the upper part is clobbered across the TLS
>>> insns.
>>>
>>> jeff
>>
>> Consider where the TLS call is inside a loop. The compiler would normally want
>> to hoist that out of the loop. By adding a set(x,x) into the parallel of the tls_desc we
>> are now making x live across the loop, x is dependant on the value from the previous
>> iteration, and the tls_desc can no longer be hoisted.
> Hmm.  I think I see the problem you're trying to point out.  Let me
> restate it and see if you agree.
>
> The low subreg set does clearly indicate the upper part of the SVE
> register is clobbered.  The problem is from a liveness standpoint the
> compiler is considering the low part live, even though it's a self-set.
>
> In fact, if that is the case, then a single TLS call (independent of a
> loop) would make the low part of the register globally live.  This
> should be testable.  Include one of these low part self sets on the
> existing TLS calls and compile a little test function and let's look at
> the liveness data.
>
>
> Now it could be the case that various local analysis could sub-optimally
> handle things.  You mention LICM.  I know our original LICM did have a
> problem in that if it saw a use of a hard reg in a loop without seeing a
> set of that hard reg it considered the register varying within the loop.
>  I have no idea if we carried that forward when the loop code was
> rewritten (when I looked at this it was circa 1992).
>
>
>>
>> Or consider a stream of code containing two tls_desc calls (ok, the compiler might
>> optimise one of the tls calls away, but this approach should be reusable for other exprs).
>> Between the two set(x,x)’s x is considered live so the register allocator can’t use that
>> register.
>> Given that we are applying this to all the neon registers, the register allocator now throws
>> an ICE because it can’t find any free hard neon registers to use.
> Given your statements it sounds like the liveness infrastructure is
> making those neon regs globally live when it sees the low part subreg
> self-set.  Let's confirm that one way or the other and see where it
> takes us.

Indeed in (set (subreg:neon reg1) (subreg:neon reg1)) it appears that
the lowpart of reg1
is used and thus it is live but liveness analysis can (and should)
simply ignore such sets.

> Jeff
Jeff Law Nov. 28, 2017, 3:07 p.m. UTC | #12
On 11/28/2017 04:55 AM, Richard Biener wrote:

>>> Or consider a stream of code containing two tls_desc calls (ok, the compiler might
>>> optimise one of the tls calls away, but this approach should be reusable for other exprs).
>>> Between the two set(x,x)’s x is considered live so the register allocator can’t use that
>>> register.
>>> Given that we are applying this to all the neon registers, the register allocator now throws
>>> an ICE because it can’t find any free hard neon registers to use.
>> Given your statements it sounds like the liveness infrastructure is
>> making those neon regs globally live when it sees the low part subreg
>> self-set.  Let's confirm that one way or the other and see where it
>> takes us.
> 
> Indeed in (set (subreg:neon reg1) (subreg:neon reg1)) it appears that
> the lowpart of reg1
> is used and thus it is live but liveness analysis can (and should)
> simply ignore such sets.
My suggestion was going to be to peek a bit at the life analysis code if
indeed my suspicion was true.

Jeff
Alan Hayward Nov. 30, 2017, 11:03 a.m. UTC | #13
> On 27 Nov 2017, at 17:29, Jeff Law <law@redhat.com> wrote:

> 

> On 11/23/2017 04:11 AM, Alan Hayward wrote:

>> 

>>> On 22 Nov 2017, at 17:33, Jeff Law <law@redhat.com> wrote:

>>> 

>>> On 11/22/2017 04:31 AM, Alan Hayward wrote:

>>>> 

>>>>> On 21 Nov 2017, at 03:13, Jeff Law <law@redhat.com> wrote:

>>>>>> 

>>>>>>> 

>>>>>>> You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd

>>>>>>> totally forgotten about it.  And in fact it seems to come pretty close

>>>>>>> to what you need…

>>>>>> 

>>>>>> Yes, some of the code is similar to the way

>>>>>> TARGET_HARD_REGNO_CALL_PART_CLOBBERED works. Both that code and the

>>>>>> CLOBBER expr code served as a starting point for writing the patch. The main difference

>>>>>> here, is that _PART_CLOBBERED is around all calls and is not tied to a specific Instruction,

>>>>>> it’s part of the calling abi. Whereas clobber_high is explicitly tied to an expression (tls_desc).

>>>>>> It meant there wasn’t really any opportunity to resume any existing code.

>>>>> Understood.  Though your first patch mentions that you're trying to

>>>>> describe partial preservation "around TLS calls". Presumably those are

>>>>> represented as normal insns, not call_insn.

>>>>> 

>>>>> That brings me back to Richi's idea of exposing a set of the low subreg

>>>>> to itself using whatever mode is wide enough to cover the neon part of

>>>>> the register.

>>>>> 

>>>>> That should tell the generic parts of the compiler that you're just

>>>>> clobbering the upper part and at least in theory you can implement in

>>>>> the aarch64 backend and the rest of the compiler should "just work"

>>>>> because that's the existing semantics of a subreg store.

>>>>> 

>>>>> The only worry would be if a pass tried to get overly smart and

>>>>> considered that kind of set a nop -- but I think I'd argue that's simply

>>>>> wrong given the semantics of a partial store.

>>>>> 

>>>> 

>>>> So, the instead of using clobber_high(reg X), to use set(reg X, reg X).

>>>> It’s something we considered, and then dismissed.

>>>> 

>>>> The problem then is you are now using SET semantics on those registers, and it

>>>> would make the register live around the function, which might not be the case.

>>>> Whereas clobber semantics will just make the register dead - which is exactly

>>>> what we want (but only conditionally).

>>> ?!?  A set of the subreg is the *exact* semantics you want.  It says the

>>> low part is preserved while the upper part is clobbered across the TLS

>>> insns.

>>> 

>>> jeff

>> 

>> Consider where the TLS call is inside a loop. The compiler would normally want

>> to hoist that out of the loop. By adding a set(x,x) into the parallel of the tls_desc we

>> are now making x live across the loop, x is dependant on the value from the previous

>> iteration, and the tls_desc can no longer be hoisted.

> Hmm.  I think I see the problem you're trying to point out.  Let me

> restate it and see if you agree.

> 

> The low subreg set does clearly indicate the upper part of the SVE

> register is clobbered.  The problem is from a liveness standpoint the

> compiler is considering the low part live, even though it's a self-set.

> 


Yes.

> In fact, if that is the case, then a single TLS call (independent of a

> loop) would make the low part of the register globally live.  This

> should be testable.  Include one of these low part self sets on the

> existing TLS calls and compile a little test function and let's look at

> the liveness data.

> 



Ok, so I’ve put 
(set (reg:TI V0_REGNUM) (reg:TI V0_REGNUM))
(set (reg:TI V1_REGNUM) (reg:TI V1_REGNUM))
etc up to V31 in the UNSPEC_TLSDESC pattern in aarch64.md, using up all the neon registers.

In an ideal world, this change would do nothing as neon reg size is TI.

First I compiled up  sve_tls_preserve_1.c (Test1 below)

Without the SET changes, gcc does not backup any neon registers before the tlsdesccall (good).
With the SET changes, gcc backs up all the neon registers (not ideal).

Without the SET changes, dump logs give:

cse1:
;;  regs ever live 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 66 [cc]

cprop1:
;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]
;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]
;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 66 [cc] 79 80 81 82 84 85 86 87 88 89
;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]
;; live  gen 	 0 [x0] 32 [v0] 78 79 80 81 82 83 84 85 86 87 88 89
;; live  kill	 30 [x30] 66 [cc]
;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]
;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

reload:
;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 66 [cc]

With the set changes:

cse1:
;;  regs ever live 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc]

cprop1:
;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]
;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]
;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc] 79 80 81 82 84 85 86 87 88 89
;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 64 [sfp] 65 [ap]
;; live  gen 	 0 [x0] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 78 79 80 81 82 83 84 85 86 87 88 89
;; live  kill	 30 [x30] 66 [cc]
;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]
;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

reload:
;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc]


Is there any extra dump data you think would be useful?


For completeness, I also ran through some more tests…..



Second I tried putting the tls call inside a simple for loop (Test2 below)

Both before and after the SET changes, gcc hoisted the tlsdesccall call out of the loop.
This has already happened before the expand pass.
(and as before, with the SET changes, gcc backs up all the neon registers before the tlsdesccall).


Third, I made the tls variable an array, and accessed elements from it in the loop. (Test3 below)

Both before and after the SET changes, gcc fails to hoist the tlsdesccall call.
That’s a missed optimisation chance - I’m not sure how tricky or worthwhile that’d be to fix (at the
tree level the tls load looks just like a MEM base plus index access. I guess you’d need something
specific in an rtl pass to be the hoist).
Anyway, we now have a tlsdec inside a loop, which is what I wanted.
Without the SET changes, nothing is backed up whilst inside the loop.
With the SET changes, gcc tries to back up the live neon registers, and ICEs with
"error: unable to find a register to spill” in lra-assigns.c.


Forth, I took the original test and made two tls accesses in a row with an asm volatile in-between (Test4 below).

With the SET changes, gcc ICEs in the same way.
Looking at the dump files,

Without the SET changes, dump logs for the final test are similar to the very first test:

;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc]

;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]
;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]
;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 66 [cc] 81 84 85 86 88 89 90 91 95 96 97 98 99 100
;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]
;; live  gen 	 0 [x0] 32 [v0] 81 84 85 86 88 89 90 91 92 95 96 97 98 99 100
;; live  kill	 30 [x30] 66 [cc]
;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]
;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]


With the SET changes:

;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 66 [cc]

;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]
;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]
;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc] 81 84 85 86 88 89 90 91 95 96 97 98 99 100
;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 64 [sfp] 65 [ap]
;; live  gen 	 0 [x0] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 81 84 85 86 88 89 90 91 92 95 96 97 98 99 100
;; live  kill	 30 [x30] 66 [cc]
;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]
;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]




Test1:

__thread v4si tx;

v4si foo (v4si a, v4si b, v4si c)
{
  v4si y;

  y = a + tx + b + c;

  return y + 7;
}


Test2:


__thread v4si tx;

v4si foo (v4si a, v4si *b, v4si *c)
{
  v4si y;

  for(int i=0; i<MAX; i++)
    y += a + tx + b[i] + c[i];

  return y + 7;
}


Test3:

__thread v4si tx[MAX];

v4si foo (v4si a, v4si *b, v4si *c)
{
  v4si y;

  for(int i=0; i<MAX; i++)
    y += a + tx[i] + b[i] + c[i];

  return y + 7;
}


Test4:

__thread v4si tx;
__thread v4si ty;

v4si foo (v4si a, v4si b, v4si c)
{
  v4si y;

  y = a + tx + b + c;

  asm volatile ("" : : : "memory");

  y += a + ty + b + c;

  return y + 7;
}


> 

> Now it could be the case that various local analysis could sub-optimally

> handle things.  You mention LICM.  I know our original LICM did have a

> problem in that if it saw a use of a hard reg in a loop without seeing a

> set of that hard reg it considered the register varying within the loop.

> I have no idea if we carried that forward when the loop code was

> rewritten (when I looked at this it was circa 1992).

> 

> 

>> 

>> Or consider a stream of code containing two tls_desc calls (ok, the compiler might

>> optimise one of the tls calls away, but this approach should be reusable for other exprs).

>> Between the two set(x,x)’s x is considered live so the register allocator can’t use that

>> register.

>> Given that we are applying this to all the neon registers, the register allocator now throws

>> an ICE because it can’t find any free hard neon registers to use.

> Given your statements it sounds like the liveness infrastructure is

> making those neon regs globally live when it sees the low part subreg

> self-set.  Let's confirm that one way or the other and see where it

> takes us.

> 

> Jeff
Alan Hayward Dec. 12, 2017, 11:11 a.m. UTC | #14
Ping.

> On 30 Nov 2017, at 11:03, Alan Hayward <Alan.Hayward@arm.com> wrote:

> 

> 

>> On 27 Nov 2017, at 17:29, Jeff Law <law@redhat.com> wrote:

>> 

>> On 11/23/2017 04:11 AM, Alan Hayward wrote:

>>> 

>>>> On 22 Nov 2017, at 17:33, Jeff Law <law@redhat.com> wrote:

>>>> 

>>>> On 11/22/2017 04:31 AM, Alan Hayward wrote:

>>>>> 

>>>>>> On 21 Nov 2017, at 03:13, Jeff Law <law@redhat.com> wrote:

>>>>>>> 

>>>>>>>> 

>>>>>>>> You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd

>>>>>>>> totally forgotten about it.  And in fact it seems to come pretty close

>>>>>>>> to what you need…

>>>>>>> 

>>>>>>> Yes, some of the code is similar to the way

>>>>>>> TARGET_HARD_REGNO_CALL_PART_CLOBBERED works. Both that code and the

>>>>>>> CLOBBER expr code served as a starting point for writing the patch. The main difference

>>>>>>> here, is that _PART_CLOBBERED is around all calls and is not tied to a specific Instruction,

>>>>>>> it’s part of the calling abi. Whereas clobber_high is explicitly tied to an expression (tls_desc).

>>>>>>> It meant there wasn’t really any opportunity to resume any existing code.

>>>>>> Understood.  Though your first patch mentions that you're trying to

>>>>>> describe partial preservation "around TLS calls". Presumably those are

>>>>>> represented as normal insns, not call_insn.

>>>>>> 

>>>>>> That brings me back to Richi's idea of exposing a set of the low subreg

>>>>>> to itself using whatever mode is wide enough to cover the neon part of

>>>>>> the register.

>>>>>> 

>>>>>> That should tell the generic parts of the compiler that you're just

>>>>>> clobbering the upper part and at least in theory you can implement in

>>>>>> the aarch64 backend and the rest of the compiler should "just work"

>>>>>> because that's the existing semantics of a subreg store.

>>>>>> 

>>>>>> The only worry would be if a pass tried to get overly smart and

>>>>>> considered that kind of set a nop -- but I think I'd argue that's simply

>>>>>> wrong given the semantics of a partial store.

>>>>>> 

>>>>> 

>>>>> So, the instead of using clobber_high(reg X), to use set(reg X, reg X).

>>>>> It’s something we considered, and then dismissed.

>>>>> 

>>>>> The problem then is you are now using SET semantics on those registers, and it

>>>>> would make the register live around the function, which might not be the case.

>>>>> Whereas clobber semantics will just make the register dead - which is exactly

>>>>> what we want (but only conditionally).

>>>> ?!?  A set of the subreg is the *exact* semantics you want.  It says the

>>>> low part is preserved while the upper part is clobbered across the TLS

>>>> insns.

>>>> 

>>>> jeff

>>> 

>>> Consider where the TLS call is inside a loop. The compiler would normally want

>>> to hoist that out of the loop. By adding a set(x,x) into the parallel of the tls_desc we

>>> are now making x live across the loop, x is dependant on the value from the previous

>>> iteration, and the tls_desc can no longer be hoisted.

>> Hmm.  I think I see the problem you're trying to point out.  Let me

>> restate it and see if you agree.

>> 

>> The low subreg set does clearly indicate the upper part of the SVE

>> register is clobbered.  The problem is from a liveness standpoint the

>> compiler is considering the low part live, even though it's a self-set.

>> 

> 

> Yes.

> 

>> In fact, if that is the case, then a single TLS call (independent of a

>> loop) would make the low part of the register globally live.  This

>> should be testable.  Include one of these low part self sets on the

>> existing TLS calls and compile a little test function and let's look at

>> the liveness data.

>> 

> 

> 

> Ok, so I’ve put 

> (set (reg:TI V0_REGNUM) (reg:TI V0_REGNUM))

> (set (reg:TI V1_REGNUM) (reg:TI V1_REGNUM))

> etc up to V31 in the UNSPEC_TLSDESC pattern in aarch64.md, using up all the neon registers.

> 

> In an ideal world, this change would do nothing as neon reg size is TI.

> 

> First I compiled up  sve_tls_preserve_1.c (Test1 below)

> 

> Without the SET changes, gcc does not backup any neon registers before the tlsdesccall (good).

> With the SET changes, gcc backs up all the neon registers (not ideal).

> 

> Without the SET changes, dump logs give:

> 

> cse1:

> ;;  regs ever live 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 66 [cc]

> 

> cprop1:

> ;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

> ;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

> ;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 66 [cc] 79 80 81 82 84 85 86 87 88 89

> ;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

> ;; live  gen 	 0 [x0] 32 [v0] 78 79 80 81 82 83 84 85 86 87 88 89

> ;; live  kill	 30 [x30] 66 [cc]

> ;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

> ;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

> 

> reload:

> ;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 66 [cc]

> 

> With the set changes:

> 

> cse1:

> ;;  regs ever live 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc]

> 

> cprop1:

> ;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]

> ;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]

> ;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc] 79 80 81 82 84 85 86 87 88 89

> ;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 64 [sfp] 65 [ap]

> ;; live  gen 	 0 [x0] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 78 79 80 81 82 83 84 85 86 87 88 89

> ;; live  kill	 30 [x30] 66 [cc]

> ;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

> ;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

> 

> reload:

> ;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc]

> 

> 

> Is there any extra dump data you think would be useful?

> 

> 

> For completeness, I also ran through some more tests…..

> 

> 

> 

> Second I tried putting the tls call inside a simple for loop (Test2 below)

> 

> Both before and after the SET changes, gcc hoisted the tlsdesccall call out of the loop.

> This has already happened before the expand pass.

> (and as before, with the SET changes, gcc backs up all the neon registers before the tlsdesccall).

> 

> 

> Third, I made the tls variable an array, and accessed elements from it in the loop. (Test3 below)

> 

> Both before and after the SET changes, gcc fails to hoist the tlsdesccall call.

> That’s a missed optimisation chance - I’m not sure how tricky or worthwhile that’d be to fix (at the

> tree level the tls load looks just like a MEM base plus index access. I guess you’d need something

> specific in an rtl pass to be the hoist).

> Anyway, we now have a tlsdec inside a loop, which is what I wanted.

> Without the SET changes, nothing is backed up whilst inside the loop.

> With the SET changes, gcc tries to back up the live neon registers, and ICEs with

> "error: unable to find a register to spill” in lra-assigns.c.

> 

> 

> Forth, I took the original test and made two tls accesses in a row with an asm volatile in-between (Test4 below).

> 

> With the SET changes, gcc ICEs in the same way.

> Looking at the dump files,

> 

> Without the SET changes, dump logs for the final test are similar to the very first test:

> 

> ;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc]

> 

> ;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

> ;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

> ;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 66 [cc] 81 84 85 86 88 89 90 91 95 96 97 98 99 100

> ;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

> ;; live  gen 	 0 [x0] 32 [v0] 81 84 85 86 88 89 90 91 92 95 96 97 98 99 100

> ;; live  kill	 30 [x30] 66 [cc]

> ;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

> ;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

> 

> 

> With the SET changes:

> 

> ;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 66 [cc]

> 

> ;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]

> ;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]

> ;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc] 81 84 85 86 88 89 90 91 95 96 97 98 99 100

> ;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 64 [sfp] 65 [ap]

> ;; live  gen 	 0 [x0] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 81 84 85 86 88 89 90 91 92 95 96 97 98 99 100

> ;; live  kill	 30 [x30] 66 [cc]

> ;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

> ;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

> 

> 

> 

> 

> Test1:

> 

> __thread v4si tx;

> 

> v4si foo (v4si a, v4si b, v4si c)

> {

>  v4si y;

> 

>  y = a + tx + b + c;

> 

>  return y + 7;

> }

> 

> 

> Test2:

> 

> 

> __thread v4si tx;

> 

> v4si foo (v4si a, v4si *b, v4si *c)

> {

>  v4si y;

> 

>  for(int i=0; i<MAX; i++)

>    y += a + tx + b[i] + c[i];

> 

>  return y + 7;

> }

> 

> 

> Test3:

> 

> __thread v4si tx[MAX];

> 

> v4si foo (v4si a, v4si *b, v4si *c)

> {

>  v4si y;

> 

>  for(int i=0; i<MAX; i++)

>    y += a + tx[i] + b[i] + c[i];

> 

>  return y + 7;

> }

> 

> 

> Test4:

> 

> __thread v4si tx;

> __thread v4si ty;

> 

> v4si foo (v4si a, v4si b, v4si c)

> {

>  v4si y;

> 

>  y = a + tx + b + c;

> 

>  asm volatile ("" : : : "memory");

> 

>  y += a + ty + b + c;

> 

>  return y + 7;

> }

> 

> 

>> 

>> Now it could be the case that various local analysis could sub-optimally

>> handle things.  You mention LICM.  I know our original LICM did have a

>> problem in that if it saw a use of a hard reg in a loop without seeing a

>> set of that hard reg it considered the register varying within the loop.

>> I have no idea if we carried that forward when the loop code was

>> rewritten (when I looked at this it was circa 1992).

>> 

>> 

>>> 

>>> Or consider a stream of code containing two tls_desc calls (ok, the compiler might

>>> optimise one of the tls calls away, but this approach should be reusable for other exprs).

>>> Between the two set(x,x)’s x is considered live so the register allocator can’t use that

>>> register.

>>> Given that we are applying this to all the neon registers, the register allocator now throws

>>> an ICE because it can’t find any free hard neon registers to use.

>> Given your statements it sounds like the liveness infrastructure is

>> making those neon regs globally live when it sees the low part subreg

>> self-set.  Let's confirm that one way or the other and see where it

>> takes us.

>> 

>> Jeff

>
Alan Hayward Dec. 19, 2017, 10:12 a.m. UTC | #15
Ping ping.
I think there should be enough information in the first test to show that any "set to self”
registers become live. Let me know if there’s anything I’ve missed.


Thanks,
Alan.

> On 12 Dec 2017, at 11:11, Alan Hayward <Alan.Hayward@arm.com> wrote:

> 

> Ping.

> 

>> On 30 Nov 2017, at 11:03, Alan Hayward <Alan.Hayward@arm.com> wrote:

>> 

>> 

>>> On 27 Nov 2017, at 17:29, Jeff Law <law@redhat.com> wrote:

>>> 

>>> On 11/23/2017 04:11 AM, Alan Hayward wrote:

>>>> 

>>>>> On 22 Nov 2017, at 17:33, Jeff Law <law@redhat.com> wrote:

>>>>> 

>>>>> On 11/22/2017 04:31 AM, Alan Hayward wrote:

>>>>>> 

>>>>>>> On 21 Nov 2017, at 03:13, Jeff Law <law@redhat.com> wrote:

>>>>>>>> 

>>>>>>>>> 

>>>>>>>>> You might also look at TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  I'd

>>>>>>>>> totally forgotten about it.  And in fact it seems to come pretty close

>>>>>>>>> to what you need…

>>>>>>>> 

>>>>>>>> Yes, some of the code is similar to the way

>>>>>>>> TARGET_HARD_REGNO_CALL_PART_CLOBBERED works. Both that code and the

>>>>>>>> CLOBBER expr code served as a starting point for writing the patch. The main difference

>>>>>>>> here, is that _PART_CLOBBERED is around all calls and is not tied to a specific Instruction,

>>>>>>>> it’s part of the calling abi. Whereas clobber_high is explicitly tied to an expression (tls_desc).

>>>>>>>> It meant there wasn’t really any opportunity to resume any existing code.

>>>>>>> Understood.  Though your first patch mentions that you're trying to

>>>>>>> describe partial preservation "around TLS calls". Presumably those are

>>>>>>> represented as normal insns, not call_insn.

>>>>>>> 

>>>>>>> That brings me back to Richi's idea of exposing a set of the low subreg

>>>>>>> to itself using whatever mode is wide enough to cover the neon part of

>>>>>>> the register.

>>>>>>> 

>>>>>>> That should tell the generic parts of the compiler that you're just

>>>>>>> clobbering the upper part and at least in theory you can implement in

>>>>>>> the aarch64 backend and the rest of the compiler should "just work"

>>>>>>> because that's the existing semantics of a subreg store.

>>>>>>> 

>>>>>>> The only worry would be if a pass tried to get overly smart and

>>>>>>> considered that kind of set a nop -- but I think I'd argue that's simply

>>>>>>> wrong given the semantics of a partial store.

>>>>>>> 

>>>>>> 

>>>>>> So, the instead of using clobber_high(reg X), to use set(reg X, reg X).

>>>>>> It’s something we considered, and then dismissed.

>>>>>> 

>>>>>> The problem then is you are now using SET semantics on those registers, and it

>>>>>> would make the register live around the function, which might not be the case.

>>>>>> Whereas clobber semantics will just make the register dead - which is exactly

>>>>>> what we want (but only conditionally).

>>>>> ?!?  A set of the subreg is the *exact* semantics you want.  It says the

>>>>> low part is preserved while the upper part is clobbered across the TLS

>>>>> insns.

>>>>> 

>>>>> jeff

>>>> 

>>>> Consider where the TLS call is inside a loop. The compiler would normally want

>>>> to hoist that out of the loop. By adding a set(x,x) into the parallel of the tls_desc we

>>>> are now making x live across the loop, x is dependant on the value from the previous

>>>> iteration, and the tls_desc can no longer be hoisted.

>>> Hmm.  I think I see the problem you're trying to point out.  Let me

>>> restate it and see if you agree.

>>> 

>>> The low subreg set does clearly indicate the upper part of the SVE

>>> register is clobbered.  The problem is from a liveness standpoint the

>>> compiler is considering the low part live, even though it's a self-set.

>>> 

>> 

>> Yes.

>> 

>>> In fact, if that is the case, then a single TLS call (independent of a

>>> loop) would make the low part of the register globally live.  This

>>> should be testable.  Include one of these low part self sets on the

>>> existing TLS calls and compile a little test function and let's look at

>>> the liveness data.

>>> 

>> 

>> 

>> Ok, so I’ve put 

>> (set (reg:TI V0_REGNUM) (reg:TI V0_REGNUM))

>> (set (reg:TI V1_REGNUM) (reg:TI V1_REGNUM))

>> etc up to V31 in the UNSPEC_TLSDESC pattern in aarch64.md, using up all the neon registers.

>> 

>> In an ideal world, this change would do nothing as neon reg size is TI.

>> 

>> First I compiled up  sve_tls_preserve_1.c (Test1 below)

>> 

>> Without the SET changes, gcc does not backup any neon registers before the tlsdesccall (good).

>> With the SET changes, gcc backs up all the neon registers (not ideal).

>> 

>> Without the SET changes, dump logs give:

>> 

>> cse1:

>> ;;  regs ever live 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 66 [cc]

>> 

>> cprop1:

>> ;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

>> ;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

>> ;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 66 [cc] 79 80 81 82 84 85 86 87 88 89

>> ;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

>> ;; live  gen 	 0 [x0] 32 [v0] 78 79 80 81 82 83 84 85 86 87 88 89

>> ;; live  kill	 30 [x30] 66 [cc]

>> ;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

>> ;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

>> 

>> reload:

>> ;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 66 [cc]

>> 

>> With the set changes:

>> 

>> cse1:

>> ;;  regs ever live 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc]

>> 

>> cprop1:

>> ;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]

>> ;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]

>> ;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc] 79 80 81 82 84 85 86 87 88 89

>> ;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 64 [sfp] 65 [ap]

>> ;; live  gen 	 0 [x0] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 78 79 80 81 82 83 84 85 86 87 88 89

>> ;; live  kill	 30 [x30] 66 [cc]

>> ;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

>> ;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

>> 

>> reload:

>> ;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc]

>> 

>> 

>> Is there any extra dump data you think would be useful?

>> 

>> 

>> For completeness, I also ran through some more tests…..

>> 

>> 

>> 

>> Second I tried putting the tls call inside a simple for loop (Test2 below)

>> 

>> Both before and after the SET changes, gcc hoisted the tlsdesccall call out of the loop.

>> This has already happened before the expand pass.

>> (and as before, with the SET changes, gcc backs up all the neon registers before the tlsdesccall).

>> 

>> 

>> Third, I made the tls variable an array, and accessed elements from it in the loop. (Test3 below)

>> 

>> Both before and after the SET changes, gcc fails to hoist the tlsdesccall call.

>> That’s a missed optimisation chance - I’m not sure how tricky or worthwhile that’d be to fix (at the

>> tree level the tls load looks just like a MEM base plus index access. I guess you’d need something

>> specific in an rtl pass to be the hoist).

>> Anyway, we now have a tlsdec inside a loop, which is what I wanted.

>> Without the SET changes, nothing is backed up whilst inside the loop.

>> With the SET changes, gcc tries to back up the live neon registers, and ICEs with

>> "error: unable to find a register to spill” in lra-assigns.c.

>> 

>> 

>> Forth, I took the original test and made two tls accesses in a row with an asm volatile in-between (Test4 below).

>> 

>> With the SET changes, gcc ICEs in the same way.

>> Looking at the dump files,

>> 

>> Without the SET changes, dump logs for the final test are similar to the very first test:

>> 

>> ;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc]

>> 

>> ;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

>> ;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

>> ;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 66 [cc] 81 84 85 86 88 89 90 91 95 96 97 98 99 100

>> ;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 64 [sfp] 65 [ap]

>> ;; live  gen 	 0 [x0] 32 [v0] 81 84 85 86 88 89 90 91 92 95 96 97 98 99 100

>> ;; live  kill	 30 [x30] 66 [cc]

>> ;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

>> ;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

>> 

>> 

>> With the SET changes:

>> 

>> ;;  regs ever live 	 0 [x0] 1 [x1] 2 [x2] 29 [x29] 30 [x30] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 66 [cc]

>> 

>> ;; lr  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]

>> ;; lr  use 	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 64 [sfp] 65 [ap]

>> ;; lr  def 	 0 [x0] 30 [x30] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 66 [cc] 81 84 85 86 88 89 90 91 95 96 97 98 99 100

>> ;; live  in  	 29 [x29] 31 [sp] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 64 [sfp] 65 [ap]

>> ;; live  gen 	 0 [x0] 32 [v0] 33 [v1] 34 [v2] 35 [v3] 36 [v4] 37 [v5] 38 [v6] 39 [v7] 40 [v8] 41 [v9] 42 [v10] 43 [v11] 44 [v12] 45 [v13] 46 [v14] 47 [v15] 48 [v16] 49 [v17] 50 [v18] 51 [v19] 52 [v20] 53 [v21] 54 [v22] 55 [v23] 56 [v24] 57 [v25] 58 [v26] 59 [v27] 60 [v28] 61 [v29] 62 [v30] 63 [v31] 81 84 85 86 88 89 90 91 92 95 96 97 98 99 100

>> ;; live  kill	 30 [x30] 66 [cc]

>> ;; lr  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

>> ;; live  out 	 29 [x29] 31 [sp] 32 [v0] 64 [sfp] 65 [ap]

>> 

>> 

>> 

>> 

>> Test1:

>> 

>> __thread v4si tx;

>> 

>> v4si foo (v4si a, v4si b, v4si c)

>> {

>> v4si y;

>> 

>> y = a + tx + b + c;

>> 

>> return y + 7;

>> }

>> 

>> 

>> Test2:

>> 

>> 

>> __thread v4si tx;

>> 

>> v4si foo (v4si a, v4si *b, v4si *c)

>> {

>> v4si y;

>> 

>> for(int i=0; i<MAX; i++)

>>   y += a + tx + b[i] + c[i];

>> 

>> return y + 7;

>> }

>> 

>> 

>> Test3:

>> 

>> __thread v4si tx[MAX];

>> 

>> v4si foo (v4si a, v4si *b, v4si *c)

>> {

>> v4si y;

>> 

>> for(int i=0; i<MAX; i++)

>>   y += a + tx[i] + b[i] + c[i];

>> 

>> return y + 7;

>> }

>> 

>> 

>> Test4:

>> 

>> __thread v4si tx;

>> __thread v4si ty;

>> 

>> v4si foo (v4si a, v4si b, v4si c)

>> {

>> v4si y;

>> 

>> y = a + tx + b + c;

>> 

>> asm volatile ("" : : : "memory");

>> 

>> y += a + ty + b + c;

>> 

>> return y + 7;

>> }

>> 

>> 

>>> 

>>> Now it could be the case that various local analysis could sub-optimally

>>> handle things.  You mention LICM.  I know our original LICM did have a

>>> problem in that if it saw a use of a hard reg in a loop without seeing a

>>> set of that hard reg it considered the register varying within the loop.

>>> I have no idea if we carried that forward when the loop code was

>>> rewritten (when I looked at this it was circa 1992).

>>> 

>>> 

>>>> 

>>>> Or consider a stream of code containing two tls_desc calls (ok, the compiler might

>>>> optimise one of the tls calls away, but this approach should be reusable for other exprs).

>>>> Between the two set(x,x)’s x is considered live so the register allocator can’t use that

>>>> register.

>>>> Given that we are applying this to all the neon registers, the register allocator now throws

>>>> an ICE because it can’t find any free hard neon registers to use.

>>> Given your statements it sounds like the liveness infrastructure is

>>> making those neon regs globally live when it sees the low part subreg

>>> self-set.  Let's confirm that one way or the other and see where it

>>> takes us.

>>> 

>>> Jeff

>> 

>
Jeff Law Dec. 19, 2017, 4:27 p.m. UTC | #16
On 12/19/2017 03:12 AM, Alan Hayward wrote:
> Ping ping.
> I think there should be enough information in the first test to show that any "set to self”
> registers become live. Let me know if there’s anything I’ve missed.
I think that both Richi and I would like you investigate fixing the df
infrastructure so that a self-set doesn't make things go live.  Sorry if
we weren't clear about that.  ie, level of complexity and likely fallout
so that we can evaluate that vs CLOBBER_HIGH.



jeff
Alan Hayward Jan. 12, 2018, 11:58 a.m. UTC | #17
> On 19 Dec 2017, at 16:27, Jeff Law <law@redhat.com> wrote:

> 

> On 12/19/2017 03:12 AM, Alan Hayward wrote:

>> Ping ping.

>> I think there should be enough information in the first test to show that any "set to self”

>> registers become live. Let me know if there’s anything I’ve missed.

> I think that both Richi and I would like you investigate fixing the df

> infrastructure so that a self-set doesn't make things go live.  Sorry if

> we weren't clear about that.  ie, level of complexity and likely fallout

> so that we can evaluate that vs CLOBBER_HIGH.

> 

> 

> 

> jeff

> 



Right, sorry, I misunderstood. Ok, so I’ve been looking at trying to do this.

To summarise: To do this we need to check for 1) All reg sets to self where 2) the existing value of the register fits within the mode of the new set. In these cases we want then to (in effect) pretend the set doesn’t exist.
To test this, I add a set to self in a tls_desc call for every V register ( eg: (set (reg:TI V0_REGNUM) (reg:TI V0_REGNUM)), (set (reg:TI V1_REGNUM) (reg:TI V1_REGNUM))  etc etc). If the patch works, then these registers will not be backed up around a tls call.

First added some checks into df-scan and successfully stopped the sets to self from being added to the live and uses lists.  [ To simplify the issue for now I ignored the existing value of the register, and just assumed it fits ].
However, running my test case, the code still results in my vector registers being backed up around tls. Even though the dumps show that my vector registers are no longer live.

Debugging further, finds that the register backing up is happening as part of combine.c. Ok, I can add a check for reg set to self in here too…..
But as part of my clobber_high patch I already had code in clobber.c that checked for CLOBBER_HIGH and then checked the mode of the previous value in the register. My new code ends up looking very similar to my clobber high patch.

Instead of:

if (GET_CODE (setter) == CLOBBER_HIGH
    && reg_is_clobbered_by_clobber_high(REGNO(dest), GET_MODE (rsp->last_set_value))

Now becomes something like:

if (GET_CODE (setter) == SET
    && REG_P (dest) && HARD_REGISTER_P (dest) && REG_P (src) && REGNO(dst) == REGNO(src)
    && reg_is_clobbered_by_self_set(REGNO(dest), GET_MODE (rsp->last_set_value))

I then need to find the next pass that has similar checks…. and again it’ll be the same places I already have clobber high code.

I suspect in the end I’ll be able to remove the df-scan changes, because as I effectively found out with clobber high, they aren’t causing any register backups to happen.

Ok, in the new patch we do save a bit of code because there is no new expression to add. But that was a small part of the full patch.

I could rewrite the patch in this way, but personally I feel it’s now exploiting a side effect of a set to self, rather than being explicit in what it is trying to do. 


Alan.
Alan Hayward Jan. 24, 2018, 11:31 a.m. UTC | #18
Ping.
Any comments on this?
The one line summary is that using self sets instead of clobber high would result in a
patch roughly the same, but with different condition checks.
It depends if people think it really is useful for self sets to not be live.

Given that we are at stage 4 now, and this can’t go in until stage 1, I’m happy to
leave the discussion until stage 1, but would appreciate any suggestions before then.

Thanks,
Alan.

> On 12 Jan 2018, at 11:58, Alan Hayward <Alan.Hayward@arm.com> wrote:

> 

> 

> 

>> On 19 Dec 2017, at 16:27, Jeff Law <law@redhat.com> wrote:

>> 

>> On 12/19/2017 03:12 AM, Alan Hayward wrote:

>>> Ping ping.

>>> I think there should be enough information in the first test to show that any "set to self”

>>> registers become live. Let me know if there’s anything I’ve missed.

>> I think that both Richi and I would like you investigate fixing the df

>> infrastructure so that a self-set doesn't make things go live.  Sorry if

>> we weren't clear about that.  ie, level of complexity and likely fallout

>> so that we can evaluate that vs CLOBBER_HIGH.

>> 

>> 

>> 

>> jeff

>> 

> 

> 

> Right, sorry, I misunderstood. Ok, so I’ve been looking at trying to do this.

> 

> To summarise: To do this we need to check for 1) All reg sets to self where 2) the existing value of the register fits within the mode of the new set. In these cases we want then to (in effect) pretend the set doesn’t exist.

> To test this, I add a set to self in a tls_desc call for every V register ( eg: (set (reg:TI V0_REGNUM) (reg:TI V0_REGNUM)), (set (reg:TI V1_REGNUM) (reg:TI V1_REGNUM))  etc etc). If the patch works, then these registers will not be backed up around a tls call.

> 

> First added some checks into df-scan and successfully stopped the sets to self from being added to the live and uses lists.  [ To simplify the issue for now I ignored the existing value of the register, and just assumed it fits ].

> However, running my test case, the code still results in my vector registers being backed up around tls. Even though the dumps show that my vector registers are no longer live.

> 

> Debugging further, finds that the register backing up is happening as part of combine.c. Ok, I can add a check for reg set to self in here too…..

> But as part of my clobber_high patch I already had code in clobber.c that checked for CLOBBER_HIGH and then checked the mode of the previous value in the register. My new code ends up looking very similar to my clobber high patch.

> 

> Instead of:

> 

> if (GET_CODE (setter) == CLOBBER_HIGH

>    && reg_is_clobbered_by_clobber_high(REGNO(dest), GET_MODE (rsp->last_set_value))

> 

> Now becomes something like:

> 

> if (GET_CODE (setter) == SET

>    && REG_P (dest) && HARD_REGISTER_P (dest) && REG_P (src) && REGNO(dst) == REGNO(src)

>    && reg_is_clobbered_by_self_set(REGNO(dest), GET_MODE (rsp->last_set_value))

> 

> I then need to find the next pass that has similar checks…. and again it’ll be the same places I already have clobber high code.

> 

> I suspect in the end I’ll be able to remove the df-scan changes, because as I effectively found out with clobber high, they aren’t causing any register backups to happen.

> 

> Ok, in the new patch we do save a bit of code because there is no new expression to add. But that was a small part of the full patch.

> 

> I could rewrite the patch in this way, but personally I feel it’s now exploiting a side effect of a set to self, rather than being explicit in what it is trying to do. 

> 

> 

> Alan.

>
diff mbox series

Patch

diff --git a/gcc/doc/rtl.texi b/gcc/doc/rtl.texi
index f583940b9441b2111c8d65a00a064e89bdd2ffaf..951322258ddbb57900225bd501bd23a8a9970ead 100644
--- a/gcc/doc/rtl.texi
+++ b/gcc/doc/rtl.texi
@@ -3209,6 +3209,18 @@  There is one other known use for clobbering a pseudo register in a
 clobbered by the insn.  In this case, using the same pseudo register in
 the clobber and elsewhere in the insn produces the expected results.

+@findex clobber_high
+@item (clobber_high @var{x})
+Represents the storing or possible storing of an unpredictable,
+undescribed value into the upper parts of @var{x}. The mode of the expression
+represents the lower parts of the register which will not be overwritten.
+@code{reg} must be a reg expression.
+
+One place this is used is when calling into functions where the registers are
+preserved, but only up to a given number of bits.  For example when using
+Aarch64 SVE, calling a TLS descriptor will cause only the lower 128 bits of
+each of the vector registers to be preserved.
+
 @findex use
 @item (use @var{x})
 Represents the use of the value of @var{x}.  It indicates that the
@@ -3262,7 +3274,8 @@  Represents several side effects performed in parallel.  The square
 brackets stand for a vector; the operand of @code{parallel} is a
 vector of expressions.  @var{x0}, @var{x1} and so on are individual
 side effect expressions---expressions of code @code{set}, @code{call},
-@code{return}, @code{simple_return}, @code{clobber} or @code{use}.
+@code{return}, @code{simple_return}, @code{clobber} @code{use} or
+@code{clobber_high}.

 ``In parallel'' means that first all the values used in the individual
 side-effects are computed, and second all the actual side-effects are
diff --git a/gcc/rtl.c b/gcc/rtl.c
index 3b2728be8b506fb3c14a20297cf92368caa5ca3b..6db84f99627bb8617c6e227892ca44076f4e729b 100644
--- a/gcc/rtl.c
+++ b/gcc/rtl.c
@@ -860,6 +860,17 @@  rtl_check_failed_code2 (const_rtx r, enum rtx_code code1, enum rtx_code code2,
 }

 void
+rtl_check_failed_code3 (const_rtx r, enum rtx_code code1, enum rtx_code code2,
+			enum rtx_code code3, const char *file, int line,
+			const char *func)
+{
+  internal_error
+    ("RTL check: expected code '%s', '%s' or '%s', have '%s' in %s, at %s:%d",
+     GET_RTX_NAME (code1), GET_RTX_NAME (code2), GET_RTX_NAME (code3),
+     GET_RTX_NAME (GET_CODE (r)), func, trim_filename (file), line);
+}
+
+void
 rtl_check_failed_code_mode (const_rtx r, enum rtx_code code, machine_mode mode,
 			    bool not_mode, const char *file, int line,
 			    const char *func)
diff --git a/gcc/rtl.def b/gcc/rtl.def
index 83bcfcaadcacc45cce352bf7fba33fbbc87ccd58..a6c4d4a46c4eb4f6cb0eca66a3f6a558f94acc8a 100644
--- a/gcc/rtl.def
+++ b/gcc/rtl.def
@@ -312,6 +312,16 @@  DEF_RTL_EXPR(USE, "use", "e", RTX_EXTRA)
    is considered undeletable before reload.  */
 DEF_RTL_EXPR(CLOBBER, "clobber", "e", RTX_EXTRA)

+/* Indicate that the upper parts of something are clobbered in a way that we
+   don't want to explain.  The MODE references the lower bits that will be
+   preserved.  Anything above that size will be clobbered.
+
+   CLOBBER_HIGH only occurs as the operand of a PARALLEL rtx.  It cannot appear
+   in other contexts, and unlike CLOBBER, it cannot appear on its own.
+   CLOBBER_HIGH can only be used with fixed register rtxes.  */
+
+DEF_RTL_EXPR(CLOBBER_HIGH, "clobber_high", "e", RTX_EXTRA)
+
 /* Call a subroutine.
    Operand 1 is the address to call.
    Operand 2 is the number of arguments.  */
diff --git a/gcc/rtl.h b/gcc/rtl.h
index ec5cf314a9e516e7e855e5d897a9a26c4ce36c20..71621bdfd67c4ce3dcccc5279456cae841371f97 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -1083,6 +1083,14 @@  is_a_helper <rtx_note *>::test (rtx_insn *insn)
 			       __FUNCTION__); \
      &_rtx->u.fld[_n]; }))

+#define RTL_CHECKC3(RTX, N, C1, C2, C3) __extension__			\
+(*({ __typeof (RTX) const _rtx = (RTX); const int _n = (N);		\
+     const enum rtx_code _code = GET_CODE (_rtx);			\
+     if (_code != (C1) && _code != (C2) && _code != (C3))		\
+       rtl_check_failed_code3 (_rtx, (C1), (C2), (C3), __FILE__,	\
+			       __LINE__, __FUNCTION__);			\
+     &_rtx->u.fld[_n]; }))
+
 #define RTVEC_ELT(RTVEC, I) __extension__				\
 (*({ __typeof (RTVEC) const _rtvec = (RTVEC); const int _i = (I);	\
      if (_i < 0 || _i >= GET_NUM_ELEM (_rtvec))				\
@@ -1173,6 +1181,10 @@  extern void rtl_check_failed_code1 (const_rtx, enum rtx_code, const char *,
 extern void rtl_check_failed_code2 (const_rtx, enum rtx_code, enum rtx_code,
 				    const char *, int, const char *)
     ATTRIBUTE_NORETURN ATTRIBUTE_COLD;
+extern void rtl_check_failed_code3 (const_rtx, enum rtx_code, enum rtx_code,
+				    enum rtx_code, const char *, int,
+				    const char *)
+    ATTRIBUTE_NORETURN ATTRIBUTE_COLD;
 extern void rtl_check_failed_code_mode (const_rtx, enum rtx_code, machine_mode,
 					bool, const char *, int, const char *)
     ATTRIBUTE_NORETURN ATTRIBUTE_COLD;
@@ -1191,6 +1203,7 @@  extern void rtvec_check_failed_bounds (const_rtvec, int, const char *, int,
 #define RTL_CHECK2(RTX, N, C1, C2)  ((RTX)->u.fld[N])
 #define RTL_CHECKC1(RTX, N, C)	    ((RTX)->u.fld[N])
 #define RTL_CHECKC2(RTX, N, C1, C2) ((RTX)->u.fld[N])
+#define RTL_CHECKC3(RTX, N, C1, C2, C3) ((RTX)->u.fld[N])
 #define RTVEC_ELT(RTVEC, I)	    ((RTVEC)->elem[I])
 #define XWINT(RTX, N)		    ((RTX)->u.hwint[N])
 #define CWI_ELT(RTX, I)		    ((RTX)->u.hwiv.elem[I])
@@ -1345,6 +1358,7 @@  extern void rtl_check_failed_flag (const char *, const_rtx, const char *,
 #define XCVECLEN(RTX, N, C)	GET_NUM_ELEM (XCVEC (RTX, N, C))

 #define XC2EXP(RTX, N, C1, C2)      (RTL_CHECKC2 (RTX, N, C1, C2).rt_rtx)
+#define XC3EXP(RTX, N, C1, C2, C3)  (RTL_CHECKC3 (RTX, N, C1, C2, C3).rt_rtx)