diff mbox series

[v4] xtensa: Eliminate the use of callee-saved register that saves and restores only once

Message ID 465b0cbe-73ca-f5a0-661d-d34217e29b4d@yahoo.co.jp
State New
Headers show
Series [v4] xtensa: Eliminate the use of callee-saved register that saves and restores only once | expand

Commit Message

Takayuki 'January June' Suwa Jan. 19, 2023, 3:50 a.m. UTC
In the previous patch, if insn is JUMP_INSN or CALL_INSN, it bypasses the reg check (possibly FAIL).

=====
In the case of the CALL0 ABI, values that must be retained before and
after function calls are placed in the callee-saved registers (A12
through A15) and referenced later.  However, it is often the case that
the save and the reference are each only once and a simple register-
register move (the frame pointer is needed to recover the stack pointer
and must be excluded).

e.g. in the following example, if there are no other occurrences of
register A14:

;; before
	; prologue {
  ...
	s32i.n	a14, sp, 16
  ...
	; } prologue
  ...
	mov.n	a14, a6
  ...
	call0	foo
  ...
	mov.n	a8, a14
  ...
	; epilogue {
  ...
	l32i.n	a14, sp, 16
  ...
	; } epilogue

It can be possible like this:

;; after
	; prologue {
  ...
	(deleted)
  ...
	; } prologue
  ...
	s32i.n	a6, sp, 16
  ...
	call0	foo
  ...
	l32i.n	a8, sp, 16
  ...
	; epilogue {
  ...
	(deleted)
  ...
	; } epilogue

This patch introduces a new peephole2 pattern that implements the above.

gcc/ChangeLog:

	* config/xtensa/xtensa.md: New peephole2 pattern that eliminates
	the use of callee-saved register that saves and restores only once
	for other register, by using its stack slot directly.
---
 gcc/config/xtensa/xtensa.md | 62 +++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

Comments

Max Filippov Jan. 20, 2023, 3:14 p.m. UTC | #1
Hi Suwa-san,

On Wed, Jan 18, 2023 at 7:50 PM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
>
> In the previous patch, if insn is JUMP_INSN or CALL_INSN, it bypasses the reg check (possibly FAIL).
>
> =====
> In the case of the CALL0 ABI, values that must be retained before and
> after function calls are placed in the callee-saved registers (A12
> through A15) and referenced later.  However, it is often the case that
> the save and the reference are each only once and a simple register-
> register move (the frame pointer is needed to recover the stack pointer
> and must be excluded).
>
> e.g. in the following example, if there are no other occurrences of
> register A14:
>
> ;; before
>         ; prologue {
>   ...
>         s32i.n  a14, sp, 16
>   ...
>         ; } prologue
>   ...
>         mov.n   a14, a6
>   ...
>         call0   foo
>   ...
>         mov.n   a8, a14
>   ...
>         ; epilogue {
>   ...
>         l32i.n  a14, sp, 16
>   ...
>         ; } epilogue
>
> It can be possible like this:
>
> ;; after
>         ; prologue {
>   ...
>         (deleted)
>   ...
>         ; } prologue
>   ...
>         s32i.n  a6, sp, 16
>   ...
>         call0   foo
>   ...
>         l32i.n  a8, sp, 16
>   ...
>         ; epilogue {
>   ...
>         (deleted)
>   ...
>         ; } epilogue
>
> This patch introduces a new peephole2 pattern that implements the above.
>
> gcc/ChangeLog:
>
>         * config/xtensa/xtensa.md: New peephole2 pattern that eliminates
>         the use of callee-saved register that saves and restores only once
>         for other register, by using its stack slot directly.
> ---
>  gcc/config/xtensa/xtensa.md | 62 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 62 insertions(+)

There are still issues with this change in the libgomp:

FAIL: libgomp.c/examples-4/target-1.c execution test
FAIL: libgomp.c/examples-4/target-2.c execution test

They come from the following function:

code produced before the change:
       .literal_position
       .literal .LC8, init@PLT
       .literal .LC9, 400000
       .literal .LC10, 100000
       .literal .LC11, -800000
       .literal .LC12, 800000
       .align  4
       .global vec_mult_ref
       .type   vec_mult_ref, @function
vec_mult_ref:
       l32r    a9, .LC11
       addi    sp, sp, -16
       l32r    a10, .LC9
       s32i.n  a12, sp, 8
       s32i.n  a13, sp, 4
       s32i.n  a0, sp, 12
       add.n   sp, sp, a9
       add.n   a12, sp, a10
       l32r    a9, .LC8
       mov.n   a13, a2
       mov.n   a3, sp
       mov.n   a2, a12
       callx0  a9
       l32r    a7, .LC10
       mov.n   a10, a12
       mov.n   a11, sp
       mov.n   a2, a13
       loop    a7, .L17_LEND
.L17:
       l32i.n  a9, a10, 0
       l32i.n  a6, a11, 0
       addi.n  a10, a10, 4
       mull    a9, a9, a6
       addi.n  a11, a11, 4
       s32i.n  a9, a2, 0
       addi.n  a2, a2, 4
       .L17_LEND:
       l32r    a9, .LC12
       add.n   sp, sp, a9
       l32i.n  a0, sp, 12
       l32i.n  a12, sp, 8
       l32i.n  a13, sp, 4
       addi    sp, sp, 16
       ret.n

--------------------

with the change:
       .literal_position
       .literal .LC8, init@PLT
       .literal .LC9, 400000
       .literal .LC10, 100000
       .literal .LC11, -800000
       .literal .LC12, 800000
       .align  4
       .global vec_mult_ref
       .type   vec_mult_ref, @function
vec_mult_ref:
       l32r    a9, .LC11
       l32r    a10, .LC9
       addi    sp, sp, -16
       s32i.n  a12, sp, 8
       s32i.n  a0, sp, 12
       add.n   sp, sp, a9
       add.n   a12, sp, a10
       l32r    a9, .LC8
       s32i.n  a2, sp, 4
       mov.n   a3, sp
       mov.n   a2, a12
       callx0  a9
       l32r    a7, .LC10
       l32i.n  a2, sp, 4
       mov.n   a10, a12
       mov.n   a11, sp
       loop    a7, .L17_LEND
.L17:
       l32i.n  a9, a10, 0
       l32i.n  a6, a11, 0
       addi.n  a10, a10, 4
       mull    a9, a9, a6
       addi.n  a11, a11, 4
       s32i.n  a9, a2, 0
       addi.n  a2, a2, 4
       .L17_LEND:
       l32r    a9, .LC12
       add.n   sp, sp, a9
       l32i.n  a0, sp, 12
       l32i.n  a12, sp, 8
       addi    sp, sp, 16
       ret.n

the stack pointer is modified after saving callee-saved registers,
but the stack offset where a2 is stored and reloaded does not take
this into an account.

After having this many attempts and getting to the issues that are
really hard to detect I wonder if the target backend is the right place
for this optimization?
Takayuki 'January June' Suwa Jan. 21, 2023, 4:39 a.m. UTC | #2
On 2023/01/21 0:14, Max Filippov wrote:
> Hi Suwa-san,
Hi!

> 
> On Wed, Jan 18, 2023 at 7:50 PM Takayuki 'January June' Suwa
> <jjsuwa_sys3175@yahoo.co.jp> wrote:
>>
>> In the previous patch, if insn is JUMP_INSN or CALL_INSN, it bypasses the reg check (possibly FAIL).
>>
>> =====
>> In the case of the CALL0 ABI, values that must be retained before and
>> after function calls are placed in the callee-saved registers (A12
>> through A15) and referenced later.  However, it is often the case that
>> the save and the reference are each only once and a simple register-
>> register move (the frame pointer is needed to recover the stack pointer
>> and must be excluded).
>>
>> e.g. in the following example, if there are no other occurrences of
>> register A14:
>>
>> ;; before
>>         ; prologue {
>>   ...
>>         s32i.n  a14, sp, 16
>>   ...
>>         ; } prologue
>>   ...
>>         mov.n   a14, a6
>>   ...
>>         call0   foo
>>   ...
>>         mov.n   a8, a14
>>   ...
>>         ; epilogue {
>>   ...
>>         l32i.n  a14, sp, 16
>>   ...
>>         ; } epilogue
>>
>> It can be possible like this:
>>
>> ;; after
>>         ; prologue {
>>   ...
>>         (deleted)
>>   ...
>>         ; } prologue
>>   ...
>>         s32i.n  a6, sp, 16
>>   ...
>>         call0   foo
>>   ...
>>         l32i.n  a8, sp, 16
>>   ...
>>         ; epilogue {
>>   ...
>>         (deleted)
>>   ...
>>         ; } epilogue
>>
>> This patch introduces a new peephole2 pattern that implements the above.
>>
>> gcc/ChangeLog:
>>
>>         * config/xtensa/xtensa.md: New peephole2 pattern that eliminates
>>         the use of callee-saved register that saves and restores only once
>>         for other register, by using its stack slot directly.
>> ---
>>  gcc/config/xtensa/xtensa.md | 62 +++++++++++++++++++++++++++++++++++++
>>  1 file changed, 62 insertions(+)
> 
> There are still issues with this change in the libgomp:
> 
> FAIL: libgomp.c/examples-4/target-1.c execution test
> FAIL: libgomp.c/examples-4/target-2.c execution test
> 
> They come from the following function:
> 
> code produced before the change:
>        .literal_position
>        .literal .LC8, init@PLT
>        .literal .LC9, 400000
>        .literal .LC10, 100000
>        .literal .LC11, -800000
>        .literal .LC12, 800000
>        .align  4
>        .global vec_mult_ref
>        .type   vec_mult_ref, @function
> vec_mult_ref:
>        l32r    a9, .LC11
>        addi    sp, sp, -16
>        l32r    a10, .LC9
>        s32i.n  a12, sp, 8
>        s32i.n  a13, sp, 4
>        s32i.n  a0, sp, 12
>        add.n   sp, sp, a9
>        add.n   a12, sp, a10
>        l32r    a9, .LC8
>        mov.n   a13, a2
>        mov.n   a3, sp
>        mov.n   a2, a12
>        callx0  a9
>        l32r    a7, .LC10
>        mov.n   a10, a12
>        mov.n   a11, sp
>        mov.n   a2, a13
>        loop    a7, .L17_LEND
> .L17:
>        l32i.n  a9, a10, 0
>        l32i.n  a6, a11, 0
>        addi.n  a10, a10, 4
>        mull    a9, a9, a6
>        addi.n  a11, a11, 4
>        s32i.n  a9, a2, 0
>        addi.n  a2, a2, 4
>        .L17_LEND:
>        l32r    a9, .LC12
>        add.n   sp, sp, a9
>        l32i.n  a0, sp, 12
>        l32i.n  a12, sp, 8
>        l32i.n  a13, sp, 4
>        addi    sp, sp, 16
>        ret.n
> 
> --------------------
> 
> with the change:
>        .literal_position
>        .literal .LC8, init@PLT
>        .literal .LC9, 400000
>        .literal .LC10, 100000
>        .literal .LC11, -800000
>        .literal .LC12, 800000
>        .align  4
>        .global vec_mult_ref
>        .type   vec_mult_ref, @function
> vec_mult_ref:
>        l32r    a9, .LC11
>        l32r    a10, .LC9
>        addi    sp, sp, -16
>        s32i.n  a12, sp, 8
>        s32i.n  a0, sp, 12
>        add.n   sp, sp, a9
>        add.n   a12, sp, a10
>        l32r    a9, .LC8
>        s32i.n  a2, sp, 4
>        mov.n   a3, sp
>        mov.n   a2, a12
>        callx0  a9
>        l32r    a7, .LC10
>        l32i.n  a2, sp, 4
>        mov.n   a10, a12
>        mov.n   a11, sp
>        loop    a7, .L17_LEND
> .L17:
>        l32i.n  a9, a10, 0
>        l32i.n  a6, a11, 0
>        addi.n  a10, a10, 4
>        mull    a9, a9, a6
>        addi.n  a11, a11, 4
>        s32i.n  a9, a2, 0
>        addi.n  a2, a2, 4
>        .L17_LEND:
>        l32r    a9, .LC12
>        add.n   sp, sp, a9
>        l32i.n  a0, sp, 12
>        l32i.n  a12, sp, 8
>        addi    sp, sp, 16
>        ret.n
> 
> the stack pointer is modified after saving callee-saved registers,
> but the stack offset where a2 is stored and reloaded does not take
> this into an account.
> 
> After having this many attempts and getting to the issues that are
> really hard to detect I wonder if the target backend is the right place
> for this optimization?
> 
I guess they are not hard to detect but just issues I didn't anticipate (and I just need a little more work).
And where else should it be done?  What about implementing a target-specific pass just for one-point optimization?
Max Filippov Jan. 22, 2023, 3:45 p.m. UTC | #3
On Fri, Jan 20, 2023 at 8:39 PM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
> On 2023/01/21 0:14, Max Filippov wrote:
> > After having this many attempts and getting to the issues that are
> > really hard to detect I wonder if the target backend is the right place
> > for this optimization?
> >
> I guess they are not hard to detect

I mean, on the testing side. check-gcc testsuite passed without new
regressions with this change, linux kernel smoke test passed, I was
almost convinced that it's ok to commit.

> but just issues I didn't anticipate (and I just need a little more work).

Looking at other peephole2 patterns I see that their code transformations
are much more compact and they don't need to track additional properties
of unrelated instructions.

> And where else should it be done?  What about implementing a
> target-specific pass just for one-point optimization?

I don't even understand what's target-specific in this optimization?
It looks very generic to me.
Takayuki 'January June' Suwa Jan. 23, 2023, 3:33 a.m. UTC | #4
On 2023/01/23 0:45, Max Filippov wrote:
> On Fri, Jan 20, 2023 at 8:39 PM Takayuki 'January June' Suwa
> <jjsuwa_sys3175@yahoo.co.jp> wrote:
>> On 2023/01/21 0:14, Max Filippov wrote:
>>> After having this many attempts and getting to the issues that are
>>> really hard to detect I wonder if the target backend is the right place
>>> for this optimization?
>>>
>> I guess they are not hard to detect
> 
> I mean, on the testing side. check-gcc testsuite passed without new
> regressions with this change, linux kernel smoke test passed, I was
> almost convinced that it's ok to commit.
> 
>> but just issues I didn't anticipate (and I just need a little more work).
> 
> Looking at other peephole2 patterns I see that their code transformations
> are much more compact and they don't need to track additional properties
> of unrelated instructions.
> 
>> And where else should it be done?  What about implementing a
>> target-specific pass just for one-point optimization?
> 
> I don't even understand what's target-specific in this optimization?
> It looks very generic to me.
> 

Ah, I seem to have misunderstood what you meant, sorry.

Now, what this patch is trying to do depends on whether register moves can be converted to stack pointer indirect loads/stores with offsets, and whether there is any benefit in doing so, but they are not target dependent. Is it?

If we want the target-independent part to do something like this, we will need a mechanism (macro, hook, etc.) to write appropriate information in the machine description and pass it on.

For example, offset ranges for register indirect loads and stores, or whether the ABI requires that callee-saved registers always be associated with stack slots, or even the need for stack frame construction...

I totally agree that the peephole2 pattern is not the best implementation location.
diff mbox series

Patch

diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 4f1e8fd13..ac04ef6f0 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -3029,3 +3029,65 @@  FALLTHRU:;
   operands[1] = GEN_INT (imm0);
   operands[2] = GEN_INT (imm1);
 })
+
+(define_peephole2
+  [(set (match_operand:SI 0 "register_operand")
+	(match_operand:SI 1 "reload_operand"))]
+  "!TARGET_WINDOWED_ABI && df
+   && epilogue_contains (insn)
+   && ! call_used_or_fixed_reg_p (REGNO (operands[0]))
+   && (!frame_pointer_needed
+       || REGNO (operands[0]) != HARD_FRAME_POINTER_REGNUM)"
+  [(const_int 0)]
+{
+  rtx reg = operands[0], pattern;
+  rtx_insn *insnP = NULL, *insnS = NULL, *insnR = NULL;
+  df_ref ref;
+  rtx_insn *insn;
+  for (ref = DF_REG_DEF_CHAIN (REGNO (reg));
+       ref; ref = DF_REF_NEXT_REG (ref))
+    if (DF_REF_CLASS (ref) != DF_REF_REGULAR
+	|| DEBUG_INSN_P (insn = DF_REF_INSN (ref)))
+      continue;
+    else if (insn == curr_insn)
+      continue;
+    else if (GET_CODE (pattern = PATTERN (insn)) == SET
+	     && rtx_equal_p (SET_DEST (pattern), reg)
+	     && REG_P (SET_SRC (pattern)))
+      {
+	if (insnS)
+	  FAIL;
+	insnS = insn;
+	continue;
+      }
+    else
+      FAIL;
+  for (ref = DF_REG_USE_CHAIN (REGNO (reg));
+       ref; ref = DF_REF_NEXT_REG (ref))
+    if (DF_REF_CLASS (ref) != DF_REF_REGULAR
+	|| DEBUG_INSN_P (insn = DF_REF_INSN (ref)))
+      continue;
+    else if (prologue_contains (insn))
+      {
+	insnP = insn;
+	continue;
+      }
+    else if (GET_CODE (pattern = PATTERN (insn)) == SET
+	     && rtx_equal_p (SET_SRC (pattern), reg)
+	     && REG_P (SET_DEST (pattern)))
+      {
+	if (insnR)
+	  FAIL;
+	insnR = insn;
+	continue;
+      }
+    else
+      FAIL;
+  if (!insnP || !insnS || !insnR)
+    FAIL;
+  SET_DEST (PATTERN (insnS)) = copy_rtx (operands[1]);
+  df_insn_rescan (insnS);
+  SET_SRC (PATTERN (insnR)) = copy_rtx (operands[1]);
+  df_insn_rescan (insnR);
+  set_insn_deleted (insnP);
+})