diff mbox series

[v2,2/2] RISC-V: avoid LUI based const mat in prologue/epilogue expansion [PR/105733]

Message ID 20240513184932.662109-3-vineetg@rivosinc.com
State New
Headers show
Series RISC-V improve stack/array access by constant mat tweak | expand

Commit Message

Vineet Gupta May 13, 2024, 6:49 p.m. UTC
If the constant used for stack offset can be expressed as sum of two S12
values, the constant need not be materialized (in a reg) and instead the
two S12 bits can be added to instructions involved with frame pointer.
This avoids burning a register and more importantly can often get down
to be 2 insn vs. 3.

The prev patches to generally avoid LUI based const materialization didn't
fix this PR and need this directed fix in funcion prologue/epilogue
expansion.

This fix doesn't move the neddle for SPEC, at all, but it is still a
win considering gcc generates one insn fewer than llvm for the test ;-)

   gcc-13.1 release   |      gcc 230823     |                   |
                      |    g6619b3d4c15c    |   This patch      |  clang/llvm
---------------------------------------------------------------------------------
li      t0,-4096     | li    t0,-4096      | addi  sp,sp,-2048 | addi sp,sp,-2048
addi    t0,t0,2016   | addi  t0,t0,2032    | add   sp,sp,-16   | addi sp,sp,-32
li      a4,4096      | add   sp,sp,t0      | add   a5,sp,a0    | add  a1,sp,16
add     sp,sp,t0     | addi  a5,sp,-2032   | sb    zero,0(a5)  | add  a0,a0,a1
li      a5,-4096     | add   a0,a5,a0      | addi  sp,sp,2032  | sb   zero,0(a0)
addi    a4,a4,-2032  | li    t0, 4096      | addi  sp,sp,32    | addi sp,sp,2032
add     a4,a4,a5     | sb    zero,2032(a0) | ret               | addi sp,sp,48
addi    a5,sp,16     | addi  t0,t0,-2032   |                   | ret
add     a5,a4,a5     | add   sp,sp,t0      |
add     a0,a5,a0     | ret                 |
li      t0,4096      |
sd      a5,8(sp)     |
sb      zero,2032(a0)|
addi    t0,t0,-2016  |
add     sp,sp,t0     |
ret                  |

gcc/ChangeLog:
	PR target/105733
	* config/riscv/riscv.h: New macros for with aligned offsets.
	* config/riscv/riscv.cc (riscv_split_sum_of_two_s12): New
	function to split a sum of two s12 values into constituents.
	(riscv_expand_prologue): Handle offset being sum of two S12.
	(riscv_expand_epilogue): Ditto.
	* config/riscv/riscv-protos.h (riscv_split_sum_of_two_s12): New.

gcc/testsuite/ChangeLog:
	* gcc.target/riscv/pr105733.c: New Test.
	* gcc.target/riscv/rvv/autovec/vls/spill-1.c: Adjust to not
	expect LUI 4096.
	* gcc.target/riscv/rvv/autovec/vls/spill-2.c: Ditto.
	* gcc.target/riscv/rvv/autovec/vls/spill-3.c: Ditto.
	* gcc.target/riscv/rvv/autovec/vls/spill-4.c: Ditto.
	* gcc.target/riscv/rvv/autovec/vls/spill-5.c: Ditto.
	* gcc.target/riscv/rvv/autovec/vls/spill-6.c: Ditto.
	* gcc.target/riscv/rvv/autovec/vls/spill-7.c: Ditto.

Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
---
 gcc/config/riscv/riscv-protos.h               |  2 +
 gcc/config/riscv/riscv.cc                     | 74 +++++++++++++++++--
 gcc/config/riscv/riscv.h                      |  7 ++
 gcc/testsuite/gcc.target/riscv/pr105733.c     | 15 ++++
 .../riscv/rvv/autovec/vls/spill-1.c           |  4 +-
 .../riscv/rvv/autovec/vls/spill-2.c           |  4 +-
 .../riscv/rvv/autovec/vls/spill-3.c           |  4 +-
 .../riscv/rvv/autovec/vls/spill-4.c           |  4 +-
 .../riscv/rvv/autovec/vls/spill-5.c           |  4 +-
 .../riscv/rvv/autovec/vls/spill-6.c           |  4 +-
 .../riscv/rvv/autovec/vls/spill-7.c           |  4 +-
 11 files changed, 105 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/pr105733.c

Comments

Jeff Law May 13, 2024, 8:28 p.m. UTC | #1
On 5/13/24 12:49 PM, Vineet Gupta wrote:
> If the constant used for stack offset can be expressed as sum of two S12
> values, the constant need not be materialized (in a reg) and instead the
> two S12 bits can be added to instructions involved with frame pointer.
> This avoids burning a register and more importantly can often get down
> to be 2 insn vs. 3.
> 
> The prev patches to generally avoid LUI based const materialization didn't
> fix this PR and need this directed fix in funcion prologue/epilogue
> expansion.
> 
> This fix doesn't move the neddle for SPEC, at all, but it is still a
> win considering gcc generates one insn fewer than llvm for the test ;-)
> 
>     gcc-13.1 release   |      gcc 230823     |                   |
>                        |    g6619b3d4c15c    |   This patch      |  clang/llvm
> ---------------------------------------------------------------------------------
> li      t0,-4096     | li    t0,-4096      | addi  sp,sp,-2048 | addi sp,sp,-2048
> addi    t0,t0,2016   | addi  t0,t0,2032    | add   sp,sp,-16   | addi sp,sp,-32
> li      a4,4096      | add   sp,sp,t0      | add   a5,sp,a0    | add  a1,sp,16
> add     sp,sp,t0     | addi  a5,sp,-2032   | sb    zero,0(a5)  | add  a0,a0,a1
> li      a5,-4096     | add   a0,a5,a0      | addi  sp,sp,2032  | sb   zero,0(a0)
> addi    a4,a4,-2032  | li    t0, 4096      | addi  sp,sp,32    | addi sp,sp,2032
> add     a4,a4,a5     | sb    zero,2032(a0) | ret               | addi sp,sp,48
> addi    a5,sp,16     | addi  t0,t0,-2032   |                   | ret
> add     a5,a4,a5     | add   sp,sp,t0      |
> add     a0,a5,a0     | ret                 |
> li      t0,4096      |
> sd      a5,8(sp)     |
> sb      zero,2032(a0)|
> addi    t0,t0,-2016  |
> add     sp,sp,t0     |
> ret                  |
> 
> gcc/ChangeLog:
> 	PR target/105733
> 	* config/riscv/riscv.h: New macros for with aligned offsets.
> 	* config/riscv/riscv.cc (riscv_split_sum_of_two_s12): New
> 	function to split a sum of two s12 values into constituents.
> 	(riscv_expand_prologue): Handle offset being sum of two S12.
> 	(riscv_expand_epilogue): Ditto.
> 	* config/riscv/riscv-protos.h (riscv_split_sum_of_two_s12): New.
> 
> gcc/testsuite/ChangeLog:
> 	* gcc.target/riscv/pr105733.c: New Test.
> 	* gcc.target/riscv/rvv/autovec/vls/spill-1.c: Adjust to not
> 	expect LUI 4096.
> 	* gcc.target/riscv/rvv/autovec/vls/spill-2.c: Ditto.
> 	* gcc.target/riscv/rvv/autovec/vls/spill-3.c: Ditto.
> 	* gcc.target/riscv/rvv/autovec/vls/spill-4.c: Ditto.
> 	* gcc.target/riscv/rvv/autovec/vls/spill-5.c: Ditto.
> 	* gcc.target/riscv/rvv/autovec/vls/spill-6.c: Ditto.
> 	* gcc.target/riscv/rvv/autovec/vls/spill-7.c: Ditto.
> 


> @@ -8074,14 +8111,26 @@ riscv_expand_epilogue (int style)
>   	}
>         else
>   	{
> -	  if (!SMALL_OPERAND (adjust_offset.to_constant ()))
> +	  HOST_WIDE_INT adj_off_value = adjust_offset.to_constant ();
> +	  if (SMALL_OPERAND (adj_off_value))
> +	    {
> +	      adjust = GEN_INT (adj_off_value);
> +	    }
> +	  else if (SUM_OF_TWO_S12_ALGN (adj_off_value))
> +	    {
> +	      HOST_WIDE_INT base, off;
> +	      riscv_split_sum_of_two_s12 (adj_off_value, &base, &off);
> +	      insn = gen_add3_insn (stack_pointer_rtx, hard_frame_pointer_rtx,
> +					GEN_INT (base));
> +	      RTX_FRAME_RELATED_P (insn) = 1;
> +	      adjust = GEN_INT (off);
> +	    }
So this was the hunk that we identified internally as causing problems 
with libgomp's testsuite.  We never fully chased it down as this hunk 
didn't seem terribly important performance wise -- we just set it aside. 
  The thing is it looked basically correct to me.  So the failure was 
certainly unexpected, but it was consistent.

So I think the question is whether or not the CI system runs the libgomp 
testsuite, particularly in the rv64 linux configuration.  If it does, 
and it passes, then we're good.  I'm still finding my way around the 
configuration, so I don't know if the CI system Edwin & Patrick have 
built tests libgomp or not.

If it isn't run, then we'll need to do a run to test that.  I'm set up 
here to do that if needed.   I can just drop this version into our 
internal tree, trigger an internal CI run and see if it complains :-)

If it does complain, then we know where to start investigations.




Jeff
Patrick O'Neill May 14, 2024, 12:54 a.m. UTC | #2
On 5/13/24 13:28, Jeff Law wrote:
>
>
> On 5/13/24 12:49 PM, Vineet Gupta wrote:
>> If the constant used for stack offset can be expressed as sum of two S12
>> values, the constant need not be materialized (in a reg) and instead the
>> two S12 bits can be added to instructions involved with frame pointer.
>> This avoids burning a register and more importantly can often get down
>> to be 2 insn vs. 3.
>>
>> The prev patches to generally avoid LUI based const materialization 
>> didn't
>> fix this PR and need this directed fix in funcion prologue/epilogue
>> expansion.
>>
>> This fix doesn't move the neddle for SPEC, at all, but it is still a
>> win considering gcc generates one insn fewer than llvm for the test ;-)
>>
>>     gcc-13.1 release   |      gcc 230823     | |
>>                        |    g6619b3d4c15c    |   This patch |  
>> clang/llvm
>> --------------------------------------------------------------------------------- 
>>
>> li      t0,-4096     | li    t0,-4096      | addi  sp,sp,-2048 | addi 
>> sp,sp,-2048
>> addi    t0,t0,2016   | addi  t0,t0,2032    | add   sp,sp,-16   | addi 
>> sp,sp,-32
>> li      a4,4096      | add   sp,sp,t0      | add   a5,sp,a0    | add  
>> a1,sp,16
>> add     sp,sp,t0     | addi  a5,sp,-2032   | sb    zero,0(a5)  | add  
>> a0,a0,a1
>> li      a5,-4096     | add   a0,a5,a0      | addi  sp,sp,2032  | sb   
>> zero,0(a0)
>> addi    a4,a4,-2032  | li    t0, 4096      | addi  sp,sp,32    | addi 
>> sp,sp,2032
>> add     a4,a4,a5     | sb    zero,2032(a0) | ret               | addi 
>> sp,sp,48
>> addi    a5,sp,16     | addi  t0,t0,-2032   |                   | ret
>> add     a5,a4,a5     | add   sp,sp,t0      |
>> add     a0,a5,a0     | ret                 |
>> li      t0,4096      |
>> sd      a5,8(sp)     |
>> sb      zero,2032(a0)|
>> addi    t0,t0,-2016  |
>> add     sp,sp,t0     |
>> ret                  |
>>
>> gcc/ChangeLog:
>>     PR target/105733
>>     * config/riscv/riscv.h: New macros for with aligned offsets.
>>     * config/riscv/riscv.cc (riscv_split_sum_of_two_s12): New
>>     function to split a sum of two s12 values into constituents.
>>     (riscv_expand_prologue): Handle offset being sum of two S12.
>>     (riscv_expand_epilogue): Ditto.
>>     * config/riscv/riscv-protos.h (riscv_split_sum_of_two_s12): New.
>>
>> gcc/testsuite/ChangeLog:
>>     * gcc.target/riscv/pr105733.c: New Test.
>>     * gcc.target/riscv/rvv/autovec/vls/spill-1.c: Adjust to not
>>     expect LUI 4096.
>>     * gcc.target/riscv/rvv/autovec/vls/spill-2.c: Ditto.
>>     * gcc.target/riscv/rvv/autovec/vls/spill-3.c: Ditto.
>>     * gcc.target/riscv/rvv/autovec/vls/spill-4.c: Ditto.
>>     * gcc.target/riscv/rvv/autovec/vls/spill-5.c: Ditto.
>>     * gcc.target/riscv/rvv/autovec/vls/spill-6.c: Ditto.
>>     * gcc.target/riscv/rvv/autovec/vls/spill-7.c: Ditto.
>>
>
>
>> @@ -8074,14 +8111,26 @@ riscv_expand_epilogue (int style)
>>       }
>>         else
>>       {
>> -      if (!SMALL_OPERAND (adjust_offset.to_constant ()))
>> +      HOST_WIDE_INT adj_off_value = adjust_offset.to_constant ();
>> +      if (SMALL_OPERAND (adj_off_value))
>> +        {
>> +          adjust = GEN_INT (adj_off_value);
>> +        }
>> +      else if (SUM_OF_TWO_S12_ALGN (adj_off_value))
>> +        {
>> +          HOST_WIDE_INT base, off;
>> +          riscv_split_sum_of_two_s12 (adj_off_value, &base, &off);
>> +          insn = gen_add3_insn (stack_pointer_rtx, 
>> hard_frame_pointer_rtx,
>> +                    GEN_INT (base));
>> +          RTX_FRAME_RELATED_P (insn) = 1;
>> +          adjust = GEN_INT (off);
>> +        }
> So this was the hunk that we identified internally as causing problems 
> with libgomp's testsuite.  We never fully chased it down as this hunk 
> didn't seem terribly important performance wise -- we just set it 
> aside.  The thing is it looked basically correct to me.  So the 
> failure was certainly unexpected, but it was consistent.
>
> So I think the question is whether or not the CI system runs the 
> libgomp testsuite, particularly in the rv64 linux configuration. If it 
> does, and it passes, then we're good.  I'm still finding my way around 
> the configuration, so I don't know if the CI system Edwin & Patrick 
> have built tests libgomp or not.

I poked around the .sum files in pre/postcommit and we do run tests like:

PASS: c-c++-common/gomp/affinity-2.c  (test for errors, line 45)

I'm not familar with libgomp so I don't know if that's the same libgomp 
tests you're referring to.

Patrick

>
> If it isn't run, then we'll need to do a run to test that.  I'm set up 
> here to do that if needed.   I can just drop this version into our 
> internal tree, trigger an internal CI run and see if it complains :-)
>
> If it does complain, then we know where to start investigations.
>
>
>
>
> Jeff
>
Jeff Law May 14, 2024, 3:36 a.m. UTC | #3
On 5/13/24 6:54 PM, Patrick O'Neill wrote:
> 
> On 5/13/24 13:28, Jeff Law wrote:
>>
>>
>> On 5/13/24 12:49 PM, Vineet Gupta wrote:
>>> If the constant used for stack offset can be expressed as sum of two S12
>>> values, the constant need not be materialized (in a reg) and instead the
>>> two S12 bits can be added to instructions involved with frame pointer.
>>> This avoids burning a register and more importantly can often get down
>>> to be 2 insn vs. 3.
>>>
>>> The prev patches to generally avoid LUI based const materialization 
>>> didn't
>>> fix this PR and need this directed fix in funcion prologue/epilogue
>>> expansion.
>>>
>>> This fix doesn't move the neddle for SPEC, at all, but it is still a
>>> win considering gcc generates one insn fewer than llvm for the test ;-)
>>>
>>>     gcc-13.1 release   |      gcc 230823     | |
>>>                        |    g6619b3d4c15c    |   This patch | clang/llvm
>>> ---------------------------------------------------------------------------------
>>> li      t0,-4096     | li    t0,-4096      | addi  sp,sp,-2048 | addi 
>>> sp,sp,-2048
>>> addi    t0,t0,2016   | addi  t0,t0,2032    | add   sp,sp,-16   | addi 
>>> sp,sp,-32
>>> li      a4,4096      | add   sp,sp,t0      | add   a5,sp,a0    | add 
>>> a1,sp,16
>>> add     sp,sp,t0     | addi  a5,sp,-2032   | sb    zero,0(a5)  | add 
>>> a0,a0,a1
>>> li      a5,-4096     | add   a0,a5,a0      | addi  sp,sp,2032  | sb 
>>> zero,0(a0)
>>> addi    a4,a4,-2032  | li    t0, 4096      | addi  sp,sp,32    | addi 
>>> sp,sp,2032
>>> add     a4,a4,a5     | sb    zero,2032(a0) | ret               | addi 
>>> sp,sp,48
>>> addi    a5,sp,16     | addi  t0,t0,-2032   |                   | ret
>>> add     a5,a4,a5     | add   sp,sp,t0      |
>>> add     a0,a5,a0     | ret                 |
>>> li      t0,4096      |
>>> sd      a5,8(sp)     |
>>> sb      zero,2032(a0)|
>>> addi    t0,t0,-2016  |
>>> add     sp,sp,t0     |
>>> ret                  |
>>>
>>> gcc/ChangeLog:
>>>     PR target/105733
>>>     * config/riscv/riscv.h: New macros for with aligned offsets.
>>>     * config/riscv/riscv.cc (riscv_split_sum_of_two_s12): New
>>>     function to split a sum of two s12 values into constituents.
>>>     (riscv_expand_prologue): Handle offset being sum of two S12.
>>>     (riscv_expand_epilogue): Ditto.
>>>     * config/riscv/riscv-protos.h (riscv_split_sum_of_two_s12): New.
>>>
>>> gcc/testsuite/ChangeLog:
>>>     * gcc.target/riscv/pr105733.c: New Test.
>>>     * gcc.target/riscv/rvv/autovec/vls/spill-1.c: Adjust to not
>>>     expect LUI 4096.
>>>     * gcc.target/riscv/rvv/autovec/vls/spill-2.c: Ditto.
>>>     * gcc.target/riscv/rvv/autovec/vls/spill-3.c: Ditto.
>>>     * gcc.target/riscv/rvv/autovec/vls/spill-4.c: Ditto.
>>>     * gcc.target/riscv/rvv/autovec/vls/spill-5.c: Ditto.
>>>     * gcc.target/riscv/rvv/autovec/vls/spill-6.c: Ditto.
>>>     * gcc.target/riscv/rvv/autovec/vls/spill-7.c: Ditto.
>>>
>>
>>
>>> @@ -8074,14 +8111,26 @@ riscv_expand_epilogue (int style)
>>>       }
>>>         else
>>>       {
>>> -      if (!SMALL_OPERAND (adjust_offset.to_constant ()))
>>> +      HOST_WIDE_INT adj_off_value = adjust_offset.to_constant ();
>>> +      if (SMALL_OPERAND (adj_off_value))
>>> +        {
>>> +          adjust = GEN_INT (adj_off_value);
>>> +        }
>>> +      else if (SUM_OF_TWO_S12_ALGN (adj_off_value))
>>> +        {
>>> +          HOST_WIDE_INT base, off;
>>> +          riscv_split_sum_of_two_s12 (adj_off_value, &base, &off);
>>> +          insn = gen_add3_insn (stack_pointer_rtx, 
>>> hard_frame_pointer_rtx,
>>> +                    GEN_INT (base));
>>> +          RTX_FRAME_RELATED_P (insn) = 1;
>>> +          adjust = GEN_INT (off);
>>> +        }
>> So this was the hunk that we identified internally as causing problems 
>> with libgomp's testsuite.  We never fully chased it down as this hunk 
>> didn't seem terribly important performance wise -- we just set it 
>> aside.  The thing is it looked basically correct to me.  So the 
>> failure was certainly unexpected, but it was consistent.
>>
>> So I think the question is whether or not the CI system runs the 
>> libgomp testsuite, particularly in the rv64 linux configuration. If it 
>> does, and it passes, then we're good.  I'm still finding my way around 
>> the configuration, so I don't know if the CI system Edwin & Patrick 
>> have built tests libgomp or not.
> 
> I poked around the .sum files in pre/postcommit and we do run tests like:
> 
> PASS: c-c++-common/gomp/affinity-2.c  (test for errors, line 45)
I was able to find the summary info:

> Tests that now fail, but worked before (15 tests):
> libgomp: libgomp.fortran/simd7.f90   -O0  execution test
> libgomp: libgomp.fortran/task2.f90   -O0  execution test
> libgomp: libgomp.fortran/vla2.f90   -O0  execution test
> libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  execution test
> libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
> libgomp: libgomp.fortran/vla4.f90   -O1  execution test
> libgomp: libgomp.fortran/vla4.f90   -O2  execution test
> libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  execution test
> libgomp: libgomp.fortran/vla4.f90   -O3 -g  execution test
> libgomp: libgomp.fortran/vla4.f90   -Os  execution test
> libgomp: libgomp.fortran/vla5.f90   -O1  execution test
> libgomp: libgomp.fortran/vla5.f90   -O2  execution test
> libgomp: libgomp.fortran/vla5.f90   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  execution test
> libgomp: libgomp.fortran/vla5.f90   -O3 -g  execution test
> libgomp: libgomp.fortran/vla5.f90   -Os  execution test

So if you could check on those, it'd be appreciated.

Jeff
Patrick O'Neill May 14, 2024, 2:51 p.m. UTC | #4
On 5/13/24 20:36, Jeff Law wrote:
>
>
> On 5/13/24 6:54 PM, Patrick O'Neill wrote:
>>
>> On 5/13/24 13:28, Jeff Law wrote:
>>>
>>>
>>> On 5/13/24 12:49 PM, Vineet Gupta wrote:
>>>> If the constant used for stack offset can be expressed as sum of 
>>>> two S12
>>>> values, the constant need not be materialized (in a reg) and 
>>>> instead the
>>>> two S12 bits can be added to instructions involved with frame pointer.
>>>> This avoids burning a register and more importantly can often get down
>>>> to be 2 insn vs. 3.
>>>>
>>>> The prev patches to generally avoid LUI based const materialization 
>>>> didn't
>>>> fix this PR and need this directed fix in funcion prologue/epilogue
>>>> expansion.
>>>>
>>>> This fix doesn't move the neddle for SPEC, at all, but it is still a
>>>> win considering gcc generates one insn fewer than llvm for the test 
>>>> ;-)
>>>>
>>>>     gcc-13.1 release   |      gcc 230823     | |
>>>>                        |    g6619b3d4c15c    |   This patch | 
>>>> clang/llvm
>>>> --------------------------------------------------------------------------------- 
>>>>
>>>> li      t0,-4096     | li    t0,-4096      | addi sp,sp,-2048 | 
>>>> addi sp,sp,-2048
>>>> addi    t0,t0,2016   | addi  t0,t0,2032    | add sp,sp,-16   | addi 
>>>> sp,sp,-32
>>>> li      a4,4096      | add   sp,sp,t0      | add a5,sp,a0    | add 
>>>> a1,sp,16
>>>> add     sp,sp,t0     | addi  a5,sp,-2032   | sb zero,0(a5)  | add 
>>>> a0,a0,a1
>>>> li      a5,-4096     | add   a0,a5,a0      | addi sp,sp,2032  | sb 
>>>> zero,0(a0)
>>>> addi    a4,a4,-2032  | li    t0, 4096      | addi sp,sp,32    | 
>>>> addi sp,sp,2032
>>>> add     a4,a4,a5     | sb    zero,2032(a0) | ret               | 
>>>> addi sp,sp,48
>>>> addi    a5,sp,16     | addi  t0,t0,-2032 |                   | ret
>>>> add     a5,a4,a5     | add   sp,sp,t0      |
>>>> add     a0,a5,a0     | ret                 |
>>>> li      t0,4096      |
>>>> sd      a5,8(sp)     |
>>>> sb      zero,2032(a0)|
>>>> addi    t0,t0,-2016  |
>>>> add     sp,sp,t0     |
>>>> ret                  |
>>>>
>>>> gcc/ChangeLog:
>>>>     PR target/105733
>>>>     * config/riscv/riscv.h: New macros for with aligned offsets.
>>>>     * config/riscv/riscv.cc (riscv_split_sum_of_two_s12): New
>>>>     function to split a sum of two s12 values into constituents.
>>>>     (riscv_expand_prologue): Handle offset being sum of two S12.
>>>>     (riscv_expand_epilogue): Ditto.
>>>>     * config/riscv/riscv-protos.h (riscv_split_sum_of_two_s12): New.
>>>>
>>>> gcc/testsuite/ChangeLog:
>>>>     * gcc.target/riscv/pr105733.c: New Test.
>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-1.c: Adjust to not
>>>>     expect LUI 4096.
>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-2.c: Ditto.
>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-3.c: Ditto.
>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-4.c: Ditto.
>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-5.c: Ditto.
>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-6.c: Ditto.
>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-7.c: Ditto.
>>>>
>>>
>>>
>>>> @@ -8074,14 +8111,26 @@ riscv_expand_epilogue (int style)
>>>>       }
>>>>         else
>>>>       {
>>>> -      if (!SMALL_OPERAND (adjust_offset.to_constant ()))
>>>> +      HOST_WIDE_INT adj_off_value = adjust_offset.to_constant ();
>>>> +      if (SMALL_OPERAND (adj_off_value))
>>>> +        {
>>>> +          adjust = GEN_INT (adj_off_value);
>>>> +        }
>>>> +      else if (SUM_OF_TWO_S12_ALGN (adj_off_value))
>>>> +        {
>>>> +          HOST_WIDE_INT base, off;
>>>> +          riscv_split_sum_of_two_s12 (adj_off_value, &base, &off);
>>>> +          insn = gen_add3_insn (stack_pointer_rtx, 
>>>> hard_frame_pointer_rtx,
>>>> +                    GEN_INT (base));
>>>> +          RTX_FRAME_RELATED_P (insn) = 1;
>>>> +          adjust = GEN_INT (off);
>>>> +        }
>>> So this was the hunk that we identified internally as causing 
>>> problems with libgomp's testsuite.  We never fully chased it down as 
>>> this hunk didn't seem terribly important performance wise -- we just 
>>> set it aside.  The thing is it looked basically correct to me.  So 
>>> the failure was certainly unexpected, but it was consistent.
>>>
>>> So I think the question is whether or not the CI system runs the 
>>> libgomp testsuite, particularly in the rv64 linux configuration. If 
>>> it does, and it passes, then we're good. I'm still finding my way 
>>> around the configuration, so I don't know if the CI system Edwin & 
>>> Patrick have built tests libgomp or not.
>>
>> I poked around the .sum files in pre/postcommit and we do run tests 
>> like:
>>
>> PASS: c-c++-common/gomp/affinity-2.c  (test for errors, line 45)
> I was able to find the summary info:
>
>> Tests that now fail, but worked before (15 tests):
>> libgomp: libgomp.fortran/simd7.f90   -O0  execution test
>> libgomp: libgomp.fortran/task2.f90   -O0  execution test
>> libgomp: libgomp.fortran/vla2.f90   -O0  execution test
>> libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer 
>> -funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>> libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
>> libgomp: libgomp.fortran/vla4.f90   -O1  execution test
>> libgomp: libgomp.fortran/vla4.f90   -O2  execution test
>> libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer 
>> -funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>> libgomp: libgomp.fortran/vla4.f90   -O3 -g  execution test
>> libgomp: libgomp.fortran/vla4.f90   -Os  execution test
>> libgomp: libgomp.fortran/vla5.f90   -O1  execution test
>> libgomp: libgomp.fortran/vla5.f90   -O2  execution test
>> libgomp: libgomp.fortran/vla5.f90   -O3 -fomit-frame-pointer 
>> -funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>> libgomp: libgomp.fortran/vla5.f90   -O3 -g  execution test
>> libgomp: libgomp.fortran/vla5.f90   -Os  execution test
>
> So if you could check on those, it'd be appreciated.

I checked rv64gcv linux and those do not currently run in CI.

Patrick
Jeff Law May 14, 2024, 2:54 p.m. UTC | #5
On 5/14/24 8:51 AM, Patrick O'Neill wrote:
> 
> On 5/13/24 20:36, Jeff Law wrote:
>>
>>
>> On 5/13/24 6:54 PM, Patrick O'Neill wrote:
>>>
>>> On 5/13/24 13:28, Jeff Law wrote:
>>>>
>>>>
>>>> On 5/13/24 12:49 PM, Vineet Gupta wrote:
>>>>> If the constant used for stack offset can be expressed as sum of 
>>>>> two S12
>>>>> values, the constant need not be materialized (in a reg) and 
>>>>> instead the
>>>>> two S12 bits can be added to instructions involved with frame pointer.
>>>>> This avoids burning a register and more importantly can often get down
>>>>> to be 2 insn vs. 3.
>>>>>
>>>>> The prev patches to generally avoid LUI based const materialization 
>>>>> didn't
>>>>> fix this PR and need this directed fix in funcion prologue/epilogue
>>>>> expansion.
>>>>>
>>>>> This fix doesn't move the neddle for SPEC, at all, but it is still a
>>>>> win considering gcc generates one insn fewer than llvm for the 
>>>>> test ;-)
>>>>>
>>>>>     gcc-13.1 release   |      gcc 230823     | |
>>>>>                        |    g6619b3d4c15c    |   This patch | 
>>>>> clang/llvm
>>>>> ---------------------------------------------------------------------------------
>>>>> li      t0,-4096     | li    t0,-4096      | addi sp,sp,-2048 | 
>>>>> addi sp,sp,-2048
>>>>> addi    t0,t0,2016   | addi  t0,t0,2032    | add sp,sp,-16   | addi 
>>>>> sp,sp,-32
>>>>> li      a4,4096      | add   sp,sp,t0      | add a5,sp,a0    | add 
>>>>> a1,sp,16
>>>>> add     sp,sp,t0     | addi  a5,sp,-2032   | sb zero,0(a5)  | add 
>>>>> a0,a0,a1
>>>>> li      a5,-4096     | add   a0,a5,a0      | addi sp,sp,2032  | sb 
>>>>> zero,0(a0)
>>>>> addi    a4,a4,-2032  | li    t0, 4096      | addi sp,sp,32    | 
>>>>> addi sp,sp,2032
>>>>> add     a4,a4,a5     | sb    zero,2032(a0) | ret               | 
>>>>> addi sp,sp,48
>>>>> addi    a5,sp,16     | addi  t0,t0,-2032 |                   | ret
>>>>> add     a5,a4,a5     | add   sp,sp,t0      |
>>>>> add     a0,a5,a0     | ret                 |
>>>>> li      t0,4096      |
>>>>> sd      a5,8(sp)     |
>>>>> sb      zero,2032(a0)|
>>>>> addi    t0,t0,-2016  |
>>>>> add     sp,sp,t0     |
>>>>> ret                  |
>>>>>
>>>>> gcc/ChangeLog:
>>>>>     PR target/105733
>>>>>     * config/riscv/riscv.h: New macros for with aligned offsets.
>>>>>     * config/riscv/riscv.cc (riscv_split_sum_of_two_s12): New
>>>>>     function to split a sum of two s12 values into constituents.
>>>>>     (riscv_expand_prologue): Handle offset being sum of two S12.
>>>>>     (riscv_expand_epilogue): Ditto.
>>>>>     * config/riscv/riscv-protos.h (riscv_split_sum_of_two_s12): New.
>>>>>
>>>>> gcc/testsuite/ChangeLog:
>>>>>     * gcc.target/riscv/pr105733.c: New Test.
>>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-1.c: Adjust to not
>>>>>     expect LUI 4096.
>>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-2.c: Ditto.
>>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-3.c: Ditto.
>>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-4.c: Ditto.
>>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-5.c: Ditto.
>>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-6.c: Ditto.
>>>>>     * gcc.target/riscv/rvv/autovec/vls/spill-7.c: Ditto.
>>>>>
>>>>
>>>>
>>>>> @@ -8074,14 +8111,26 @@ riscv_expand_epilogue (int style)
>>>>>       }
>>>>>         else
>>>>>       {
>>>>> -      if (!SMALL_OPERAND (adjust_offset.to_constant ()))
>>>>> +      HOST_WIDE_INT adj_off_value = adjust_offset.to_constant ();
>>>>> +      if (SMALL_OPERAND (adj_off_value))
>>>>> +        {
>>>>> +          adjust = GEN_INT (adj_off_value);
>>>>> +        }
>>>>> +      else if (SUM_OF_TWO_S12_ALGN (adj_off_value))
>>>>> +        {
>>>>> +          HOST_WIDE_INT base, off;
>>>>> +          riscv_split_sum_of_two_s12 (adj_off_value, &base, &off);
>>>>> +          insn = gen_add3_insn (stack_pointer_rtx, 
>>>>> hard_frame_pointer_rtx,
>>>>> +                    GEN_INT (base));
>>>>> +          RTX_FRAME_RELATED_P (insn) = 1;
>>>>> +          adjust = GEN_INT (off);
>>>>> +        }
>>>> So this was the hunk that we identified internally as causing 
>>>> problems with libgomp's testsuite.  We never fully chased it down as 
>>>> this hunk didn't seem terribly important performance wise -- we just 
>>>> set it aside.  The thing is it looked basically correct to me.  So 
>>>> the failure was certainly unexpected, but it was consistent.
>>>>
>>>> So I think the question is whether or not the CI system runs the 
>>>> libgomp testsuite, particularly in the rv64 linux configuration. If 
>>>> it does, and it passes, then we're good. I'm still finding my way 
>>>> around the configuration, so I don't know if the CI system Edwin & 
>>>> Patrick have built tests libgomp or not.
>>>
>>> I poked around the .sum files in pre/postcommit and we do run tests 
>>> like:
>>>
>>> PASS: c-c++-common/gomp/affinity-2.c  (test for errors, line 45)
>> I was able to find the summary info:
>>
>>> Tests that now fail, but worked before (15 tests):
>>> libgomp: libgomp.fortran/simd7.f90   -O0  execution test
>>> libgomp: libgomp.fortran/task2.f90   -O0  execution test
>>> libgomp: libgomp.fortran/vla2.f90   -O0  execution test
>>> libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer - 
>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>> libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
>>> libgomp: libgomp.fortran/vla4.f90   -O1  execution test
>>> libgomp: libgomp.fortran/vla4.f90   -O2  execution test
>>> libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer - 
>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>> libgomp: libgomp.fortran/vla4.f90   -O3 -g  execution test
>>> libgomp: libgomp.fortran/vla4.f90   -Os  execution test
>>> libgomp: libgomp.fortran/vla5.f90   -O1  execution test
>>> libgomp: libgomp.fortran/vla5.f90   -O2  execution test
>>> libgomp: libgomp.fortran/vla5.f90   -O3 -fomit-frame-pointer - 
>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>> libgomp: libgomp.fortran/vla5.f90   -O3 -g  execution test
>>> libgomp: libgomp.fortran/vla5.f90   -Os  execution test
>>
>> So if you could check on those, it'd be appreciated.
> 
> I checked rv64gcv linux and those do not currently run in CI.
OK.  I'll Vineet's patch to an internal branch and force our CI to run 
on it.   Stay tuned for results.

jeff
Jeff Law May 14, 2024, 3:44 p.m. UTC | #6
On 5/14/24 8:51 AM, Patrick O'Neill wrote:
> 

>> I was able to find the summary info:
>>
>>> Tests that now fail, but worked before (15 tests):
>>> libgomp: libgomp.fortran/simd7.f90   -O0  execution test
>>> libgomp: libgomp.fortran/task2.f90   -O0  execution test
>>> libgomp: libgomp.fortran/vla2.f90   -O0  execution test
>>> libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer - 
>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>> libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
>>> libgomp: libgomp.fortran/vla4.f90   -O1  execution test
>>> libgomp: libgomp.fortran/vla4.f90   -O2  execution test
>>> libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer - 
>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>> libgomp: libgomp.fortran/vla4.f90   -O3 -g  execution test
>>> libgomp: libgomp.fortran/vla4.f90   -Os  execution test
>>> libgomp: libgomp.fortran/vla5.f90   -O1  execution test
>>> libgomp: libgomp.fortran/vla5.f90   -O2  execution test
>>> libgomp: libgomp.fortran/vla5.f90   -O3 -fomit-frame-pointer - 
>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>> libgomp: libgomp.fortran/vla5.f90   -O3 -g  execution test
>>> libgomp: libgomp.fortran/vla5.f90   -Os  execution test
>>
>> So if you could check on those, it'd be appreciated.
> 
> I checked rv64gcv linux and those do not currently run in CI.
So just ran with Vineet's patch in our CI system.  His patch is still 
triggering those regressions.  So we need to get that resolved before 
that second patch can go in.

jeff
Vineet Gupta May 14, 2024, 4:36 p.m. UTC | #7
On 5/14/24 08:44, Jeff Law wrote:
> On 5/14/24 8:51 AM, Patrick O'Neill wrote:
>>> I was able to find the summary info:
>>>
>>>> Tests that now fail, but worked before (15 tests):
>>>> libgomp: libgomp.fortran/simd7.f90   -O0  execution test
>>>> libgomp: libgomp.fortran/task2.f90   -O0  execution test
>>>> libgomp: libgomp.fortran/vla2.f90   -O0  execution test
>>>> libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer - 
>>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>>> libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
>>>> libgomp: libgomp.fortran/vla4.f90   -O1  execution test
>>>> libgomp: libgomp.fortran/vla4.f90   -O2  execution test
>>>> libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer - 
>>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>>> libgomp: libgomp.fortran/vla4.f90   -O3 -g  execution test
>>>> libgomp: libgomp.fortran/vla4.f90   -Os  execution test
>>>> libgomp: libgomp.fortran/vla5.f90   -O1  execution test
>>>> libgomp: libgomp.fortran/vla5.f90   -O2  execution test
>>>> libgomp: libgomp.fortran/vla5.f90   -O3 -fomit-frame-pointer - 
>>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>>> libgomp: libgomp.fortran/vla5.f90   -O3 -g  execution test
>>>> libgomp: libgomp.fortran/vla5.f90   -Os  execution test
>>> So if you could check on those, it'd be appreciated.
>> I checked rv64gcv linux and those do not currently run in CI.
> So just ran with Vineet's patch in our CI system.  His patch is still 
> triggering those regressions.  So we need to get that resolved before 
> that second patch can go in.

And just for reproducibility what exact --with-arch build is this from ?

-Vineet
Jeff Law May 14, 2024, 5:06 p.m. UTC | #8
On 5/14/24 10:36 AM, Vineet Gupta wrote:
> 
> 
> On 5/14/24 08:44, Jeff Law wrote:
>> On 5/14/24 8:51 AM, Patrick O'Neill wrote:
>>>> I was able to find the summary info:
>>>>
>>>>> Tests that now fail, but worked before (15 tests):
>>>>> libgomp: libgomp.fortran/simd7.f90   -O0  execution test
>>>>> libgomp: libgomp.fortran/task2.f90   -O0  execution test
>>>>> libgomp: libgomp.fortran/vla2.f90   -O0  execution test
>>>>> libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer -
>>>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>>>> libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
>>>>> libgomp: libgomp.fortran/vla4.f90   -O1  execution test
>>>>> libgomp: libgomp.fortran/vla4.f90   -O2  execution test
>>>>> libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer -
>>>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>>>> libgomp: libgomp.fortran/vla4.f90   -O3 -g  execution test
>>>>> libgomp: libgomp.fortran/vla4.f90   -Os  execution test
>>>>> libgomp: libgomp.fortran/vla5.f90   -O1  execution test
>>>>> libgomp: libgomp.fortran/vla5.f90   -O2  execution test
>>>>> libgomp: libgomp.fortran/vla5.f90   -O3 -fomit-frame-pointer -
>>>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>>>> libgomp: libgomp.fortran/vla5.f90   -O3 -g  execution test
>>>>> libgomp: libgomp.fortran/vla5.f90   -Os  execution test
>>>> So if you could check on those, it'd be appreciated.
>>> I checked rv64gcv linux and those do not currently run in CI.
>> So just ran with Vineet's patch in our CI system.  His patch is still
>> triggering those regressions.  So we need to get that resolved before
>> that second patch can go in.
> 
> And just for reproducibility what exact --with-arch build is this from ?
This run was with "--with-arch=rv64gc_zba_zbb_zbc_zbkb_zbs_zfa_zicond"

I think we likely saw it without zbkb & zfa when we first looked at this 
a few months back.

jeff
Vineet Gupta May 20, 2024, 4:44 p.m. UTC | #9
On 5/14/24 08:44, Jeff Law wrote:
>>> I was able to find the summary info:
>>>
>>>> Tests that now fail, but worked before (15 tests):
>>>> libgomp: libgomp.fortran/simd7.f90   -O0  execution test
>>>> libgomp: libgomp.fortran/task2.f90   -O0  execution test
>>>> libgomp: libgomp.fortran/vla2.f90   -O0  execution test
>>>> libgomp: libgomp.fortran/vla3.f90   -O3 -fomit-frame-pointer - 
>>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>>> libgomp: libgomp.fortran/vla3.f90   -O3 -g  execution test
>>>> libgomp: libgomp.fortran/vla4.f90   -O1  execution test
>>>> libgomp: libgomp.fortran/vla4.f90   -O2  execution test
>>>> libgomp: libgomp.fortran/vla4.f90   -O3 -fomit-frame-pointer - 
>>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>>> libgomp: libgomp.fortran/vla4.f90   -O3 -g  execution test
>>>> libgomp: libgomp.fortran/vla4.f90   -Os  execution test
>>>> libgomp: libgomp.fortran/vla5.f90   -O1  execution test
>>>> libgomp: libgomp.fortran/vla5.f90   -O2  execution test
>>>> libgomp: libgomp.fortran/vla5.f90   -O3 -fomit-frame-pointer - 
>>>> funroll-loops -fpeel-loops -ftracer -finline-functions execution test
>>>> libgomp: libgomp.fortran/vla5.f90   -O3 -g  execution test
>>>> libgomp: libgomp.fortran/vla5.f90   -Os  execution test
>>> So if you could check on those, it'd be appreciated.
>> I checked rv64gcv linux and those do not currently run in CI.
> So just ran with Vineet's patch in our CI system.  His patch is still 
> triggering those regressions.  So we need to get that resolved before 
> that second patch can go in.

So CI pointed to a lone Fortran execute failure which is very likely
also causing above.

    FAIL: gfortran.dg/PR93963.f90 -O0 execution test

Turns out the alloca codepath in epilogue expansion is simply busted -
I'm surprised that we only see 1 failure in the entire testsuite run
(libgomp run aside).

> -      if (!SMALL_OPERAND (adjust_offset.to_constant ()))
> +      HOST_WIDE_INT adj_off_value = adjust_offset.to_constant ();
> +      if (SMALL_OPERAND (adj_off_value))
> +        {
> +          adjust = GEN_INT (adj_off_value);
> +        }
> +      else if (SUM_OF_TWO_S12_ALGN (adj_off_value))
> +        {
> +          HOST_WIDE_INT base, off;
> +          riscv_split_sum_of_two_s12 (adj_off_value, &base, &off);
> +          insn = gen_add3_insn (stack_pointer_rtx,
hard_frame_pointer_rtx,
> +                    GEN_INT (base));

1. Missing gen_insn( ) here causes the insn to be overwritten by the
subsequent emit_insn below.

2. In sum of two s12 logic first insn is sp = xx + 2048, with the
additional insn expected to be of form
    sp = sp + off corresponding to
       stack_pointer_rtx+ stack_pointer_rtx+ off
    which the following emit_insn () below is not.
...

>        insn = emit_insn ( gen_add3_insn (stack_pointer_rtx,
hard_frame_pointer_rtx,
>                                                                adjust));

3. Yet another issue had to do with which insn should the dwarf be
attached -and the adj needed adjusting :-)
So In v3 I've split this patch into the main prologue/epilogue change
and one from the alloca one - which depending on reviewer guidance
(aesthetics, ugliness, trying to keep uniformity etc ? can be decided upon.

Thx,
-Vineet
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 706dc204e643..6da6ae4d041f 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -166,6 +166,8 @@  extern void riscv_subword_address (rtx, rtx *, rtx *, rtx *, rtx *);
 extern void riscv_lshift_subword (machine_mode, rtx, rtx, rtx *);
 extern enum memmodel riscv_union_memmodels (enum memmodel, enum memmodel);
 extern bool riscv_reg_frame_related (rtx);
+extern void riscv_split_sum_of_two_s12 (HOST_WIDE_INT, HOST_WIDE_INT *,
+					HOST_WIDE_INT *);
 
 /* Routines implemented in riscv-c.cc.  */
 void riscv_cpu_cpp_builtins (cpp_reader *);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 4067505270e1..4b742489b272 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -4063,6 +4063,32 @@  riscv_split_doubleword_move (rtx dest, rtx src)
        riscv_emit_move (riscv_subword (dest, true), riscv_subword (src, true));
      }
 }
+
+/* Constant VAL is known to be sum of two S12 constants.  Break it into
+   comprising BASE and OFF.
+   Numerically S12 is -2048 to 2047, however it uses the more conservative
+   range -2048 to 2032 as offsets pertain to stack related registers.  */
+
+void
+riscv_split_sum_of_two_s12 (HOST_WIDE_INT val, HOST_WIDE_INT *base,
+			    HOST_WIDE_INT *off)
+{
+  if (SUM_OF_TWO_S12_N (val))
+    {
+      *base = -2048;
+      *off = val - (-2048);
+    }
+  else if (SUM_OF_TWO_S12_P_ALGN (val))
+    {
+      *base = 2032;
+      *off = val - 2032;
+    }
+  else
+    {
+      gcc_unreachable ();
+    }
+}
+
 
 /* Return the appropriate instructions to move SRC into DEST.  Assume
    that SRC is operand 1 and DEST is operand 0.  */
@@ -7852,6 +7878,17 @@  riscv_expand_prologue (void)
 				GEN_INT (-constant_frame));
 	  RTX_FRAME_RELATED_P (emit_insn (insn)) = 1;
 	}
+      else if (SUM_OF_TWO_S12_ALGN (-constant_frame))
+	{
+	  HOST_WIDE_INT base, off;
+	  riscv_split_sum_of_two_s12 (-constant_frame, &base, &off);
+	  insn = gen_add3_insn (stack_pointer_rtx, stack_pointer_rtx,
+				GEN_INT (base));
+	  RTX_FRAME_RELATED_P (emit_insn (insn)) = 1;
+	  insn = gen_add3_insn (stack_pointer_rtx, stack_pointer_rtx,
+				GEN_INT (off));
+	  RTX_FRAME_RELATED_P (emit_insn (insn)) = 1;
+	}
       else
 	{
 	  riscv_emit_move (RISCV_PROLOGUE_TEMP (Pmode), GEN_INT (-constant_frame));
@@ -8074,14 +8111,26 @@  riscv_expand_epilogue (int style)
 	}
       else
 	{
-	  if (!SMALL_OPERAND (adjust_offset.to_constant ()))
+	  HOST_WIDE_INT adj_off_value = adjust_offset.to_constant ();
+	  if (SMALL_OPERAND (adj_off_value))
+	    {
+	      adjust = GEN_INT (adj_off_value);
+	    }
+	  else if (SUM_OF_TWO_S12_ALGN (adj_off_value))
+	    {
+	      HOST_WIDE_INT base, off;
+	      riscv_split_sum_of_two_s12 (adj_off_value, &base, &off);
+	      insn = gen_add3_insn (stack_pointer_rtx, hard_frame_pointer_rtx,
+					GEN_INT (base));
+	      RTX_FRAME_RELATED_P (insn) = 1;
+	      adjust = GEN_INT (off);
+	    }
+	  else
 	    {
 	      riscv_emit_move (RISCV_PROLOGUE_TEMP (Pmode),
-			       GEN_INT (adjust_offset.to_constant ()));
+			       GEN_INT (adj_off_value));
 	      adjust = RISCV_PROLOGUE_TEMP (Pmode);
 	    }
-	  else
-	    adjust = GEN_INT (adjust_offset.to_constant ());
 	}
 
       insn = emit_insn (
@@ -8148,10 +8197,21 @@  riscv_expand_epilogue (int style)
 
       /* Get an rtx for STEP1 that we can add to BASE.
 	 Skip if adjust equal to zero.  */
-      if (step1.to_constant () != 0)
+      HOST_WIDE_INT step1_value = step1.to_constant ();
+      if (step1_value != 0)
 	{
-	  rtx adjust = GEN_INT (step1.to_constant ());
-	  if (!SMALL_OPERAND (step1.to_constant ()))
+	  rtx adjust = GEN_INT (step1_value);
+	  if (SUM_OF_TWO_S12_ALGN (step1_value))
+	    {
+	      HOST_WIDE_INT base, off;
+	      riscv_split_sum_of_two_s12 (step1_value, &base, &off);
+	      insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
+						stack_pointer_rtx,
+						GEN_INT (base)));
+	      RTX_FRAME_RELATED_P (insn) = 1;
+	      adjust = GEN_INT (off);
+	    }
+	  else if (!SMALL_OPERAND (step1_value))
 	    {
 	      riscv_emit_move (RISCV_PROLOGUE_TEMP (Pmode), adjust);
 	      adjust = RISCV_PROLOGUE_TEMP (Pmode);
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 0d27c0d378df..d6b14c4d6205 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -641,6 +641,13 @@  enum reg_class
 #define SUM_OF_TWO_S12(VALUE)						\
   (SUM_OF_TWO_S12_N (VALUE) || SUM_OF_TWO_S12_P (VALUE))
 
+/* Variant with first value 8 byte aligned if involving stack regs.  */
+#define SUM_OF_TWO_S12_P_ALGN(VALUE)				\
+  (((VALUE) >= (2032 + 1)) && ((VALUE) <= (2032 * 2)))
+
+#define SUM_OF_TWO_S12_ALGN(VALUE)				\
+  (SUM_OF_TWO_S12_N (VALUE) || SUM_OF_TWO_S12_P_ALGN (VALUE))
+
 /* If this is a single bit mask, then we can load it with bseti.  Special
    handling of SImode 0x80000000 on RV64 is done in riscv_build_integer_1. */
 #define SINGLE_BIT_MASK_OPERAND(VALUE)					\
diff --git a/gcc/testsuite/gcc.target/riscv/pr105733.c b/gcc/testsuite/gcc.target/riscv/pr105733.c
new file mode 100644
index 000000000000..6156c36dc7ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr105733.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options { -march=rv64gcv -mabi=lp64d } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-Os" "-Oz" } } */
+
+#define BUF_SIZE 2064
+
+void
+foo(unsigned long i)
+{
+    volatile char buf[BUF_SIZE];
+
+    buf[i] = 0;
+}
+
+/* { dg-final { scan-assembler-not {li\t[a-x0-9]+,4096} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-1.c
index b64c73f34f13..6afcf1db593b 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-1.c
@@ -129,5 +129,5 @@  spill_12 (int8_t *in, int8_t *out)
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-256} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-512} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-1024} 1 } } */
-/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 1 } } */
-/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,-4096\s+add\tsp,sp,[a-x0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 3 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,2032} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-2.c
index 8fcdca705384..544e8628a27b 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-2.c
@@ -120,5 +120,5 @@  spill_11 (int16_t *in, int16_t *out)
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-256} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-512} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-1024} 1 } } */
-/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 1 } } */
-/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,-4096\s+add\tsp,sp,[a-x0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 3 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,2032} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-3.c
index ca296ce02d66..4bfeb07e9aca 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-3.c
@@ -111,5 +111,5 @@  spill_10 (int32_t *in, int32_t *out)
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-256} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-512} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-1024} 1 } } */
-/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 1 } } */
-/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,-4096\s+add\tsp,sp,[a-x0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 3 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,2032} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-4.c
index ef61d9a2c0c3..1faf31ffd8e0 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-4.c
@@ -102,5 +102,5 @@  spill_9 (int64_t *in, int64_t *out)
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-256} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-512} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-1024} 1 } } */
-/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 1 } } */
-/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,-4096\s+add\tsp,sp,[a-x0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 3 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,2032} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-5.c
index 150135a91103..0c8dccc518e3 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-5.c
@@ -120,5 +120,5 @@  spill_11 (_Float16 *in, _Float16 *out)
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-256} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-512} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-1024} 1 } } */
-/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 1 } } */
-/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,-4096\s+add\tsp,sp,[a-x0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 3 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,2032} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-6.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-6.c
index c5d2d0194348..8bf53b84d1cd 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-6.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-6.c
@@ -111,5 +111,5 @@  spill_10 (float *in, float *out)
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-256} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-512} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-1024} 1 } } */
-/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 1 } } */
-/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,-4096\s+add\tsp,sp,[a-x0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 3 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,2032} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-7.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-7.c
index 70ca683908db..e3980a295406 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-7.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/spill-7.c
@@ -102,5 +102,5 @@  spill_9 (int64_t *in, int64_t *out)
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-256} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-512} 1 } } */
 /* { dg-final { scan-assembler-times {addi\tsp,sp,-1024} 1 } } */
-/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 1 } } */
-/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,-4096\s+add\tsp,sp,[a-x0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,-2048} 3 } } */
+/* { dg-final { scan-assembler-times {addi\tsp,sp,2032} 1 } } */