diff mbox series

LoongArch: Replace -mexplicit-relocs=auto simple-used address peephole2 with combine

Message ID 20231212064754.6623-1-xry111@xry111.site
State New
Headers show
Series LoongArch: Replace -mexplicit-relocs=auto simple-used address peephole2 with combine | expand

Commit Message

Xi Ruoyao Dec. 12, 2023, 6:47 a.m. UTC
The problem with peephole2 is it uses a naive sliding-window algorithm
and misses many cases.  For example:

    float a[10000];
    float t() { return a[0] + a[8000]; }

is compiled to:

    la.local    $r13,a
    la.local    $r12,a+32768
    fld.s       $f1,$r13,0
    fld.s       $f0,$r12,-768
    fadd.s      $f0,$f1,$f0

by trunk.  But as we've explained in r14-4851, the following would be
better with -mexplicit-relocs=auto:

    pcalau12i   $r13,%pc_hi20(a)
    pcalau12i   $r12,%pc_hi20(a+32000)
    fld.s       $f1,$r13,%pc_lo12(a)
    fld.s       $f0,$r12,%pc_lo12(a+32000)
    fadd.s      $f0,$f1,$f0

However the sliding-window algorithm just won't detect the pcalau12i/fld
pair to be optimized.  Use a define_insn_and_split in combine pass will
work around the issue.

gcc/ChangeLog:

	* config/loongarch/loongarch.md:
	(simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>): New
	define_insn_and_split.
	(simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>): Likewise.
	(simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>): Likewise.
	(simple_load_off<su>ext<P:mode><SUBDI:mode><GPR:mode>):
	Likewise.
	(simple_store<ST_ANY:mode><P:mode>): Likewise.
	(simple_store_off<ST_ANY:mode><P:mode>): Likewise.
	(define_peephole2): Remove la.local/[f]ld peepholes.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c:
	New test.
---

Bootstrapped & regtested on loongarch64-linux-gnu.  Ok for trunk?

 gcc/config/loongarch/loongarch.md             | 165 +++++++++---------
 ...explicit-relocs-auto-single-load-store-2.c |  11 ++
 2 files changed, 98 insertions(+), 78 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c

Comments

Xi Ruoyao Dec. 21, 2023, 11:56 a.m. UTC | #1
Ping :).

On Tue, 2023-12-12 at 14:47 +0800, Xi Ruoyao wrote:
> The problem with peephole2 is it uses a naive sliding-window algorithm
> and misses many cases.  For example:
> 
>     float a[10000];
>     float t() { return a[0] + a[8000]; }
> 
> is compiled to:
> 
>     la.local    $r13,a
>     la.local    $r12,a+32768
>     fld.s       $f1,$r13,0
>     fld.s       $f0,$r12,-768
>     fadd.s      $f0,$f1,$f0
> 
> by trunk.  But as we've explained in r14-4851, the following would be
> better with -mexplicit-relocs=auto:
> 
>     pcalau12i   $r13,%pc_hi20(a)
>     pcalau12i   $r12,%pc_hi20(a+32000)
>     fld.s       $f1,$r13,%pc_lo12(a)
>     fld.s       $f0,$r12,%pc_lo12(a+32000)
>     fadd.s      $f0,$f1,$f0
> 
> However the sliding-window algorithm just won't detect the pcalau12i/fld
> pair to be optimized.  Use a define_insn_and_split in combine pass will
> work around the issue.
> 
> gcc/ChangeLog:
> 
> 	* config/loongarch/loongarch.md:
> 	(simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>): New
> 	define_insn_and_split.
> 	(simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>): Likewise.
> 	(simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>): Likewise.
> 	(simple_load_off<su>ext<P:mode><SUBDI:mode><GPR:mode>):
> 	Likewise.
> 	(simple_store<ST_ANY:mode><P:mode>): Likewise.
> 	(simple_store_off<ST_ANY:mode><P:mode>): Likewise.
> 	(define_peephole2): Remove la.local/[f]ld peepholes.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c:
> 	New test.
> ---
> 
> Bootstrapped & regtested on loongarch64-linux-gnu.  Ok for trunk?
> 
>  gcc/config/loongarch/loongarch.md             | 165 +++++++++---------
>  ...explicit-relocs-auto-single-load-store-2.c |  11 ++
>  2 files changed, 98 insertions(+), 78 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
> 
> diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
> index 7b26d15aa4e..4009de408fb 100644
> --- a/gcc/config/loongarch/loongarch.md
> +++ b/gcc/config/loongarch/loongarch.md
> @@ -4033,101 +4033,110 @@ (define_insn "loongarch_crcc_w_<size>_w"
>  ;;
>  ;; And if the pseudo op cannot be relaxed, we'll get a worse result (with
>  ;; 3 instructions).
> -(define_peephole2
> -  [(set (match_operand:P 0 "register_operand")
> -	(match_operand:P 1 "symbolic_pcrel_operand"))
> -   (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
> -	(mem:LD_AT_LEAST_32_BIT (match_dup 0)))]
> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> -   && (peep2_reg_dead_p (2, operands[0]) \
> -       || REGNO (operands[0]) == REGNO (operands[2]))"
> -  [(set (match_dup 2)
> -	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
> +(define_insn_and_split "simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>"
> +  [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
> +	(mem:LD_AT_LEAST_32_BIT
> +	  (match_operand:P 1 "symbolic_pcrel_operand" "")))]
> +  "loongarch_pre_reload_split () \
> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> +  "#"
> +  ""
> +  [(set (match_dup 0)
> +	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
>    {
> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> +    operands[2] = gen_reg_rtx (Pmode);
> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>    })
>  
> -(define_peephole2
> -  [(set (match_operand:P 0 "register_operand")
> -	(match_operand:P 1 "symbolic_pcrel_operand"))
> -   (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
> -	(mem:LD_AT_LEAST_32_BIT (plus (match_dup 0)
> -				(match_operand 3 "const_int_operand"))))]
> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> -   && (peep2_reg_dead_p (2, operands[0]) \
> -       || REGNO (operands[0]) == REGNO (operands[2]))"
> -  [(set (match_dup 2)
> -	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
> +(define_insn_and_split "simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>"
> +  [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
> +	(mem:LD_AT_LEAST_32_BIT
> +	  (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
> +		(match_operand 2 "const_int_operand" ""))))]
> +  "loongarch_pre_reload_split () \
> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> +  "#"
> +  ""
> +  [(set (match_dup 0)
> +	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
>    {
> -    operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> +    HOST_WIDE_INT offset = INTVAL (operands[2]);
> +    operands[2] = gen_reg_rtx (Pmode);
> +    operands[1] = plus_constant (Pmode, operands[1], offset);
> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>    })
>  
> -(define_peephole2
> -  [(set (match_operand:P 0 "register_operand")
> -	(match_operand:P 1 "symbolic_pcrel_operand"))
> -   (set (match_operand:GPR 2 "register_operand")
> -	(any_extend:GPR (mem:SUBDI (match_dup 0))))]
> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> -   && (peep2_reg_dead_p (2, operands[0]) \
> -       || REGNO (operands[0]) == REGNO (operands[2]))"
> -  [(set (match_dup 2)
> -	(any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
> -					     (match_dup 1)))))]
> +(define_insn_and_split "simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
> +  [(set (match_operand:GPR 0 "register_operand" "=r")
> +	(any_extend:GPR
> +	  (mem:SUBDI (match_operand:P 1 "symbolic_pcrel_operand" ""))))]
> +  "loongarch_pre_reload_split () \
> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> +  "#"
> +  ""
> +  [(set (match_dup 0)
> +	(any_extend:GPR
> +	  (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
>    {
> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> +    operands[2] = gen_reg_rtx (Pmode);
> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>    })
>  
> -(define_peephole2
> -  [(set (match_operand:P 0 "register_operand")
> -	(match_operand:P 1 "symbolic_pcrel_operand"))
> -   (set (match_operand:GPR 2 "register_operand")
> +(define_insn_and_split
> +  "simple_load_off_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
> +  [(set (match_operand:GPR 0 "register_operand" "=r")
> +	(any_extend:GPR
> +	  (mem:SUBDI
> +	    (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
> +		  (match_operand 2 "const_int_operand" "")))))]
> +  "loongarch_pre_reload_split () \
> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> +  "#"
> +  ""
> +  [(set (match_dup 0)
>  	(any_extend:GPR
> -	  (mem:SUBDI (plus (match_dup 0)
> -			   (match_operand 3 "const_int_operand")))))]
> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> -   && (peep2_reg_dead_p (2, operands[0]) \
> -       || REGNO (operands[0]) == REGNO (operands[2]))"
> -  [(set (match_dup 2)
> -	(any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
> -					     (match_dup 1)))))]
> +	  (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
>    {
> -    operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> +    HOST_WIDE_INT offset = INTVAL (operands[2]);
> +    operands[2] = gen_reg_rtx (Pmode);
> +    operands[1] = plus_constant (Pmode, operands[1], offset);
> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>    })
>  
> -(define_peephole2
> -  [(set (match_operand:P 0 "register_operand")
> -	(match_operand:P 1 "symbolic_pcrel_operand"))
> -   (set (mem:ST_ANY (match_dup 0))
> -	(match_operand:ST_ANY 2 "register_operand"))]
> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> -   && (peep2_reg_dead_p (2, operands[0])) \
> -   && REGNO (operands[0]) != REGNO (operands[2])"
> -  [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
> +(define_insn_and_split "simple_store<ST_ANY:mode><P:mode>"
> +  [(set (mem:ST_ANY (match_operand:P 0 "symbolic_pcrel_operand"))
> +	(match_operand:ST_ANY 1 "register_operand" "r,f"))]
> +  "loongarch_pre_reload_split () \
> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> +  "#"
> +  ""
> +  [(set (mem:ST_ANY (lo_sum:P (match_dup 2) (match_dup 0))) (match_dup 1))]
>    {
> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> +    operands[2] = gen_reg_rtx (Pmode);
> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[0]));
>    })
>  
> -(define_peephole2
> -  [(set (match_operand:P 0 "register_operand")
> -	(match_operand:P 1 "symbolic_pcrel_operand"))
> -   (set (mem:ST_ANY (plus (match_dup 0)
> -			  (match_operand 3 "const_int_operand")))
> -	(match_operand:ST_ANY 2 "register_operand"))]
> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> -   && (peep2_reg_dead_p (2, operands[0])) \
> -   && REGNO (operands[0]) != REGNO (operands[2])"
> -  [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
> +(define_insn_and_split "simple_store_off<ST_ANY:mode><P:mode>"
> +  [(set (mem:ST_ANY
> +	  (plus (match_operand:P 0 "symbolic_pcrel_operand" "")
> +		(match_operand 1 "const_int_operand" "")))
> +	(match_operand:ST_ANY 2 "register_operand" "r,f"))]
> +  "loongarch_pre_reload_split () \
> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> +  "#"
> +  ""
> +  [(set (mem:ST_ANY (lo_sum:P (match_dup 1) (match_dup 0))) (match_dup 2))]
>    {
> -    operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> +    HOST_WIDE_INT offset = INTVAL (operands[1]);
> +    operands[1] = gen_reg_rtx (Pmode);
> +    operands[0] = plus_constant (Pmode, operands[0], offset);
> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[1], operands[0]));
>    })
>  
>  ;; Synchronization instructions.
> diff --git a/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
> new file mode 100644
> index 00000000000..42cb966d1e0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=loongarch64 -mabi=lp64d -mexplicit-relocs=auto" } */
> +
> +float a[8001];
> +float
> +t (void)
> +{
> +  return a[0] + a[8000];
> +}
> +
> +/* { dg-final { scan-assembler-not "la.local" } } */
Lulu Cheng Dec. 21, 2023, noon UTC | #2
Sorry, I've been busy with something else these two days. I don't think 
there's anything wrong with the code,

but I need to test the spec.:-)

在 2023/12/21 下午7:56, Xi Ruoyao 写道:
> Ping :).
>
> On Tue, 2023-12-12 at 14:47 +0800, Xi Ruoyao wrote:
>> The problem with peephole2 is it uses a naive sliding-window algorithm
>> and misses many cases.  For example:
>>
>>      float a[10000];
>>      float t() { return a[0] + a[8000]; }
>>
>> is compiled to:
>>
>>      la.local    $r13,a
>>      la.local    $r12,a+32768
>>      fld.s       $f1,$r13,0
>>      fld.s       $f0,$r12,-768
>>      fadd.s      $f0,$f1,$f0
>>
>> by trunk.  But as we've explained in r14-4851, the following would be
>> better with -mexplicit-relocs=auto:
>>
>>      pcalau12i   $r13,%pc_hi20(a)
>>      pcalau12i   $r12,%pc_hi20(a+32000)
>>      fld.s       $f1,$r13,%pc_lo12(a)
>>      fld.s       $f0,$r12,%pc_lo12(a+32000)
>>      fadd.s      $f0,$f1,$f0
>>
>> However the sliding-window algorithm just won't detect the pcalau12i/fld
>> pair to be optimized.  Use a define_insn_and_split in combine pass will
>> work around the issue.
>>
>> gcc/ChangeLog:
>>
>> 	* config/loongarch/loongarch.md:
>> 	(simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>): New
>> 	define_insn_and_split.
>> 	(simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>): Likewise.
>> 	(simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>): Likewise.
>> 	(simple_load_off<su>ext<P:mode><SUBDI:mode><GPR:mode>):
>> 	Likewise.
>> 	(simple_store<ST_ANY:mode><P:mode>): Likewise.
>> 	(simple_store_off<ST_ANY:mode><P:mode>): Likewise.
>> 	(define_peephole2): Remove la.local/[f]ld peepholes.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c:
>> 	New test.
>> ---
>>
>> Bootstrapped & regtested on loongarch64-linux-gnu.  Ok for trunk?
>>
>>   gcc/config/loongarch/loongarch.md             | 165 +++++++++---------
>>   ...explicit-relocs-auto-single-load-store-2.c |  11 ++
>>   2 files changed, 98 insertions(+), 78 deletions(-)
>>   create mode 100644 gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
>>
>> diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
>> index 7b26d15aa4e..4009de408fb 100644
>> --- a/gcc/config/loongarch/loongarch.md
>> +++ b/gcc/config/loongarch/loongarch.md
>> @@ -4033,101 +4033,110 @@ (define_insn "loongarch_crcc_w_<size>_w"
>>   ;;
>>   ;; And if the pseudo op cannot be relaxed, we'll get a worse result (with
>>   ;; 3 instructions).
>> -(define_peephole2
>> -  [(set (match_operand:P 0 "register_operand")
>> -	(match_operand:P 1 "symbolic_pcrel_operand"))
>> -   (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
>> -	(mem:LD_AT_LEAST_32_BIT (match_dup 0)))]
>> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> -   && (peep2_reg_dead_p (2, operands[0]) \
>> -       || REGNO (operands[0]) == REGNO (operands[2]))"
>> -  [(set (match_dup 2)
>> -	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
>> +(define_insn_and_split "simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>"
>> +  [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
>> +	(mem:LD_AT_LEAST_32_BIT
>> +	  (match_operand:P 1 "symbolic_pcrel_operand" "")))]
>> +  "loongarch_pre_reload_split () \
>> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> +  "#"
>> +  ""
>> +  [(set (match_dup 0)
>> +	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
>>     {
>> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> +    operands[2] = gen_reg_rtx (Pmode);
>> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>>     })
>>   
>> -(define_peephole2
>> -  [(set (match_operand:P 0 "register_operand")
>> -	(match_operand:P 1 "symbolic_pcrel_operand"))
>> -   (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
>> -	(mem:LD_AT_LEAST_32_BIT (plus (match_dup 0)
>> -				(match_operand 3 "const_int_operand"))))]
>> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> -   && (peep2_reg_dead_p (2, operands[0]) \
>> -       || REGNO (operands[0]) == REGNO (operands[2]))"
>> -  [(set (match_dup 2)
>> -	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
>> +(define_insn_and_split "simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>"
>> +  [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
>> +	(mem:LD_AT_LEAST_32_BIT
>> +	  (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
>> +		(match_operand 2 "const_int_operand" ""))))]
>> +  "loongarch_pre_reload_split () \
>> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> +  "#"
>> +  ""
>> +  [(set (match_dup 0)
>> +	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
>>     {
>> -    operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
>> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> +    HOST_WIDE_INT offset = INTVAL (operands[2]);
>> +    operands[2] = gen_reg_rtx (Pmode);
>> +    operands[1] = plus_constant (Pmode, operands[1], offset);
>> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>>     })
>>   
>> -(define_peephole2
>> -  [(set (match_operand:P 0 "register_operand")
>> -	(match_operand:P 1 "symbolic_pcrel_operand"))
>> -   (set (match_operand:GPR 2 "register_operand")
>> -	(any_extend:GPR (mem:SUBDI (match_dup 0))))]
>> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> -   && (peep2_reg_dead_p (2, operands[0]) \
>> -       || REGNO (operands[0]) == REGNO (operands[2]))"
>> -  [(set (match_dup 2)
>> -	(any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
>> -					     (match_dup 1)))))]
>> +(define_insn_and_split "simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
>> +  [(set (match_operand:GPR 0 "register_operand" "=r")
>> +	(any_extend:GPR
>> +	  (mem:SUBDI (match_operand:P 1 "symbolic_pcrel_operand" ""))))]
>> +  "loongarch_pre_reload_split () \
>> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> +  "#"
>> +  ""
>> +  [(set (match_dup 0)
>> +	(any_extend:GPR
>> +	  (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
>>     {
>> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> +    operands[2] = gen_reg_rtx (Pmode);
>> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>>     })
>>   
>> -(define_peephole2
>> -  [(set (match_operand:P 0 "register_operand")
>> -	(match_operand:P 1 "symbolic_pcrel_operand"))
>> -   (set (match_operand:GPR 2 "register_operand")
>> +(define_insn_and_split
>> +  "simple_load_off_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
>> +  [(set (match_operand:GPR 0 "register_operand" "=r")
>> +	(any_extend:GPR
>> +	  (mem:SUBDI
>> +	    (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
>> +		  (match_operand 2 "const_int_operand" "")))))]
>> +  "loongarch_pre_reload_split () \
>> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> +  "#"
>> +  ""
>> +  [(set (match_dup 0)
>>   	(any_extend:GPR
>> -	  (mem:SUBDI (plus (match_dup 0)
>> -			   (match_operand 3 "const_int_operand")))))]
>> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> -   && (peep2_reg_dead_p (2, operands[0]) \
>> -       || REGNO (operands[0]) == REGNO (operands[2]))"
>> -  [(set (match_dup 2)
>> -	(any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
>> -					     (match_dup 1)))))]
>> +	  (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
>>     {
>> -    operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
>> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> +    HOST_WIDE_INT offset = INTVAL (operands[2]);
>> +    operands[2] = gen_reg_rtx (Pmode);
>> +    operands[1] = plus_constant (Pmode, operands[1], offset);
>> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>>     })
>>   
>> -(define_peephole2
>> -  [(set (match_operand:P 0 "register_operand")
>> -	(match_operand:P 1 "symbolic_pcrel_operand"))
>> -   (set (mem:ST_ANY (match_dup 0))
>> -	(match_operand:ST_ANY 2 "register_operand"))]
>> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> -   && (peep2_reg_dead_p (2, operands[0])) \
>> -   && REGNO (operands[0]) != REGNO (operands[2])"
>> -  [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
>> +(define_insn_and_split "simple_store<ST_ANY:mode><P:mode>"
>> +  [(set (mem:ST_ANY (match_operand:P 0 "symbolic_pcrel_operand"))
>> +	(match_operand:ST_ANY 1 "register_operand" "r,f"))]
>> +  "loongarch_pre_reload_split () \
>> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> +  "#"
>> +  ""
>> +  [(set (mem:ST_ANY (lo_sum:P (match_dup 2) (match_dup 0))) (match_dup 1))]
>>     {
>> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> +    operands[2] = gen_reg_rtx (Pmode);
>> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[0]));
>>     })
>>   
>> -(define_peephole2
>> -  [(set (match_operand:P 0 "register_operand")
>> -	(match_operand:P 1 "symbolic_pcrel_operand"))
>> -   (set (mem:ST_ANY (plus (match_dup 0)
>> -			  (match_operand 3 "const_int_operand")))
>> -	(match_operand:ST_ANY 2 "register_operand"))]
>> -  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> -   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> -   && (peep2_reg_dead_p (2, operands[0])) \
>> -   && REGNO (operands[0]) != REGNO (operands[2])"
>> -  [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
>> +(define_insn_and_split "simple_store_off<ST_ANY:mode><P:mode>"
>> +  [(set (mem:ST_ANY
>> +	  (plus (match_operand:P 0 "symbolic_pcrel_operand" "")
>> +		(match_operand 1 "const_int_operand" "")))
>> +	(match_operand:ST_ANY 2 "register_operand" "r,f"))]
>> +  "loongarch_pre_reload_split () \
>> +   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> +   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> +  "#"
>> +  ""
>> +  [(set (mem:ST_ANY (lo_sum:P (match_dup 1) (match_dup 0))) (match_dup 2))]
>>     {
>> -    operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
>> -    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> +    HOST_WIDE_INT offset = INTVAL (operands[1]);
>> +    operands[1] = gen_reg_rtx (Pmode);
>> +    operands[0] = plus_constant (Pmode, operands[0], offset);
>> +    emit_insn (gen_pcalau12i_gr<P:mode> (operands[1], operands[0]));
>>     })
>>   
>>   ;; Synchronization instructions.
>> diff --git a/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
>> new file mode 100644
>> index 00000000000..42cb966d1e0
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
>> @@ -0,0 +1,11 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -march=loongarch64 -mabi=lp64d -mexplicit-relocs=auto" } */
>> +
>> +float a[8001];
>> +float
>> +t (void)
>> +{
>> +  return a[0] + a[8000];
>> +}
>> +
>> +/* { dg-final { scan-assembler-not "la.local" } } */
Lulu Cheng Dec. 22, 2023, 3:44 a.m. UTC | #3
在 2023/12/21 下午8:00, chenglulu 写道:
> Sorry, I've been busy with something else these two days. I don't 
> think there's anything wrong with the code,
>
> but I need to test the spec.:-)

Hi, Ruoyao:

After applying this patch, spec2006 464.h264 ref will have a 6.4% 
performance drop. So I'm going to retest it.

>
> 在 2023/12/21 下午7:56, Xi Ruoyao 写道:
>> Ping :).
>>
Xi Ruoyao Dec. 22, 2023, 7:09 a.m. UTC | #4
On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote:
> 
> 在 2023/12/21 下午8:00, chenglulu 写道:
> > Sorry, I've been busy with something else these two days. I don't 
> > think there's anything wrong with the code,
> > 
> > but I need to test the spec.:-)
> 
> Hi, Ruoyao:
> 
> After applying this patch, spec2006 464.h264 ref will have a 6.4% 
> performance drop. So I'm going to retest it.

I think 6.4% is large enough not to be a random error.

Is there an example showing the code regression?

And I'm wondering if keeping the peephole besides the new
define_insn_and_split produces a better result instead of solely relying
on define_insn_and_split?
Lulu Cheng Dec. 22, 2023, 7:21 a.m. UTC | #5
在 2023/12/22 下午3:09, Xi Ruoyao 写道:
> On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote:
>> 在 2023/12/21 下午8:00, chenglulu 写道:
>>> Sorry, I've been busy with something else these two days. I don't
>>> think there's anything wrong with the code,
>>>
>>> but I need to test the spec.:-)
>> Hi, Ruoyao:
>>
>> After applying this patch, spec2006 464.h264 ref will have a 6.4%
>> performance drop. So I'm going to retest it.
> I think 6.4% is large enough not to be a random error.
>
> Is there an example showing the code regression?
>
> And I'm wondering if keeping the peephole besides the new
> define_insn_and_split produces a better result instead of solely relying
> on define_insn_and_split?
>
I haven't debugged this yet, I'm retesting, if there is still such a big 
performance gap,

I think I need to see the reason.
Lulu Cheng Dec. 23, 2023, 2:26 a.m. UTC | #6
在 2023/12/22 下午3:21, chenglulu 写道:
>
> 在 2023/12/22 下午3:09, Xi Ruoyao 写道:
>> On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote:
>>> 在 2023/12/21 下午8:00, chenglulu 写道:
>>>> Sorry, I've been busy with something else these two days. I don't
>>>> think there's anything wrong with the code,
>>>>
>>>> but I need to test the spec.:-)
>>> Hi, Ruoyao:
>>>
>>> After applying this patch, spec2006 464.h264 ref will have a 6.4%
>>> performance drop. So I'm going to retest it.
>> I think 6.4% is large enough not to be a random error.
>>
>> Is there an example showing the code regression?
>>
>> And I'm wondering if keeping the peephole besides the new
>> define_insn_and_split produces a better result instead of solely relying
>> on define_insn_and_split?
>>
> I haven't debugged this yet, I'm retesting, if there is still such a 
> big performance gap,
>
> I think I need to see the reason.
>
The performance drop has nothing to do with this patch. I found that the 
h264 performance compiled

by r14-6787 compared to r14-6421 dropped by 6.4%.
Lulu Cheng Dec. 23, 2023, 2:29 a.m. UTC | #7
在 2023/12/23 上午10:26, chenglulu 写道:
>
> 在 2023/12/22 下午3:21, chenglulu 写道:
>>
>> 在 2023/12/22 下午3:09, Xi Ruoyao 写道:
>>> On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote:
>>>> 在 2023/12/21 下午8:00, chenglulu 写道:
>>>>> Sorry, I've been busy with something else these two days. I don't
>>>>> think there's anything wrong with the code,
>>>>>
>>>>> but I need to test the spec.:-)
>>>> Hi, Ruoyao:
>>>>
>>>> After applying this patch, spec2006 464.h264 ref will have a 6.4%
>>>> performance drop. So I'm going to retest it.
>>> I think 6.4% is large enough not to be a random error.
>>>
>>> Is there an example showing the code regression?
>>>
>>> And I'm wondering if keeping the peephole besides the new
>>> define_insn_and_split produces a better result instead of solely 
>>> relying
>>> on define_insn_and_split?
>>>
>> I haven't debugged this yet, I'm retesting, if there is still such a 
>> big performance gap,
>>
>> I think I need to see the reason.
>>
> The performance drop has nothing to do with this patch. I found that 
> the h264 performance compiled
>
> by r14-6787 compared to r14-6421 dropped by 6.4%.
>
>
But there is a problem. My regression test has the following two fail 
items.(based on r14-6787)

+FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
+FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
Xi Ruoyao Dec. 23, 2023, 10:44 a.m. UTC | #8
On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
> > The performance drop has nothing to do with this patch. I found that the h264 performance compiled 
> > by r14-6787 compared to r14-6421 dropped by 6.4%. 

Then I guess we should create a bug report...

>  But there is a problem. My regression test has the following two fail items.(based on r14-6787)

> +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
> +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6

Strange.  I didn't see them on r14-6650 (with or without the patch).
Xi Ruoyao Dec. 23, 2023, 10:47 a.m. UTC | #9
On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote:
> On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
> > > The performance drop has nothing to do with this patch. I found that the h264 performance compiled 
> > > by r14-6787 compared to r14-6421 dropped by 6.4%. 
> 
> Then I guess we should create a bug report...
> 
> >  But there is a problem. My regression test has the following two fail items.(based on r14-6787)
> 
> > +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)

I guess this is https://gcc.gnu.org/PR28123.

> > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6

I'll take a look on this.  Maybe it will show up with Binutils trunk (I
just realized I tested this patch with Binutils 2.41, and it's not
sufficient to really test the change).

> Strange.  I didn't see them on r14-6650 (with or without the patch).
Xi Ruoyao Dec. 24, 2023, 12:59 p.m. UTC | #10
On Sat, 2023-12-23 at 18:47 +0800, Xi Ruoyao wrote:
> On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote:
> > On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
> > > > The performance drop has nothing to do with this patch. I found that the h264 performance compiled 
> > > > by r14-6787 compared to r14-6421 dropped by 6.4%. 
> > 
> > Then I guess we should create a bug report...
> > 
> > >  But there is a problem. My regression test has the following two fail items.(based on r14-6787)
> > 
> > > +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
> 
> I guess this is https://gcc.gnu.org/PR28123.
> 
> > > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
> 
> I'll take a look on this.  Maybe it will show up with Binutils trunk (I
> just realized I tested this patch with Binutils 2.41, and it's not
> sufficient to really test the change).

I cannot reproduce the issue on a Gentoo dev machine with Binutils
2.41.50.20231218 and the patch on top of r14-6819.  And in my manual
testing (for ruling out the difference caused by default PIE and SSP)
the test also passes:

xry111@nanmen2 ~/git-repos/gcc-build $ /home/xry111/git-repos/gcc-
build/gcc/xgcc -B/home/xry111/git-repos/gcc-build/gcc/ /home/xry111/git-
repos/gcc/gcc/testsuite/gcc.dg/pr86617.c -fdiagnostics-plain-output -Os
-fdump-rtl-final -ffat-lto-objects -S -o pr86617.s -fno-stack-protector
-fno-pie && grep -c mem/v pr86617.c.348r.final 
6

Could you recheck with latest GCC master?
Lulu Cheng Dec. 25, 2023, 2:08 a.m. UTC | #11
在 2023/12/24 下午8:59, Xi Ruoyao 写道:
> On Sat, 2023-12-23 at 18:47 +0800, Xi Ruoyao wrote:
>> On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote:
>>> On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
>>>>> The performance drop has nothing to do with this patch. I found that the h264 performance compiled
>>>>> by r14-6787 compared to r14-6421 dropped by 6.4%.
>>> Then I guess we should create a bug report...
>>>
>>>>   But there is a problem. My regression test has the following two fail items.(based on r14-6787)
>>>> +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
>> I guess this is https://gcc.gnu.org/PR28123.
>>
>>>> +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
>> I'll take a look on this.  Maybe it will show up with Binutils trunk (I
>> just realized I tested this patch with Binutils 2.41, and it's not
>> sufficient to really test the change).
> I cannot reproduce the issue on a Gentoo dev machine with Binutils
> 2.41.50.20231218 and the patch on top of r14-6819.  And in my manual
> testing (for ruling out the difference caused by default PIE and SSP)
> the test also passes:
>
> xry111@nanmen2 ~/git-repos/gcc-build $ /home/xry111/git-repos/gcc-
> build/gcc/xgcc -B/home/xry111/git-repos/gcc-build/gcc/ /home/xry111/git-
> repos/gcc/gcc/testsuite/gcc.dg/pr86617.c -fdiagnostics-plain-output -Os
> -fdump-rtl-final -ffat-lto-objects -S -o pr86617.s -fno-stack-protector
> -fno-pie && grep -c mem/v pr86617.c.348r.final
> 6
>
> Could you recheck with latest GCC master?
Ok, I'll test again with the latest code.
>
Xi Ruoyao Dec. 25, 2023, 10:38 a.m. UTC | #12
On Mon, 2023-12-25 at 10:08 +0800, chenglulu wrote:
> 
> 在 2023/12/24 下午8:59, Xi Ruoyao 写道:
> > On Sat, 2023-12-23 at 18:47 +0800, Xi Ruoyao wrote:
> > > On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote:
> > > > On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
> > > > > > The performance drop has nothing to do with this patch. I
> > > > > > found that the h264 performance compiled
> > > > > > by r14-6787 compared to r14-6421 dropped by 6.4%.
> > > > Then I guess we should create a bug report...
> > > > 
> > > > >   But there is a problem. My regression test has the following
> > > > > two fail items.(based on r14-6787)
> > > > > +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
> > > I guess this is https://gcc.gnu.org/PR28123.
> > > 
> > > > > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
> > > I'll take a look on this.  Maybe it will show up with Binutils
> > > trunk (I
> > > just realized I tested this patch with Binutils 2.41, and it's not
> > > sufficient to really test the change).
> > I cannot reproduce the issue on a Gentoo dev machine with Binutils
> > 2.41.50.20231218 and the patch on top of r14-6819.  And in my manual
> > testing (for ruling out the difference caused by default PIE and
> > SSP)
> > the test also passes:
> > 
> > xry111@nanmen2 ~/git-repos/gcc-build $ /home/xry111/git-repos/gcc-
> > build/gcc/xgcc -B/home/xry111/git-repos/gcc-build/gcc/
> > /home/xry111/git-
> > repos/gcc/gcc/testsuite/gcc.dg/pr86617.c -fdiagnostics-plain-output
> > -Os
> > -fdump-rtl-final -ffat-lto-objects -S -o pr86617.s -fno-stack-
> > protector
> > -fno-pie && grep -c mem/v pr86617.c.348r.final
> > 6
> > 
> > Could you recheck with latest GCC master?
> Ok, I'll test again with the latest code.

Per https://gcc.gnu.org/pipermail/gcc-patches/2023-December/641407.html
I need to and "&& true" into the split condition.  I'll test it and send
V2.
Lulu Cheng Dec. 27, 2023, 3:59 a.m. UTC | #13
在 2023/12/23 下午6:44, Xi Ruoyao 写道:
> On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
>>> The performance drop has nothing to do with this patch. I found that the h264 performance compiled
>>> by r14-6787 compared to r14-6421 dropped by 6.4%.
> Then I guess we should create a bug report...
The code h264 score in r14-6818 is the same as that of r14-6421.
>
>>   But there is a problem. My regression test has the following two fail items.(based on r14-6787)
>> +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
>> +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
> Strange.  I didn't see them on r14-6650 (with or without the patch).
>

+FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6

In r14-6818 the issue persists. I kind of chased the code and found that the problem is like this:
   volatile unsigned char u8;

   void test (void)
   {
     u8 = u8 + u8;
     u8 = u8 - u8;
   }

$./gcc/cc1 test.c -o test.s -fdump-rtl-all-all -fdiagnostics-plain-output  -Os -fdump-rtl-final -ffat-lto-objects

test.c.301r.outof_cfglayout

  (insn 7 6 9 2 (set (reg:DI 80 [ u8.0_1 ])
         (zero_extend:DI*(mem/v/c*:QI (symbol_ref:DI ("*.LANCHOR0") [flags 0x182]) [0 u8D.2193+0 S1 A8]))) "volatile.c":5:11 459 {simple_load_uextdiqidi}
      (nil))

test.c.302r.split1

(insn 27 6 28 2 (set (reg:DI 98)
         (unspec:DI [
                 (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])
             ] UNSPEC_PCALAU12I_GR)) "volatile.c":5:11 -1
      (nil))
(insn 28 27 9 2 (set (reg:DI 80 [ u8.0_1 ])
         (zero_extend:DI*(mem:*QI (lo_sum:DI (reg:DI 98)
                     (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])) [0  S1 A8]))) "volatile.c":5:11 -1
      (nil))

The volatile property of the mem here is gone, so the test fails.
Xi Ruoyao Dec. 27, 2023, 11:21 a.m. UTC | #14
On Wed, 2023-12-27 at 11:59 +0800, chenglulu wrote:

> +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
> 
> In r14-6818 the issue persists. I kind of chased the code and found that the problem is like this:
>   volatile unsigned char u8;
> 
>   void test (void)
>   {
>     u8 = u8 + u8;
>     u8 = u8 - u8;
>   }
> 
> $./gcc/cc1 test.c -o test.s -fdump-rtl-all-all -fdiagnostics-plain-output  -Os -fdump-rtl-final -ffat-lto-objects
> 
> test.c.301r.outof_cfglayout
> 
>  (insn 7 6 9 2 (set (reg:DI 80 [ u8.0_1 ])
>         (zero_extend:DI (mem/v/c:QI (symbol_ref:DI ("*.LANCHOR0") [flags 0x182]) [0 u8D.2193+0 S1 A8]))) "volatile.c":5:11 459 {simple_load_uextdiqidi}
>      (nil))
> 
> test.c.302r.split1
> 
> (insn 27 6 28 2 (set (reg:DI 98)
>         (unspec:DI [
>                 (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])
>             ] UNSPEC_PCALAU12I_GR)) "volatile.c":5:11 -1
>      (nil))
> (insn 28 27 9 2 (set (reg:DI 80 [ u8.0_1 ])
>         (zero_extend:DI (mem:QI (lo_sum:DI (reg:DI 98)
>                     (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])) [0  S1 A8]))) "volatile.c":5:11 -1
>      (nil))
> 
> The volatile property of the mem here is gone, so the test fails.

Phew.  I guess I couldn't reproduce it because I have Jeff's ext-dce
patch in my local repo, which removed the zero_extend...

I'll rework this patch.
diff mbox series

Patch

diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 7b26d15aa4e..4009de408fb 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -4033,101 +4033,110 @@  (define_insn "loongarch_crcc_w_<size>_w"
 ;;
 ;; And if the pseudo op cannot be relaxed, we'll get a worse result (with
 ;; 3 instructions).
-(define_peephole2
-  [(set (match_operand:P 0 "register_operand")
-	(match_operand:P 1 "symbolic_pcrel_operand"))
-   (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
-	(mem:LD_AT_LEAST_32_BIT (match_dup 0)))]
-  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
-   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
-   && (peep2_reg_dead_p (2, operands[0]) \
-       || REGNO (operands[0]) == REGNO (operands[2]))"
-  [(set (match_dup 2)
-	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
+(define_insn_and_split "simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>"
+  [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
+	(mem:LD_AT_LEAST_32_BIT
+	  (match_operand:P 1 "symbolic_pcrel_operand" "")))]
+  "loongarch_pre_reload_split () \
+   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+  "#"
+  ""
+  [(set (match_dup 0)
+	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
   {
-    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+    operands[2] = gen_reg_rtx (Pmode);
+    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
   })
 
-(define_peephole2
-  [(set (match_operand:P 0 "register_operand")
-	(match_operand:P 1 "symbolic_pcrel_operand"))
-   (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
-	(mem:LD_AT_LEAST_32_BIT (plus (match_dup 0)
-				(match_operand 3 "const_int_operand"))))]
-  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
-   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
-   && (peep2_reg_dead_p (2, operands[0]) \
-       || REGNO (operands[0]) == REGNO (operands[2]))"
-  [(set (match_dup 2)
-	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
+(define_insn_and_split "simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>"
+  [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
+	(mem:LD_AT_LEAST_32_BIT
+	  (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
+		(match_operand 2 "const_int_operand" ""))))]
+  "loongarch_pre_reload_split () \
+   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+  "#"
+  ""
+  [(set (match_dup 0)
+	(mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
   {
-    operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
-    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+    HOST_WIDE_INT offset = INTVAL (operands[2]);
+    operands[2] = gen_reg_rtx (Pmode);
+    operands[1] = plus_constant (Pmode, operands[1], offset);
+    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
   })
 
-(define_peephole2
-  [(set (match_operand:P 0 "register_operand")
-	(match_operand:P 1 "symbolic_pcrel_operand"))
-   (set (match_operand:GPR 2 "register_operand")
-	(any_extend:GPR (mem:SUBDI (match_dup 0))))]
-  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
-   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
-   && (peep2_reg_dead_p (2, operands[0]) \
-       || REGNO (operands[0]) == REGNO (operands[2]))"
-  [(set (match_dup 2)
-	(any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
-					     (match_dup 1)))))]
+(define_insn_and_split "simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(any_extend:GPR
+	  (mem:SUBDI (match_operand:P 1 "symbolic_pcrel_operand" ""))))]
+  "loongarch_pre_reload_split () \
+   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+  "#"
+  ""
+  [(set (match_dup 0)
+	(any_extend:GPR
+	  (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
   {
-    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+    operands[2] = gen_reg_rtx (Pmode);
+    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
   })
 
-(define_peephole2
-  [(set (match_operand:P 0 "register_operand")
-	(match_operand:P 1 "symbolic_pcrel_operand"))
-   (set (match_operand:GPR 2 "register_operand")
+(define_insn_and_split
+  "simple_load_off_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(any_extend:GPR
+	  (mem:SUBDI
+	    (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
+		  (match_operand 2 "const_int_operand" "")))))]
+  "loongarch_pre_reload_split () \
+   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+  "#"
+  ""
+  [(set (match_dup 0)
 	(any_extend:GPR
-	  (mem:SUBDI (plus (match_dup 0)
-			   (match_operand 3 "const_int_operand")))))]
-  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
-   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
-   && (peep2_reg_dead_p (2, operands[0]) \
-       || REGNO (operands[0]) == REGNO (operands[2]))"
-  [(set (match_dup 2)
-	(any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
-					     (match_dup 1)))))]
+	  (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
   {
-    operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
-    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+    HOST_WIDE_INT offset = INTVAL (operands[2]);
+    operands[2] = gen_reg_rtx (Pmode);
+    operands[1] = plus_constant (Pmode, operands[1], offset);
+    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
   })
 
-(define_peephole2
-  [(set (match_operand:P 0 "register_operand")
-	(match_operand:P 1 "symbolic_pcrel_operand"))
-   (set (mem:ST_ANY (match_dup 0))
-	(match_operand:ST_ANY 2 "register_operand"))]
-  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
-   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
-   && (peep2_reg_dead_p (2, operands[0])) \
-   && REGNO (operands[0]) != REGNO (operands[2])"
-  [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
+(define_insn_and_split "simple_store<ST_ANY:mode><P:mode>"
+  [(set (mem:ST_ANY (match_operand:P 0 "symbolic_pcrel_operand"))
+	(match_operand:ST_ANY 1 "register_operand" "r,f"))]
+  "loongarch_pre_reload_split () \
+   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+  "#"
+  ""
+  [(set (mem:ST_ANY (lo_sum:P (match_dup 2) (match_dup 0))) (match_dup 1))]
   {
-    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+    operands[2] = gen_reg_rtx (Pmode);
+    emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[0]));
   })
 
-(define_peephole2
-  [(set (match_operand:P 0 "register_operand")
-	(match_operand:P 1 "symbolic_pcrel_operand"))
-   (set (mem:ST_ANY (plus (match_dup 0)
-			  (match_operand 3 "const_int_operand")))
-	(match_operand:ST_ANY 2 "register_operand"))]
-  "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
-   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
-   && (peep2_reg_dead_p (2, operands[0])) \
-   && REGNO (operands[0]) != REGNO (operands[2])"
-  [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
+(define_insn_and_split "simple_store_off<ST_ANY:mode><P:mode>"
+  [(set (mem:ST_ANY
+	  (plus (match_operand:P 0 "symbolic_pcrel_operand" "")
+		(match_operand 1 "const_int_operand" "")))
+	(match_operand:ST_ANY 2 "register_operand" "r,f"))]
+  "loongarch_pre_reload_split () \
+   && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+   && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+  "#"
+  ""
+  [(set (mem:ST_ANY (lo_sum:P (match_dup 1) (match_dup 0))) (match_dup 2))]
   {
-    operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
-    emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+    HOST_WIDE_INT offset = INTVAL (operands[1]);
+    operands[1] = gen_reg_rtx (Pmode);
+    operands[0] = plus_constant (Pmode, operands[0], offset);
+    emit_insn (gen_pcalau12i_gr<P:mode> (operands[1], operands[0]));
   })
 
 ;; Synchronization instructions.
diff --git a/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
new file mode 100644
index 00000000000..42cb966d1e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=loongarch64 -mabi=lp64d -mexplicit-relocs=auto" } */
+
+float a[8001];
+float
+t (void)
+{
+  return a[0] + a[8000];
+}
+
+/* { dg-final { scan-assembler-not "la.local" } } */