diff mbox series

AArch64: Improve GOT addressing

Message ID VE1PR08MB559917AA1989855BAC5E8B3B83599@VE1PR08MB5599.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Improve GOT addressing | expand

Commit Message

Wilco Dijkstra May 5, 2021, 1:17 p.m. UTC
Improve GOT addressing by emitting the instructions as a pair.  This reduces
register pressure and improves code quality. With -fPIC codesize improves by
0.65% and SPECINT2017 improves by 0.25%.

Passes bootstrap and regress. OK for commit?

ChangeLog:
2021-05-05  Wilco Dijkstra  <wdijkstr@arm.com>

        * config/aarch64/aarch64.md (ldr_got_small_<mode>): Emit ADRP+LDR GOT sequence.
        (ldr_got_small_sidi): Likewise.
        * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Remove tmp_reg.
        (aarch64_print_operand): Correctly print got_lo12 in L specifier.

---

Comments

Richard Sandiford May 10, 2021, 12:05 p.m. UTC | #1
Wilco Dijkstra via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Improve GOT addressing by emitting the instructions as a pair.  This reduces
> register pressure and improves code quality. With -fPIC codesize improves by
> 0.65% and SPECINT2017 improves by 0.25%.
>
> Passes bootstrap and regress. OK for commit?

Normally we should only put two instructions in the same define_insn
if there's a specific ABI or architectural reason for not separating
them.  Doing it purely for optimisation reasons is going against the
general direction of travel.  So I think the first question is: why
don't we simply delay the split until after reload instead, since
that's the more normal way of handling this kind of thing?

Also, the patch means that we use RTL of the form:

  (set (reg:PTR R)
       (unspec:PTR [(mem:PTR (symbol_ref:PTR S))]
                   UNSPEC_GOTSMALLPIC))

to represent the move of S into R.  This should just be represented as:

  (set (reg:PTR R) (symbol_ref:PTR S))

and go through the normal move patterns.

Thanks,
Richard

> ChangeLog:
> 2021-05-05  Wilco Dijkstra  <wdijkstr@arm.com>
>
>         * config/aarch64/aarch64.md (ldr_got_small_<mode>): Emit ADRP+LDR GOT sequence.
>         (ldr_got_small_sidi): Likewise.
>         * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Remove tmp_reg.
>         (aarch64_print_operand): Correctly print got_lo12 in L specifier.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 641c83b479e76cbcc75b299eb7ae5f634d9db7cd..32c5c76d3c001a79d2a69b7f8243f1f1f605f901 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -3625,27 +3625,21 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
>  
>  	rtx insn;
>  	rtx mem;
> -	rtx tmp_reg = dest;
>  	machine_mode mode = GET_MODE (dest);
>  
> -	if (can_create_pseudo_p ())
> -	  tmp_reg = gen_reg_rtx (mode);
> -
> -	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
>  	if (mode == ptr_mode)
>  	  {
>  	    if (mode == DImode)
> -	      insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
> +	      insn = gen_ldr_got_small_di (dest, imm);
>  	    else
> -	      insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
> +	      insn = gen_ldr_got_small_si (dest, imm);
>  
>  	    mem = XVECEXP (SET_SRC (insn), 0, 0);
>  	  }
>  	else
>  	  {
>  	    gcc_assert (mode == Pmode);
> -
> -	    insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
> +	    insn = gen_ldr_got_small_sidi (dest, imm);
>  	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
>  	  }
>  
> @@ -11019,7 +11013,7 @@ aarch64_print_operand (FILE *f, rtx x, int code)
>        switch (aarch64_classify_symbolic_expression (x))
>  	{
>  	case SYMBOL_SMALL_GOT_4G:
> -	  asm_fprintf (asm_out_file, ":lo12:");
> +	  asm_fprintf (asm_out_file, ":got_lo12:");
>  	  break;
>  
>  	case SYMBOL_SMALL_TLSGD:
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index abfd84526745d029ad4953eabad6dd17b159a218..36c5c054f86e9cdd1f0945cdbc1beb47aa7ad80a 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -6705,25 +6705,23 @@ (define_insn "add_losym_<mode>"
>  
>  (define_insn "ldr_got_small_<mode>"
>    [(set (match_operand:PTR 0 "register_operand" "=r")
> -	(unspec:PTR [(mem:PTR (lo_sum:PTR
> -			      (match_operand:PTR 1 "register_operand" "r")
> -			      (match_operand:PTR 2 "aarch64_valid_symref" "S")))]
> +	(unspec:PTR [(mem:PTR (match_operand:PTR 1 "aarch64_valid_symref" "S"))]
>  		    UNSPEC_GOTSMALLPIC))]
>    ""
> -  "ldr\\t%<w>0, [%1, #:got_lo12:%c2]"
> -  [(set_attr "type" "load_<ldst_sz>")]
> +  "adrp\\t%0, %A1\;ldr\\t%<w>0, [%0, %L1]"
> +  [(set_attr "type" "load_<ldst_sz>")
> +   (set_attr "length" "8")]
>  )
>  
>  (define_insn "ldr_got_small_sidi"
>    [(set (match_operand:DI 0 "register_operand" "=r")
>  	(zero_extend:DI
> -	 (unspec:SI [(mem:SI (lo_sum:DI
> -			     (match_operand:DI 1 "register_operand" "r")
> -			     (match_operand:DI 2 "aarch64_valid_symref" "S")))]
> +	 (unspec:SI [(mem:SI (match_operand:DI 1 "aarch64_valid_symref" "S"))]
>  		    UNSPEC_GOTSMALLPIC)))]
>    "TARGET_ILP32"
> -  "ldr\\t%w0, [%1, #:got_lo12:%c2]"
> -  [(set_attr "type" "load_4")]
> +  "adrp\\t%0, %A1\;ldr\\t%w0, [%0, %L1]"
> +  [(set_attr "type" "load_4")
> +   (set_attr "length" "8")]
>  )
>  
>  (define_insn "ldr_got_small_28k_<mode>"
Wilco Dijkstra May 10, 2021, 2:04 p.m. UTC | #2
Hi Richard,

> Normally we should only put two instructions in the same define_insn
> if there's a specific ABI or architectural reason for not separating
> them.  Doing it purely for optimisation reasons is going against the
> general direction of travel.  So I think the first question is: why
> don't we simply delay the split until after reload instead, since
> that's the more normal way of handling this kind of thing?

Well there are no optimizations that benefit from them being split, and there
is no gain from scheduling them independently. Keeping them together
means the linker could perform relaxations on the pair without adding new
relocations. So if we split after reload we'd still want to keep them together.

> This should just be represented as:
>
>  (set (reg:PTR R) (symbol_ref:PTR S))
>
> and go through the normal move patterns.

Yes that should be feasible. Note the UNSPEC is unnecessary even in the
original pattern - given it uses the standard HIGH operator for GOT accesses,
we can also use LO_SUM for the 2nd part since the indirection is hidden.

Cheers,
Wilco
Richard Sandiford May 10, 2021, 2:54 p.m. UTC | #3
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>> Normally we should only put two instructions in the same define_insn
>> if there's a specific ABI or architectural reason for not separating
>> them.  Doing it purely for optimisation reasons is going against the
>> general direction of travel.  So I think the first question is: why
>> don't we simply delay the split until after reload instead, since
>> that's the more normal way of handling this kind of thing?
>
> Well there are no optimizations that benefit from them being split, and there
> is no gain from scheduling them independently. Keeping them together
> means the linker could perform relaxations on the pair without adding new
> relocations. So if we split after reload we'd still want to keep them together.

The burden of proof is the other way though: there has to be a specific
reason for keeping the instructions together, rather than a specific
reason for splitting them.  How we optimise things after RA changes
with time.

Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 641c83b479e76cbcc75b299eb7ae5f634d9db7cd..32c5c76d3c001a79d2a69b7f8243f1f1f605f901 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3625,27 +3625,21 @@  aarch64_load_symref_appropriately (rtx dest, rtx imm,
 
 	rtx insn;
 	rtx mem;
-	rtx tmp_reg = dest;
 	machine_mode mode = GET_MODE (dest);
 
-	if (can_create_pseudo_p ())
-	  tmp_reg = gen_reg_rtx (mode);
-
-	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 	if (mode == ptr_mode)
 	  {
 	    if (mode == DImode)
-	      insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
+	      insn = gen_ldr_got_small_di (dest, imm);
 	    else
-	      insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
+	      insn = gen_ldr_got_small_si (dest, imm);
 
 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
 	  }
 	else
 	  {
 	    gcc_assert (mode == Pmode);
-
-	    insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
+	    insn = gen_ldr_got_small_sidi (dest, imm);
 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
 	  }
 
@@ -11019,7 +11013,7 @@  aarch64_print_operand (FILE *f, rtx x, int code)
       switch (aarch64_classify_symbolic_expression (x))
 	{
 	case SYMBOL_SMALL_GOT_4G:
-	  asm_fprintf (asm_out_file, ":lo12:");
+	  asm_fprintf (asm_out_file, ":got_lo12:");
 	  break;
 
 	case SYMBOL_SMALL_TLSGD:
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index abfd84526745d029ad4953eabad6dd17b159a218..36c5c054f86e9cdd1f0945cdbc1beb47aa7ad80a 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6705,25 +6705,23 @@  (define_insn "add_losym_<mode>"
 
 (define_insn "ldr_got_small_<mode>"
   [(set (match_operand:PTR 0 "register_operand" "=r")
-	(unspec:PTR [(mem:PTR (lo_sum:PTR
-			      (match_operand:PTR 1 "register_operand" "r")
-			      (match_operand:PTR 2 "aarch64_valid_symref" "S")))]
+	(unspec:PTR [(mem:PTR (match_operand:PTR 1 "aarch64_valid_symref" "S"))]
 		    UNSPEC_GOTSMALLPIC))]
   ""
-  "ldr\\t%<w>0, [%1, #:got_lo12:%c2]"
-  [(set_attr "type" "load_<ldst_sz>")]
+  "adrp\\t%0, %A1\;ldr\\t%<w>0, [%0, %L1]"
+  [(set_attr "type" "load_<ldst_sz>")
+   (set_attr "length" "8")]
 )
 
 (define_insn "ldr_got_small_sidi"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
-	 (unspec:SI [(mem:SI (lo_sum:DI
-			     (match_operand:DI 1 "register_operand" "r")
-			     (match_operand:DI 2 "aarch64_valid_symref" "S")))]
+	 (unspec:SI [(mem:SI (match_operand:DI 1 "aarch64_valid_symref" "S"))]
 		    UNSPEC_GOTSMALLPIC)))]
   "TARGET_ILP32"
-  "ldr\\t%w0, [%1, #:got_lo12:%c2]"
-  [(set_attr "type" "load_4")]
+  "adrp\\t%0, %A1\;ldr\\t%w0, [%0, %L1]"
+  [(set_attr "type" "load_4")
+   (set_attr "length" "8")]
 )
 
 (define_insn "ldr_got_small_28k_<mode>"