diff mbox

[PING,AArch64] Peepholes to generate ldp and stp instructions

Message ID 7aad32511cb04e4f9707b3626e1116a2@SN2PR07MB029.namprd07.prod.outlook.com
State New
Headers show

Commit Message

Hurugalawadi, Naveen Oct. 29, 2013, 9:22 a.m. UTC
Hi,

Please consider this as a reminder to review the ldp and stp peephole
implementation for AArch64 target.

The patch was originally posted at:-
http://gcc.gnu.org/ml/gcc-patches/2013-03/msg01051.html

Please review the same and let me know if its okay.
 
Build and tested on aarch64-thunder-elf (using Cavium's internal
simulator). No new regressions.

2013-10-29   Naveen H.S  <Naveen.Hurugalawadi@caviumnetworks.com>

gcc/
	* config/aarch64/aarch64.md (peephole2s to generate ldp
	instruction for 2 consecutive loads from memory): New.
	(peephole2s to generate stp instruction for 2 consecutive
	stores to memory in integer mode): New.
	(peephole2s to generate ldp instruction for 2 consecutive
	loads from memory in floating point mode): New.
	(peephole2s to generate stp instruction for 2 consecutive
	stores to memory in floating point mode): New.

gcc/testsuite
	* gcc.target/aarch64/ldp-stp.c: New testcase
    
Thanks,
Naveen

Comments

Ramana Radhakrishnan Oct. 29, 2013, 9:52 a.m. UTC | #1
You are better off CCing the maintainers for such reviews. Let me do 
that for you. I cannot approve or reject this patch but I have a few 
comments as below.

On 10/29/13 09:22, Hurugalawadi, Naveen wrote:
>
> diff -uprN '-x*.orig' mainline-orig/gcc/config/aarch64/aarch64.md gcc-4.8.0/gcc/config/aarch64/aarch64.md
> --- mainline-orig/gcc/config/aarch64/aarch64.md	2013-10-28 17:15:52.363975264 +0530
> +++ gcc-4.8.0/gcc/config/aarch64/aarch64.md	2013-10-29 10:11:09.656082201 +0530
> @@ -1068,6 +1068,27 @@
>      (set_attr "mode" "<MODE>")]
>   )
>
> +(define_peephole2
> +  [(set (match_operand:GPI 0 "register_operand")
> +	(match_operand:GPI 1 "aarch64_mem_pair_operand"))
> +   (set (match_operand:GPI 2 "register_operand")
> +	(match_operand:GPI 3 "memory_operand"))]
> +  "GET_CODE (operands[1]) == MEM
> +   && GET_CODE (XEXP (operands[1], 0)) == PLUS
> +   && GET_CODE (XEXP (XEXP (operands[1], 0), 0)) == REG

Use the predicates REG_P , CONST_INT_P and so on. Don't do this in this 
form.

> +   && GET_CODE (XEXP (XEXP (operands[1], 0), 1)) == CONST_INT
> +   && GET_MODE (operands[0]) == GET_MODE (XEXP (XEXP (operands[1], 0), 0))
> +   && REGNO (operands[0]) != REGNO (operands[2])
> +   && REGNO_REG_CLASS (REGNO (operands[0]))
> +      == REGNO_REG_CLASS (REGNO (operands[2]))
> +   && rtx_equal_p (XEXP (operands[3], 0),
> +		   plus_constant (Pmode, XEXP (operands[1], 0),
> +				  GET_MODE_SIZE (<MODE>mode)))
> +   && optimize_size"

Why use optimize_size here ? I would have thought it was always good to 
generate ldp and stp instructions. If you want to make it a core 
specific decision make it a tuning decision but don't disable just based 
on size.

Additionally if this check is common between all the patterns it would 
be better to factor this out into a predicate or a function in its own 
right.

> +  [(parallel [(set (match_dup 0) (match_dup 1))
> +	      (set (match_dup 2) (match_dup 3))])]
> +)
> +
>   ;; Operands 0 and 2 are tied together by the final condition; so we allow
>   ;; fairly lax checking on the second memory operation.
>   (define_insn "store_pair<mode>"
> @@ -1085,6 +1106,27 @@
>      (set_attr "mode" "<MODE>")]
>   )
>
> +(define_peephole2
> +  [(set (match_operand:GPI 0 "aarch64_mem_pair_operand")
> +	(match_operand:GPI 1 "register_operand"))
> +   (set (match_operand:GPI 2 "memory_operand")
> +	(match_operand:GPI 3 "register_operand"))]
> +  "GET_CODE (operands[0]) == MEM
> +   && GET_CODE (XEXP (operands[0], 0)) == PLUS
> +   && GET_CODE (XEXP (XEXP (operands[0], 0), 0)) == REG
> +   && GET_CODE (XEXP (XEXP (operands[0], 0), 1)) == CONST_INT
> +   && GET_MODE (operands[1]) == GET_MODE (XEXP (XEXP (operands[0], 0), 0))
> +   && REGNO (operands[1]) != REGNO (operands[3])
> +   && REGNO_REG_CLASS (REGNO (operands[1]))
> +      == REGNO_REG_CLASS (REGNO (operands[3]))
> +   && rtx_equal_p (XEXP (operands[2], 0),
> +		   plus_constant (Pmode, XEXP (operands[0], 0),
> +				  GET_MODE_SIZE (<MODE>mode)))
> +   && optimize_size"
> +  [(parallel [(set (match_dup 0) (match_dup 1))
> +	      (set (match_dup 2) (match_dup 3))])]
> +)
> +
>   ;; Operands 1 and 3 are tied together by the final condition; so we allow
>   ;; fairly lax checking on the second memory operation.
>   (define_insn "load_pair<mode>"
> @@ -1102,6 +1144,28 @@
>      (set_attr "mode" "<MODE>")]
>   )
>
> +(define_peephole2
> +  [(set (match_operand:GPF 0 "register_operand")
> +	(match_operand:GPF 1 "aarch64_mem_pair_operand"))
> +   (set (match_operand:GPF 2 "register_operand")
> +	(match_operand:GPF 3 "memory_operand"))]
> +  "GET_CODE (operands[1]) == MEM
> +   && GET_CODE (XEXP (operands[1], 0)) == PLUS
> +   && GET_CODE (XEXP (XEXP (operands[1], 0), 0)) == REG
> +   && GET_CODE (XEXP (XEXP (operands[1], 0), 1)) == CONST_INT
> +   && GET_MODE (operands[0]) == GET_MODE (XEXP (XEXP (operands[1], 0), 0))
> +   && REGNO (operands[0]) != REGNO (operands[2])
> +   && REGNO (operands[0]) >= 32 && REGNO (operands[2]) >= 32

I'm not sure what this means here but this check doesn't look right - 32 
is V0_REGNUM - why are you checking that here ? If you really need to do 
that check then use V0_REGNUM or it may be better to see if this really 
is a SIMD regnum ? Can you not use something like FP_REGNUM_P here 
instead ?


> +   && REGNO_REG_CLASS (REGNO (operands[0]))
> +      == REGNO_REG_CLASS (REGNO (operands[2]))
> +   && rtx_equal_p (XEXP (operands[3], 0),
> +		   plus_constant (Pmode, XEXP (operands[1], 0),
> +				  GET_MODE_SIZE (<MODE>mode)))
> +   && optimize_size"
> +  [(parallel [(set (match_dup 0) (match_dup 1))
> +	      (set (match_dup 2) (match_dup 3))])]
> +)
> +
>   ;; Operands 0 and 2 are tied together by the final condition; so we allow
>   ;; fairly lax checking on the second memory operation.
>   (define_insn "store_pair<mode>"
> @@ -1119,6 +1183,28 @@
>      (set_attr "mode" "<MODE>")]
>   )
>
> +(define_peephole2
> +  [(set (match_operand:GPF 0 "aarch64_mem_pair_operand")
> +	(match_operand:GPF 1 "register_operand"))
> +   (set (match_operand:GPF 2 "memory_operand")
> +	(match_operand:GPF 3 "register_operand"))]
> +  "GET_CODE (operands[0]) == MEM
> +   && GET_CODE (XEXP (operands[0], 0)) == PLUS
> +   && GET_CODE (XEXP (XEXP (operands[0], 0), 0)) == REG
> +   && GET_CODE (XEXP (XEXP (operands[0], 0), 1)) == CONST_INT
> +   && GET_MODE (operands[1]) == GET_MODE (XEXP (XEXP (operands[0], 0), 0))
> +   && REGNO (operands[1]) != REGNO (operands[3])
> +   && REGNO (operands[1]) >= 32 && REGNO (operands[3]) >= 32
> +   && REGNO_REG_CLASS (REGNO (operands[1]))
> +      == REGNO_REG_CLASS (REGNO (operands[3]))
> +   && rtx_equal_p (XEXP (operands[2], 0),
> +		   plus_constant (Pmode, XEXP (operands[0], 0),
> +				  GET_MODE_SIZE (<MODE>mode)))
> +   && optimize_size"
> +  [(parallel [(set (match_dup 0) (match_dup 1))
> +	      (set (match_dup 2) (match_dup 3))])]
> +)
> +
>   ;; Load pair with writeback.  This is primarily used in function epilogues
>   ;; when restoring [fp,lr]
>   (define_insn "loadwb_pair<GPI:mode>_<P:mode>"
> diff -uprN '-x*.orig' mainline-orig/gcc/testsuite/gcc.target/aarch64/ldp-stp.c gcc-4.8.0/gcc/testsuite/gcc.target/aarch64/ldp-stp.c
> --- mainline-orig/gcc/testsuite/gcc.target/aarch64/ldp-stp.c	1970-01-01 05:30:00.000000000 +0530
> +++ gcc-4.8.0/gcc/testsuite/gcc.target/aarch64/ldp-stp.c	2013-10-28 19:01:11.695986357 +0530
> @@ -0,0 +1,33 @@
> +/* { dg-options "-Os" } */
> +
> +extern void abort (void);
> +
> +typedef struct
> +{
> +  long int x, y;
> +} ldst;
> +
> +void
> +f (ldst p0, ldst p1, ldst p2, ldst p3, ldst p4, ldst p5)
> +{
> +  if (p2.x != 1 || p2.y != -1
> +      || p3.x != -1 || p3.y != 1 || p4.x != 0 || p4.y != -1)
> +    abort ();
> +}
> +
> +void
> +foo ()
> +{
> +  ldst p0, p1, p2, p3, p4, p5;
> +
> +  p4.x = 0;
> +  p4.y = -1;
> +
> +  p5.x = 1;
> +  p5.y = 0;
> +
> +  f (p0, p1, p2, p3, p4, p5);
> +}
> +
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 3 } } */
> +/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]" 3 } } */
diff mbox

Patch

diff -uprN '-x*.orig' mainline-orig/gcc/config/aarch64/aarch64.md gcc-4.8.0/gcc/config/aarch64/aarch64.md
--- mainline-orig/gcc/config/aarch64/aarch64.md	2013-10-28 17:15:52.363975264 +0530
+++ gcc-4.8.0/gcc/config/aarch64/aarch64.md	2013-10-29 10:11:09.656082201 +0530
@@ -1068,6 +1068,27 @@ 
    (set_attr "mode" "<MODE>")]
 )
 
+(define_peephole2
+  [(set (match_operand:GPI 0 "register_operand")
+	(match_operand:GPI 1 "aarch64_mem_pair_operand"))
+   (set (match_operand:GPI 2 "register_operand")
+	(match_operand:GPI 3 "memory_operand"))]
+  "GET_CODE (operands[1]) == MEM
+   && GET_CODE (XEXP (operands[1], 0)) == PLUS
+   && GET_CODE (XEXP (XEXP (operands[1], 0), 0)) == REG
+   && GET_CODE (XEXP (XEXP (operands[1], 0), 1)) == CONST_INT
+   && GET_MODE (operands[0]) == GET_MODE (XEXP (XEXP (operands[1], 0), 0))
+   && REGNO (operands[0]) != REGNO (operands[2])
+   && REGNO_REG_CLASS (REGNO (operands[0]))
+      == REGNO_REG_CLASS (REGNO (operands[2]))
+   && rtx_equal_p (XEXP (operands[3], 0),
+		   plus_constant (Pmode, XEXP (operands[1], 0),
+				  GET_MODE_SIZE (<MODE>mode)))
+   && optimize_size"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+	      (set (match_dup 2) (match_dup 3))])]
+)
+
 ;; Operands 0 and 2 are tied together by the final condition; so we allow
 ;; fairly lax checking on the second memory operation.
 (define_insn "store_pair<mode>"
@@ -1085,6 +1106,27 @@ 
    (set_attr "mode" "<MODE>")]
 )
 
+(define_peephole2
+  [(set (match_operand:GPI 0 "aarch64_mem_pair_operand")
+	(match_operand:GPI 1 "register_operand"))
+   (set (match_operand:GPI 2 "memory_operand")
+	(match_operand:GPI 3 "register_operand"))]
+  "GET_CODE (operands[0]) == MEM
+   && GET_CODE (XEXP (operands[0], 0)) == PLUS
+   && GET_CODE (XEXP (XEXP (operands[0], 0), 0)) == REG
+   && GET_CODE (XEXP (XEXP (operands[0], 0), 1)) == CONST_INT
+   && GET_MODE (operands[1]) == GET_MODE (XEXP (XEXP (operands[0], 0), 0))
+   && REGNO (operands[1]) != REGNO (operands[3])
+   && REGNO_REG_CLASS (REGNO (operands[1]))
+      == REGNO_REG_CLASS (REGNO (operands[3]))
+   && rtx_equal_p (XEXP (operands[2], 0),
+		   plus_constant (Pmode, XEXP (operands[0], 0),
+				  GET_MODE_SIZE (<MODE>mode)))
+   && optimize_size"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+	      (set (match_dup 2) (match_dup 3))])]
+)
+
 ;; Operands 1 and 3 are tied together by the final condition; so we allow
 ;; fairly lax checking on the second memory operation.
 (define_insn "load_pair<mode>"
@@ -1102,6 +1144,28 @@ 
    (set_attr "mode" "<MODE>")]
 )
 
+(define_peephole2
+  [(set (match_operand:GPF 0 "register_operand")
+	(match_operand:GPF 1 "aarch64_mem_pair_operand"))
+   (set (match_operand:GPF 2 "register_operand")
+	(match_operand:GPF 3 "memory_operand"))]
+  "GET_CODE (operands[1]) == MEM
+   && GET_CODE (XEXP (operands[1], 0)) == PLUS
+   && GET_CODE (XEXP (XEXP (operands[1], 0), 0)) == REG
+   && GET_CODE (XEXP (XEXP (operands[1], 0), 1)) == CONST_INT
+   && GET_MODE (operands[0]) == GET_MODE (XEXP (XEXP (operands[1], 0), 0))
+   && REGNO (operands[0]) != REGNO (operands[2])
+   && REGNO (operands[0]) >= 32 && REGNO (operands[2]) >= 32
+   && REGNO_REG_CLASS (REGNO (operands[0]))
+      == REGNO_REG_CLASS (REGNO (operands[2]))
+   && rtx_equal_p (XEXP (operands[3], 0),
+		   plus_constant (Pmode, XEXP (operands[1], 0),
+				  GET_MODE_SIZE (<MODE>mode)))
+   && optimize_size"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+	      (set (match_dup 2) (match_dup 3))])]
+)
+
 ;; Operands 0 and 2 are tied together by the final condition; so we allow
 ;; fairly lax checking on the second memory operation.
 (define_insn "store_pair<mode>"
@@ -1119,6 +1183,28 @@ 
    (set_attr "mode" "<MODE>")]
 )
 
+(define_peephole2
+  [(set (match_operand:GPF 0 "aarch64_mem_pair_operand")
+	(match_operand:GPF 1 "register_operand"))
+   (set (match_operand:GPF 2 "memory_operand")
+	(match_operand:GPF 3 "register_operand"))]
+  "GET_CODE (operands[0]) == MEM
+   && GET_CODE (XEXP (operands[0], 0)) == PLUS
+   && GET_CODE (XEXP (XEXP (operands[0], 0), 0)) == REG
+   && GET_CODE (XEXP (XEXP (operands[0], 0), 1)) == CONST_INT
+   && GET_MODE (operands[1]) == GET_MODE (XEXP (XEXP (operands[0], 0), 0))
+   && REGNO (operands[1]) != REGNO (operands[3])
+   && REGNO (operands[1]) >= 32 && REGNO (operands[3]) >= 32
+   && REGNO_REG_CLASS (REGNO (operands[1]))
+      == REGNO_REG_CLASS (REGNO (operands[3]))
+   && rtx_equal_p (XEXP (operands[2], 0),
+		   plus_constant (Pmode, XEXP (operands[0], 0),
+				  GET_MODE_SIZE (<MODE>mode)))
+   && optimize_size"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+	      (set (match_dup 2) (match_dup 3))])]
+)
+
 ;; Load pair with writeback.  This is primarily used in function epilogues
 ;; when restoring [fp,lr]
 (define_insn "loadwb_pair<GPI:mode>_<P:mode>"
diff -uprN '-x*.orig' mainline-orig/gcc/testsuite/gcc.target/aarch64/ldp-stp.c gcc-4.8.0/gcc/testsuite/gcc.target/aarch64/ldp-stp.c
--- mainline-orig/gcc/testsuite/gcc.target/aarch64/ldp-stp.c	1970-01-01 05:30:00.000000000 +0530
+++ gcc-4.8.0/gcc/testsuite/gcc.target/aarch64/ldp-stp.c	2013-10-28 19:01:11.695986357 +0530
@@ -0,0 +1,33 @@ 
+/* { dg-options "-Os" } */
+
+extern void abort (void);
+
+typedef struct
+{
+  long int x, y;
+} ldst;
+
+void
+f (ldst p0, ldst p1, ldst p2, ldst p3, ldst p4, ldst p5)
+{
+  if (p2.x != 1 || p2.y != -1
+      || p3.x != -1 || p3.y != 1 || p4.x != 0 || p4.y != -1)
+    abort ();
+}
+
+void
+foo ()
+{
+  ldst p0, p1, p2, p3, p4, p5;
+
+  p4.x = 0;
+  p4.y = -1;
+
+  p5.x = 1;
+  p5.y = 0;
+
+  f (p0, p1, p2, p3, p4, p5);
+}
+
+/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 3 } } */
+/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]" 3 } } */