[committed,AArch64] Use SVE ADR to optimise shift-add sequences
diff mbox series

Message ID mptimr0nmv7.fsf@arm.com
State New
Headers show
Series
  • [committed,AArch64] Use SVE ADR to optimise shift-add sequences
Related show

Commit Message

Richard Sandiford Aug. 14, 2019, 8:59 a.m. UTC
This patch uses SVE ADR to optimise shift-and-add and uxtw-and-add
sequences.

Tested on aarch64-linux-gnu (with and without SVE) and aarch64_be-elf.
Applied as r274436.

Richard


2019-08-14  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* config/aarch64/predicates.md (const_1_to_3_operand): New predicate.
	* config/aarch64/aarch64-sve.md (*aarch64_adr_uxtw)
	(*aarch64_adr<mode>_shift, *aarch64_adr_shift_uxtw): New patterns.

gcc/testsuite/
	* gcc.target/aarch64/sve/adr_1.c: New test.
	* gcc.target/aarch64/sve/adr_1_run.c: Likewise.
	* gcc.target/aarch64/sve/adr_2.c: Likewise.
	* gcc.target/aarch64/sve/adr_2_run.c: Likewise.
	* gcc.target/aarch64/sve/adr_3.c: Likewise.
	* gcc.target/aarch64/sve/adr_3_run.c: Likewise.
	* gcc.target/aarch64/sve/adr_4.c: Likewise.
	* gcc.target/aarch64/sve/adr_4_run.c: Likewise.
	* gcc.target/aarch64/sve/adr_5.c: Likewise.
	* gcc.target/aarch64/sve/adr_5_run.c: Likewise.
------------------------------------------------------------------------------

Patch
diff mbox series

Index: gcc/config/aarch64/predicates.md
===================================================================
--- gcc/config/aarch64/predicates.md	2019-08-14 09:15:57.617827961 +0100
+++ gcc/config/aarch64/predicates.md	2019-08-14 09:56:55.323680943 +0100
@@ -39,6 +39,13 @@  (define_predicate "const0_operand"
   (and (match_code "const_int")
        (match_test "op == CONST0_RTX (mode)")))
 
+(define_predicate "const_1_to_3_operand"
+  (match_code "const_int,const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
+})
+
 (define_special_predicate "subreg_lowpart_operator"
   (and (match_code "subreg")
        (match_test "subreg_lowpart_p (op)")))
@@ -595,6 +602,11 @@  (define_predicate "aarch64_sve_inc_dec_i
   (and (match_code "const,const_vector")
        (match_test "aarch64_sve_inc_dec_immediate_p (op)")))
 
+(define_predicate "aarch64_sve_uxtw_immediate"
+  (and (match_code "const_vector")
+       (match_test "GET_MODE_UNIT_BITSIZE (GET_MODE (op)) > 32")
+       (match_test "aarch64_const_vec_all_same_int_p (op, 0xffffffff)")))
+
 (define_predicate "aarch64_sve_logical_immediate"
   (and (match_code "const,const_vector")
        (match_test "aarch64_sve_bitmask_immediate_p (op)")))
Index: gcc/config/aarch64/aarch64-sve.md
===================================================================
--- gcc/config/aarch64/aarch64-sve.md	2019-08-14 09:54:30.808741952 +0100
+++ gcc/config/aarch64/aarch64-sve.md	2019-08-14 09:56:55.323680943 +0100
@@ -61,6 +61,7 @@ 
 ;; ---- [INT] General binary arithmetic corresponding to rtx codes
 ;; ---- [INT] Addition
 ;; ---- [INT] Subtraction
+;; ---- [INT] Take address
 ;; ---- [INT] Absolute difference
 ;; ---- [INT] Multiplication
 ;; ---- [INT] Highpart multiplication
@@ -1672,6 +1673,65 @@  (define_insn "sub<mode>3"
 ;; Merging forms are handled through SVE_INT_BINARY.
 
 ;; -------------------------------------------------------------------------
+;; ---- [INT] Take address
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ADR
+;; -------------------------------------------------------------------------
+
+;; Unshifted ADR, with the offset being zero-extended from the low 32 bits.
+(define_insn "*aarch64_adr_uxtw"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
+	(plus:VNx2DI
+	  (and:VNx2DI
+	    (match_operand:VNx2DI 2 "register_operand" "w")
+	    (match_operand:VNx2DI 3 "aarch64_sve_uxtw_immediate"))
+	  (match_operand:VNx2DI 1 "register_operand" "w")))]
+  "TARGET_SVE"
+  "adr\t%0.d, [%1.d, %2.d, uxtw]"
+)
+
+;; ADR with a nonzero shift.
+(define_insn_and_rewrite "*aarch64_adr<mode>_shift"
+  [(set (match_operand:SVE_SDI 0 "register_operand" "=w")
+	(plus:SVE_SDI
+	  (unspec:SVE_SDI
+	    [(match_operand 4)
+	     (ashift:SVE_SDI
+	       (match_operand:SVE_SDI 2 "register_operand" "w")
+	       (match_operand:SVE_SDI 3 "const_1_to_3_operand"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:SVE_SDI 1 "register_operand" "w")))]
+  "TARGET_SVE"
+  "adr\t%0.<Vetype>, [%1.<Vetype>, %2.<Vetype>, lsl %3]"
+  "&& !CONSTANT_P (operands[4])"
+  {
+    operands[4] = CONSTM1_RTX (<VPRED>mode);
+  }
+)
+
+;; Same, but with the index being zero-extended from the low 32 bits.
+(define_insn_and_rewrite "*aarch64_adr_shift_uxtw"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
+	(plus:VNx2DI
+	  (unspec:VNx2DI
+	    [(match_operand 5)
+	     (ashift:VNx2DI
+	       (and:VNx2DI
+		 (match_operand:VNx2DI 2 "register_operand" "w")
+		 (match_operand:VNx2DI 4 "aarch64_sve_uxtw_immediate"))
+	       (match_operand:VNx2DI 3 "const_1_to_3_operand"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:VNx2DI 1 "register_operand" "w")))]
+  "TARGET_SVE"
+  "adr\t%0.d, [%1.d, %2.d, uxtw %3]"
+  "&& !CONSTANT_P (operands[5])"
+  {
+    operands[5] = CONSTM1_RTX (VNx2BImode);
+  }
+)
+
+;; -------------------------------------------------------------------------
 ;; ---- [INT] Absolute difference
 ;; -------------------------------------------------------------------------
 ;; Includes:
Index: gcc/testsuite/gcc.target/aarch64/sve/adr_1.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/adr_1.c	2019-08-14 09:56:55.323680943 +0100
@@ -0,0 +1,46 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#ifndef FACTOR
+#define FACTOR 2
+#endif
+
+#define LOOP(TYPE)						\
+  __attribute__ ((noipa))					\
+  void								\
+  test_##TYPE (TYPE *restrict dst, TYPE *restrict src,		\
+	       int count)					\
+  {								\
+    for (int i = 0; i < count; ++i)				\
+      dst[i] += src[i] * FACTOR;				\
+  }
+
+#define TEST_ALL(T) \
+  T (int8_t) \
+  T (int16_t) \
+  T (int32_t) \
+  T (int64_t) \
+  T (uint8_t) \
+  T (uint16_t) \
+  T (uint32_t) \
+  T (uint64_t)
+
+TEST_ALL (LOOP)
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 1\]} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 1\]} 2 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/adr_1_run.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/adr_1_run.c	2019-08-14 09:56:55.323680943 +0100
@@ -0,0 +1,31 @@ 
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "adr_1.c"
+
+#define N 131
+
+#define TEST_LOOP(TYPE)						\
+  {								\
+    TYPE a[N], b[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (TYPE) i * i + i % 5;				\
+	b[i] = (TYPE) i * 3 + i % 7;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (a, b, N);					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = ((TYPE) (i * i + i % 5)			\
+			 + ((TYPE) i * 3 + i % 7) * FACTOR);	\
+	if (a[i] != expected)					\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/adr_2.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/adr_2.c	2019-08-14 09:56:55.323680943 +0100
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 4
+#include "adr_1.c"
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 2\]} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 2\]} 2 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/adr_2_run.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/adr_2_run.c	2019-08-14 09:56:55.323680943 +0100
@@ -0,0 +1,5 @@ 
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 4
+#include "adr_1_run.c"
Index: gcc/testsuite/gcc.target/aarch64/sve/adr_3.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/adr_3.c	2019-08-14 09:56:55.323680943 +0100
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 8
+#include "adr_1.c"
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 3\]} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 3\]} 2 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/adr_3_run.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/adr_3_run.c	2019-08-14 09:56:55.323680943 +0100
@@ -0,0 +1,5 @@ 
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 8
+#include "adr_1_run.c"
Index: gcc/testsuite/gcc.target/aarch64/sve/adr_4.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/adr_4.c	2019-08-14 09:56:55.323680943 +0100
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 16
+#include "adr_1.c"
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.[bhsd],} 8 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.[bhsd],} 8 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.[bhsd],} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/adr_4_run.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/adr_4_run.c	2019-08-14 09:56:55.323680943 +0100
@@ -0,0 +1,5 @@ 
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 16
+#include "adr_1_run.c"
Index: gcc/testsuite/gcc.target/aarch64/sve/adr_5.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/adr_5.c	2019-08-14 09:56:55.323680943 +0100
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define LOOP(FACTOR)						\
+  __attribute__ ((noipa))					\
+  void								\
+  test_##FACTOR (uint64_t *restrict dst,			\
+		 uint64_t *restrict src, int count)		\
+  {								\
+    for (int i = 0; i < count; ++i)				\
+      dst[i] += (src[i] & 0xffffffff) * FACTOR;			\
+  }
+
+#define TEST_ALL(T) T (1) T (2) T (4) T (8)
+
+TEST_ALL (LOOP)
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tand\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tuxtw\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 1\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 2\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 3\]} 1 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/adr_5_run.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/adr_5_run.c	2019-08-14 09:56:55.323680943 +0100
@@ -0,0 +1,32 @@ 
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "adr_5.c"
+
+#define N 131
+
+#define TEST_LOOP(FACTOR)						\
+  {									\
+    uint64_t a[N], b[N];						\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	a[i] = (uint64_t) i * i + i % 5;				\
+	b[i] = (uint64_t) (i * 3) << ((i & 7) * 8);			\
+	asm volatile ("" ::: "memory");					\
+      }									\
+    test_##FACTOR (a, b, N);						\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	uint64_t expected = ((uint64_t) (i * i + i % 5)			\
+			     + (((uint64_t) (i * 3) << ((i & 7) * 8))	\
+				& 0xffffffff) * FACTOR);		\
+	if (a[i] != expected)						\
+	  __builtin_abort ();						\
+      }									\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+}