diff mbox

[AArch64] PR/64134: Make aarch64_expand_vector_init use 'ins' more often

Message ID 55312B31.5@arm.com
State New
Headers show

Commit Message

Alan Lawrence April 17, 2015, 3:48 p.m. UTC
From https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64134, testcase

#define vector __attribute__((vector_size(16)))

float a; float b;
vector float fb(void) { return (vector float){ 0,0,b,a};}

currently produces (correct, but suboptimal):

fb:
         fmov    s0, wzr
         adrp    x1, b
         adrp    x0, a
         sub     sp, sp, #16
         ldr     w1, [x1, #:lo12:b]
         ldr     w0, [x0, #:lo12:a]
         stp     s0, s0, [sp]
         stp     w1, w0, [sp, 8]
         ldr     q0, [sp]
         add     sp, sp, 16
         ret

with this patch:

fb:
         adrp    x1, b
         movi    v0.4s, 0
         adrp    x0, a
         ldr     s2, [x1, #:lo12:b]
         ldr     s1, [x0, #:lo12:a]
         ins     v0.s[2], v2.s[0]
         ins     v0.s[3], v1.s[0]
         ret

The reason is that aarch64_expand_vector_init presently loads a constant and 
then overwrites with 'ins' only if exactly one element of the vector is 
variable; otherwise, it dumps the entire vector out to the stack (later changed 
to STP) and then loads the whole vector in. This patch changes behaviour to load 
constants and then 'ins' if at most half the elements are variable rather than 
only one.

AFAICT this code path is only used for initialization of GCC vector extension 
vectors, and there is already a special cases for all elements being the same 
(e.g. the _dup_ instrinsics). So it doesn't feel worth introducing a 'cost 
model'-type approach for this one use case (such would probably have to be based 
on an assumption about success of STP pattern later anyway). Instead this is a 
(relatively) simple heuristic improvement.

There is a possibility of using ld1 rather than ldr+ins, which *may* generate 
further improvement (probably requiring adrp+add due to limited addressing modes 
of ld1, however); this patch does not tackle that.

Tested on aarch64-none-elf.

gcc/ChangeLog:

	PR target/64134
	config/aarch64/aarch64.c (aarch64_expand_vector_init): Load constant
	and overwrite variable parts if <= 1/2 the elements are variable.

gcc/testsuite/ChangeLog:

	PR target/64134
	gcc.target/aarch64/vec_init_1.c: New test.

Comments

Richard Earnshaw April 18, 2015, 1:30 p.m. UTC | #1
On 17/04/15 16:48, Alan Lawrence wrote:
> From https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64134, testcase
> 
> #define vector __attribute__((vector_size(16)))
> 
> float a; float b;
> vector float fb(void) { return (vector float){ 0,0,b,a};}
> 
> currently produces (correct, but suboptimal):
> 
> fb:
>         fmov    s0, wzr
>         adrp    x1, b
>         adrp    x0, a
>         sub     sp, sp, #16
>         ldr     w1, [x1, #:lo12:b]
>         ldr     w0, [x0, #:lo12:a]
>         stp     s0, s0, [sp]
>         stp     w1, w0, [sp, 8]
>         ldr     q0, [sp]
>         add     sp, sp, 16
>         ret
> 
> with this patch:
> 
> fb:
>         adrp    x1, b
>         movi    v0.4s, 0
>         adrp    x0, a
>         ldr     s2, [x1, #:lo12:b]
>         ldr     s1, [x0, #:lo12:a]
>         ins     v0.s[2], v2.s[0]
>         ins     v0.s[3], v1.s[0]
>         ret
> 
> The reason is that aarch64_expand_vector_init presently loads a constant
> and then overwrites with 'ins' only if exactly one element of the vector
> is variable; otherwise, it dumps the entire vector out to the stack
> (later changed to STP) and then loads the whole vector in. This patch
> changes behaviour to load constants and then 'ins' if at most half the
> elements are variable rather than only one.
> 
> AFAICT this code path is only used for initialization of GCC vector
> extension vectors, and there is already a special cases for all elements
> being the same (e.g. the _dup_ instrinsics). So it doesn't feel worth
> introducing a 'cost model'-type approach for this one use case (such
> would probably have to be based on an assumption about success of STP
> pattern later anyway). Instead this is a (relatively) simple heuristic
> improvement.
> 
> There is a possibility of using ld1 rather than ldr+ins, which *may*
> generate further improvement (probably requiring adrp+add due to limited
> addressing modes of ld1, however); this patch does not tackle that.
> 
> Tested on aarch64-none-elf.
> 
> gcc/ChangeLog:
> 
>     PR target/64134
>     config/aarch64/aarch64.c (aarch64_expand_vector_init): Load constant
>     and overwrite variable parts if <= 1/2 the elements are variable.
> 
> gcc/testsuite/ChangeLog:
> 
>     PR target/64134
>     gcc.target/aarch64/vec_init_1.c: New test.

OK.

R.
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index cba3c1a4d42c7d543e0ed96a7b41fcd9c925f245..767b986e907fbede46b117a271d179b58406afca 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -8769,22 +8769,19 @@  aarch64_expand_vector_init (rtx target, rtx vals)
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
   int n_elts = GET_MODE_NUNITS (mode);
-  int n_var = 0, one_var = -1;
+  int n_var = 0;
+  rtx any_const = NULL_RTX;
   bool all_same = true;
-  rtx x, mem;
-  int i;
 
-  x = XVECEXP (vals, 0, 0);
-  if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
-    n_var = 1, one_var = 0;
-  
-  for (i = 1; i < n_elts; ++i)
+  for (int i = 0; i < n_elts; ++i)
     {
-      x = XVECEXP (vals, 0, i);
+      rtx x = XVECEXP (vals, 0, i);
       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
-	++n_var, one_var = i;
+	++n_var;
+      else
+	any_const = x;
 
-      if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
+      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
 	all_same = false;
     }
 
@@ -8801,36 +8798,60 @@  aarch64_expand_vector_init (rtx target, rtx vals)
   /* Splat a single non-constant element if we can.  */
   if (all_same)
     {
-      x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
       return;
     }
 
-  /* One field is non-constant.  Load constant then overwrite varying
-     field.  This is more efficient than using the stack.  */
-  if (n_var == 1)
+  /* Half the fields (or less) are non-constant.  Load constant then overwrite
+     varying fields.  Hope that this is more efficient than using the stack.  */
+  if (n_var <= n_elts/2)
     {
       rtx copy = copy_rtx (vals);
-      rtx index = GEN_INT (one_var);
-      enum insn_code icode;
 
-      /* Load constant part of vector, substitute neighboring value for
-	 varying element.  */
-      XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
+      /* Load constant part of vector.  We really don't care what goes into the
+	 parts we will overwrite, but we're more likely to be able to load the
+	 constant efficiently if it has fewer, larger, repeating parts
+	 (see aarch64_simd_valid_immediate).  */
+      for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+	    continue;
+	  rtx subst = any_const;
+	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
+	    {
+	      /* Look in the copied vector, as more elements are const.  */
+	      rtx test = XVECEXP (copy, 0, i ^ bit);
+	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
+		{
+		  subst = test;
+		  break;
+		}
+	    }
+	  XVECEXP (copy, 0, i) = subst;
+	}
       aarch64_expand_vector_init (target, copy);
 
-      /* Insert variable.  */
-      x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
-      icode = optab_handler (vec_set_optab, mode);
+      /* Insert variables.  */
+      enum insn_code icode = optab_handler (vec_set_optab, mode);
       gcc_assert (icode != CODE_FOR_nothing);
-      emit_insn (GEN_FCN (icode) (target, x, index));
+
+      for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+	    continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
       return;
     }
 
   /* Construct the vector in memory one field at a time
      and load the whole vector.  */
-  mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
-  for (i = 0; i < n_elts; i++)
+  rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
+  for (int i = 0; i < n_elts; i++)
     emit_move_insn (adjust_address_nv (mem, inner_mode,
 				    i * GET_MODE_SIZE (inner_mode)),
 		    XVECEXP (vals, 0, i));
diff --git a/gcc/testsuite/gcc.target/aarch64/vec_init_1.c b/gcc/testsuite/gcc.target/aarch64/vec_init_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..64eaff2dadab9de94e5c80174b24e9b216478a54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec_init_1.c
@@ -0,0 +1,34 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -fomit-frame-pointer --save-temps -fno-inline" } */
+
+extern void abort (void);
+
+typedef float float16x4_t __attribute__ ((vector_size ((16))));
+
+float a;
+float b;
+
+float16x4_t
+make_vector ()
+{
+  return (float16x4_t) { 0, 0, a, b };
+}
+
+int
+main (int argc, char **argv)
+{
+  a = 4.0;
+  b = 3.0;
+  float16x4_t vec = make_vector ();
+  if (vec[0] != 0 || vec[1] != 0 || vec[2] != a || vec[3] != b)
+    abort ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "ins\\t" 2 } } */
+/* What we want to check, is that make_vector does not stp the whole vector
+   to the stack.  Unfortunately here we scan the body of main() too, which may
+   be a bit fragile - the test is currently passing only because of the option
+   -fomit-frame-pointer which avoids use of stp in the prologue to main().  */
+/* { dg-final { scan-assembler-not "stp\\t" } } */
+/* { dg-final { cleanup-saved-temps } } */