diff mbox

[AArch64] PR target/71663 Improve Vector Initializtion

Message ID CO2PR07MB26944CDE12E84FD22A41F68583870@CO2PR07MB2694.namprd07.prod.outlook.com
State New
Headers show

Commit Message

Hurugalawadi, Naveen Dec. 9, 2016, 3:28 a.m. UTC
Hi,

The AArch64 vector initialization sequence can be optimized to generate
better code. The attached patch handles for the case where the vector
contains only variables. It checks for the common elements in the vector
and inserts the values in optimized way.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay?

2016-12-09  Andrew PInski  <apinski@cavium.com>

gcc
	* config/aarch64/aarch64.c (aarch64_expand_vector_init):
	Improve vector initialization code gen.
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e87831f..da5b6fa 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11609,11 +11609,54 @@  aarch64_expand_vector_init (rtx target, rtx vals)
       aarch64_expand_vector_init (target, copy);
     }
 
-  /* Insert the variable lanes directly.  */
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
+  /* If there is only varables, try to optimize
+     the inseration using dup for the most common element
+     followed by insertations. */
+  if (n_var == n_elts && n_elts <= 16)
+    {
+      int matches[16][2];
+      int nummatches = 0;
+      memset (matches, 0, sizeof(matches));
+      for(int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	    {
+	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  if (i != j)
+		    nummatches++;
+		  break;
+		}
+	    }
+	}
+      int maxelement = 0;
+      int maxv = 0;
+      for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  maxelement = i, maxv = matches[i][1];
+
+      /* Create a duplicate of the most common element. */
+      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+      aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+      /* Insert the rest. */
+      for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	    continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+      return;
+    }
+
+  /* Insert the variable lanes directly.  */
+
   for (int i = 0; i < n_elts; i++)
     {
       rtx x = XVECEXP (vals, 0, i);