diff mbox

[AArch64] PR target/71663 Improve Vector Initializtion

Message ID CO2PR07MB26941DBF2C22CDA616A2967983110@CO2PR07MB2694.namprd07.prod.outlook.com
State New
Headers show

Commit Message

Hurugalawadi, Naveen April 26, 2017, 6:34 a.m. UTC
Hi Kyrill,

Thanks for the review and your comments.

>> It would be useful if you expanded a bit on the approach used to
>> generate the improved codegen

The patch creates a duplicate of most common element and tries to optimize
the insertion using dup for the element followed by insertions.

Current code:
============================================
        movi    v2.4s, 0
        ins     v2.s[0], v0.s[0]
        ins     v2.s[1], v1.s[0]
        ins     v2.s[2], v0.s[0]
        orr     v0.16b, v2.16b, v2.16b
        ins     v0.s[3], v3.s[0]
        ret
============================================

Code after the patch:
============================================
        dup     v0.4s, v0.s[0]
        ins     v0.s[1], v1.s[0]
        ins     v0.s[3], v3.s[0]
        ret
============================================

>> Some typos

Modified as required

>> worth adding a testcase where one of the vector elements appears more than
>> the others?

Modified the testcase as required using common element.

Please review the patch and let us know if its okay?
Bootstrapped and Regression tested on aarch64-thunder-linux.

Thanks,
Naveen

Comments

Hurugalawadi, Naveen May 11, 2017, 4:54 a.m. UTC | #1
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01260.html

Thanks,
Naveen
Hurugalawadi, Naveen May 26, 2017, 6:25 a.m. UTC | #2
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01260.html

Thanks,
Naveen
James Greenhalgh June 9, 2017, 2:16 p.m. UTC | #3
On Wed, Apr 26, 2017 at 06:34:36AM +0000, Hurugalawadi, Naveen wrote:
> Hi Kyrill,
> 
> Thanks for the review and your comments.
> 
> >> It would be useful if you expanded a bit on the approach used to
> >> generate the improved codegen
> 
> The patch creates a duplicate of most common element and tries to optimize
> the insertion using dup for the element followed by insertions.
> 
> Current code:
> ============================================
>         movi    v2.4s, 0
>         ins     v2.s[0], v0.s[0]
>         ins     v2.s[1], v1.s[0]
>         ins     v2.s[2], v0.s[0]
>         orr     v0.16b, v2.16b, v2.16b
>         ins     v0.s[3], v3.s[0]
>         ret
> ============================================
> 
> Code after the patch:
> ============================================
>         dup     v0.4s, v0.s[0]
>         ins     v0.s[1], v1.s[0]
>         ins     v0.s[3], v3.s[0]
>         ret
> ============================================
> 
> >> Some typos
> 
> Modified as required
> 
> >> worth adding a testcase where one of the vector elements appears more than
> >> the others?
> 
> Modified the testcase as required using common element.
> 
> Please review the patch and let us know if its okay?
> Bootstrapped and Regression tested on aarch64-thunder-linux.

This patch fell through the cracks as it shows up in a reply chain with a
few others. If you could try to keep one reply chain for each patch series
you're submitting, that would make tracking the submissions much
easier :-).


> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 2e385c4..8747a23 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -11671,11 +11671,54 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>        aarch64_expand_vector_init (target, copy);
>      }
>  
> -  /* Insert the variable lanes directly.  */
> -
>    enum insn_code icode = optab_handler (vec_set_optab, mode);
>    gcc_assert (icode != CODE_FOR_nothing);
>  
> +  /* If there are only variable elements, try to optimize
> +     the insertion using dup for the most common element
> +     followed by insertions.  */
> +  if (n_var == n_elts && n_elts <= 16)
> +    {
> +      int matches[16][2];
> +      int nummatches = 0;
> +      memset (matches, 0, sizeof(matches));

Very minor, but what is wrong with:

  int matches[16][2] = {0};

Rather than the explicit memset?

> +      for(int i = 0; i < n_elts; i++)
> +	{
> +	  for (int j = 0; j <= i; j++)
> +	    {
> +	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
> +		{
> +		  matches[i][0] = j;
> +		  matches[j][1]++;
> +		  if (i != j)
> +		    nummatches++;

nummatches is unused.

This search algorithm is tough to follow. A comment explaining that you will
fill matches[*][0] with the earliest matching element, and matches[X][1]
with the count of duplicate elements (if X is the earliest element which has
duplicates). Would be useful to understand exactly what you're aiming for.
Certainly it took me a while to understand that for:

  { a, b, c, b, b, d, c, e }

matches would look like:

  { { 0, 1 },
    { 1, 3 },
    { 2. 2 },
    { 1, 0 },
    { 1, 0 },
    { 5, 1 },
    { 2, 0 },
    { 7, 1 } }

> +		}
> +	    }
> +	}
> +      int maxelement = 0;
> +      int maxv = 0;
> +      for (int i = 0; i < n_elts; i++)
> +	if (matches[i][1] > maxv)
> +	  maxelement = i, maxv = matches[i][1];

Put braces round this and write it as two statements, or eliminate the use of
maxv and use matches[i][1] > matches[maxelement][1], but don't use the comma
operator like this please.

> +      /* Create a duplicate of the most common element.  */
> +      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
> +      aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
> +
> +      /* Insert the rest.  */
> +      for (int i = 0; i < n_elts; i++)
> +	{
> +	  rtx x = XVECEXP (vals, 0, i);
> +	  if (matches[i][0] == maxelement)
> +	    continue;
> +	  x = copy_to_mode_reg (inner_mode, x);
> +	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
> +	}
> +      return;
> +    }
> +
> +  /* Insert the variable lanes directly.  */

This code would read better if you rearranged the cases. As it stands your
code is added in the middle of a logical operation (deal with constant lanes,
then deal with variable lanes). Move your new code above the part-variable
case.

>    for (int i = 0; i < n_elts; i++)
>      {
>        rtx x = XVECEXP (vals, 0, i);
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c b/gcc/testsuite/gcc.target/aarch64/pr71663.c
> new file mode 100644
> index 0000000..a043a21
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#define vector __attribute__((vector_size(16)))
> +
> +vector float combine (float a, float b, float c, float d)

c is unused.

> +{
> +  return (vector float) { a, b, a, d };
> +}
> +
> +/* { dg-final { scan-assembler-not "movi\t" } } */
> +/* { dg-final { scan-assembler-not "orr\t" } } */
> +/* { dg-final { scan-assembler-times "ins\t" 2 } } */
> +/* { dg-final { scan-assembler-times "dup\t" 1 } } */
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..8747a23 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11671,11 +11671,54 @@  aarch64_expand_vector_init (rtx target, rtx vals)
       aarch64_expand_vector_init (target, copy);
     }
 
-  /* Insert the variable lanes directly.  */
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
+  /* If there are only variable elements, try to optimize
+     the insertion using dup for the most common element
+     followed by insertions.  */
+  if (n_var == n_elts && n_elts <= 16)
+    {
+      int matches[16][2];
+      int nummatches = 0;
+      memset (matches, 0, sizeof(matches));
+      for(int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	    {
+	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  if (i != j)
+		    nummatches++;
+		  break;
+		}
+	    }
+	}
+      int maxelement = 0;
+      int maxv = 0;
+      for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  maxelement = i, maxv = matches[i][1];
+
+      /* Create a duplicate of the most common element.  */
+      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+      aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+
+      /* Insert the rest.  */
+      for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	    continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+      return;
+    }
+
+  /* Insert the variable lanes directly.  */
   for (int i = 0; i < n_elts; i++)
     {
       rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c b/gcc/testsuite/gcc.target/aarch64/pr71663.c
new file mode 100644
index 0000000..a043a21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float c, float d)
+{
+  return (vector float) { a, b, a, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
+/* { dg-final { scan-assembler-times "ins\t" 2 } } */
+/* { dg-final { scan-assembler-times "dup\t" 1 } } */