diff mbox series

Fold more shuffle builtins to VEC_PERM_EXPR.

Message ID 20210826045751.40630-1-hongtao.liu@intel.com
State New
Headers show
Series Fold more shuffle builtins to VEC_PERM_EXPR. | expand

Commit Message

liuhongt Aug. 26, 2021, 4:57 a.m. UTC
This patch is a follow-up to [1], it fold all shufps/shufpd builtins into gimple.
  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html

gcc/
	PR target/98167
	PR target/43147
	* config/i386/i386.c (ix86_gimple_fold_builtin): Fold
	IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512,
	IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS,
	IX86_BUILTIN_SHUFPS256.
	(ix86_masked_all_ones): New function.

gcc/testsuite/
	* gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase.
	* gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase.
	* gcc.target/i386/pr43147.c: New test.
---
 gcc/config/i386/i386.c                        | 90 ++++++++++++++-----
 .../gcc.target/i386/avx512f-vshufpd-1.c       |  3 +-
 .../gcc.target/i386/avx512f-vshufps-1.c       |  3 +-
 gcc/testsuite/gcc.target/i386/pr43147.c       | 15 ++++
 4 files changed, 87 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr43147.c

Comments

Hongtao Liu Aug. 26, 2021, 5:05 a.m. UTC | #1
On Thu, Aug 26, 2021 at 12:57 PM liuhongt <hongtao.liu@intel.com> wrote:
>
>   This patch is a follow-up to [1], it fold all shufps/shufpd builtins into gimple.
Of course for non-mask or mask all-ones version.
>   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html
>
> gcc/
>         PR target/98167
>         PR target/43147
>         * config/i386/i386.c (ix86_gimple_fold_builtin): Fold
>         IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512,
>         IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS,
>         IX86_BUILTIN_SHUFPS256.
>         (ix86_masked_all_ones): New function.
>
> gcc/testsuite/
>         * gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase.
>         * gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase.
>         * gcc.target/i386/pr43147.c: New test.
> ---
>  gcc/config/i386/i386.c                        | 90 ++++++++++++++-----
>  .../gcc.target/i386/avx512f-vshufpd-1.c       |  3 +-
>  .../gcc.target/i386/avx512f-vshufps-1.c       |  3 +-
>  gcc/testsuite/gcc.target/i386/pr43147.c       | 15 ++++
>  4 files changed, 87 insertions(+), 24 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr43147.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index ebec8668758..f3eed9f2426 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -17541,6 +17541,20 @@ ix86_vector_shift_count (tree arg1)
>    return NULL_TREE;
>  }
>
> +/* Return true if arg_mask is all ones, arg_vec is corresponding vector.  */
> +static bool
> +ix86_masked_all_ones (unsigned HOST_WIDE_INT elems, tree arg_mask)
> +{
> +  if (TREE_CODE (arg_mask) != INTEGER_CST)
> +    return false;
> +
> +  unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask);
> +  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
> +    return false;
> +
> +  return true;
> +}
> +
>  static tree
>  ix86_fold_builtin (tree fndecl, int n_args,
>                    tree *args, bool ignore ATTRIBUTE_UNUSED)
> @@ -18026,6 +18040,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>    enum tree_code tcode;
>    unsigned HOST_WIDE_INT count;
>    bool is_vshift;
> +  unsigned HOST_WIDE_INT elems;
>
>    switch (fn_code)
>      {
> @@ -18349,17 +18364,11 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>        gcc_assert (n_args >= 2);
>        arg0 = gimple_call_arg (stmt, 0);
>        arg1 = gimple_call_arg (stmt, 1);
> -      if (n_args > 2)
> -       {
> -         /* This is masked shift.  Only optimize if the mask is all ones.  */
> -         tree argl = gimple_call_arg (stmt, n_args - 1);
> -         if (!tree_fits_uhwi_p (argl))
> -           break;
> -         unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
> -         unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> -         if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
> -           break;
> -       }
> +      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +      /* For masked shift, only optimize if the mask is all ones.  */
> +      if (n_args > 2
> +         && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, n_args - 1)))
> +       break;
>        if (is_vshift)
>         {
>           if (TREE_CODE (arg1) != VECTOR_CST)
> @@ -18408,25 +18417,62 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>         }
>        break;
>
> +    case IX86_BUILTIN_SHUFPD512:
> +    case IX86_BUILTIN_SHUFPS512:
>      case IX86_BUILTIN_SHUFPD:
> +    case IX86_BUILTIN_SHUFPD256:
> +    case IX86_BUILTIN_SHUFPS:
> +    case IX86_BUILTIN_SHUFPS256:
> +      arg0 = gimple_call_arg (stmt, 0);
> +      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +      /* This is masked shuffle.  Only optimize if the mask is all ones.  */
> +      if (n_args > 3
> +         && !ix86_masked_all_ones (elems,
> +                                   gimple_call_arg (stmt, n_args - 1)))
> +       break;
>        arg2 = gimple_call_arg (stmt, 2);
>        if (TREE_CODE (arg2) == INTEGER_CST)
>         {
> +         unsigned HOST_WIDE_INT shuffle_mask = TREE_INT_CST_LOW (arg2);
> +         /* Check valid imm, refer to gcc.target/i386/testimm-10.c.  */
> +         if (shuffle_mask > 255)
> +           return false;
> +
> +         machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));
>           location_t loc = gimple_location (stmt);
> -         unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
> -         arg0 = gimple_call_arg (stmt, 0);
> +         tree itype = (imode == E_DFmode
> +                       ? long_long_integer_type_node : integer_type_node);
> +         tree vtype = build_vector_type (itype, elems);
> +         tree_vector_builder elts (vtype, elems, 1);
> +
> +
> +         /* Transform integer shuffle_mask to vector perm_mask which
> +            is used by vec_perm_expr, refer to shuflp[sd]256/512 in sse.md.  */
> +         for (unsigned i = 0; i != elems; i++)
> +           {
> +             unsigned sel_idx;
> +             /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6])
> +                provide 2 select constrols for each element of the
> +                destination.  */
> +             if (imode == E_DFmode)
> +               sel_idx = (i & 1) * elems + (i & ~1)
> +                         + ((shuffle_mask >> i) & 1);
> +             else
> +               {
> +                 /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select
> +                    controls for each element of the destination.  */
> +                 unsigned j = i % 4;
> +                 sel_idx = ((i >> 1) & 1) * elems + (i & ~3)
> +                           + ((shuffle_mask >> 2 * j) & 3);
> +               }
> +             elts.quick_push (build_int_cst (itype, sel_idx));
> +           }
> +
> +         tree perm_mask = elts.build ();
>           arg1 = gimple_call_arg (stmt, 1);
> -         tree itype = long_long_integer_type_node;
> -         tree vtype = build_vector_type (itype, 2); /* V2DI */
> -         tree_vector_builder elts (vtype, 2, 1);
> -         /* Ignore bits other than the lowest 2.  */
> -         elts.quick_push (build_int_cst (itype, imask & 1));
> -         imask >>= 1;
> -         elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
> -         tree omask = elts.build ();
>           gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
>                                            VEC_PERM_EXPR,
> -                                          arg0, arg1, omask);
> +                                          arg0, arg1, perm_mask);
>           gimple_set_location (g, loc);
>           gsi_replace (gsi, g, false);
>           return true;
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
> index d1ac01e1c88..8df5b9d4441 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
> @@ -7,11 +7,12 @@
>  #include <immintrin.h>
>
>  __m512d x;
> +__m512d y;
>
>  void extern
>  avx512f_test (void)
>  {
> -  x = _mm512_shuffle_pd (x, x, 56);
> +  x = _mm512_shuffle_pd (x, y, 56);
>    x = _mm512_mask_shuffle_pd (x, 2, x, x, 56);
>    x = _mm512_maskz_shuffle_pd (2, x, x, 56);
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
> index 07a63fca3ff..378ae4b7101 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
> @@ -7,11 +7,12 @@
>  #include <immintrin.h>
>
>  __m512 x;
> +__m512 y;
>
>  void extern
>  avx512f_test (void)
>  {
> -  x = _mm512_shuffle_ps (x, x, 56);
> +  x = _mm512_shuffle_ps (x, y, 56);
>    x = _mm512_mask_shuffle_ps (x, 2, x, x, 56);
>    x = _mm512_maskz_shuffle_ps (2, x, x, 56);
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/pr43147.c b/gcc/testsuite/gcc.target/i386/pr43147.c
> new file mode 100644
> index 00000000000..3c30f917c06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr43147.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2" } */
> +/* { dg-final { scan-assembler "movaps" } } */
> +/* { dg-final { scan-assembler-not "shufps" } } */
> +
> +#include <x86intrin.h>
> +
> +__m128
> +foo (void)
> +{
> +  __m128 m = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
> +  m = _mm_shuffle_ps(m, m, 0xC9);
> +  m = _mm_shuffle_ps(m, m, 0x2D);
> +  return m;
> +}
> --
> 2.18.1
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ebec8668758..f3eed9f2426 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -17541,6 +17541,20 @@  ix86_vector_shift_count (tree arg1)
   return NULL_TREE;
 }
 
+/* Return true if arg_mask is all ones, arg_vec is corresponding vector.  */
+static bool
+ix86_masked_all_ones (unsigned HOST_WIDE_INT elems, tree arg_mask)
+{
+  if (TREE_CODE (arg_mask) != INTEGER_CST)
+    return false;
+
+  unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask);
+  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
+    return false;
+
+  return true;
+}
+
 static tree
 ix86_fold_builtin (tree fndecl, int n_args,
 		   tree *args, bool ignore ATTRIBUTE_UNUSED)
@@ -18026,6 +18040,7 @@  ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   enum tree_code tcode;
   unsigned HOST_WIDE_INT count;
   bool is_vshift;
+  unsigned HOST_WIDE_INT elems;
 
   switch (fn_code)
     {
@@ -18349,17 +18364,11 @@  ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
       gcc_assert (n_args >= 2);
       arg0 = gimple_call_arg (stmt, 0);
       arg1 = gimple_call_arg (stmt, 1);
-      if (n_args > 2)
-	{
-	  /* This is masked shift.  Only optimize if the mask is all ones.  */
-	  tree argl = gimple_call_arg (stmt, n_args - 1);
-	  if (!tree_fits_uhwi_p (argl))
-	    break;
-	  unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
-	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
-	  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
-	    break;
-	}
+      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+      /* For masked shift, only optimize if the mask is all ones.  */
+      if (n_args > 2
+	  && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, n_args - 1)))
+	break;
       if (is_vshift)
 	{
 	  if (TREE_CODE (arg1) != VECTOR_CST)
@@ -18408,25 +18417,62 @@  ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	}
       break;
 
+    case IX86_BUILTIN_SHUFPD512:
+    case IX86_BUILTIN_SHUFPS512:
     case IX86_BUILTIN_SHUFPD:
+    case IX86_BUILTIN_SHUFPD256:
+    case IX86_BUILTIN_SHUFPS:
+    case IX86_BUILTIN_SHUFPS256:
+      arg0 = gimple_call_arg (stmt, 0);
+      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+      /* This is masked shuffle.  Only optimize if the mask is all ones.  */
+      if (n_args > 3
+	  && !ix86_masked_all_ones (elems,
+				    gimple_call_arg (stmt, n_args - 1)))
+	break;
       arg2 = gimple_call_arg (stmt, 2);
       if (TREE_CODE (arg2) == INTEGER_CST)
 	{
+	  unsigned HOST_WIDE_INT shuffle_mask = TREE_INT_CST_LOW (arg2);
+	  /* Check valid imm, refer to gcc.target/i386/testimm-10.c.  */
+	  if (shuffle_mask > 255)
+	    return false;
+
+	  machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));
 	  location_t loc = gimple_location (stmt);
-	  unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
-	  arg0 = gimple_call_arg (stmt, 0);
+	  tree itype = (imode == E_DFmode
+			? long_long_integer_type_node : integer_type_node);
+	  tree vtype = build_vector_type (itype, elems);
+	  tree_vector_builder elts (vtype, elems, 1);
+
+
+	  /* Transform integer shuffle_mask to vector perm_mask which
+	     is used by vec_perm_expr, refer to shuflp[sd]256/512 in sse.md.  */
+	  for (unsigned i = 0; i != elems; i++)
+	    {
+	      unsigned sel_idx;
+	      /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6])
+		 provide 2 select constrols for each element of the
+		 destination.  */
+	      if (imode == E_DFmode)
+		sel_idx = (i & 1) * elems + (i & ~1)
+			  + ((shuffle_mask >> i) & 1);
+	      else
+		{
+		  /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select
+		     controls for each element of the destination.  */
+		  unsigned j = i % 4;
+		  sel_idx = ((i >> 1) & 1) * elems + (i & ~3)
+			    + ((shuffle_mask >> 2 * j) & 3);
+		}
+	      elts.quick_push (build_int_cst (itype, sel_idx));
+	    }
+
+	  tree perm_mask = elts.build ();
 	  arg1 = gimple_call_arg (stmt, 1);
-	  tree itype = long_long_integer_type_node;
-	  tree vtype = build_vector_type (itype, 2); /* V2DI */
-	  tree_vector_builder elts (vtype, 2, 1);
-	  /* Ignore bits other than the lowest 2.  */
-	  elts.quick_push (build_int_cst (itype, imask & 1));
-	  imask >>= 1;
-	  elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
-	  tree omask = elts.build ();
 	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
 					   VEC_PERM_EXPR,
-					   arg0, arg1, omask);
+					   arg0, arg1, perm_mask);
 	  gimple_set_location (g, loc);
 	  gsi_replace (gsi, g, false);
 	  return true;
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
index d1ac01e1c88..8df5b9d4441 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
@@ -7,11 +7,12 @@ 
 #include <immintrin.h>
 
 __m512d x;
+__m512d y;
 
 void extern
 avx512f_test (void)
 {
-  x = _mm512_shuffle_pd (x, x, 56);
+  x = _mm512_shuffle_pd (x, y, 56);
   x = _mm512_mask_shuffle_pd (x, 2, x, x, 56);
   x = _mm512_maskz_shuffle_pd (2, x, x, 56);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
index 07a63fca3ff..378ae4b7101 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
@@ -7,11 +7,12 @@ 
 #include <immintrin.h>
 
 __m512 x;
+__m512 y;
 
 void extern
 avx512f_test (void)
 {
-  x = _mm512_shuffle_ps (x, x, 56);
+  x = _mm512_shuffle_ps (x, y, 56);
   x = _mm512_mask_shuffle_ps (x, 2, x, x, 56);
   x = _mm512_maskz_shuffle_ps (2, x, x, 56);
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr43147.c b/gcc/testsuite/gcc.target/i386/pr43147.c
new file mode 100644
index 00000000000..3c30f917c06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr43147.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler "movaps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+#include <x86intrin.h>
+
+__m128
+foo (void)
+{
+  __m128 m = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
+  m = _mm_shuffle_ps(m, m, 0xC9);
+  m = _mm_shuffle_ps(m, m, 0x2D);
+  return m;
+}