diff mbox series

Support AVX512F masked gather loads (PR tree-optimization/88464)

Message ID 20181212224310.GP12380@tucnak
State New
Headers show
Series Support AVX512F masked gather loads (PR tree-optimization/88464) | expand

Commit Message

Jakub Jelinek Dec. 12, 2018, 10:43 p.m. UTC
Hi!

We support either the AVX2 gather loads (128-bit or 256-bit, using masks in
vector registers), both conditional and unconditional, but AVX512F gather
loads only unconditional.  The problem was that tree-vect-stmts.c didn't
have code to deal with the integral masktype argument the builtins have,
where we need to compute it as vector bool first and then VCE.

Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
trunk?

2018-12-12  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/88464
	* tree-vect-stmts.c (vect_build_gather_load_calls): Handle INTEGER_TYPE
	masktype if mask is non-NULL.
	(vectorizable_load): Don't reject masked gather loads if masktype
	in the decl is INTEGER_TYPE.

	* gcc.target/i386/avx512f-pr88462-1.c: New test.
	* gcc.target/i386/avx512f-pr88462-2.c: New test.


	Jakub

Comments

Richard Biener Dec. 13, 2018, 4:09 p.m. UTC | #1
On December 12, 2018 11:43:10 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote:
>Hi!
>
>We support either the AVX2 gather loads (128-bit or 256-bit, using
>masks in
>vector registers), both conditional and unconditional, but AVX512F
>gather
>loads only unconditional.  The problem was that tree-vect-stmts.c
>didn't
>have code to deal with the integral masktype argument the builtins
>have,
>where we need to compute it as vector bool first and then VCE.
>
>Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok
>for
>trunk?

OK. 

Is there any chance you could look at replacing the i386 code to work with the new IFN style used by Aarch64? 

>2018-12-12  Jakub Jelinek  <jakub@redhat.com>
>
>	PR tree-optimization/88464
>	* tree-vect-stmts.c (vect_build_gather_load_calls): Handle
>INTEGER_TYPE
>	masktype if mask is non-NULL.
>	(vectorizable_load): Don't reject masked gather loads if masktype
>	in the decl is INTEGER_TYPE.
>
>	* gcc.target/i386/avx512f-pr88462-1.c: New test.
>	* gcc.target/i386/avx512f-pr88462-2.c: New test.
>
>--- gcc/tree-vect-stmts.c.jj	2018-11-20 21:39:06.983478940 +0100
>+++ gcc/tree-vect-stmts.c	2018-12-12 20:05:42.980019685 +0100
>@@ -2647,8 +2647,13 @@ vect_build_gather_load_calls (stmt_vec_i
>   tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
>  tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
>   tree scaletype = TREE_VALUE (arglist);
>+  tree real_masktype = masktype;
>   gcc_checking_assert (types_compatible_p (srctype, rettype)
>-		       && (!mask || types_compatible_p (srctype, masktype)));
>+		       && (!mask
>+			   || TREE_CODE (masktype) == INTEGER_TYPE
>+			   || types_compatible_p (srctype, masktype)));
>+  if (mask && TREE_CODE (masktype) == INTEGER_TYPE)
>+    masktype = build_same_sized_truth_vector_type (srctype);
> 
>   tree perm_mask = NULL_TREE;
>   tree mask_perm_mask = NULL_TREE;
>@@ -2763,9 +2768,9 @@ vect_build_gather_load_calls (stmt_vec_i
> 	      mask_op = vec_mask;
>	      if (!useless_type_conversion_p (masktype, TREE_TYPE (vec_mask)))
> 		{
>-		  gcc_assert
>-		    (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op)),
>-			       TYPE_VECTOR_SUBPARTS (masktype)));
>+		  poly_uint64 sub1 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op));
>+		  poly_uint64 sub2 = TYPE_VECTOR_SUBPARTS (masktype);
>+		  gcc_assert (known_eq (sub1, sub2));
> 		  var = vect_get_new_ssa_name (masktype, vect_simple_var);
> 		  mask_op = build1 (VIEW_CONVERT_EXPR, masktype, mask_op);
> 		  gassign *new_stmt
>@@ -2777,8 +2782,33 @@ vect_build_gather_load_calls (stmt_vec_i
> 	  src_op = mask_op;
> 	}
> 
>+      tree mask_arg = mask_op;
>+      if (masktype != real_masktype)
>+	{
>+	  tree utype;
>+	  if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype))
>+	    utype = real_masktype;
>+	  else
>+	    utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1);
>+	  var = vect_get_new_ssa_name (utype, vect_scalar_var);
>+	  mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op);
>+	  gassign *new_stmt
>+	    = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
>+	  vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
>+	  mask_arg = var;
>+	  if (!useless_type_conversion_p (real_masktype, utype))
>+	    {
>+	      gcc_assert (TYPE_PRECISION (utype)
>+			  <= TYPE_PRECISION (real_masktype));
>+	      var = vect_get_new_ssa_name (real_masktype, vect_scalar_var);
>+	      new_stmt = gimple_build_assign (var, NOP_EXPR, utype);
>+	      vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
>+	      mask_arg = var;
>+	    }
>+	  src_op = build_zero_cst (srctype);
>+	}
>gcall *new_call = gimple_build_call (gs_info->decl, 5, src_op, ptr, op,
>-					   mask_op, scale);
>+					   mask_arg, scale);
> 
>       stmt_vec_info new_stmt_info;
>       if (!useless_type_conversion_p (vectype, rettype))
>@@ -7555,20 +7585,6 @@ vectorizable_load (stmt_vec_info stmt_in
> 					     TYPE_MODE (mask_vectype), true))
> 	    return false;
> 	}
>-      else if (memory_access_type == VMAT_GATHER_SCATTER &&
>gs_info.decl)
>-	{
>-	  tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
>-	  tree masktype
>-	    = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
>-	  if (TREE_CODE (masktype) == INTEGER_TYPE)
>-	    {
>-	      if (dump_enabled_p ())
>-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>-				 "masked gather with integer mask not"
>-				 " supported.");
>-	      return false;
>-	    }
>-	}
>       else if (memory_access_type != VMAT_LOAD_STORE_LANES
> 	       && memory_access_type != VMAT_GATHER_SCATTER)
> 	{
>--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj	2018-12-12
>20:39:23.207028399 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c	2018-12-12
>20:44:22.195148359 +0100
>@@ -0,0 +1,35 @@
>+/* PR tree-optimization/88462 */
>+/* { dg-do compile } */
>+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
>-mtune=skylake-avx512 -fdump-tree-vect-details" } */
>+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte
>vectors" 3 "vect" } } */
>+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function"
>3 "vect" } } */
>+
>+__attribute__((noipa)) void
>+f1 (double * __restrict__ a, const double * __restrict__ b, const int
>* __restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (a[i] > 10.0)
>+      a[i] = b[c[i]];
>+}
>+
>+__attribute__((noipa)) void
>+f2 (double * __restrict__ a, const double * __restrict__ b, const long
>* __restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (a[i] > 10.0)
>+      a[i] = b[c[i]];
>+}
>+
>+__attribute__((noipa)) void
>+f3 (float * __restrict__ a, const float * __restrict__ b, const int *
>__restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (a[i] > 10.0f)
>+      a[i] = b[c[i]];
>+}
>--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj	2018-12-12
>20:58:36.540211660 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c	2018-12-12
>21:00:26.182423054 +0100
>@@ -0,0 +1,51 @@
>+/* PR tree-optimization/88462 */
>+/* { dg-do run { target { avx512f } } } */
>+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
>-mtune=skylake-avx512" } */
>+
>+#include "avx512f-check.h"
>+
>+#include "avx512f-pr88462-1.c"
>+
>+static void
>+avx512f_test (void)
>+{
>+  double a[1024], b[1024];
>+  float c[1024], f[1024];
>+  int d[1024];
>+  long e[1024];
>+  int i;
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      a[i] = (i % 3) != 0 ? 15.0 : -5.0;
>+      b[i] = 2 * i;
>+      d[i] = (i % 3) ? 1023 - i : __INT_MAX__;
>+    }
>+  f1 (a, b, d, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
>+	abort ();
>+      a[i] = (i % 3) != 1 ? 15.0 : -5.0;
>+      b[i] = 3 * i;
>+      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
>+    }
>+  f2 (a, b, e, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0))
>+	abort ();
>+      c[i] = (i % 3) != 2 ? 15.0f : -5.0f;
>+      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
>+      f[i] = 4 * i;
>+    }
>+  f3 (c, f, d, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f))
>+	abort ();
>+    }
>+}
>
>	Jakub
Jakub Jelinek Dec. 13, 2018, 4:16 p.m. UTC | #2
On Thu, Dec 13, 2018 at 05:09:17PM +0100, Richard Biener wrote:
> Is there any chance you could look at replacing the i386 code to work with
> the new IFN style used by Aarch64?

Maybe for GCC 10, I'm afraid it is a lot of work mainly due to the weirdnesses of
x86 mixed index vs. data sized types, that would be non-fun to represent in
the optabs.

	Jakub
diff mbox series

Patch

--- gcc/tree-vect-stmts.c.jj	2018-11-20 21:39:06.983478940 +0100
+++ gcc/tree-vect-stmts.c	2018-12-12 20:05:42.980019685 +0100
@@ -2647,8 +2647,13 @@  vect_build_gather_load_calls (stmt_vec_i
   tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
   tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
   tree scaletype = TREE_VALUE (arglist);
+  tree real_masktype = masktype;
   gcc_checking_assert (types_compatible_p (srctype, rettype)
-		       && (!mask || types_compatible_p (srctype, masktype)));
+		       && (!mask
+			   || TREE_CODE (masktype) == INTEGER_TYPE
+			   || types_compatible_p (srctype, masktype)));
+  if (mask && TREE_CODE (masktype) == INTEGER_TYPE)
+    masktype = build_same_sized_truth_vector_type (srctype);
 
   tree perm_mask = NULL_TREE;
   tree mask_perm_mask = NULL_TREE;
@@ -2763,9 +2768,9 @@  vect_build_gather_load_calls (stmt_vec_i
 	      mask_op = vec_mask;
 	      if (!useless_type_conversion_p (masktype, TREE_TYPE (vec_mask)))
 		{
-		  gcc_assert
-		    (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op)),
-			       TYPE_VECTOR_SUBPARTS (masktype)));
+		  poly_uint64 sub1 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op));
+		  poly_uint64 sub2 = TYPE_VECTOR_SUBPARTS (masktype);
+		  gcc_assert (known_eq (sub1, sub2));
 		  var = vect_get_new_ssa_name (masktype, vect_simple_var);
 		  mask_op = build1 (VIEW_CONVERT_EXPR, masktype, mask_op);
 		  gassign *new_stmt
@@ -2777,8 +2782,33 @@  vect_build_gather_load_calls (stmt_vec_i
 	  src_op = mask_op;
 	}
 
+      tree mask_arg = mask_op;
+      if (masktype != real_masktype)
+	{
+	  tree utype;
+	  if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype))
+	    utype = real_masktype;
+	  else
+	    utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1);
+	  var = vect_get_new_ssa_name (utype, vect_scalar_var);
+	  mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op);
+	  gassign *new_stmt
+	    = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
+	  vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+	  mask_arg = var;
+	  if (!useless_type_conversion_p (real_masktype, utype))
+	    {
+	      gcc_assert (TYPE_PRECISION (utype)
+			  <= TYPE_PRECISION (real_masktype));
+	      var = vect_get_new_ssa_name (real_masktype, vect_scalar_var);
+	      new_stmt = gimple_build_assign (var, NOP_EXPR, utype);
+	      vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+	      mask_arg = var;
+	    }
+	  src_op = build_zero_cst (srctype);
+	}
       gcall *new_call = gimple_build_call (gs_info->decl, 5, src_op, ptr, op,
-					   mask_op, scale);
+					   mask_arg, scale);
 
       stmt_vec_info new_stmt_info;
       if (!useless_type_conversion_p (vectype, rettype))
@@ -7555,20 +7585,6 @@  vectorizable_load (stmt_vec_info stmt_in
 					     TYPE_MODE (mask_vectype), true))
 	    return false;
 	}
-      else if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl)
-	{
-	  tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
-	  tree masktype
-	    = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
-	  if (TREE_CODE (masktype) == INTEGER_TYPE)
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "masked gather with integer mask not"
-				 " supported.");
-	      return false;
-	    }
-	}
       else if (memory_access_type != VMAT_LOAD_STORE_LANES
 	       && memory_access_type != VMAT_GATHER_SCATTER)
 	{
--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj	2018-12-12 20:39:23.207028399 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c	2018-12-12 20:44:22.195148359 +0100
@@ -0,0 +1,35 @@ 
+/* PR tree-optimization/88462 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 "vect" } } */
+
+__attribute__((noipa)) void
+f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (a[i] > 10.0)
+      a[i] = b[c[i]];
+}
+
+__attribute__((noipa)) void
+f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (a[i] > 10.0)
+      a[i] = b[c[i]];
+}
+
+__attribute__((noipa)) void
+f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (a[i] > 10.0f)
+      a[i] = b[c[i]];
+}
--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj	2018-12-12 20:58:36.540211660 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c	2018-12-12 21:00:26.182423054 +0100
@@ -0,0 +1,51 @@ 
+/* PR tree-optimization/88462 */
+/* { dg-do run { target { avx512f } } } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-pr88462-1.c"
+
+static void
+avx512f_test (void)
+{
+  double a[1024], b[1024];
+  float c[1024], f[1024];
+  int d[1024];
+  long e[1024];
+  int i;
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      a[i] = (i % 3) != 0 ? 15.0 : -5.0;
+      b[i] = 2 * i;
+      d[i] = (i % 3) ? 1023 - i : __INT_MAX__;
+    }
+  f1 (a, b, d, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
+	abort ();
+      a[i] = (i % 3) != 1 ? 15.0 : -5.0;
+      b[i] = 3 * i;
+      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
+    }
+  f2 (a, b, e, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0))
+	abort ();
+      c[i] = (i % 3) != 2 ? 15.0f : -5.0f;
+      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
+      f[i] = 4 * i;
+    }
+  f3 (c, f, d, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f))
+	abort ();
+    }
+}