diff mbox

Fix PR78007

Message ID alpine.LSU.2.11.1611090910190.5294@t29.fhfr.qr
State New
Headers show

Commit Message

Richard Biener Nov. 9, 2016, 8:14 a.m. UTC
The following implements vectorization of bswap via VEC_PERM_EXPR
on the corresponding QImode vector.

ARM already has backend handling via the builtin_vectorized_call
hook and thus there were already testcases available.  It doesn't
end up working for vect-bswap16.c because we have a promoted
argument to __builtin_bswap16 which confuses vectorization.

Eventually the testcase should also succeed on vect_perm_byte
targets but I have no way to verify that.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2016-11-09  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/78007
	* tree-vect-stmts.c (vectorizable_bswap): New function.
	(vectorizable_call): Call vectorizable_bswap for
	BUILT_IN_BSWAP{16,32,64} if arguments are not promoted.

	* gcc.dg/vect/vect-bswap32.c: Adjust.
	* gcc.dg/vect/vect-bswap64.c: Likewise.
diff mbox

Patch

Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	(revision 241959)
+++ gcc/tree-vect-stmts.c	(working copy)
@@ -2432,6 +2432,116 @@  vectorizable_mask_load_store (gimple *st
   return true;
 }
 
+/* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}.  */
+
+static bool
+vectorizable_bswap (gimple *stmt, gimple_stmt_iterator *gsi,
+		    gimple **vec_stmt, slp_tree slp_node,
+		    tree vectype_in, enum vect_def_type *dt)
+{
+  tree op, vectype;
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  unsigned ncopies, nunits;
+
+  op = gimple_call_arg (stmt, 0);
+  vectype = STMT_VINFO_VECTYPE (stmt_info);
+  nunits = TYPE_VECTOR_SUBPARTS (vectype);
+
+  /* Multiple types in SLP are handled by creating the appropriate number of
+     vectorized stmts for each SLP node.  Hence, NCOPIES is always 1 in
+     case of SLP.  */
+  if (slp_node)
+    ncopies = 1;
+  else
+    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
+
+  gcc_assert (ncopies >= 1);
+
+  tree char_vectype = get_same_sized_vectype (char_type_node, vectype_in);
+  if (! char_vectype)
+    return false;
+
+  unsigned char *elts
+    = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (char_vectype));
+  unsigned char *elt = elts;
+  unsigned word_bytes = TYPE_VECTOR_SUBPARTS (char_vectype) / nunits;
+  for (unsigned i = 0; i < nunits; ++i)
+    for (unsigned j = 0; j < word_bytes; ++j)
+      *elt++ = (i + 1) * word_bytes - j - 1;
+
+  if (! can_vec_perm_p (TYPE_MODE (char_vectype), false, elts))
+    return false;
+
+  if (! vec_stmt)
+    {
+      STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location, "=== vectorizable_bswap ==="
+                         "\n");
+      if (! PURE_SLP_STMT (stmt_info))
+	{
+	  add_stmt_cost (stmt_info->vinfo->target_cost_data,
+			 1, vector_stmt, stmt_info, 0, vect_prologue);
+	  add_stmt_cost (stmt_info->vinfo->target_cost_data,
+			 ncopies, vec_perm, stmt_info, 0, vect_body);
+	}
+      return true;
+    }
+
+  tree *telts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (char_vectype));
+  for (unsigned i = 0; i < TYPE_VECTOR_SUBPARTS (char_vectype); ++i)
+    telts[i] = build_int_cst (char_type_node, elts[i]);
+  tree bswap_vconst = build_vector (char_vectype, telts);
+
+  /* Transform.  */
+  vec<tree> vec_oprnds = vNULL;
+  gimple *new_stmt = NULL;
+  stmt_vec_info prev_stmt_info = NULL;
+  for (unsigned j = 0; j < ncopies; j++)
+    {
+      /* Handle uses.  */
+      if (j == 0)
+        vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node, -1);
+      else
+        vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds, NULL);
+
+      /* Arguments are ready. create the new vector stmt.  */
+      unsigned i;
+      tree vop;
+      FOR_EACH_VEC_ELT (vec_oprnds, i, vop)
+       {
+	 tree tem = make_ssa_name (char_vectype);
+	 new_stmt = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
+						      char_vectype, vop));
+	 vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	 tree tem2 = make_ssa_name (char_vectype);
+	 new_stmt = gimple_build_assign (tem2, VEC_PERM_EXPR,
+					 tem, tem, bswap_vconst);
+	 vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	 tem = make_ssa_name (vectype);
+	 new_stmt = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
+						      vectype, tem2));
+	 vect_finish_stmt_generation (stmt, new_stmt, gsi);
+         if (slp_node)
+           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+       }
+
+      if (slp_node)
+        continue;
+
+      if (j == 0)
+        STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+      else
+        STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
+      prev_stmt_info = vinfo_for_stmt (new_stmt);
+    }
+
+  vec_oprnds.release ();
+  return true;
+}
+
 /* Return true if vector types VECTYPE_IN and VECTYPE_OUT have
    integer elements and if we can narrow VECTYPE_IN to VECTYPE_OUT
    in a single step.  On success, store the binary pack code in
@@ -2658,6 +2768,12 @@  vectorizable_call (gimple *gs, gimple_st
 	     { 0, 1, 2, ... vf - 1 } vector.  */
 	  gcc_assert (nargs == 0);
 	}
+      else if (modifier == NONE
+	       && (gimple_call_builtin_p (stmt, BUILT_IN_BSWAP16)
+		   || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP32)
+		   || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP64)))
+	return vectorizable_bswap (stmt, gsi, vec_stmt, slp_node,
+				   vectype_in, dt);
       else
 	{
 	  if (dump_enabled_p ())
Index: gcc/testsuite/gcc.dg/vect/vect-bswap32.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/vect-bswap32.c	(revision 241959)
+++ gcc/testsuite/gcc.dg/vect/vect-bswap32.c	(working copy)
@@ -1,4 +1,4 @@ 
-/* { dg-require-effective-target vect_bswap } */
+/* { dg-additional-options "-msse4" { target sse4_runtime } } */
 
 #include "tree-vect.h"
 
@@ -42,4 +42,4 @@  main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_bswap || sse4_runtime } } } } */
Index: gcc/testsuite/gcc.dg/vect/vect-bswap64.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/vect-bswap64.c	(revision 241959)
+++ gcc/testsuite/gcc.dg/vect/vect-bswap64.c	(working copy)
@@ -1,4 +1,4 @@ 
-/* { dg-require-effective-target vect_bswap } */
+/* { dg-additional-options "-msse4" { target sse4_runtime } } */
 
 #include "tree-vect.h"
 
@@ -42,4 +42,4 @@  main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_bswap || sse4_runtime } } } } */