diff mbox series

[5/8] vect: Use inbranch simdclones in masked loops

Message ID d96af71e-d8e8-0bff-d502-5e54768ac774@arm.com
State New
Headers show
Series [1/8] parloops: Copy target and optimizations when creating a function clone | expand

Commit Message

Andre Vieira (lists) Aug. 30, 2023, 9:13 a.m. UTC
This patch enables the compiler to use inbranch simdclones when 
generating masked loops in autovectorization.

gcc/ChangeLog:

	* omp-simd-clone.cc (simd_clone_adjust_argument_types): Make function
	compatible with mask parameters in clone.
	* tree-vect-stmts.cc (vect_convert): New helper function.
	(vect_build_all_ones_mask): Allow vector boolean typed masks.
	(vectorizable_simd_clone_call): Enable the use of masked clones in
	fully masked loops.

Comments

Andre Vieira (lists) Oct. 18, 2023, 2:41 p.m. UTC | #1
Rebased, needs review.

On 30/08/2023 10:13, Andre Vieira (lists) via Gcc-patches wrote:
> This patch enables the compiler to use inbranch simdclones when 
> generating masked loops in autovectorization.
> 
> gcc/ChangeLog:
> 
>      * omp-simd-clone.cc (simd_clone_adjust_argument_types): Make function
>      compatible with mask parameters in clone.
>      * tree-vect-stmts.cc (vect_convert): New helper function.
>      (vect_build_all_ones_mask): Allow vector boolean typed masks.
>      (vectorizable_simd_clone_call): Enable the use of masked clones in
>      fully masked loops.
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index a42643400ddcf10961633448b49d4caafb999f12..ef0b9b48c7212900023bc0eaebca5e1f9389db77 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -807,8 +807,14 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
     {
       ipa_adjusted_param adj;
       memset (&adj, 0, sizeof (adj));
-      tree parm = args[i];
-      tree parm_type = node->definition ? TREE_TYPE (parm) : parm;
+      tree parm = NULL_TREE;
+      tree parm_type = NULL_TREE;
+      if(i < args.length())
+	{
+	  parm = args[i];
+	  parm_type = node->definition ? TREE_TYPE (parm) : parm;
+	}
+
       adj.base_index = i;
       adj.prev_clone_index = i;
 
@@ -1547,7 +1553,7 @@ simd_clone_adjust (struct cgraph_node *node)
 	  mask = gimple_assign_lhs (g);
 	  g = gimple_build_assign (make_ssa_name (TREE_TYPE (mask)),
 				   BIT_AND_EXPR, mask,
-				   build_int_cst (TREE_TYPE (mask), 1));
+				   build_one_cst (TREE_TYPE (mask)));
 	  gsi_insert_after (&gsi, g, GSI_CONTINUE_LINKING);
 	  mask = gimple_assign_lhs (g);
 	}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 731acc76350cae39c899a866584068cff247183a..6e2c70c1d3970af652c1e50e41b144162884bf24 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1594,6 +1594,20 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
     }
 }
 
+/* Return SSA name of the result of the conversion of OPERAND into type TYPE.
+   The conversion statement is inserted at GSI.  */
+
+static tree
+vect_convert (vec_info *vinfo, stmt_vec_info stmt_info, tree type, tree operand,
+	      gimple_stmt_iterator *gsi)
+{
+  operand = build1 (VIEW_CONVERT_EXPR, type, operand);
+  gassign *new_stmt = gimple_build_assign (make_ssa_name (type),
+					   operand);
+  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+  return gimple_get_lhs (new_stmt);
+}
+
 /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
    form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
    that needs to be applied to all loads and stores in a vectorized loop.
@@ -2547,7 +2561,8 @@ vect_build_all_ones_mask (vec_info *vinfo,
 {
   if (TREE_CODE (masktype) == INTEGER_TYPE)
     return build_int_cst (masktype, -1);
-  else if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+  else if (VECTOR_BOOLEAN_TYPE_P (masktype)
+	   || TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
     {
       tree mask = build_int_cst (TREE_TYPE (masktype), -1);
       mask = build_vector_from_val (masktype, mask);
@@ -4156,7 +4171,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
   size_t i, nargs;
   tree lhs, rtype, ratype;
   vec<constructor_elt, va_gc> *ret_ctor_elts = NULL;
-  int arg_offset = 0;
+  int masked_call_offset = 0;
 
   /* Is STMT a vectorizable call?   */
   gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
@@ -4171,7 +4186,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
       gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
       fndecl = TREE_OPERAND (fndecl, 0);
       gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
-      arg_offset = 1;
+      masked_call_offset = 1;
     }
   if (fndecl == NULL_TREE)
     return false;
@@ -4199,7 +4214,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
     return false;
 
   /* Process function arguments.  */
-  nargs = gimple_call_num_args (stmt) - arg_offset;
+  nargs = gimple_call_num_args (stmt) - masked_call_offset;
 
   /* Bail out if the function has zero arguments.  */
   if (nargs == 0)
@@ -4221,7 +4236,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
       thisarginfo.op = NULL_TREE;
       thisarginfo.simd_lane_linear = false;
 
-      int op_no = i + arg_offset;
+      int op_no = i + masked_call_offset;
       if (slp_node)
 	op_no = vect_slp_child_index_for_operand (stmt, op_no);
       if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
@@ -4303,16 +4318,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
       arginfo.quick_push (thisarginfo);
     }
 
-  if (loop_vinfo
-      && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not considering SIMD clones; not yet supported"
-			 " for variable-width vectors.\n");
-      return false;
-    }
-
   poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
   unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 1;
   unsigned int badness = 0;
@@ -4325,9 +4330,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
       {
 	unsigned int this_badness = 0;
 	unsigned int num_calls;
-	if (!constant_multiple_p (vf * group_size,
-				  n->simdclone->simdlen, &num_calls)
-	    || n->simdclone->nargs != nargs)
+	if (!constant_multiple_p (vf * group_size, n->simdclone->simdlen,
+				  &num_calls)
+	    || (!n->simdclone->inbranch && (masked_call_offset > 0))
+	    || nargs != n->simdclone->nargs)
 	  continue;
 	if (num_calls != 1)
 	  this_badness += exact_log2 (num_calls) * 4096;
@@ -4344,7 +4350,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	      case SIMD_CLONE_ARG_TYPE_VECTOR:
 		if (!useless_type_conversion_p
 			(n->simdclone->args[i].orig_type,
-			 TREE_TYPE (gimple_call_arg (stmt, i + arg_offset))))
+			 TREE_TYPE (gimple_call_arg (stmt,
+						     i + masked_call_offset))))
 		  i = -1;
 		else if (arginfo[i].dt == vect_constant_def
 			 || arginfo[i].dt == vect_external_def
@@ -4392,6 +4399,17 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	  }
 	if (i == (size_t) -1)
 	  continue;
+	if (masked_call_offset == 0
+	    && n->simdclone->inbranch
+	    && n->simdclone->nargs > nargs)
+	  {
+	    gcc_assert (n->simdclone->args[n->simdclone->nargs - 1].arg_type ==
+			SIMD_CLONE_ARG_TYPE_MASK);
+	    /* Penalize using a masked SIMD clone in a non-masked loop, that is
+	       not in a branch, as we'd have to construct an all-true mask.  */
+	    if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	      this_badness += 64;
+	  }
 	if (bestn == NULL || this_badness < badness)
 	  {
 	    bestn = n;
@@ -4414,7 +4432,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	   || arginfo[i].dt == vect_external_def)
 	  && bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 	{
-	  tree arg_type = TREE_TYPE (gimple_call_arg (stmt, i + arg_offset));
+	  tree arg_type = TREE_TYPE (gimple_call_arg (stmt,
+						      i + masked_call_offset));
 	  arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type,
 							    slp_node);
 	  if (arginfo[i].vectype == NULL
@@ -4523,22 +4542,37 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
       if (gimple_vuse (stmt) && slp_node)
 	vinfo->any_known_not_updated_vssa = true;
       simd_clone_info.safe_push (bestn->decl);
-      for (i = 0; i < nargs; i++)
-	if ((bestn->simdclone->args[i].arg_type
-	     == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
-	    || (bestn->simdclone->args[i].arg_type
-		== SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP))
-	  {
-	    simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
-	    simd_clone_info.safe_push (arginfo[i].op);
-	    tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
-		       ? size_type_node : TREE_TYPE (arginfo[i].op);
-	    tree ls = build_int_cst (lst, arginfo[i].linear_step);
-	    simd_clone_info.safe_push (ls);
-	    tree sll = arginfo[i].simd_lane_linear
-		       ? boolean_true_node : boolean_false_node;
-	    simd_clone_info.safe_push (sll);
-	  }
+      for (i = 0; i < bestn->simdclone->nargs; i++)
+	{
+	  switch (bestn->simdclone->args[i].arg_type)
+	    {
+	    default:
+	      continue;
+	    case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
+	    case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
+	      {
+		auto &clone_info = STMT_VINFO_SIMD_CLONE_INFO (stmt_info);
+		clone_info.safe_grow_cleared (i * 3 + 1, true);
+		clone_info.safe_push (arginfo[i].op);
+		tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
+			   ? size_type_node : TREE_TYPE (arginfo[i].op);
+		tree ls = build_int_cst (lst, arginfo[i].linear_step);
+		clone_info.safe_push (ls);
+		tree sll = arginfo[i].simd_lane_linear
+			   ? boolean_true_node : boolean_false_node;
+		clone_info.safe_push (sll);
+	      }
+	      break;
+	    case SIMD_CLONE_ARG_TYPE_MASK:
+	      if (loop_vinfo
+		  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo),
+				       ncopies, vectype, op);
+
+	      break;
+	    }
+	}
 
       if (!bestn->simdclone->inbranch && loop_vinfo)
 	{
@@ -4590,6 +4624,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
     vec_oprnds.safe_grow_cleared (nargs, true);
   for (j = 0; j < ncopies; ++j)
     {
+      poly_uint64 callee_nelements;
+      poly_uint64 caller_nelements;
       /* Build argument list for the vectorized call.  */
       if (j == 0)
 	vargs.create (nargs);
@@ -4600,8 +4636,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	{
 	  unsigned int k, l, m, o;
 	  tree atype;
-	  poly_uint64 callee_nelements, caller_nelements;
-	  op = gimple_call_arg (stmt, i + arg_offset);
+	  op = gimple_call_arg (stmt, i + masked_call_offset);
 	  switch (bestn->simdclone->args[i].arg_type)
 	    {
 	    case SIMD_CLONE_ARG_TYPE_VECTOR:
@@ -4680,16 +4715,9 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 		      if (k == 1)
 			if (!useless_type_conversion_p (TREE_TYPE (vec_oprnd0),
 						       atype))
-			  {
-			    vec_oprnd0
-			      = build1 (VIEW_CONVERT_EXPR, atype, vec_oprnd0);
-			    gassign *new_stmt
-			      = gimple_build_assign (make_ssa_name (atype),
-						     vec_oprnd0);
-			    vect_finish_stmt_generation (vinfo, stmt_info,
-							 new_stmt, gsi);
-			    vargs.safe_push (gimple_assign_lhs (new_stmt));
-			  }
+			  vargs.safe_push (vect_convert (vinfo, stmt_info,
+							 atype, vec_oprnd0,
+							 gsi));
 			else
 			  vargs.safe_push (vec_oprnd0);
 		      else
@@ -4738,6 +4766,24 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 			      vec_oprnds_i[i] = 0;
 			    }
 			  vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
+			  if (loop_vinfo
+			      && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+			    {
+			      vec_loop_masks *loop_masks
+				= &LOOP_VINFO_MASKS (loop_vinfo);
+			      tree loop_mask
+				= vect_get_loop_mask (loop_vinfo, gsi,
+						      loop_masks, ncopies,
+						      vectype, j);
+			      vec_oprnd0
+				= prepare_vec_mask (loop_vinfo,
+						    TREE_TYPE (loop_mask),
+						    loop_mask, vec_oprnd0,
+						    gsi);
+			      loop_vinfo->vec_cond_masked_set.add ({ vec_oprnd0,
+								     loop_mask });
+
+			    }
 			  vec_oprnd0
 			    = build3 (VEC_COND_EXPR, atype, vec_oprnd0,
 				      build_vector_from_val (atype, one),
@@ -4901,6 +4947,64 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	    }
 	}
 
+      if (masked_call_offset == 0
+	  && bestn->simdclone->inbranch
+	  && bestn->simdclone->nargs > nargs)
+	{
+	  unsigned long m, o;
+	  size_t mask_i = bestn->simdclone->nargs - 1;
+	  tree mask;
+	  gcc_assert (bestn->simdclone->args[mask_i].arg_type ==
+		      SIMD_CLONE_ARG_TYPE_MASK);
+
+	  tree masktype = bestn->simdclone->args[mask_i].vector_type;
+	  callee_nelements = TYPE_VECTOR_SUBPARTS (masktype);
+	  o = vector_unroll_factor (nunits, callee_nelements);
+	  for (m = j * o; m < (j + 1) * o; m++)
+	    {
+	      if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+		{
+		  vec_loop_masks *loop_masks = &LOOP_VINFO_MASKS (loop_vinfo);
+		  mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+					     ncopies, vectype, j);
+		}
+	      else
+		mask = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
+
+	      if (!useless_type_conversion_p (TREE_TYPE (mask), masktype))
+		{
+		  gassign *new_stmt;
+		  if (bestn->simdclone->mask_mode != VOIDmode)
+		    {
+		      /* This means we are dealing with integer mask modes.
+			 First convert to an integer type with the same size as
+			 the current vector type.  */
+		      unsigned HOST_WIDE_INT intermediate_size
+			= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (mask)));
+		      tree mid_int_type =
+			build_nonstandard_integer_type (intermediate_size, 1);
+		      mask = build1 (VIEW_CONVERT_EXPR, mid_int_type, mask);
+		      new_stmt
+			= gimple_build_assign (make_ssa_name (mid_int_type),
+					       mask);
+		      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+		      /* Then zero-extend to the mask mode.  */
+		      mask = fold_build1 (NOP_EXPR, masktype,
+					  gimple_get_lhs (new_stmt));
+		    }
+		  else
+		    mask = build1 (VIEW_CONVERT_EXPR, masktype, mask);
+
+		  new_stmt = gimple_build_assign (make_ssa_name (masktype),
+						  mask);
+		  vect_finish_stmt_generation (vinfo, stmt_info,
+					       new_stmt, gsi);
+		  mask = gimple_assign_lhs (new_stmt);
+		}
+	      vargs.safe_push (mask);
+	    }
+	}
+
       gcall *new_call = gimple_build_call_vec (fndecl, vargs);
       if (vec_dest)
 	{
Richard Biener Oct. 19, 2023, 12:17 p.m. UTC | #2
On Wed, 18 Oct 2023, Andre Vieira (lists) wrote:

> Rebased, needs review.

+      tree parm_type = NULL_TREE;
+      if(i < args.length())
+       {

space before (

+/* Return SSA name of the result of the conversion of OPERAND into type 
TYPE.
+   The conversion statement is inserted at GSI.  */
+ 
+static tree                
+vect_convert (vec_info *vinfo, stmt_vec_info stmt_info, tree type, tree 
operand,
+             gimple_stmt_iterator *gsi)                 
+{                                  
+  operand = build1 (VIEW_CONVERT_EXPR, type, operand);
+  gassign *new_stmt = gimple_build_assign (make_ssa_name (type),
+                                          operand);

I don't like this much, it's got one use in your patch only.  Please
leave this abstraction out.

OK with the above two changes.

Thanks,
Richard.

> On 30/08/2023 10:13, Andre Vieira (lists) via Gcc-patches wrote:
> > This patch enables the compiler to use inbranch simdclones when generating
> > masked loops in autovectorization.
> > 
> > gcc/ChangeLog:
> > 
> >      * omp-simd-clone.cc (simd_clone_adjust_argument_types): Make function
> >      compatible with mask parameters in clone.
> >      * tree-vect-stmts.cc (vect_convert): New helper function.
> >      (vect_build_all_ones_mask): Allow vector boolean typed masks.
> >      (vectorizable_simd_clone_call): Enable the use of masked clones in
> >      fully masked loops.
>
diff mbox series

Patch

diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index a42643400ddcf10961633448b49d4caafb999f12..ef0b9b48c7212900023bc0eaebca5e1f9389db77 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -807,8 +807,14 @@  simd_clone_adjust_argument_types (struct cgraph_node *node)
     {
       ipa_adjusted_param adj;
       memset (&adj, 0, sizeof (adj));
-      tree parm = args[i];
-      tree parm_type = node->definition ? TREE_TYPE (parm) : parm;
+      tree parm = NULL_TREE;
+      tree parm_type = NULL_TREE;
+      if(i < args.length())
+	{
+	  parm = args[i];
+	  parm_type = node->definition ? TREE_TYPE (parm) : parm;
+	}
+
       adj.base_index = i;
       adj.prev_clone_index = i;
 
@@ -1547,7 +1553,7 @@  simd_clone_adjust (struct cgraph_node *node)
 	  mask = gimple_assign_lhs (g);
 	  g = gimple_build_assign (make_ssa_name (TREE_TYPE (mask)),
 				   BIT_AND_EXPR, mask,
-				   build_int_cst (TREE_TYPE (mask), 1));
+				   build_one_cst (TREE_TYPE (mask)));
 	  gsi_insert_after (&gsi, g, GSI_CONTINUE_LINKING);
 	  mask = gimple_assign_lhs (g);
 	}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 664c3b5f7ca48fdb49383fb8a97f407465574479..7217f36a250d549b955c874d7c7644d94982b0b5 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1723,6 +1723,20 @@  check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
     }
 }
 
+/* Return SSA name of the result of the conversion of OPERAND into type TYPE.
+   The conversion statement is inserted at GSI.  */
+
+static tree
+vect_convert (vec_info *vinfo, stmt_vec_info stmt_info, tree type, tree operand,
+	      gimple_stmt_iterator *gsi)
+{
+  operand = build1 (VIEW_CONVERT_EXPR, type, operand);
+  gassign *new_stmt = gimple_build_assign (make_ssa_name (type),
+					   operand);
+  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+  return gimple_get_lhs (new_stmt);
+}
+
 /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
    form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
    that needs to be applied to all loads and stores in a vectorized loop.
@@ -2666,7 +2680,8 @@  vect_build_all_ones_mask (vec_info *vinfo,
 {
   if (TREE_CODE (masktype) == INTEGER_TYPE)
     return build_int_cst (masktype, -1);
-  else if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+  else if (VECTOR_BOOLEAN_TYPE_P (masktype)
+	   || TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
     {
       tree mask = build_int_cst (TREE_TYPE (masktype), -1);
       mask = build_vector_from_val (masktype, mask);
@@ -4018,7 +4033,7 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
   size_t i, nargs;
   tree lhs, rtype, ratype;
   vec<constructor_elt, va_gc> *ret_ctor_elts = NULL;
-  int arg_offset = 0;
+  int masked_call_offset = 0;
 
   /* Is STMT a vectorizable call?   */
   gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
@@ -4033,7 +4048,7 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
       gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
       fndecl = TREE_OPERAND (fndecl, 0);
       gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
-      arg_offset = 1;
+      masked_call_offset = 1;
     }
   if (fndecl == NULL_TREE)
     return false;
@@ -4065,7 +4080,7 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
     return false;
 
   /* Process function arguments.  */
-  nargs = gimple_call_num_args (stmt) - arg_offset;
+  nargs = gimple_call_num_args (stmt) - masked_call_offset;
 
   /* Bail out if the function has zero arguments.  */
   if (nargs == 0)
@@ -4083,7 +4098,7 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
       thisarginfo.op = NULL_TREE;
       thisarginfo.simd_lane_linear = false;
 
-      op = gimple_call_arg (stmt, i + arg_offset);
+      op = gimple_call_arg (stmt, i + masked_call_offset);
       if (!vect_is_simple_use (op, vinfo, &thisarginfo.dt,
 			       &thisarginfo.vectype)
 	  || thisarginfo.dt == vect_uninitialized_def)
@@ -4161,14 +4176,6 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
     }
 
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  if (!vf.is_constant ())
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not considering SIMD clones; not yet supported"
-			 " for variable-width vectors.\n");
-      return false;
-    }
 
   unsigned int badness = 0;
   struct cgraph_node *bestn = NULL;
@@ -4181,7 +4188,8 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	unsigned int this_badness = 0;
 	unsigned int num_calls;
 	if (!constant_multiple_p (vf, n->simdclone->simdlen, &num_calls)
-	    || n->simdclone->nargs != nargs)
+	    || (!n->simdclone->inbranch && (masked_call_offset > 0))
+	    || nargs != n->simdclone->nargs)
 	  continue;
 	if (num_calls != 1)
 	  this_badness += exact_log2 (num_calls) * 4096;
@@ -4198,7 +4206,8 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	      case SIMD_CLONE_ARG_TYPE_VECTOR:
 		if (!useless_type_conversion_p
 			(n->simdclone->args[i].orig_type,
-			 TREE_TYPE (gimple_call_arg (stmt, i + arg_offset))))
+			 TREE_TYPE (gimple_call_arg (stmt,
+						     i + masked_call_offset))))
 		  i = -1;
 		else if (arginfo[i].dt == vect_constant_def
 			 || arginfo[i].dt == vect_external_def
@@ -4243,6 +4252,17 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	  }
 	if (i == (size_t) -1)
 	  continue;
+	if (masked_call_offset == 0
+	    && n->simdclone->inbranch
+	    && n->simdclone->nargs > nargs)
+	  {
+	    gcc_assert (n->simdclone->args[n->simdclone->nargs - 1].arg_type ==
+			SIMD_CLONE_ARG_TYPE_MASK);
+	    /* Penalize using a masked SIMD clone in a non-masked loop, that is
+	       not in a branch, as we'd have to construct an all-true mask.  */
+	    if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	      this_badness += 64;
+	  }
 	if (bestn == NULL || this_badness < badness)
 	  {
 	    bestn = n;
@@ -4259,7 +4279,8 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	   || arginfo[i].dt == vect_external_def)
 	  && bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 	{
-	  tree arg_type = TREE_TYPE (gimple_call_arg (stmt, i + arg_offset));
+	  tree arg_type = TREE_TYPE (gimple_call_arg (stmt,
+						      i + masked_call_offset));
 	  arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type,
 							    slp_node);
 	  if (arginfo[i].vectype == NULL
@@ -4331,24 +4352,38 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	  && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
 	vinfo->any_known_not_updated_vssa = true;
       STMT_VINFO_SIMD_CLONE_INFO (stmt_info).safe_push (bestn->decl);
-      for (i = 0; i < nargs; i++)
-	if ((bestn->simdclone->args[i].arg_type
-	     == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
-	    || (bestn->simdclone->args[i].arg_type
-		== SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP))
-	  {
-	    STMT_VINFO_SIMD_CLONE_INFO (stmt_info).safe_grow_cleared (i * 3
-									+ 1,
-								      true);
-	    STMT_VINFO_SIMD_CLONE_INFO (stmt_info).safe_push (arginfo[i].op);
-	    tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
-		       ? size_type_node : TREE_TYPE (arginfo[i].op);
-	    tree ls = build_int_cst (lst, arginfo[i].linear_step);
-	    STMT_VINFO_SIMD_CLONE_INFO (stmt_info).safe_push (ls);
-	    tree sll = arginfo[i].simd_lane_linear
-		       ? boolean_true_node : boolean_false_node;
-	    STMT_VINFO_SIMD_CLONE_INFO (stmt_info).safe_push (sll);
-	  }
+
+      for (i = 0; i < bestn->simdclone->nargs; i++)
+	{
+	  switch (bestn->simdclone->args[i].arg_type)
+	    {
+	    default:
+	      continue;
+	    case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
+	    case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
+	      {
+		auto &clone_info = STMT_VINFO_SIMD_CLONE_INFO (stmt_info);
+		clone_info.safe_grow_cleared (i * 3 + 1, true);
+		clone_info.safe_push (arginfo[i].op);
+		tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
+			   ? size_type_node : TREE_TYPE (arginfo[i].op);
+		tree ls = build_int_cst (lst, arginfo[i].linear_step);
+		clone_info.safe_push (ls);
+		tree sll = arginfo[i].simd_lane_linear
+			   ? boolean_true_node : boolean_false_node;
+		clone_info.safe_push (sll);
+	      }
+	      break;
+	    case SIMD_CLONE_ARG_TYPE_MASK:
+	      if (loop_vinfo
+		  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo),
+				       ncopies, vectype, op);
+
+	      break;
+	    }
+	}
 
       if (!bestn->simdclone->inbranch)
 	{
@@ -4394,6 +4429,8 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
   vec_oprnds_i.safe_grow_cleared (nargs, true);
   for (j = 0; j < ncopies; ++j)
     {
+      poly_uint64 callee_nelements;
+      poly_uint64 caller_nelements;
       /* Build argument list for the vectorized call.  */
       if (j == 0)
 	vargs.create (nargs);
@@ -4404,8 +4441,7 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	{
 	  unsigned int k, l, m, o;
 	  tree atype;
-	  poly_uint64 callee_nelements, caller_nelements;
-	  op = gimple_call_arg (stmt, i + arg_offset);
+	  op = gimple_call_arg (stmt, i + masked_call_offset);
 	  switch (bestn->simdclone->args[i].arg_type)
 	    {
 	    case SIMD_CLONE_ARG_TYPE_VECTOR:
@@ -4482,16 +4518,9 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 		      if (k == 1)
 			if (!useless_type_conversion_p (TREE_TYPE (vec_oprnd0),
 						       atype))
-			  {
-			    vec_oprnd0
-			      = build1 (VIEW_CONVERT_EXPR, atype, vec_oprnd0);
-			    gassign *new_stmt
-			      = gimple_build_assign (make_ssa_name (atype),
-						     vec_oprnd0);
-			    vect_finish_stmt_generation (vinfo, stmt_info,
-							 new_stmt, gsi);
-			    vargs.safe_push (gimple_assign_lhs (new_stmt));
-			  }
+			  vargs.safe_push (vect_convert (vinfo, stmt_info,
+							 atype, vec_oprnd0,
+							 gsi));
 			else
 			  vargs.safe_push (vec_oprnd0);
 		      else
@@ -4544,6 +4573,24 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 			      vec_oprnds_i[i] = 0;
 			    }
 			  vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
+			  if (loop_vinfo
+			      && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+			    {
+			      vec_loop_masks *loop_masks
+				= &LOOP_VINFO_MASKS (loop_vinfo);
+			      tree loop_mask
+				= vect_get_loop_mask (loop_vinfo, gsi,
+						      loop_masks, ncopies,
+						      vectype, j);
+			      vec_oprnd0
+				= prepare_vec_mask (loop_vinfo,
+						    TREE_TYPE (loop_mask),
+						    loop_mask, vec_oprnd0,
+						    gsi);
+			      loop_vinfo->vec_cond_masked_set.add ({ vec_oprnd0,
+								     loop_mask });
+
+			    }
 			  vec_oprnd0
 			    = build3 (VEC_COND_EXPR, atype, vec_oprnd0,
 				      build_vector_from_val (atype, one),
@@ -4641,6 +4688,64 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	    }
 	}
 
+      if (masked_call_offset == 0
+	  && bestn->simdclone->inbranch
+	  && bestn->simdclone->nargs > nargs)
+	{
+	  unsigned long m, o;
+	  size_t mask_i = bestn->simdclone->nargs - 1;
+	  tree mask;
+	  gcc_assert (bestn->simdclone->args[mask_i].arg_type ==
+		      SIMD_CLONE_ARG_TYPE_MASK);
+
+	  tree masktype = bestn->simdclone->args[mask_i].vector_type;
+	  callee_nelements = TYPE_VECTOR_SUBPARTS (masktype);
+	  o = vector_unroll_factor (nunits, callee_nelements);
+	  for (m = j * o; m < (j + 1) * o; m++)
+	    {
+	      if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+		{
+		  vec_loop_masks *loop_masks = &LOOP_VINFO_MASKS (loop_vinfo);
+		  mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+					     ncopies, vectype, j);
+		}
+	      else
+		mask = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
+
+	      if (!useless_type_conversion_p (TREE_TYPE (mask), masktype))
+		{
+		  gassign *new_stmt;
+		  if (bestn->simdclone->mask_mode != VOIDmode)
+		    {
+		      /* This means we are dealing with integer mask modes.
+			 First convert to an integer type with the same size as
+			 the current vector type.  */
+		      unsigned HOST_WIDE_INT intermediate_size
+			= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (mask)));
+		      tree mid_int_type =
+			build_nonstandard_integer_type (intermediate_size, 1);
+		      mask = build1 (VIEW_CONVERT_EXPR, mid_int_type, mask);
+		      new_stmt
+			= gimple_build_assign (make_ssa_name (mid_int_type),
+					       mask);
+		      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+		      /* Then zero-extend to the mask mode.  */
+		      mask = fold_build1 (NOP_EXPR, masktype,
+					  gimple_get_lhs (new_stmt));
+		    }
+		  else
+		    mask = build1 (VIEW_CONVERT_EXPR, masktype, mask);
+
+		  new_stmt = gimple_build_assign (make_ssa_name (masktype),
+						  mask);
+		  vect_finish_stmt_generation (vinfo, stmt_info,
+					       new_stmt, gsi);
+		  mask = gimple_assign_lhs (new_stmt);
+		}
+	      vargs.safe_push (mask);
+	    }
+	}
+
       gcall *new_call = gimple_build_call_vec (fndecl, vargs);
       if (vec_dest)
 	{
@@ -4659,13 +4764,13 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 
       if (vec_dest)
 	{
-	  if (!multiple_p (TYPE_VECTOR_SUBPARTS (vectype), nunits))
+	  caller_nelements = TYPE_VECTOR_SUBPARTS (vectype);
+	  if (!multiple_p (caller_nelements, nunits))
 	    {
 	      unsigned int k, l;
 	      poly_uint64 prec = GET_MODE_BITSIZE (TYPE_MODE (vectype));
 	      poly_uint64 bytes = GET_MODE_SIZE (TYPE_MODE (vectype));
-	      k = vector_unroll_factor (nunits,
-					TYPE_VECTOR_SUBPARTS (vectype));
+	      k = vector_unroll_factor (nunits, caller_nelements);
 	      gcc_assert ((k & (k - 1)) == 0);
 	      for (l = 0; l < k; l++)
 		{
@@ -4691,11 +4796,11 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 		vect_clobber_variable (vinfo, stmt_info, gsi, new_temp);
 	      continue;
 	    }
-	  else if (!multiple_p (nunits, TYPE_VECTOR_SUBPARTS (vectype)))
+	  else if (!multiple_p (nunits, caller_nelements))
 	    {
 	      unsigned int k;
-	      if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (rtype),
-					TYPE_VECTOR_SUBPARTS (vectype), &k))
+	      if (!constant_multiple_p (caller_nelements,
+					TYPE_VECTOR_SUBPARTS (rtype), &k))
 		gcc_unreachable ();
 	      gcc_assert ((k & (k - 1)) == 0);
 	      if ((j & (k - 1)) == 0)