diff mbox series

[V4] VECT: Apply COND_LEN_* into vectorizable_operation

Message ID 20230712131739.270277-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V4] VECT: Apply COND_LEN_* into vectorizable_operation | expand

Commit Message

juzhe.zhong@rivai.ai July 12, 2023, 1:17 p.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.
As we disscussed before, COND_LEN_* patterns were added for multiple situations.
This patch apply CON_LEN_* for the following situation:

Support for the situation that in "vectorizable_operation":
  /* If operating on inactive elements could generate spurious traps,
     we need to restrict the operation to active lanes.  Note that this
     specifically doesn't apply to unhoisted invariants, since they
     operate on the same value for every lane.

     Similarly, if this operation is part of a reduction, a fully-masked
     loop should only change the active lanes of the reduction chain,
     keeping the inactive lanes as-is.  */
  bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
			    || reduc_idx >= 0);

For mask_out_inactive is true with length loop control.

So, we can these 2 following cases:

1. Integer division:

   #define TEST_TYPE(TYPE) 				\
   __attribute__((noipa))				\
   void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
   {							\
     for (int i = 0; i < n; i++)				\
       dst[i] = a[i] % b[i];				\
   }
   #define TEST_ALL()	\
   TEST_TYPE(int8_t)	\
   TEST_ALL()

With this patch:
  
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
  ivtmp_45 = _61 * 4;
  vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
  vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
  vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
  .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);

2. Floating-point arithmetic **WITHOUT** -ffast-math
  
   #define TEST_TYPE(TYPE) 				\
   __attribute__((noipa))				\
   void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
   {							\
     for (int i = 0; i < n; i++)				\
       dst[i] = a[i] + b[i];				\
   }
   #define TEST_ALL()	\
   TEST_TYPE(float)	\
   TEST_ALL()

With this patch:
   
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
  ivtmp_45 = _61 * 4;
  vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
  vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
  vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
  .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);

With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".

gcc/ChangeLog:

        * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
        (CASE): Ditto.
        (get_conditional_len_internal_fn): New function.
        * internal-fn.h (get_conditional_len_internal_fn): Ditto.
        * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.

---
 gcc/internal-fn.cc     | 71 +++++++++++++++++++++++++++++++-----------
 gcc/internal-fn.h      |  1 +
 gcc/tree-vect-stmts.cc | 48 +++++++++++++++++++++-------
 3 files changed, 91 insertions(+), 29 deletions(-)
diff mbox series

Patch

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index f9aaf66cf2a..c11123a1173 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4276,23 +4276,24 @@  static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
   0
 };
 
-/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
-   tree code CODE.  */
+/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
+   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
+   for each such IFN_COND_##SUFFIX.  */
 #define FOR_EACH_CODE_MAPPING(T) \
-  T (PLUS_EXPR, IFN_COND_ADD) \
-  T (MINUS_EXPR, IFN_COND_SUB) \
-  T (MULT_EXPR, IFN_COND_MUL) \
-  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
-  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
-  T (RDIV_EXPR, IFN_COND_RDIV) \
-  T (MIN_EXPR, IFN_COND_MIN) \
-  T (MAX_EXPR, IFN_COND_MAX) \
-  T (BIT_AND_EXPR, IFN_COND_AND) \
-  T (BIT_IOR_EXPR, IFN_COND_IOR) \
-  T (BIT_XOR_EXPR, IFN_COND_XOR) \
-  T (LSHIFT_EXPR, IFN_COND_SHL) \
-  T (RSHIFT_EXPR, IFN_COND_SHR) \
-  T (NEGATE_EXPR, IFN_COND_NEG)
+  T (PLUS_EXPR, ADD) \
+  T (MINUS_EXPR, SUB) \
+  T (MULT_EXPR, MUL) \
+  T (TRUNC_DIV_EXPR, DIV) \
+  T (TRUNC_MOD_EXPR, MOD) \
+  T (RDIV_EXPR, RDIV) \
+  T (MIN_EXPR, MIN) \
+  T (MAX_EXPR, MAX) \
+  T (BIT_AND_EXPR, AND) \
+  T (BIT_IOR_EXPR, IOR) \
+  T (BIT_XOR_EXPR, XOR) \
+  T (LSHIFT_EXPR, SHL) \
+  T (RSHIFT_EXPR, SHR) \
+  T (NEGATE_EXPR, NEG)
 
 /* Return a function that only performs CODE when a certain condition is met
    and that uses a given fallback value otherwise.  For example, if CODE is
@@ -4313,7 +4314,7 @@  get_conditional_internal_fn (tree_code code)
 {
   switch (code)
     {
-#define CASE(CODE, IFN) case CODE: return IFN;
+#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
       FOR_EACH_CODE_MAPPING(CASE)
 #undef CASE
     default:
@@ -4329,7 +4330,7 @@  conditional_internal_fn_code (internal_fn ifn)
 {
   switch (ifn)
     {
-#define CASE(CODE, IFN) case IFN: return CODE;
+#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
       FOR_EACH_CODE_MAPPING(CASE)
 #undef CASE
     default:
@@ -4337,6 +4338,40 @@  conditional_internal_fn_code (internal_fn ifn)
     }
 }
 
+/* Like get_conditional_internal_fn, but return a function that
+   additionally restricts the operation to the leading elements
+   of a vector.  The number of elements to process is given by a length
+   and bias pair, as for IFN_LOAD_LEN.  The values of the remaining
+   elements are taken from the fallback ("else") argument.
+
+   For example, if CODE is a binary operation associated with FN:
+
+     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
+
+   is equivalent to the C code:
+
+     for (int i = 0; i < NUNITS; i++)
+      {
+	if (i < LEN + BIAS && COND[i])
+	  LHS[i] = A[i] CODE B[i];
+	else
+	  LHS[i] = ELSE[i];
+      }
+*/
+
+internal_fn
+get_conditional_len_internal_fn (tree_code code)
+{
+  switch (code)
+    {
+#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
+      FOR_EACH_CODE_MAPPING(CASE)
+#undef CASE
+    default:
+      return IFN_LAST;
+    }
+}
+
 /* Invoke T(IFN) for each internal function IFN that also has an
    IFN_COND_* form.  */
 #define FOR_EACH_COND_FN_PAIR(T) \
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 4234bbfed87..dd1bab0bddf 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -224,6 +224,7 @@  extern bool set_edom_supported_p (void);
 
 extern internal_fn get_conditional_internal_fn (tree_code);
 extern internal_fn get_conditional_internal_fn (internal_fn);
+extern internal_fn get_conditional_len_internal_fn (tree_code);
 extern tree_code conditional_internal_fn_code (internal_fn);
 extern internal_fn get_unconditional_internal_fn (internal_fn);
 extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 10e71178ce7..dd24f017235 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6711,7 +6711,9 @@  vectorizable_operation (vec_info *vinfo,
 
   int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
   vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
+  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
   internal_fn cond_fn = get_conditional_internal_fn (code);
+  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
 
   /* If operating on inactive elements could generate spurious traps,
      we need to restrict the operation to active lanes.  Note that this
@@ -6730,9 +6732,17 @@  vectorizable_operation (vec_info *vinfo,
 	  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
 	  && mask_out_inactive)
 	{
-	  if (cond_fn == IFN_LAST
-	      || !direct_internal_fn_supported_p (cond_fn, vectype,
-						  OPTIMIZE_FOR_SPEED))
+	  if (cond_fn != IFN_LAST
+	      && direct_internal_fn_supported_p (cond_fn, vectype,
+						 OPTIMIZE_FOR_SPEED))
+	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
+				   vectype, NULL);
+	  else if (cond_len_fn != IFN_LAST
+		   && direct_internal_fn_supported_p (cond_len_fn, vectype,
+						      OPTIMIZE_FOR_SPEED))
+	    vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
+				  1);
+	  else
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6740,9 +6750,6 @@  vectorizable_operation (vec_info *vinfo,
 				 " conditional operation is available.\n");
 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	    }
-	  else
-	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
-				   vectype, NULL);
 	}
 
       /* Put types on constant and invariant SLP children.  */
@@ -6805,6 +6812,7 @@  vectorizable_operation (vec_info *vinfo,
                      "transform binary/unary operation.\n");
 
   bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
 
   /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
      vectors with unsigned elements, but the result is signed.  So, we
@@ -6971,11 +6979,16 @@  vectorizable_operation (vec_info *vinfo,
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
 	}
-      else if (masked_loop_p && mask_out_inactive)
+      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
 	{
-	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					  vec_num * ncopies, vectype, i);
-	  auto_vec<tree> vops (5);
+	  tree mask;
+	  if (masked_loop_p)
+	    mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
+				       vec_num * ncopies, vectype, i);
+	  else
+	    /* Dummy mask.  */
+	    mask = build_minus_one_cst (truth_type_for (vectype));
+	  auto_vec<tree> vops (6);
 	  vops.quick_push (mask);
 	  vops.quick_push (vop0);
 	  if (vop1)
@@ -6995,7 +7008,20 @@  vectorizable_operation (vec_info *vinfo,
 		(cond_fn, vectype, vops.length () - 1, &vops[1]);
 	      vops.quick_push (else_value);
 	    }
-	  gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
+	  if (len_loop_p)
+	    {
+	      tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
+					    vec_num * ncopies, vectype, i, 1);
+	      signed char biasval
+		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	      tree bias = build_int_cst (intQI_type_node, biasval);
+	      vops.quick_push (len);
+	      vops.quick_push (bias);
+	    }
+	  gcall *call
+	    = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
+							    : cond_len_fn,
+					      vops);
 	  new_temp = make_ssa_name (vec_dest, call);
 	  gimple_call_set_lhs (call, new_temp);
 	  gimple_call_set_nothrow (call, true);