diff mbox series

[v1,1/3] Vect: Support loop len in vectorizable early exit

Message ID 20240513145428.148553-1-pan2.li@intel.com
State New
Headers show
Series [v1,1/3] Vect: Support loop len in vectorizable early exit | expand

Commit Message

Li, Pan2 May 13, 2024, 2:54 p.m. UTC
From: Pan Li <pan2.li@intel.com>

This patch adds early break auto-vectorization support for target which
use length on partial vectorization.  Consider this following example:

unsigned vect_a[802];
unsigned vect_b[802];

void test (unsigned x, int n)
{
  for (int i = 0; i < n; i++)
  {
    vect_b[i] = x + i;

    if (vect_a[i] > x)
      break;

    vect_a[i] = x;
  }
}

We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias).
And then the IR of RVV looks like below:

  ...
  _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]);
  _55 = (int) _87;
  ...
  mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67;
  vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \
    {0, ... }, _87, 0);
  if (vec_len_mask_72 != { 0, ... })
    goto <bb 6>; [5.50%]
  else
    goto <bb 7>; [94.50%]

The below tests are passed for this patch:
1. The riscv fully regression tests.
2. The aarch64 fully regression tests.
3. The x86 bootstrap tests.
4. The x86 fully regression tests.

gcc/ChangeLog:

	* tree-vect-stmts.cc (vectorizable_early_exit): Add loop len
	handling for one or multiple stmt.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/tree-vect-stmts.cc | 47 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

Comments

Tamar Christina May 13, 2024, 3:14 p.m. UTC | #1
> -----Original Message-----
> From: pan2.li@intel.com <pan2.li@intel.com>
> Sent: Monday, May 13, 2024 3:54 PM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> Tamar Christina <Tamar.Christina@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>; Pan Li <pan2.li@intel.com>
> Subject: [PATCH v1 1/3] Vect: Support loop len in vectorizable early exit
> 
> From: Pan Li <pan2.li@intel.com>
> 
> This patch adds early break auto-vectorization support for target which
> use length on partial vectorization.  Consider this following example:
> 
> unsigned vect_a[802];
> unsigned vect_b[802];
> 
> void test (unsigned x, int n)
> {
>   for (int i = 0; i < n; i++)
>   {
>     vect_b[i] = x + i;
> 
>     if (vect_a[i] > x)
>       break;
> 
>     vect_a[i] = x;
>   }
> }
> 
> We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias).
> And then the IR of RVV looks like below:
> 
>   ...
>   _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]);
>   _55 = (int) _87;
>   ...
>   mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67;
>   vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \
>     {0, ... }, _87, 0);
>   if (vec_len_mask_72 != { 0, ... })
>     goto <bb 6>; [5.50%]
>   else
>     goto <bb 7>; [94.50%]
> 
> The below tests are passed for this patch:
> 1. The riscv fully regression tests.
> 2. The aarch64 fully regression tests.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-stmts.cc (vectorizable_early_exit): Add loop len
> 	handling for one or multiple stmt.
> 
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/tree-vect-stmts.cc | 47 ++++++++++++++++++++++++++++++++++++++++-
> -
>  1 file changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 21e8fe98e44..bfd9d66568f 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -12896,7 +12896,9 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>      ncopies = vect_get_num_copies (loop_vinfo, vectype);
> 
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> +  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
> 
>    /* Now build the new conditional.  Pattern gimple_conds get dropped during
>       codegen so we must replace the original insn.  */
> @@ -12960,12 +12962,11 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>  	{
>  	  if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
>  					      OPTIMIZE_FOR_SPEED))
> -	    return false;
> +	    vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1);
>  	  else
>  	    vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
>  	}
> 
> -
>        return true;
>      }
> 
> @@ -13018,6 +13019,25 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>  				  stmts[i], &cond_gsi);
>  	    workset.quick_push (stmt_mask);
>  	  }
> +      else if (len_loop_p)
> +	for (unsigned i = 0; i < stmts.length (); i++)
> +	  {
> +	    tree all_ones_mask = build_all_ones_cst (vectype);
> +	    tree all_zero_mask = build_zero_cst (vectype);
> +	    tree len = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies,
> +					  vectype, i, 1);
> +	    signed char cst = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> (loop_vinfo);
> +	    tree bias = build_int_cst (intQI_type_node, cst);
> +	    tree len_mask
> +	      = make_temp_ssa_name (TREE_TYPE (stmts[i]), NULL,
> "vec_len_mask");
> +	    gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5,
> +						      stmts[i], all_ones_mask,
> +						      all_zero_mask, len, bias);
> +	    gimple_call_set_lhs (call, len_mask);
> +	    gsi_insert_before (&cond_gsi, call, GSI_SAME_STMT);
> +
> +	    workset.quick_push (len_mask);
> +	  }
>        else
>  	workset.splice (stmts);
> 
> @@ -13042,6 +13062,29 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>  	  new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
>  				       new_temp, &cond_gsi);
>  	}
> +      else if (len_loop_p)
> +	{
> +	  /* len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
> +
> +	     which is equivalent to:
> +
> +	     len_mask = compare_mask mask && i < len ? 1 : 0
> +	  */
> +	  tree all_ones_mask = build_all_ones_cst (vectype);
> +	  tree all_zero_mask = build_zero_cst (vectype);
> +	  tree len
> +	    = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies, vectype, 0, 1);
> +	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> (loop_vinfo);
> +	  tree bias = build_int_cst (intQI_type_node, biasval);
> +	  tree len_mask
> +	    = make_temp_ssa_name (TREE_TYPE (new_temp), NULL,
> "vec_len_mask");
> +	  gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5,
> +						    new_temp, all_ones_mask,
> +						    all_zero_mask, len, bias);
> +	  gimple_call_set_lhs (call, len_mask);
> +	  gsi_insert_before (&cond_gsi, call, GSI_SAME_STMT);
> +	  new_temp = len_mask;
> +	}

Could we perhaps factor these out to a helper? Something similar to what we have with
prepare_vec_mask.  It looks like the only difference between these two block of code is the index being passed to
vect_get_loop_len.

It would be nice to have the shape the same way as the masked_loop_p case, i.e. keep 1 call to get the loop_len and one to
build the mask using a helper.

Thanks,
Tamar
>      }
> 
>    gcc_assert (new_temp);
> --
> 2.34.1
diff mbox series

Patch

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 21e8fe98e44..bfd9d66568f 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12896,7 +12896,9 @@  vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
     ncopies = vect_get_num_copies (loop_vinfo, vectype);
 
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
 
   /* Now build the new conditional.  Pattern gimple_conds get dropped during
      codegen so we must replace the original insn.  */
@@ -12960,12 +12962,11 @@  vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 	{
 	  if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
 					      OPTIMIZE_FOR_SPEED))
-	    return false;
+	    vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1);
 	  else
 	    vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
 	}
 
-
       return true;
     }
 
@@ -13018,6 +13019,25 @@  vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 				  stmts[i], &cond_gsi);
 	    workset.quick_push (stmt_mask);
 	  }
+      else if (len_loop_p)
+	for (unsigned i = 0; i < stmts.length (); i++)
+	  {
+	    tree all_ones_mask = build_all_ones_cst (vectype);
+	    tree all_zero_mask = build_zero_cst (vectype);
+	    tree len = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies,
+					  vectype, i, 1);
+	    signed char cst = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	    tree bias = build_int_cst (intQI_type_node, cst);
+	    tree len_mask
+	      = make_temp_ssa_name (TREE_TYPE (stmts[i]), NULL, "vec_len_mask");
+	    gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5,
+						      stmts[i], all_ones_mask,
+						      all_zero_mask, len, bias);
+	    gimple_call_set_lhs (call, len_mask);
+	    gsi_insert_before (&cond_gsi, call, GSI_SAME_STMT);
+
+	    workset.quick_push (len_mask);
+	  }
       else
 	workset.splice (stmts);
 
@@ -13042,6 +13062,29 @@  vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 	  new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
 				       new_temp, &cond_gsi);
 	}
+      else if (len_loop_p)
+	{
+	  /* len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
+
+	     which is equivalent to:
+
+	     len_mask = compare_mask mask && i < len ? 1 : 0
+	  */
+	  tree all_ones_mask = build_all_ones_cst (vectype);
+	  tree all_zero_mask = build_zero_cst (vectype);
+	  tree len
+	    = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies, vectype, 0, 1);
+	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	  tree bias = build_int_cst (intQI_type_node, biasval);
+	  tree len_mask
+	    = make_temp_ssa_name (TREE_TYPE (new_temp), NULL, "vec_len_mask");
+	  gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5,
+						    new_temp, all_ones_mask,
+						    all_zero_mask, len, bias);
+	  gimple_call_set_lhs (call, len_mask);
+	  gsi_insert_before (&cond_gsi, call, GSI_SAME_STMT);
+	  new_temp = len_mask;
+	}
     }
 
   gcc_assert (new_temp);