diff mbox series

[V3] VECT: Apply COND_LEN_* into vectorizable_operation

Message ID 20230712111608.71951-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V3] VECT: Apply COND_LEN_* into vectorizable_operation | expand

Commit Message

juzhe.zhong@rivai.ai July 12, 2023, 11:16 a.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.
As we disscussed before, COND_LEN_* patterns were added for multiple situations.
This patch apply CON_LEN_* for the following situation:

Support for the situation that in "vectorizable_operation":
  /* If operating on inactive elements could generate spurious traps,
     we need to restrict the operation to active lanes.  Note that this
     specifically doesn't apply to unhoisted invariants, since they
     operate on the same value for every lane.

     Similarly, if this operation is part of a reduction, a fully-masked
     loop should only change the active lanes of the reduction chain,
     keeping the inactive lanes as-is.  */
  bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
			    || reduc_idx >= 0);

For mask_out_inactive is true with length loop control.

So, we can these 2 following cases:

1. Integer division:

   #define TEST_TYPE(TYPE) 				\
   __attribute__((noipa))				\
   void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
   {							\
     for (int i = 0; i < n; i++)				\
       dst[i] = a[i] % b[i];				\
   }
   #define TEST_ALL()	\
   TEST_TYPE(int8_t)	\
   TEST_ALL()

With this patch:
  
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
  ivtmp_45 = _61 * 4;
  vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
  vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
  vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
  .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);

2. Floating-point arithmetic **WITHOUT** -ffast-math
  
   #define TEST_TYPE(TYPE) 				\
   __attribute__((noipa))				\
   void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
   {							\
     for (int i = 0; i < n; i++)				\
       dst[i] = a[i] + b[i];				\
   }
   #define TEST_ALL()	\
   TEST_TYPE(float)	\
   TEST_ALL()

With this patch:
   
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
  ivtmp_45 = _61 * 4;
  vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
  vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
  vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
  .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);

With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".

gcc/ChangeLog:

        * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
        (CASE): Ditto.
        (get_conditional_len_internal_fn): New function.
        * internal-fn.h (get_conditional_len_internal_fn): Ditto.
        * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.

---
 gcc/internal-fn.cc     | 73 +++++++++++++++++++++++++++++++-----------
 gcc/internal-fn.h      |  1 +
 gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++-------
 3 files changed, 93 insertions(+), 29 deletions(-)

Comments

Richard Sandiford July 12, 2023, 12:14 p.m. UTC | #1
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
> As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> This patch apply CON_LEN_* for the following situation:
>
> Support for the situation that in "vectorizable_operation":
>   /* If operating on inactive elements could generate spurious traps,
>      we need to restrict the operation to active lanes.  Note that this
>      specifically doesn't apply to unhoisted invariants, since they
>      operate on the same value for every lane.
>
>      Similarly, if this operation is part of a reduction, a fully-masked
>      loop should only change the active lanes of the reduction chain,
>      keeping the inactive lanes as-is.  */
>   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
> 			    || reduc_idx >= 0);
>
> For mask_out_inactive is true with length loop control.
>
> So, we can these 2 following cases:
>
> 1. Integer division:
>
>    #define TEST_TYPE(TYPE) 				\
>    __attribute__((noipa))				\
>    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
>    {							\
>      for (int i = 0; i < n; i++)				\
>        dst[i] = a[i] % b[i];				\
>    }
>    #define TEST_ALL()	\
>    TEST_TYPE(int8_t)	\
>    TEST_ALL()
>
> With this patch:
>   
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> 2. Floating-point arithmetic **WITHOUT** -ffast-math
>   
>    #define TEST_TYPE(TYPE) 				\
>    __attribute__((noipa))				\
>    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
>    {							\
>      for (int i = 0; i < n; i++)				\
>        dst[i] = a[i] + b[i];				\
>    }
>    #define TEST_ALL()	\
>    TEST_TYPE(float)	\
>    TEST_ALL()
>
> With this patch:
>    
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
>         (CASE): Ditto.
>         (get_conditional_len_internal_fn): New function.
>         * internal-fn.h (get_conditional_len_internal_fn): Ditto.
>         * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.
>
> ---
>  gcc/internal-fn.cc     | 73 +++++++++++++++++++++++++++++++-----------
>  gcc/internal-fn.h      |  1 +
>  gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++-------
>  3 files changed, 93 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index f9aaf66cf2a..b288ac6fe6b 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
>    0
>  };
>  
> -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
> -   tree code CODE.  */
> +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
> +   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
> +   for each such IFN_COND_##SUFFIX.  */
>  #define FOR_EACH_CODE_MAPPING(T) \
> -  T (PLUS_EXPR, IFN_COND_ADD) \
> -  T (MINUS_EXPR, IFN_COND_SUB) \
> -  T (MULT_EXPR, IFN_COND_MUL) \
> -  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
> -  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
> -  T (RDIV_EXPR, IFN_COND_RDIV) \
> -  T (MIN_EXPR, IFN_COND_MIN) \
> -  T (MAX_EXPR, IFN_COND_MAX) \
> -  T (BIT_AND_EXPR, IFN_COND_AND) \
> -  T (BIT_IOR_EXPR, IFN_COND_IOR) \
> -  T (BIT_XOR_EXPR, IFN_COND_XOR) \
> -  T (LSHIFT_EXPR, IFN_COND_SHL) \
> -  T (RSHIFT_EXPR, IFN_COND_SHR) \
> -  T (NEGATE_EXPR, IFN_COND_NEG)
> +  T (PLUS_EXPR, ADD) \
> +  T (MINUS_EXPR, SUB) \
> +  T (MULT_EXPR, MUL) \
> +  T (TRUNC_DIV_EXPR, DIV) \
> +  T (TRUNC_MOD_EXPR, MOD) \
> +  T (RDIV_EXPR, RDIV) \
> +  T (MIN_EXPR, MIN) \
> +  T (MAX_EXPR, MAX) \
> +  T (BIT_AND_EXPR, AND) \
> +  T (BIT_IOR_EXPR, IOR) \
> +  T (BIT_XOR_EXPR, XOR) \
> +  T (LSHIFT_EXPR, SHL) \
> +  T (RSHIFT_EXPR, SHR) \
> +  T (NEGATE_EXPR, NEG)
>  
>  /* Return a function that only performs CODE when a certain condition is met
>     and that uses a given fallback value otherwise.  For example, if CODE is
> @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
>  {
>    switch (code)
>      {
> -#define CASE(CODE, IFN) case CODE: return IFN;
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>    switch (ifn)
>      {
> -#define CASE(CODE, IFN) case IFN: return CODE;
> +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4337,6 +4338,42 @@ conditional_internal_fn_code (internal_fn ifn)
>      }
>  }
>  
> +/* Like get_conditional_internal_fn, but return a function that
> +   additionally restricts the operation to the leading elements
> +   of a vector.  The number of elements to process is given by
> +   a length and bias pair.  The function only performs the CODE
> +   when a certain condition is met as well as the element is located
> +   within LEN + BIAS (i < LEN + BIAS) and that uses a given fallback value
> +   otherwise.

How about:

/* Like get_conditional_internal_fn, but return a function that
   additionally restricts the operation to the leading elements
   of a vector.  The number of elements to process is given by a length
   and bias pair, as for IFN_LOAD_LEN.  The values of the remaining
   elements are taken from the fallback ("else") argument.

I'd like to keep the reference to IFN_LOAD_LEN.  Without it, people
might wonder why there are two length parameters, rather than one.

> +
> +   For example, if CODE is [PLUS, MINUS, ... etc]:

   For example, if CODE is a binary operation associated with FN:

> +
> +     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
> +
> +   is equivalent to the C expression:

s/expression/code/ (sorry for not noticing last time).

> +
> +     for (int i = 0; i < NUNITS; i++)
> +      {
> +	if (COND[i] && i < (LEN + BIAS))

IMO it's more natural to put the condition the other way:

  if (i < LEN + BIAS && COND[i])

OK with those changes, thanks.

Richard

> +	  LHS[i] = A[i] CODE B[i];
> +	else
> +	  LHS[i] = ELSE[i];
> +      }
> +*/
> +
> +internal_fn
> +get_conditional_len_internal_fn (tree_code code)
> +{
> +  switch (code)
> +    {
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
> +      FOR_EACH_CODE_MAPPING(CASE)
> +#undef CASE
> +    default:
> +      return IFN_LAST;
> +    }
> +}
> +
>  /* Invoke T(IFN) for each internal function IFN that also has an
>     IFN_COND_* form.  */
>  #define FOR_EACH_COND_FN_PAIR(T) \
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..dd1bab0bddf 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
>  
>  extern internal_fn get_conditional_internal_fn (tree_code);
>  extern internal_fn get_conditional_internal_fn (internal_fn);
> +extern internal_fn get_conditional_len_internal_fn (tree_code);
>  extern tree_code conditional_internal_fn_code (internal_fn);
>  extern internal_fn get_unconditional_internal_fn (internal_fn);
>  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..dd24f017235 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
>  
>    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
>  
>    /* If operating on inactive elements could generate spurious traps,
>       we need to restrict the operation to active lanes.  Note that this
> @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
>  	  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
>  	  && mask_out_inactive)
>  	{
> -	  if (cond_fn == IFN_LAST
> -	      || !direct_internal_fn_supported_p (cond_fn, vectype,
> -						  OPTIMIZE_FOR_SPEED))
> +	  if (cond_fn != IFN_LAST
> +	      && direct_internal_fn_supported_p (cond_fn, vectype,
> +						 OPTIMIZE_FOR_SPEED))
> +	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +				   vectype, NULL);
> +	  else if (cond_len_fn != IFN_LAST
> +		   && direct_internal_fn_supported_p (cond_len_fn, vectype,
> +						      OPTIMIZE_FOR_SPEED))
> +	    vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
> +				  1);
> +	  else
>  	    {
>  	      if (dump_enabled_p ())
>  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
>  				 " conditional operation is available.\n");
>  	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  	    }
> -	  else
> -	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -				   vectype, NULL);
>  	}
>  
>        /* Put types on constant and invariant SLP children.  */
> @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
>                       "transform binary/unary operation.\n");
>  
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
>  
>    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
>       vectors with unsigned elements, but the result is signed.  So, we
> @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
>  	  gimple_assign_set_lhs (new_stmt, new_temp);
>  	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>  	}
> -      else if (masked_loop_p && mask_out_inactive)
> +      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
>  	{
> -	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> -					  vec_num * ncopies, vectype, i);
> -	  auto_vec<tree> vops (5);
> +	  tree mask;
> +	  if (masked_loop_p)
> +	    mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> +				       vec_num * ncopies, vectype, i);
> +	  else
> +	    /* Dummy mask.  */
> +	    mask = build_minus_one_cst (truth_type_for (vectype));
> +	  auto_vec<tree> vops (6);
>  	  vops.quick_push (mask);
>  	  vops.quick_push (vop0);
>  	  if (vop1)
> @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
>  		(cond_fn, vectype, vops.length () - 1, &vops[1]);
>  	      vops.quick_push (else_value);
>  	    }
> -	  gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
> +	  if (len_loop_p)
> +	    {
> +	      tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +					    vec_num * ncopies, vectype, i, 1);
> +	      signed char biasval
> +		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	      tree bias = build_int_cst (intQI_type_node, biasval);
> +	      vops.quick_push (len);
> +	      vops.quick_push (bias);
> +	    }
> +	  gcall *call
> +	    = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
> +							    : cond_len_fn,
> +					      vops);
>  	  new_temp = make_ssa_name (vec_dest, call);
>  	  gimple_call_set_lhs (call, new_temp);
>  	  gimple_call_set_nothrow (call, true);
juzhe.zhong@rivai.ai July 12, 2023, 1:18 p.m. UTC | #2
I fix comments as you suggested.

Thanks a lot!
 Soon will merge it when I finish the bootstrap && regression.



juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-07-12 20:14
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V3] VECT: Apply COND_LEN_* into vectorizable_operation
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
> As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> This patch apply CON_LEN_* for the following situation:
>
> Support for the situation that in "vectorizable_operation":
>   /* If operating on inactive elements could generate spurious traps,
>      we need to restrict the operation to active lanes.  Note that this
>      specifically doesn't apply to unhoisted invariants, since they
>      operate on the same value for every lane.
>
>      Similarly, if this operation is part of a reduction, a fully-masked
>      loop should only change the active lanes of the reduction chain,
>      keeping the inactive lanes as-is.  */
>   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
>     || reduc_idx >= 0);
>
> For mask_out_inactive is true with length loop control.
>
> So, we can these 2 following cases:
>
> 1. Integer division:
>
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] % b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(int8_t) \
>    TEST_ALL()
>
> With this patch:
>   
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> 2. Floating-point arithmetic **WITHOUT** -ffast-math
>   
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] + b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(float) \
>    TEST_ALL()
>
> With this patch:
>    
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
>         (CASE): Ditto.
>         (get_conditional_len_internal_fn): New function.
>         * internal-fn.h (get_conditional_len_internal_fn): Ditto.
>         * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.
>
> ---
>  gcc/internal-fn.cc     | 73 +++++++++++++++++++++++++++++++-----------
>  gcc/internal-fn.h      |  1 +
>  gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++-------
>  3 files changed, 93 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index f9aaf66cf2a..b288ac6fe6b 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
>    0
>  };
>  
> -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
> -   tree code CODE.  */
> +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
> +   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
> +   for each such IFN_COND_##SUFFIX.  */
>  #define FOR_EACH_CODE_MAPPING(T) \
> -  T (PLUS_EXPR, IFN_COND_ADD) \
> -  T (MINUS_EXPR, IFN_COND_SUB) \
> -  T (MULT_EXPR, IFN_COND_MUL) \
> -  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
> -  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
> -  T (RDIV_EXPR, IFN_COND_RDIV) \
> -  T (MIN_EXPR, IFN_COND_MIN) \
> -  T (MAX_EXPR, IFN_COND_MAX) \
> -  T (BIT_AND_EXPR, IFN_COND_AND) \
> -  T (BIT_IOR_EXPR, IFN_COND_IOR) \
> -  T (BIT_XOR_EXPR, IFN_COND_XOR) \
> -  T (LSHIFT_EXPR, IFN_COND_SHL) \
> -  T (RSHIFT_EXPR, IFN_COND_SHR) \
> -  T (NEGATE_EXPR, IFN_COND_NEG)
> +  T (PLUS_EXPR, ADD) \
> +  T (MINUS_EXPR, SUB) \
> +  T (MULT_EXPR, MUL) \
> +  T (TRUNC_DIV_EXPR, DIV) \
> +  T (TRUNC_MOD_EXPR, MOD) \
> +  T (RDIV_EXPR, RDIV) \
> +  T (MIN_EXPR, MIN) \
> +  T (MAX_EXPR, MAX) \
> +  T (BIT_AND_EXPR, AND) \
> +  T (BIT_IOR_EXPR, IOR) \
> +  T (BIT_XOR_EXPR, XOR) \
> +  T (LSHIFT_EXPR, SHL) \
> +  T (RSHIFT_EXPR, SHR) \
> +  T (NEGATE_EXPR, NEG)
>  
>  /* Return a function that only performs CODE when a certain condition is met
>     and that uses a given fallback value otherwise.  For example, if CODE is
> @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
>  {
>    switch (code)
>      {
> -#define CASE(CODE, IFN) case CODE: return IFN;
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>    switch (ifn)
>      {
> -#define CASE(CODE, IFN) case IFN: return CODE;
> +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4337,6 +4338,42 @@ conditional_internal_fn_code (internal_fn ifn)
>      }
>  }
>  
> +/* Like get_conditional_internal_fn, but return a function that
> +   additionally restricts the operation to the leading elements
> +   of a vector.  The number of elements to process is given by
> +   a length and bias pair.  The function only performs the CODE
> +   when a certain condition is met as well as the element is located
> +   within LEN + BIAS (i < LEN + BIAS) and that uses a given fallback value
> +   otherwise.
 
How about:
 
/* Like get_conditional_internal_fn, but return a function that
   additionally restricts the operation to the leading elements
   of a vector.  The number of elements to process is given by a length
   and bias pair, as for IFN_LOAD_LEN.  The values of the remaining
   elements are taken from the fallback ("else") argument.
 
I'd like to keep the reference to IFN_LOAD_LEN.  Without it, people
might wonder why there are two length parameters, rather than one.
 
> +
> +   For example, if CODE is [PLUS, MINUS, ... etc]:
 
   For example, if CODE is a binary operation associated with FN:
 
> +
> +     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
> +
> +   is equivalent to the C expression:
 
s/expression/code/ (sorry for not noticing last time).
 
> +
> +     for (int i = 0; i < NUNITS; i++)
> +      {
> + if (COND[i] && i < (LEN + BIAS))
 
IMO it's more natural to put the condition the other way:
 
  if (i < LEN + BIAS && COND[i])
 
OK with those changes, thanks.
 
Richard
 
> +   LHS[i] = A[i] CODE B[i];
> + else
> +   LHS[i] = ELSE[i];
> +      }
> +*/
> +
> +internal_fn
> +get_conditional_len_internal_fn (tree_code code)
> +{
> +  switch (code)
> +    {
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
> +      FOR_EACH_CODE_MAPPING(CASE)
> +#undef CASE
> +    default:
> +      return IFN_LAST;
> +    }
> +}
> +
>  /* Invoke T(IFN) for each internal function IFN that also has an
>     IFN_COND_* form.  */
>  #define FOR_EACH_COND_FN_PAIR(T) \
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..dd1bab0bddf 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
>  
>  extern internal_fn get_conditional_internal_fn (tree_code);
>  extern internal_fn get_conditional_internal_fn (internal_fn);
> +extern internal_fn get_conditional_len_internal_fn (tree_code);
>  extern tree_code conditional_internal_fn_code (internal_fn);
>  extern internal_fn get_unconditional_internal_fn (internal_fn);
>  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..dd24f017235 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
>  
>    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
>  
>    /* If operating on inactive elements could generate spurious traps,
>       we need to restrict the operation to active lanes.  Note that this
> @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
>    && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
>    && mask_out_inactive)
>  {
> -   if (cond_fn == IFN_LAST
> -       || !direct_internal_fn_supported_p (cond_fn, vectype,
> -   OPTIMIZE_FOR_SPEED))
> +   if (cond_fn != IFN_LAST
> +       && direct_internal_fn_supported_p (cond_fn, vectype,
> + OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +    vectype, NULL);
> +   else if (cond_len_fn != IFN_LAST
> +    && direct_internal_fn_supported_p (cond_len_fn, vectype,
> +       OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
> +   1);
> +   else
>      {
>        if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
>  " conditional operation is available.\n");
>        LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>      }
> -   else
> -     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -    vectype, NULL);
>  }
>  
>        /* Put types on constant and invariant SLP children.  */
> @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
>                       "transform binary/unary operation.\n");
>  
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
>  
>    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
>       vectors with unsigned elements, but the result is signed.  So, we
> @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
>    gimple_assign_set_lhs (new_stmt, new_temp);
>    vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>  }
> -      else if (masked_loop_p && mask_out_inactive)
> +      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
>  {
> -   tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> -   vec_num * ncopies, vectype, i);
> -   auto_vec<tree> vops (5);
> +   tree mask;
> +   if (masked_loop_p)
> +     mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> +        vec_num * ncopies, vectype, i);
> +   else
> +     /* Dummy mask.  */
> +     mask = build_minus_one_cst (truth_type_for (vectype));
> +   auto_vec<tree> vops (6);
>    vops.quick_push (mask);
>    vops.quick_push (vop0);
>    if (vop1)
> @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
>  (cond_fn, vectype, vops.length () - 1, &vops[1]);
>        vops.quick_push (else_value);
>      }
> -   gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
> +   if (len_loop_p)
> +     {
> +       tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +     vec_num * ncopies, vectype, i, 1);
> +       signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +       tree bias = build_int_cst (intQI_type_node, biasval);
> +       vops.quick_push (len);
> +       vops.quick_push (bias);
> +     }
> +   gcall *call
> +     = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
> +     : cond_len_fn,
> +       vops);
>    new_temp = make_ssa_name (vec_dest, call);
>    gimple_call_set_lhs (call, new_temp);
>    gimple_call_set_nothrow (call, true);
Li, Pan2 via Gcc-patches July 12, 2023, 2:28 p.m. UTC | #3
Committed v4 as passed both the regression and bootstrap tests, thanks both Richard.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of ???
Sent: Wednesday, July 12, 2023 9:19 PM
To: richard.sandiford <richard.sandiford@arm.com>
Cc: gcc-patches <gcc-patches@gcc.gnu.org>; rguenther <rguenther@suse.de>
Subject: Re: Re: [PATCH V3] VECT: Apply COND_LEN_* into vectorizable_operation

I fix comments as you suggested.

Thanks a lot!
 Soon will merge it when I finish the bootstrap && regression.



juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-07-12 20:14
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V3] VECT: Apply COND_LEN_* into vectorizable_operation
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
> As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> This patch apply CON_LEN_* for the following situation:
>
> Support for the situation that in "vectorizable_operation":
>   /* If operating on inactive elements could generate spurious traps,
>      we need to restrict the operation to active lanes.  Note that this
>      specifically doesn't apply to unhoisted invariants, since they
>      operate on the same value for every lane.
>
>      Similarly, if this operation is part of a reduction, a fully-masked
>      loop should only change the active lanes of the reduction chain,
>      keeping the inactive lanes as-is.  */
>   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
>     || reduc_idx >= 0);
>
> For mask_out_inactive is true with length loop control.
>
> So, we can these 2 following cases:
>
> 1. Integer division:
>
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] % b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(int8_t) \
>    TEST_ALL()
>
> With this patch:
>   
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> 2. Floating-point arithmetic **WITHOUT** -ffast-math
>   
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] + b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(float) \
>    TEST_ALL()
>
> With this patch:
>    
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
>         (CASE): Ditto.
>         (get_conditional_len_internal_fn): New function.
>         * internal-fn.h (get_conditional_len_internal_fn): Ditto.
>         * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.
>
> ---
>  gcc/internal-fn.cc     | 73 +++++++++++++++++++++++++++++++-----------
>  gcc/internal-fn.h      |  1 +
>  gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++-------
>  3 files changed, 93 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index f9aaf66cf2a..b288ac6fe6b 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
>    0
>  };
>  
> -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
> -   tree code CODE.  */
> +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
> +   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
> +   for each such IFN_COND_##SUFFIX.  */
>  #define FOR_EACH_CODE_MAPPING(T) \
> -  T (PLUS_EXPR, IFN_COND_ADD) \
> -  T (MINUS_EXPR, IFN_COND_SUB) \
> -  T (MULT_EXPR, IFN_COND_MUL) \
> -  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
> -  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
> -  T (RDIV_EXPR, IFN_COND_RDIV) \
> -  T (MIN_EXPR, IFN_COND_MIN) \
> -  T (MAX_EXPR, IFN_COND_MAX) \
> -  T (BIT_AND_EXPR, IFN_COND_AND) \
> -  T (BIT_IOR_EXPR, IFN_COND_IOR) \
> -  T (BIT_XOR_EXPR, IFN_COND_XOR) \
> -  T (LSHIFT_EXPR, IFN_COND_SHL) \
> -  T (RSHIFT_EXPR, IFN_COND_SHR) \
> -  T (NEGATE_EXPR, IFN_COND_NEG)
> +  T (PLUS_EXPR, ADD) \
> +  T (MINUS_EXPR, SUB) \
> +  T (MULT_EXPR, MUL) \
> +  T (TRUNC_DIV_EXPR, DIV) \
> +  T (TRUNC_MOD_EXPR, MOD) \
> +  T (RDIV_EXPR, RDIV) \
> +  T (MIN_EXPR, MIN) \
> +  T (MAX_EXPR, MAX) \
> +  T (BIT_AND_EXPR, AND) \
> +  T (BIT_IOR_EXPR, IOR) \
> +  T (BIT_XOR_EXPR, XOR) \
> +  T (LSHIFT_EXPR, SHL) \
> +  T (RSHIFT_EXPR, SHR) \
> +  T (NEGATE_EXPR, NEG)
>  
>  /* Return a function that only performs CODE when a certain condition is met
>     and that uses a given fallback value otherwise.  For example, if CODE is
> @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
>  {
>    switch (code)
>      {
> -#define CASE(CODE, IFN) case CODE: return IFN;
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>    switch (ifn)
>      {
> -#define CASE(CODE, IFN) case IFN: return CODE;
> +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4337,6 +4338,42 @@ conditional_internal_fn_code (internal_fn ifn)
>      }
>  }
>  
> +/* Like get_conditional_internal_fn, but return a function that
> +   additionally restricts the operation to the leading elements
> +   of a vector.  The number of elements to process is given by
> +   a length and bias pair.  The function only performs the CODE
> +   when a certain condition is met as well as the element is located
> +   within LEN + BIAS (i < LEN + BIAS) and that uses a given fallback value
> +   otherwise.
 
How about:
 
/* Like get_conditional_internal_fn, but return a function that
   additionally restricts the operation to the leading elements
   of a vector.  The number of elements to process is given by a length
   and bias pair, as for IFN_LOAD_LEN.  The values of the remaining
   elements are taken from the fallback ("else") argument.
 
I'd like to keep the reference to IFN_LOAD_LEN.  Without it, people
might wonder why there are two length parameters, rather than one.
 
> +
> +   For example, if CODE is [PLUS, MINUS, ... etc]:
 
   For example, if CODE is a binary operation associated with FN:
 
> +
> +     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
> +
> +   is equivalent to the C expression:
 
s/expression/code/ (sorry for not noticing last time).
 
> +
> +     for (int i = 0; i < NUNITS; i++)
> +      {
> + if (COND[i] && i < (LEN + BIAS))
 
IMO it's more natural to put the condition the other way:
 
  if (i < LEN + BIAS && COND[i])
 
OK with those changes, thanks.
 
Richard
 
> +   LHS[i] = A[i] CODE B[i];
> + else
> +   LHS[i] = ELSE[i];
> +      }
> +*/
> +
> +internal_fn
> +get_conditional_len_internal_fn (tree_code code)
> +{
> +  switch (code)
> +    {
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
> +      FOR_EACH_CODE_MAPPING(CASE)
> +#undef CASE
> +    default:
> +      return IFN_LAST;
> +    }
> +}
> +
>  /* Invoke T(IFN) for each internal function IFN that also has an
>     IFN_COND_* form.  */
>  #define FOR_EACH_COND_FN_PAIR(T) \
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..dd1bab0bddf 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
>  
>  extern internal_fn get_conditional_internal_fn (tree_code);
>  extern internal_fn get_conditional_internal_fn (internal_fn);
> +extern internal_fn get_conditional_len_internal_fn (tree_code);
>  extern tree_code conditional_internal_fn_code (internal_fn);
>  extern internal_fn get_unconditional_internal_fn (internal_fn);
>  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..dd24f017235 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
>  
>    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
>  
>    /* If operating on inactive elements could generate spurious traps,
>       we need to restrict the operation to active lanes.  Note that this
> @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
>    && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
>    && mask_out_inactive)
>  {
> -   if (cond_fn == IFN_LAST
> -       || !direct_internal_fn_supported_p (cond_fn, vectype,
> -   OPTIMIZE_FOR_SPEED))
> +   if (cond_fn != IFN_LAST
> +       && direct_internal_fn_supported_p (cond_fn, vectype,
> + OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +    vectype, NULL);
> +   else if (cond_len_fn != IFN_LAST
> +    && direct_internal_fn_supported_p (cond_len_fn, vectype,
> +       OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
> +   1);
> +   else
>      {
>        if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
>  " conditional operation is available.\n");
>        LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>      }
> -   else
> -     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -    vectype, NULL);
>  }
>  
>        /* Put types on constant and invariant SLP children.  */
> @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
>                       "transform binary/unary operation.\n");
>  
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
>  
>    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
>       vectors with unsigned elements, but the result is signed.  So, we
> @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
>    gimple_assign_set_lhs (new_stmt, new_temp);
>    vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>  }
> -      else if (masked_loop_p && mask_out_inactive)
> +      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
>  {
> -   tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> -   vec_num * ncopies, vectype, i);
> -   auto_vec<tree> vops (5);
> +   tree mask;
> +   if (masked_loop_p)
> +     mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> +        vec_num * ncopies, vectype, i);
> +   else
> +     /* Dummy mask.  */
> +     mask = build_minus_one_cst (truth_type_for (vectype));
> +   auto_vec<tree> vops (6);
>    vops.quick_push (mask);
>    vops.quick_push (vop0);
>    if (vop1)
> @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
>  (cond_fn, vectype, vops.length () - 1, &vops[1]);
>        vops.quick_push (else_value);
>      }
> -   gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
> +   if (len_loop_p)
> +     {
> +       tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +     vec_num * ncopies, vectype, i, 1);
> +       signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +       tree bias = build_int_cst (intQI_type_node, biasval);
> +       vops.quick_push (len);
> +       vops.quick_push (bias);
> +     }
> +   gcall *call
> +     = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
> +     : cond_len_fn,
> +       vops);
>    new_temp = make_ssa_name (vec_dest, call);
>    gimple_call_set_lhs (call, new_temp);
>    gimple_call_set_nothrow (call, true);
diff mbox series

Patch

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index f9aaf66cf2a..b288ac6fe6b 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4276,23 +4276,24 @@  static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
   0
 };
 
-/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
-   tree code CODE.  */
+/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
+   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
+   for each such IFN_COND_##SUFFIX.  */
 #define FOR_EACH_CODE_MAPPING(T) \
-  T (PLUS_EXPR, IFN_COND_ADD) \
-  T (MINUS_EXPR, IFN_COND_SUB) \
-  T (MULT_EXPR, IFN_COND_MUL) \
-  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
-  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
-  T (RDIV_EXPR, IFN_COND_RDIV) \
-  T (MIN_EXPR, IFN_COND_MIN) \
-  T (MAX_EXPR, IFN_COND_MAX) \
-  T (BIT_AND_EXPR, IFN_COND_AND) \
-  T (BIT_IOR_EXPR, IFN_COND_IOR) \
-  T (BIT_XOR_EXPR, IFN_COND_XOR) \
-  T (LSHIFT_EXPR, IFN_COND_SHL) \
-  T (RSHIFT_EXPR, IFN_COND_SHR) \
-  T (NEGATE_EXPR, IFN_COND_NEG)
+  T (PLUS_EXPR, ADD) \
+  T (MINUS_EXPR, SUB) \
+  T (MULT_EXPR, MUL) \
+  T (TRUNC_DIV_EXPR, DIV) \
+  T (TRUNC_MOD_EXPR, MOD) \
+  T (RDIV_EXPR, RDIV) \
+  T (MIN_EXPR, MIN) \
+  T (MAX_EXPR, MAX) \
+  T (BIT_AND_EXPR, AND) \
+  T (BIT_IOR_EXPR, IOR) \
+  T (BIT_XOR_EXPR, XOR) \
+  T (LSHIFT_EXPR, SHL) \
+  T (RSHIFT_EXPR, SHR) \
+  T (NEGATE_EXPR, NEG)
 
 /* Return a function that only performs CODE when a certain condition is met
    and that uses a given fallback value otherwise.  For example, if CODE is
@@ -4313,7 +4314,7 @@  get_conditional_internal_fn (tree_code code)
 {
   switch (code)
     {
-#define CASE(CODE, IFN) case CODE: return IFN;
+#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
       FOR_EACH_CODE_MAPPING(CASE)
 #undef CASE
     default:
@@ -4329,7 +4330,7 @@  conditional_internal_fn_code (internal_fn ifn)
 {
   switch (ifn)
     {
-#define CASE(CODE, IFN) case IFN: return CODE;
+#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
       FOR_EACH_CODE_MAPPING(CASE)
 #undef CASE
     default:
@@ -4337,6 +4338,42 @@  conditional_internal_fn_code (internal_fn ifn)
     }
 }
 
+/* Like get_conditional_internal_fn, but return a function that
+   additionally restricts the operation to the leading elements
+   of a vector.  The number of elements to process is given by
+   a length and bias pair.  The function only performs the CODE
+   when a certain condition is met as well as the element is located
+   within LEN + BIAS (i < LEN + BIAS) and that uses a given fallback value
+   otherwise.
+
+   For example, if CODE is [PLUS, MINUS, ... etc]:
+
+     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
+
+   is equivalent to the C expression:
+
+     for (int i = 0; i < NUNITS; i++)
+      {
+	if (COND[i] && i < (LEN + BIAS))
+	  LHS[i] = A[i] CODE B[i];
+	else
+	  LHS[i] = ELSE[i];
+      }
+*/
+
+internal_fn
+get_conditional_len_internal_fn (tree_code code)
+{
+  switch (code)
+    {
+#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
+      FOR_EACH_CODE_MAPPING(CASE)
+#undef CASE
+    default:
+      return IFN_LAST;
+    }
+}
+
 /* Invoke T(IFN) for each internal function IFN that also has an
    IFN_COND_* form.  */
 #define FOR_EACH_COND_FN_PAIR(T) \
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 4234bbfed87..dd1bab0bddf 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -224,6 +224,7 @@  extern bool set_edom_supported_p (void);
 
 extern internal_fn get_conditional_internal_fn (tree_code);
 extern internal_fn get_conditional_internal_fn (internal_fn);
+extern internal_fn get_conditional_len_internal_fn (tree_code);
 extern tree_code conditional_internal_fn_code (internal_fn);
 extern internal_fn get_unconditional_internal_fn (internal_fn);
 extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 10e71178ce7..dd24f017235 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6711,7 +6711,9 @@  vectorizable_operation (vec_info *vinfo,
 
   int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
   vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
+  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
   internal_fn cond_fn = get_conditional_internal_fn (code);
+  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
 
   /* If operating on inactive elements could generate spurious traps,
      we need to restrict the operation to active lanes.  Note that this
@@ -6730,9 +6732,17 @@  vectorizable_operation (vec_info *vinfo,
 	  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
 	  && mask_out_inactive)
 	{
-	  if (cond_fn == IFN_LAST
-	      || !direct_internal_fn_supported_p (cond_fn, vectype,
-						  OPTIMIZE_FOR_SPEED))
+	  if (cond_fn != IFN_LAST
+	      && direct_internal_fn_supported_p (cond_fn, vectype,
+						 OPTIMIZE_FOR_SPEED))
+	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
+				   vectype, NULL);
+	  else if (cond_len_fn != IFN_LAST
+		   && direct_internal_fn_supported_p (cond_len_fn, vectype,
+						      OPTIMIZE_FOR_SPEED))
+	    vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
+				  1);
+	  else
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6740,9 +6750,6 @@  vectorizable_operation (vec_info *vinfo,
 				 " conditional operation is available.\n");
 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	    }
-	  else
-	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
-				   vectype, NULL);
 	}
 
       /* Put types on constant and invariant SLP children.  */
@@ -6805,6 +6812,7 @@  vectorizable_operation (vec_info *vinfo,
                      "transform binary/unary operation.\n");
 
   bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
 
   /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
      vectors with unsigned elements, but the result is signed.  So, we
@@ -6971,11 +6979,16 @@  vectorizable_operation (vec_info *vinfo,
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
 	}
-      else if (masked_loop_p && mask_out_inactive)
+      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
 	{
-	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					  vec_num * ncopies, vectype, i);
-	  auto_vec<tree> vops (5);
+	  tree mask;
+	  if (masked_loop_p)
+	    mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
+				       vec_num * ncopies, vectype, i);
+	  else
+	    /* Dummy mask.  */
+	    mask = build_minus_one_cst (truth_type_for (vectype));
+	  auto_vec<tree> vops (6);
 	  vops.quick_push (mask);
 	  vops.quick_push (vop0);
 	  if (vop1)
@@ -6995,7 +7008,20 @@  vectorizable_operation (vec_info *vinfo,
 		(cond_fn, vectype, vops.length () - 1, &vops[1]);
 	      vops.quick_push (else_value);
 	    }
-	  gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
+	  if (len_loop_p)
+	    {
+	      tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
+					    vec_num * ncopies, vectype, i, 1);
+	      signed char biasval
+		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	      tree bias = build_int_cst (intQI_type_node, biasval);
+	      vops.quick_push (len);
+	      vops.quick_push (bias);
+	    }
+	  gcall *call
+	    = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
+							    : cond_len_fn,
+					      vops);
 	  new_temp = make_ssa_name (vec_dest, call);
 	  gimple_call_set_lhs (call, new_temp);
 	  gimple_call_set_nothrow (call, true);