diff mbox series

VECT: Apply MASK_LEN_{LOAD_LANES, STORE_LANES} into vectorizer

Message ID 20230814064513.157363-1-juzhe.zhong@rivai.ai
State New
Headers show
Series VECT: Apply MASK_LEN_{LOAD_LANES, STORE_LANES} into vectorizer | expand

Commit Message

juzhe.zhong@rivai.ai Aug. 14, 2023, 6:45 a.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.

This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer.

Consider this simple case:

void __attribute__ ((noinline, noclone))
foo (int *__restrict a, int *__restrict b, int *__restrict c,
	  int *__restrict d, int *__restrict e, int *__restrict f,
	  int *__restrict g, int *__restrict h, int *__restrict j, int n)
{
  for (int i = 0; i < n; ++i)
    {
      a[i] = j[i * 8];
      b[i] = j[i * 8 + 1];
      c[i] = j[i * 8 + 2];
      d[i] = j[i * 8 + 3];
      e[i] = j[i * 8 + 4];
      f[i] = j[i * 8 + 5];
      g[i] = j[i * 8 + 6];
      h[i] = j[i * 8 + 7];
    }
}

RVV Gimple IR:

  _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
  ivtmp_125 = _79 * 32;
  vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0);
  vect__8.9_122 = vect_array.8[0];
  vect__8.10_121 = vect_array.8[1];
  vect__8.11_120 = vect_array.8[2];
  vect__8.12_119 = vect_array.8[3];
  vect__8.13_118 = vect_array.8[4];
  vect__8.14_117 = vect_array.8[5];
  vect__8.15_116 = vect_array.8[6];
  vect__8.16_115 = vect_array.8[7];
  vect_array.8 ={v} {CLOBBER};
  ivtmp_114 = _79 * 4;
  .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
  .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
  .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
  .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
  .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
  .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
  .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
  .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);

ASM:

foo:
	lw	t4,8(sp)
	ld	t5,0(sp)
	ble	t4,zero,.L5
.L3:
	vsetvli	t1,t4,e8,mf4,ta,ma
	vlseg8e32.v	v8,(t5)
	slli	t3,t1,2
	slli	t6,t1,5
	vse32.v	v8,0(a0)
	vse32.v	v9,0(a1)
	vse32.v	v10,0(a2)
	vse32.v	v11,0(a3)
	vse32.v	v12,0(a4)
	vse32.v	v13,0(a5)
	vse32.v	v14,0(a6)
	vse32.v	v15,0(a7)
	sub	t4,t4,t1
	add	t5,t5,t6
	add	a0,a0,t3
	add	a1,a1,t3
	add	a2,a2,t3
	add	a3,a3,t3
	add	a4,a4,t3
	add	a5,a5,t3
	add	a6,a6,t3
	add	a7,a7,t3
	bne	t4,zero,.L3
.L5:
	ret

The details of the approach:

Step 1 - Modifiy the LANES LOAD/STORE support function (vect_load_lanes_supported/vect_store_lanes_supported):

+/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
+   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
 
-bool
+internal_fn
 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
 			   bool masked_p)
 {
-  if (masked_p)
-    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
-					 vec_mask_load_lanes_optab,
-					 vectype, count);
+  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
+				    vec_mask_len_load_lanes_optab,
+				    vectype, count))
+    return IFN_MASK_LEN_LOAD_LANES;
+  else if (masked_p)
+    {
+      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
+					vec_mask_load_lanes_optab,
+					vectype, count))
+	return IFN_MASK_LOAD_LANES;
+    }
   else
-    return vect_lanes_optab_supported_p ("vec_load_lanes",
-					 vec_load_lanes_optab,
-					 vectype, count);
+    {
+      if (vect_lanes_optab_supported_p ("vec_load_lanes",
+					vec_load_lanes_optab,
+					vectype, count))
+	return IFN_LOAD_LANES;
+    }
+  return IFN_LAST;
 }
 
Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE.
I change it into return internal_fn of the LANES LOAD/STORE that target support,
If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.

Step 2 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:

+	  if (vect_store_lanes_supported (vectype, group_size, false)
+	      == IFN_MASK_LEN_STORE_LANES)
+	    {
+	      if (loop_lens)
+		final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+					       ncopies, vectype, j, 1);
+	      else
+		final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+	      signed char biasval
+		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	      bias = build_int_cst (intQI_type_node, biasval);
+	      if (!final_mask)
+		{
+		  mask_vectype = truth_type_for (vectype);
+		  final_mask = build_minus_one_cst (mask_vectype);
+		}
+	    }
+
 	  gcall *call;
-	  if (final_mask)
+	  if (final_len && final_mask)
+	    {
+	      /* Emit:
+		   MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
+					 LEN, BIAS, VEC_ARRAY).  */
+	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
+	      tree alias_ptr = build_int_cst (ref_type, align);
+	      call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
+						 dataref_ptr, alias_ptr,
+						 final_mask, final_len, bias,
+						 vec_array);
+	    }
+	  else if (final_mask)

The LEN and MASK flow is totally the same as other MASK_LEN_* load/store.

This patch bootstrap and regrssion on X86 passed.

Fully tested on RISC-V.

Ok for trunk ?

gcc/ChangeLog:

        * internal-fn.cc (internal_load_fn_p): Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer.
        (internal_store_fn_p): Ditto.
        (internal_fn_len_index): Ditto.
        (internal_fn_mask_index): Ditto.
        (internal_fn_stored_value_index): Ditto.
        * tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto.
        (vect_load_lanes_supported): Ditto.
        * tree-vect-loop.cc: Ditto.
        * tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto.
        * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
        (get_group_load_store_type): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (vect_store_lanes_supported): Ditto.
        (vect_load_lanes_supported): Ditto.

---
 gcc/internal-fn.cc         |  7 +++
 gcc/tree-vect-data-refs.cc | 61 +++++++++++++++++--------
 gcc/tree-vect-loop.cc      | 11 +++--
 gcc/tree-vect-slp.cc       |  2 +-
 gcc/tree-vect-stmts.cc     | 93 ++++++++++++++++++++++++++++++++------
 gcc/tree-vectorizer.h      |  4 +-
 6 files changed, 137 insertions(+), 41 deletions(-)

Comments

Richard Biener Aug. 15, 2023, 9:40 a.m. UTC | #1
On Mon, 14 Aug 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer.
> 
> Consider this simple case:
> 
> void __attribute__ ((noinline, noclone))
> foo (int *__restrict a, int *__restrict b, int *__restrict c,
> 	  int *__restrict d, int *__restrict e, int *__restrict f,
> 	  int *__restrict g, int *__restrict h, int *__restrict j, int n)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       a[i] = j[i * 8];
>       b[i] = j[i * 8 + 1];
>       c[i] = j[i * 8 + 2];
>       d[i] = j[i * 8 + 3];
>       e[i] = j[i * 8 + 4];
>       f[i] = j[i * 8 + 5];
>       g[i] = j[i * 8 + 6];
>       h[i] = j[i * 8 + 7];
>     }
> }
> 
> RVV Gimple IR:
> 
>   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
>   ivtmp_125 = _79 * 32;
>   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0);
>   vect__8.9_122 = vect_array.8[0];
>   vect__8.10_121 = vect_array.8[1];
>   vect__8.11_120 = vect_array.8[2];
>   vect__8.12_119 = vect_array.8[3];
>   vect__8.13_118 = vect_array.8[4];
>   vect__8.14_117 = vect_array.8[5];
>   vect__8.15_116 = vect_array.8[6];
>   vect__8.16_115 = vect_array.8[7];
>   vect_array.8 ={v} {CLOBBER};
>   ivtmp_114 = _79 * 4;
>   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
>   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
>   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
>   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
>   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
>   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
>   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
>   .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
> 
> ASM:
> 
> foo:
> 	lw	t4,8(sp)
> 	ld	t5,0(sp)
> 	ble	t4,zero,.L5
> .L3:
> 	vsetvli	t1,t4,e8,mf4,ta,ma
> 	vlseg8e32.v	v8,(t5)
> 	slli	t3,t1,2
> 	slli	t6,t1,5
> 	vse32.v	v8,0(a0)
> 	vse32.v	v9,0(a1)
> 	vse32.v	v10,0(a2)
> 	vse32.v	v11,0(a3)
> 	vse32.v	v12,0(a4)
> 	vse32.v	v13,0(a5)
> 	vse32.v	v14,0(a6)
> 	vse32.v	v15,0(a7)
> 	sub	t4,t4,t1
> 	add	t5,t5,t6
> 	add	a0,a0,t3
> 	add	a1,a1,t3
> 	add	a2,a2,t3
> 	add	a3,a3,t3
> 	add	a4,a4,t3
> 	add	a5,a5,t3
> 	add	a6,a6,t3
> 	add	a7,a7,t3
> 	bne	t4,zero,.L3
> .L5:
> 	ret
> 
> The details of the approach:
> 
> Step 1 - Modifiy the LANES LOAD/STORE support function (vect_load_lanes_supported/vect_store_lanes_supported):
> 
> +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> +   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
>  
> -bool
> +internal_fn
>  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
>  			   bool masked_p)
>  {
> -  if (masked_p)
> -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> -					 vec_mask_load_lanes_optab,
> -					 vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> +				    vec_mask_len_load_lanes_optab,
> +				    vectype, count))
> +    return IFN_MASK_LEN_LOAD_LANES;
> +  else if (masked_p)
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> +					vec_mask_load_lanes_optab,
> +					vectype, count))
> +	return IFN_MASK_LOAD_LANES;
> +    }
>    else
> -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> -					 vec_load_lanes_optab,
> -					 vectype, count);
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> +					vec_load_lanes_optab,
> +					vectype, count))
> +	return IFN_LOAD_LANES;
> +    }
> +  return IFN_LAST;
>  }
>  
> Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE.
> I change it into return internal_fn of the LANES LOAD/STORE that target support,
> If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.
> 
> Step 2 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:
> 
> +	  if (vect_store_lanes_supported (vectype, group_size, false)
> +	      == IFN_MASK_LEN_STORE_LANES)
> +	    {
> +	      if (loop_lens)
> +		final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +					       ncopies, vectype, j, 1);
> +	      else
> +		final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +	      signed char biasval
> +		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	      bias = build_int_cst (intQI_type_node, biasval);
> +	      if (!final_mask)
> +		{
> +		  mask_vectype = truth_type_for (vectype);
> +		  final_mask = build_minus_one_cst (mask_vectype);
> +		}
> +	    }
> +
>  	  gcall *call;
> -	  if (final_mask)
> +	  if (final_len && final_mask)
> +	    {
> +	      /* Emit:
> +		   MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> +					 LEN, BIAS, VEC_ARRAY).  */
> +	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +	      tree alias_ptr = build_int_cst (ref_type, align);
> +	      call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> +						 dataref_ptr, alias_ptr,
> +						 final_mask, final_len, bias,
> +						 vec_array);
> +	    }
> +	  else if (final_mask)
> 
> The LEN and MASK flow is totally the same as other MASK_LEN_* load/store.
> 
> This patch bootstrap and regrssion on X86 passed.
> 
> Fully tested on RISC-V.
> 
> Ok for trunk ?

I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
 
> gcc/ChangeLog:
> 
>         * internal-fn.cc (internal_load_fn_p): Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer.
>         (internal_store_fn_p): Ditto.
>         (internal_fn_len_index): Ditto.
>         (internal_fn_mask_index): Ditto.
>         (internal_fn_stored_value_index): Ditto.
>         * tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto.
>         (vect_load_lanes_supported): Ditto.
>         * tree-vect-loop.cc: Ditto.
>         * tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto.
>         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
>         (get_group_load_store_type): Ditto.
>         (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (vect_store_lanes_supported): Ditto.
>         (vect_load_lanes_supported): Ditto.
> 
> ---
>  gcc/internal-fn.cc         |  7 +++
>  gcc/tree-vect-data-refs.cc | 61 +++++++++++++++++--------
>  gcc/tree-vect-loop.cc      | 11 +++--
>  gcc/tree-vect-slp.cc       |  2 +-
>  gcc/tree-vect-stmts.cc     | 93 ++++++++++++++++++++++++++++++++------
>  gcc/tree-vectorizer.h      |  4 +-
>  6 files changed, 137 insertions(+), 41 deletions(-)
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 4f2b20a79e5..cc1ede58799 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4578,6 +4578,7 @@ internal_load_fn_p (internal_fn fn)
>      case IFN_MASK_LOAD:
>      case IFN_LOAD_LANES:
>      case IFN_MASK_LOAD_LANES:
> +    case IFN_MASK_LEN_LOAD_LANES:
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
>      case IFN_MASK_LEN_GATHER_LOAD:
> @@ -4600,6 +4601,7 @@ internal_store_fn_p (internal_fn fn)
>      case IFN_MASK_STORE:
>      case IFN_STORE_LANES:
>      case IFN_MASK_STORE_LANES:
> +    case IFN_MASK_LEN_STORE_LANES:
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
>      case IFN_MASK_LEN_SCATTER_STORE:
> @@ -4672,6 +4674,8 @@ internal_fn_len_index (internal_fn fn)
>      case IFN_COND_LEN_NEG:
>      case IFN_MASK_LEN_LOAD:
>      case IFN_MASK_LEN_STORE:
> +    case IFN_MASK_LEN_LOAD_LANES:
> +    case IFN_MASK_LEN_STORE_LANES:
>        return 3;
>  
>      default:
> @@ -4689,8 +4693,10 @@ internal_fn_mask_index (internal_fn fn)
>      {
>      case IFN_MASK_LOAD:
>      case IFN_MASK_LOAD_LANES:
> +    case IFN_MASK_LEN_LOAD_LANES:
>      case IFN_MASK_STORE:
>      case IFN_MASK_STORE_LANES:
> +    case IFN_MASK_LEN_STORE_LANES:
>      case IFN_MASK_LEN_LOAD:
>      case IFN_MASK_LEN_STORE:
>        return 2;
> @@ -4726,6 +4732,7 @@ internal_fn_stored_value_index (internal_fn fn)
>        return 4;
>  
>      case IFN_MASK_LEN_STORE:
> +    case IFN_MASK_LEN_STORE_LANES:
>        return 5;
>  
>      default:
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index a3570c45b52..232b91e8ed3 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -5439,24 +5439,34 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
>  }
>  
>  
> -/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
> +/* Return FN if vec_{mask_}store_lanes is available for COUNT vectors of
>     type VECTYPE.  MASKED_P says whether the masked form is needed.  */
>  
> -bool
> +internal_fn
>  vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
>  			    bool masked_p)
>  {
> -  if (masked_p)
> -    return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> -					 vec_mask_store_lanes_optab,
> -					 vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
> +				    vec_mask_len_store_lanes_optab,
> +				    vectype, count))
> +    return IFN_MASK_LEN_STORE_LANES;
> +  else if (masked_p)
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> +					vec_mask_store_lanes_optab,
> +					vectype, count))
> +	return IFN_MASK_STORE_LANES;
> +    }
>    else
> -    return vect_lanes_optab_supported_p ("vec_store_lanes",
> -					 vec_store_lanes_optab,
> -					 vectype, count);
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_store_lanes",
> +					vec_store_lanes_optab,
> +					vectype, count))
> +	return IFN_STORE_LANES;
> +    }
> +  return IFN_LAST;
>  }
>  
> -
>  /* Function vect_permute_store_chain.
>  
>     Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
> @@ -6056,21 +6066,32 @@ vect_grouped_load_supported (tree vectype, bool single_element_p,
>    return false;
>  }
>  
> -/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
> -   type VECTYPE.  MASKED_P says whether the masked form is needed.  */
> +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> +   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
>  
> -bool
> +internal_fn
>  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
>  			   bool masked_p)
>  {
> -  if (masked_p)
> -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> -					 vec_mask_load_lanes_optab,
> -					 vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> +				    vec_mask_len_load_lanes_optab,
> +				    vectype, count))
> +    return IFN_MASK_LEN_LOAD_LANES;
> +  else if (masked_p)
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> +					vec_mask_load_lanes_optab,
> +					vectype, count))
> +	return IFN_MASK_LOAD_LANES;
> +    }
>    else
> -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> -					 vec_load_lanes_optab,
> -					 vectype, count);
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> +					vec_load_lanes_optab,
> +					vectype, count))
> +	return IFN_LOAD_LANES;
> +    }
> +  return IFN_LAST;
>  }
>  
>  /* Function vect_permute_load_chain.
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 07f3717ed9d..2a0cfa3e2e8 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -2839,7 +2839,8 @@ start_over:
>  	     instructions record it and move on to the next instance.  */
>  	  if (loads_permuted
>  	      && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
> -	      && vect_store_lanes_supported (vectype, group_size, false))
> +	      && vect_store_lanes_supported (vectype, group_size, false)
> +		   != IFN_LAST)
>  	    {
>  	      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
>  		{
> @@ -2848,9 +2849,9 @@ start_over:
>  		  /* Use SLP for strided accesses (or if we can't
>  		     load-lanes).  */
>  		  if (STMT_VINFO_STRIDED_P (stmt_vinfo)
> -		      || ! vect_load_lanes_supported
> +		      || vect_load_lanes_supported
>  			    (STMT_VINFO_VECTYPE (stmt_vinfo),
> -			     DR_GROUP_SIZE (stmt_vinfo), false))
> +			     DR_GROUP_SIZE (stmt_vinfo), false) != IFN_LAST)
>  		    break;
>  		}
>  
> @@ -3153,7 +3154,7 @@ again:
>        vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
>        unsigned int size = DR_GROUP_SIZE (vinfo);
>        tree vectype = STMT_VINFO_VECTYPE (vinfo);
> -      if (! vect_store_lanes_supported (vectype, size, false)
> +      if (vect_store_lanes_supported (vectype, size, false) != IFN_LAST
>  	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
>  	 && ! vect_grouped_store_supported (vectype, size))
>  	return opt_result::failure_at (vinfo->stmt,
> @@ -3165,7 +3166,7 @@ again:
>  	  bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
>  	  size = DR_GROUP_SIZE (vinfo);
>  	  vectype = STMT_VINFO_VECTYPE (vinfo);
> -	  if (! vect_load_lanes_supported (vectype, size, false)
> +	  if (vect_load_lanes_supported (vectype, size, false) != IFN_LAST
>  	      && ! vect_grouped_load_supported (vectype, single_element_p,
>  						size))
>  	    return opt_result::failure_at (vinfo->stmt,
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 41997d5a546..3adb06dfa18 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -3094,7 +3094,7 @@ vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
>    if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
>        || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
>      return false;
> -  return vect_store_lanes_supported (vectype, group_size, false);
> +  return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
>  }
>  
>  /* Analyze an SLP instance starting from a group of grouped stores.  Call
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 89607a98f99..0f21315995e 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1610,9 +1610,15 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>    bool is_load = (vls_type == VLS_LOAD);
>    if (memory_access_type == VMAT_LOAD_STORE_LANES)
>      {
> -      if (is_load
> -	  ? !vect_load_lanes_supported (vectype, group_size, true)
> -	  : !vect_store_lanes_supported (vectype, group_size, true))
> +      internal_fn ifn
> +	= (is_load ? vect_load_lanes_supported (vectype, group_size, true)
> +		   : vect_store_lanes_supported (vectype, group_size, true));
> +      if (ifn == IFN_MASK_LEN_LOAD_LANES || ifn == IFN_MASK_LEN_STORE_LANES)
> +	vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> +      else if (ifn == IFN_MASK_LOAD_LANES || ifn == IFN_MASK_STORE_LANES)
> +	vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> +			       scalar_mask);
> +      else
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -1620,10 +1626,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>  			     " the target doesn't have an appropriate"
>  			     " load/store-lanes instruction.\n");
>  	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -	  return;
>  	}
> -      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> -			     scalar_mask);
>        return;
>      }
>  
> @@ -2274,9 +2277,11 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
>  
>  	  /* Otherwise try using LOAD/STORE_LANES.  */
>  	  else if (vls_type == VLS_LOAD
> -		   ? vect_load_lanes_supported (vectype, group_size, masked_p)
> -		   : vect_store_lanes_supported (vectype, group_size,
> -						 masked_p))
> +		     ? vect_load_lanes_supported (vectype, group_size, masked_p)
> +			 != IFN_LAST
> +		     : vect_store_lanes_supported (vectype, group_size,
> +						   masked_p)
> +			 != IFN_LAST)
>  	    {
>  	      *memory_access_type = VMAT_LOAD_STORE_LANES;
>  	      overrun_p = would_overrun_p;
> @@ -3090,8 +3095,7 @@ vect_get_loop_variant_data_ptr_increment (
>    /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
>       IVs are updated by variable amount but we will support them in the future.
>     */
> -  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> -	      && memory_access_type != VMAT_LOAD_STORE_LANES);
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
>  
>    /* When we support SELECT_VL pattern, we dynamic adjust
>       the memory address by .SELECT_VL result.
> @@ -8885,6 +8889,8 @@ vectorizable_store (vec_info *vinfo,
>  	    }
>  
>  	  tree final_mask = NULL;
> +	  tree final_len = NULL;
> +	  tree bias = NULL;
>  	  if (loop_masks)
>  	    final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>  					     ncopies, vectype, j);
> @@ -8892,8 +8898,38 @@ vectorizable_store (vec_info *vinfo,
>  	    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>  					   final_mask, vec_mask, gsi);
>  
> +	  if (vect_store_lanes_supported (vectype, group_size, false)
> +	      == IFN_MASK_LEN_STORE_LANES)

can you use the previously computed 'ifn' here please?

Otherwise LGTM.

Thanks,
Richard.

> +	    {
> +	      if (loop_lens)
> +		final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +					       ncopies, vectype, j, 1);
> +	      else
> +		final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +	      signed char biasval
> +		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	      bias = build_int_cst (intQI_type_node, biasval);
> +	      if (!final_mask)
> +		{
> +		  mask_vectype = truth_type_for (vectype);
> +		  final_mask = build_minus_one_cst (mask_vectype);
> +		}
> +	    }
> +
>  	  gcall *call;
> -	  if (final_mask)
> +	  if (final_len && final_mask)
> +	    {
> +	      /* Emit:
> +		   MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> +					 LEN, BIAS, VEC_ARRAY).  */
> +	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +	      tree alias_ptr = build_int_cst (ref_type, align);
> +	      call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> +						 dataref_ptr, alias_ptr,
> +						 final_mask, final_len, bias,
> +						 vec_array);
> +	    }
> +	  else if (final_mask)
>  	    {
>  	      /* Emit:
>  		   MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> @@ -10445,6 +10481,8 @@ vectorizable_load (vec_info *vinfo,
>  	  vec_array = create_vector_array (vectype, vec_num);
>  
>  	  tree final_mask = NULL_TREE;
> +	  tree final_len = NULL_TREE;
> +	  tree bias = NULL_TREE;
>  	  if (loop_masks)
>  	    final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>  					     ncopies, vectype, j);
> @@ -10452,8 +10490,37 @@ vectorizable_load (vec_info *vinfo,
>  	    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>  					   final_mask, vec_mask, gsi);
>  
> +	  if (vect_load_lanes_supported (vectype, group_size, false)
> +	      == IFN_MASK_LEN_LOAD_LANES)
> +	    {
> +	      if (loop_lens)
> +		final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +					       ncopies, vectype, j, 1);
> +	      else
> +		final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +	      signed char biasval
> +		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	      bias = build_int_cst (intQI_type_node, biasval);
> +	      if (!final_mask)
> +		{
> +		  mask_vectype = truth_type_for (vectype);
> +		  final_mask = build_minus_one_cst (mask_vectype);
> +		}
> +	    }
> +
>  	  gcall *call;
> -	  if (final_mask)
> +	  if (final_len && final_mask)
> +	    {
> +	      /* Emit:
> +		   VEC_ARRAY = MASK_LEN_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> +						    VEC_MASK, LEN, BIAS).  */
> +	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +	      tree alias_ptr = build_int_cst (ref_type, align);
> +	      call = gimple_build_call_internal (IFN_MASK_LEN_LOAD_LANES, 5,
> +						 dataref_ptr, alias_ptr,
> +						 final_mask, final_len, bias);
> +	    }
> +	  else if (final_mask)
>  	    {
>  	      /* Emit:
>  		   VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 5987a327332..6a2e55aa1fc 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2296,9 +2296,9 @@ extern tree bump_vector_ptr (vec_info *, tree, gimple *, gimple_stmt_iterator *,
>  extern void vect_copy_ref_info (tree, tree);
>  extern tree vect_create_destination_var (tree, tree);
>  extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT);
> -extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> +extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
>  extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
> -extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> +extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
>  extern void vect_permute_store_chain (vec_info *, vec<tree> &,
>  				      unsigned int, stmt_vec_info,
>  				      gimple_stmt_iterator *, vec<tree> *);
>
juzhe.zhong@rivai.ai Aug. 15, 2023, 10:05 a.m. UTC | #2
Hi, Richi.

> +	  if (vect_store_lanes_supported (vectype, group_size, false)
> +	      == IFN_MASK_LEN_STORE_LANES)

>> can you use the previously computed 'ifn' here please?

Do you mean rewrite the codes as follows :?

internal_fn lanes_ifn = vect_store_lanes_supported (vectype, group_size, false);

if (lanes_ifn == IFN_MASK_LEN_STORE_LANES).

>> I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.

Yeah, working on it and I will test on both X86 and ARM.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-15 17:40
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer
On Mon, 14 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer.
> 
> Consider this simple case:
> 
> void __attribute__ ((noinline, noclone))
> foo (int *__restrict a, int *__restrict b, int *__restrict c,
>   int *__restrict d, int *__restrict e, int *__restrict f,
>   int *__restrict g, int *__restrict h, int *__restrict j, int n)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       a[i] = j[i * 8];
>       b[i] = j[i * 8 + 1];
>       c[i] = j[i * 8 + 2];
>       d[i] = j[i * 8 + 3];
>       e[i] = j[i * 8 + 4];
>       f[i] = j[i * 8 + 5];
>       g[i] = j[i * 8 + 6];
>       h[i] = j[i * 8 + 7];
>     }
> }
> 
> RVV Gimple IR:
> 
>   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
>   ivtmp_125 = _79 * 32;
>   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0);
>   vect__8.9_122 = vect_array.8[0];
>   vect__8.10_121 = vect_array.8[1];
>   vect__8.11_120 = vect_array.8[2];
>   vect__8.12_119 = vect_array.8[3];
>   vect__8.13_118 = vect_array.8[4];
>   vect__8.14_117 = vect_array.8[5];
>   vect__8.15_116 = vect_array.8[6];
>   vect__8.16_115 = vect_array.8[7];
>   vect_array.8 ={v} {CLOBBER};
>   ivtmp_114 = _79 * 4;
>   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
>   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
>   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
>   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
>   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
>   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
>   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
>   .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
> 
> ASM:
> 
> foo:
> lw t4,8(sp)
> ld t5,0(sp)
> ble t4,zero,.L5
> .L3:
> vsetvli t1,t4,e8,mf4,ta,ma
> vlseg8e32.v v8,(t5)
> slli t3,t1,2
> slli t6,t1,5
> vse32.v v8,0(a0)
> vse32.v v9,0(a1)
> vse32.v v10,0(a2)
> vse32.v v11,0(a3)
> vse32.v v12,0(a4)
> vse32.v v13,0(a5)
> vse32.v v14,0(a6)
> vse32.v v15,0(a7)
> sub t4,t4,t1
> add t5,t5,t6
> add a0,a0,t3
> add a1,a1,t3
> add a2,a2,t3
> add a3,a3,t3
> add a4,a4,t3
> add a5,a5,t3
> add a6,a6,t3
> add a7,a7,t3
> bne t4,zero,.L3
> .L5:
> ret
> 
> The details of the approach:
> 
> Step 1 - Modifiy the LANES LOAD/STORE support function (vect_load_lanes_supported/vect_store_lanes_supported):
> 
> +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> +   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
>  
> -bool
> +internal_fn
>  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
>     bool masked_p)
>  {
> -  if (masked_p)
> -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> - vec_mask_load_lanes_optab,
> - vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> +     vec_mask_len_load_lanes_optab,
> +     vectype, count))
> +    return IFN_MASK_LEN_LOAD_LANES;
> +  else if (masked_p)
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> + vec_mask_load_lanes_optab,
> + vectype, count))
> + return IFN_MASK_LOAD_LANES;
> +    }
>    else
> -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> - vec_load_lanes_optab,
> - vectype, count);
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> + vec_load_lanes_optab,
> + vectype, count))
> + return IFN_LOAD_LANES;
> +    }
> +  return IFN_LAST;
>  }
>  
> Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE.
> I change it into return internal_fn of the LANES LOAD/STORE that target support,
> If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.
> 
> Step 2 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:
> 
> +   if (vect_store_lanes_supported (vectype, group_size, false)
> +       == IFN_MASK_LEN_STORE_LANES)
> +     {
> +       if (loop_lens)
> + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +        ncopies, vectype, j, 1);
> +       else
> + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +       signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +       bias = build_int_cst (intQI_type_node, biasval);
> +       if (!final_mask)
> + {
> +   mask_vectype = truth_type_for (vectype);
> +   final_mask = build_minus_one_cst (mask_vectype);
> + }
> +     }
> +
>    gcall *call;
> -   if (final_mask)
> +   if (final_len && final_mask)
> +     {
> +       /* Emit:
> +    MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> + LEN, BIAS, VEC_ARRAY).  */
> +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +       tree alias_ptr = build_int_cst (ref_type, align);
> +       call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> + dataref_ptr, alias_ptr,
> + final_mask, final_len, bias,
> + vec_array);
> +     }
> +   else if (final_mask)
> 
> The LEN and MASK flow is totally the same as other MASK_LEN_* load/store.
> 
> This patch bootstrap and regrssion on X86 passed.
> 
> Fully tested on RISC-V.
> 
> Ok for trunk ?
 
I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
> gcc/ChangeLog:
> 
>         * internal-fn.cc (internal_load_fn_p): Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer.
>         (internal_store_fn_p): Ditto.
>         (internal_fn_len_index): Ditto.
>         (internal_fn_mask_index): Ditto.
>         (internal_fn_stored_value_index): Ditto.
>         * tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto.
>         (vect_load_lanes_supported): Ditto.
>         * tree-vect-loop.cc: Ditto.
>         * tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto.
>         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
>         (get_group_load_store_type): Ditto.
>         (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (vect_store_lanes_supported): Ditto.
>         (vect_load_lanes_supported): Ditto.
> 
> ---
>  gcc/internal-fn.cc         |  7 +++
>  gcc/tree-vect-data-refs.cc | 61 +++++++++++++++++--------
>  gcc/tree-vect-loop.cc      | 11 +++--
>  gcc/tree-vect-slp.cc       |  2 +-
>  gcc/tree-vect-stmts.cc     | 93 ++++++++++++++++++++++++++++++++------
>  gcc/tree-vectorizer.h      |  4 +-
>  6 files changed, 137 insertions(+), 41 deletions(-)
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 4f2b20a79e5..cc1ede58799 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4578,6 +4578,7 @@ internal_load_fn_p (internal_fn fn)
>      case IFN_MASK_LOAD:
>      case IFN_LOAD_LANES:
>      case IFN_MASK_LOAD_LANES:
> +    case IFN_MASK_LEN_LOAD_LANES:
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
>      case IFN_MASK_LEN_GATHER_LOAD:
> @@ -4600,6 +4601,7 @@ internal_store_fn_p (internal_fn fn)
>      case IFN_MASK_STORE:
>      case IFN_STORE_LANES:
>      case IFN_MASK_STORE_LANES:
> +    case IFN_MASK_LEN_STORE_LANES:
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
>      case IFN_MASK_LEN_SCATTER_STORE:
> @@ -4672,6 +4674,8 @@ internal_fn_len_index (internal_fn fn)
>      case IFN_COND_LEN_NEG:
>      case IFN_MASK_LEN_LOAD:
>      case IFN_MASK_LEN_STORE:
> +    case IFN_MASK_LEN_LOAD_LANES:
> +    case IFN_MASK_LEN_STORE_LANES:
>        return 3;
>  
>      default:
> @@ -4689,8 +4693,10 @@ internal_fn_mask_index (internal_fn fn)
>      {
>      case IFN_MASK_LOAD:
>      case IFN_MASK_LOAD_LANES:
> +    case IFN_MASK_LEN_LOAD_LANES:
>      case IFN_MASK_STORE:
>      case IFN_MASK_STORE_LANES:
> +    case IFN_MASK_LEN_STORE_LANES:
>      case IFN_MASK_LEN_LOAD:
>      case IFN_MASK_LEN_STORE:
>        return 2;
> @@ -4726,6 +4732,7 @@ internal_fn_stored_value_index (internal_fn fn)
>        return 4;
>  
>      case IFN_MASK_LEN_STORE:
> +    case IFN_MASK_LEN_STORE_LANES:
>        return 5;
>  
>      default:
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index a3570c45b52..232b91e8ed3 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -5439,24 +5439,34 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
>  }
>  
>  
> -/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
> +/* Return FN if vec_{mask_}store_lanes is available for COUNT vectors of
>     type VECTYPE.  MASKED_P says whether the masked form is needed.  */
>  
> -bool
> +internal_fn
>  vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
>      bool masked_p)
>  {
> -  if (masked_p)
> -    return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> - vec_mask_store_lanes_optab,
> - vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
> +     vec_mask_len_store_lanes_optab,
> +     vectype, count))
> +    return IFN_MASK_LEN_STORE_LANES;
> +  else if (masked_p)
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> + vec_mask_store_lanes_optab,
> + vectype, count))
> + return IFN_MASK_STORE_LANES;
> +    }
>    else
> -    return vect_lanes_optab_supported_p ("vec_store_lanes",
> - vec_store_lanes_optab,
> - vectype, count);
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_store_lanes",
> + vec_store_lanes_optab,
> + vectype, count))
> + return IFN_STORE_LANES;
> +    }
> +  return IFN_LAST;
>  }
>  
> -
>  /* Function vect_permute_store_chain.
>  
>     Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
> @@ -6056,21 +6066,32 @@ vect_grouped_load_supported (tree vectype, bool single_element_p,
>    return false;
>  }
>  
> -/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
> -   type VECTYPE.  MASKED_P says whether the masked form is needed.  */
> +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> +   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
>  
> -bool
> +internal_fn
>  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
>     bool masked_p)
>  {
> -  if (masked_p)
> -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> - vec_mask_load_lanes_optab,
> - vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> +     vec_mask_len_load_lanes_optab,
> +     vectype, count))
> +    return IFN_MASK_LEN_LOAD_LANES;
> +  else if (masked_p)
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> + vec_mask_load_lanes_optab,
> + vectype, count))
> + return IFN_MASK_LOAD_LANES;
> +    }
>    else
> -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> - vec_load_lanes_optab,
> - vectype, count);
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> + vec_load_lanes_optab,
> + vectype, count))
> + return IFN_LOAD_LANES;
> +    }
> +  return IFN_LAST;
>  }
>  
>  /* Function vect_permute_load_chain.
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 07f3717ed9d..2a0cfa3e2e8 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -2839,7 +2839,8 @@ start_over:
>       instructions record it and move on to the next instance.  */
>    if (loads_permuted
>        && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
> -       && vect_store_lanes_supported (vectype, group_size, false))
> +       && vect_store_lanes_supported (vectype, group_size, false)
> +    != IFN_LAST)
>      {
>        FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
>  {
> @@ -2848,9 +2849,9 @@ start_over:
>    /* Use SLP for strided accesses (or if we can't
>       load-lanes).  */
>    if (STMT_VINFO_STRIDED_P (stmt_vinfo)
> -       || ! vect_load_lanes_supported
> +       || vect_load_lanes_supported
>      (STMT_VINFO_VECTYPE (stmt_vinfo),
> -      DR_GROUP_SIZE (stmt_vinfo), false))
> +      DR_GROUP_SIZE (stmt_vinfo), false) != IFN_LAST)
>      break;
>  }
>  
> @@ -3153,7 +3154,7 @@ again:
>        vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
>        unsigned int size = DR_GROUP_SIZE (vinfo);
>        tree vectype = STMT_VINFO_VECTYPE (vinfo);
> -      if (! vect_store_lanes_supported (vectype, size, false)
> +      if (vect_store_lanes_supported (vectype, size, false) != IFN_LAST
>  && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
>  && ! vect_grouped_store_supported (vectype, size))
>  return opt_result::failure_at (vinfo->stmt,
> @@ -3165,7 +3166,7 @@ again:
>    bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
>    size = DR_GROUP_SIZE (vinfo);
>    vectype = STMT_VINFO_VECTYPE (vinfo);
> -   if (! vect_load_lanes_supported (vectype, size, false)
> +   if (vect_load_lanes_supported (vectype, size, false) != IFN_LAST
>        && ! vect_grouped_load_supported (vectype, single_element_p,
>  size))
>      return opt_result::failure_at (vinfo->stmt,
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 41997d5a546..3adb06dfa18 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -3094,7 +3094,7 @@ vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
>    if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
>        || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
>      return false;
> -  return vect_store_lanes_supported (vectype, group_size, false);
> +  return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
>  }
>  
>  /* Analyze an SLP instance starting from a group of grouped stores.  Call
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 89607a98f99..0f21315995e 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1610,9 +1610,15 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>    bool is_load = (vls_type == VLS_LOAD);
>    if (memory_access_type == VMAT_LOAD_STORE_LANES)
>      {
> -      if (is_load
> -   ? !vect_load_lanes_supported (vectype, group_size, true)
> -   : !vect_store_lanes_supported (vectype, group_size, true))
> +      internal_fn ifn
> + = (is_load ? vect_load_lanes_supported (vectype, group_size, true)
> +    : vect_store_lanes_supported (vectype, group_size, true));
> +      if (ifn == IFN_MASK_LEN_LOAD_LANES || ifn == IFN_MASK_LEN_STORE_LANES)
> + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> +      else if (ifn == IFN_MASK_LOAD_LANES || ifn == IFN_MASK_STORE_LANES)
> + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> +        scalar_mask);
> +      else
>  {
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -1620,10 +1626,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>       " the target doesn't have an appropriate"
>       " load/store-lanes instruction.\n");
>    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -   return;
>  }
> -      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> -      scalar_mask);
>        return;
>      }
>  
> @@ -2274,9 +2277,11 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
>  
>    /* Otherwise try using LOAD/STORE_LANES.  */
>    else if (vls_type == VLS_LOAD
> -    ? vect_load_lanes_supported (vectype, group_size, masked_p)
> -    : vect_store_lanes_supported (vectype, group_size,
> - masked_p))
> +      ? vect_load_lanes_supported (vectype, group_size, masked_p)
> + != IFN_LAST
> +      : vect_store_lanes_supported (vectype, group_size,
> +    masked_p)
> + != IFN_LAST)
>      {
>        *memory_access_type = VMAT_LOAD_STORE_LANES;
>        overrun_p = would_overrun_p;
> @@ -3090,8 +3095,7 @@ vect_get_loop_variant_data_ptr_increment (
>    /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
>       IVs are updated by variable amount but we will support them in the future.
>     */
> -  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> -       && memory_access_type != VMAT_LOAD_STORE_LANES);
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
>  
>    /* When we support SELECT_VL pattern, we dynamic adjust
>       the memory address by .SELECT_VL result.
> @@ -8885,6 +8889,8 @@ vectorizable_store (vec_info *vinfo,
>      }
>  
>    tree final_mask = NULL;
> +   tree final_len = NULL;
> +   tree bias = NULL;
>    if (loop_masks)
>      final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>       ncopies, vectype, j);
> @@ -8892,8 +8898,38 @@ vectorizable_store (vec_info *vinfo,
>      final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>     final_mask, vec_mask, gsi);
>  
> +   if (vect_store_lanes_supported (vectype, group_size, false)
> +       == IFN_MASK_LEN_STORE_LANES)
 
can you use the previously computed 'ifn' here please?
 
Otherwise LGTM.
 
Thanks,
Richard.
 
> +     {
> +       if (loop_lens)
> + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +        ncopies, vectype, j, 1);
> +       else
> + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +       signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +       bias = build_int_cst (intQI_type_node, biasval);
> +       if (!final_mask)
> + {
> +   mask_vectype = truth_type_for (vectype);
> +   final_mask = build_minus_one_cst (mask_vectype);
> + }
> +     }
> +
>    gcall *call;
> -   if (final_mask)
> +   if (final_len && final_mask)
> +     {
> +       /* Emit:
> +    MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> + LEN, BIAS, VEC_ARRAY).  */
> +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +       tree alias_ptr = build_int_cst (ref_type, align);
> +       call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> + dataref_ptr, alias_ptr,
> + final_mask, final_len, bias,
> + vec_array);
> +     }
> +   else if (final_mask)
>      {
>        /* Emit:
>     MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> @@ -10445,6 +10481,8 @@ vectorizable_load (vec_info *vinfo,
>    vec_array = create_vector_array (vectype, vec_num);
>  
>    tree final_mask = NULL_TREE;
> +   tree final_len = NULL_TREE;
> +   tree bias = NULL_TREE;
>    if (loop_masks)
>      final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>       ncopies, vectype, j);
> @@ -10452,8 +10490,37 @@ vectorizable_load (vec_info *vinfo,
>      final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>     final_mask, vec_mask, gsi);
>  
> +   if (vect_load_lanes_supported (vectype, group_size, false)
> +       == IFN_MASK_LEN_LOAD_LANES)
> +     {
> +       if (loop_lens)
> + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +        ncopies, vectype, j, 1);
> +       else
> + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +       signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +       bias = build_int_cst (intQI_type_node, biasval);
> +       if (!final_mask)
> + {
> +   mask_vectype = truth_type_for (vectype);
> +   final_mask = build_minus_one_cst (mask_vectype);
> + }
> +     }
> +
>    gcall *call;
> -   if (final_mask)
> +   if (final_len && final_mask)
> +     {
> +       /* Emit:
> +    VEC_ARRAY = MASK_LEN_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> +     VEC_MASK, LEN, BIAS).  */
> +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +       tree alias_ptr = build_int_cst (ref_type, align);
> +       call = gimple_build_call_internal (IFN_MASK_LEN_LOAD_LANES, 5,
> + dataref_ptr, alias_ptr,
> + final_mask, final_len, bias);
> +     }
> +   else if (final_mask)
>      {
>        /* Emit:
>     VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 5987a327332..6a2e55aa1fc 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2296,9 +2296,9 @@ extern tree bump_vector_ptr (vec_info *, tree, gimple *, gimple_stmt_iterator *,
>  extern void vect_copy_ref_info (tree, tree);
>  extern tree vect_create_destination_var (tree, tree);
>  extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT);
> -extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> +extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
>  extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
> -extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> +extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
>  extern void vect_permute_store_chain (vec_info *, vec<tree> &,
>        unsigned int, stmt_vec_info,
>        gimple_stmt_iterator *, vec<tree> *);
>
Richard Biener Aug. 15, 2023, 11:19 a.m. UTC | #3
On Tue, 15 Aug 2023, juzhe.zhong@rivai.ai wrote:

> Hi, Richi.
> 
> > +	  if (vect_store_lanes_supported (vectype, group_size, false)
> > +	      == IFN_MASK_LEN_STORE_LANES)
> 
> >> can you use the previously computed 'ifn' here please?
> 
> Do you mean rewrite the codes as follows :?
> 
> internal_fn lanes_ifn = vect_store_lanes_supported (vectype, group_size, false);
> 
> if (lanes_ifn == IFN_MASK_LEN_STORE_LANES).

The vect_store_lanes_supported is performed during analysis already
and ideally we'd not re-do such check, so please save it in a
variable at that point.
 
> >> I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
> 
> Yeah, working on it and I will test on both X86 and ARM.
> 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-15 17:40
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford
> Subject: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer
> On Mon, 14 Aug 2023, juzhe.zhong@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > 
> > Hi, Richard and Richi.
> > 
> > This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer.
> > 
> > Consider this simple case:
> > 
> > void __attribute__ ((noinline, noclone))
> > foo (int *__restrict a, int *__restrict b, int *__restrict c,
> >   int *__restrict d, int *__restrict e, int *__restrict f,
> >   int *__restrict g, int *__restrict h, int *__restrict j, int n)
> > {
> >   for (int i = 0; i < n; ++i)
> >     {
> >       a[i] = j[i * 8];
> >       b[i] = j[i * 8 + 1];
> >       c[i] = j[i * 8 + 2];
> >       d[i] = j[i * 8 + 3];
> >       e[i] = j[i * 8 + 4];
> >       f[i] = j[i * 8 + 5];
> >       g[i] = j[i * 8 + 6];
> >       h[i] = j[i * 8 + 7];
> >     }
> > }
> > 
> > RVV Gimple IR:
> > 
> >   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
> >   ivtmp_125 = _79 * 32;
> >   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0);
> >   vect__8.9_122 = vect_array.8[0];
> >   vect__8.10_121 = vect_array.8[1];
> >   vect__8.11_120 = vect_array.8[2];
> >   vect__8.12_119 = vect_array.8[3];
> >   vect__8.13_118 = vect_array.8[4];
> >   vect__8.14_117 = vect_array.8[5];
> >   vect__8.15_116 = vect_array.8[6];
> >   vect__8.16_115 = vect_array.8[7];
> >   vect_array.8 ={v} {CLOBBER};
> >   ivtmp_114 = _79 * 4;
> >   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
> >   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
> >   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
> >   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
> >   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
> >   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
> >   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
> >   .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
> > 
> > ASM:
> > 
> > foo:
> > lw t4,8(sp)
> > ld t5,0(sp)
> > ble t4,zero,.L5
> > .L3:
> > vsetvli t1,t4,e8,mf4,ta,ma
> > vlseg8e32.v v8,(t5)
> > slli t3,t1,2
> > slli t6,t1,5
> > vse32.v v8,0(a0)
> > vse32.v v9,0(a1)
> > vse32.v v10,0(a2)
> > vse32.v v11,0(a3)
> > vse32.v v12,0(a4)
> > vse32.v v13,0(a5)
> > vse32.v v14,0(a6)
> > vse32.v v15,0(a7)
> > sub t4,t4,t1
> > add t5,t5,t6
> > add a0,a0,t3
> > add a1,a1,t3
> > add a2,a2,t3
> > add a3,a3,t3
> > add a4,a4,t3
> > add a5,a5,t3
> > add a6,a6,t3
> > add a7,a7,t3
> > bne t4,zero,.L3
> > .L5:
> > ret
> > 
> > The details of the approach:
> > 
> > Step 1 - Modifiy the LANES LOAD/STORE support function (vect_load_lanes_supported/vect_store_lanes_supported):
> > 
> > +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> > +   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
> >  
> > -bool
> > +internal_fn
> >  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> >     bool masked_p)
> >  {
> > -  if (masked_p)
> > -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > - vec_mask_load_lanes_optab,
> > - vectype, count);
> > +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> > +     vec_mask_len_load_lanes_optab,
> > +     vectype, count))
> > +    return IFN_MASK_LEN_LOAD_LANES;
> > +  else if (masked_p)
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > + vec_mask_load_lanes_optab,
> > + vectype, count))
> > + return IFN_MASK_LOAD_LANES;
> > +    }
> >    else
> > -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> > - vec_load_lanes_optab,
> > - vectype, count);
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> > + vec_load_lanes_optab,
> > + vectype, count))
> > + return IFN_LOAD_LANES;
> > +    }
> > +  return IFN_LAST;
> >  }
> >  
> > Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE.
> > I change it into return internal_fn of the LANES LOAD/STORE that target support,
> > If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.
> > 
> > Step 2 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:
> > 
> > +   if (vect_store_lanes_supported (vectype, group_size, false)
> > +       == IFN_MASK_LEN_STORE_LANES)
> > +     {
> > +       if (loop_lens)
> > + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > +        ncopies, vectype, j, 1);
> > +       else
> > + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> > +       signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > +       bias = build_int_cst (intQI_type_node, biasval);
> > +       if (!final_mask)
> > + {
> > +   mask_vectype = truth_type_for (vectype);
> > +   final_mask = build_minus_one_cst (mask_vectype);
> > + }
> > +     }
> > +
> >    gcall *call;
> > -   if (final_mask)
> > +   if (final_len && final_mask)
> > +     {
> > +       /* Emit:
> > +    MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> > + LEN, BIAS, VEC_ARRAY).  */
> > +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> > +       tree alias_ptr = build_int_cst (ref_type, align);
> > +       call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> > + dataref_ptr, alias_ptr,
> > + final_mask, final_len, bias,
> > + vec_array);
> > +     }
> > +   else if (final_mask)
> > 
> > The LEN and MASK flow is totally the same as other MASK_LEN_* load/store.
> > 
> > This patch bootstrap and regrssion on X86 passed.
> > 
> > Fully tested on RISC-V.
> > 
> > Ok for trunk ?
>  
> I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
> > gcc/ChangeLog:
> > 
> >         * internal-fn.cc (internal_load_fn_p): Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer.
> >         (internal_store_fn_p): Ditto.
> >         (internal_fn_len_index): Ditto.
> >         (internal_fn_mask_index): Ditto.
> >         (internal_fn_stored_value_index): Ditto.
> >         * tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto.
> >         (vect_load_lanes_supported): Ditto.
> >         * tree-vect-loop.cc: Ditto.
> >         * tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto.
> >         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
> >         (get_group_load_store_type): Ditto.
> >         (vectorizable_store): Ditto.
> >         (vectorizable_load): Ditto.
> >         * tree-vectorizer.h (vect_store_lanes_supported): Ditto.
> >         (vect_load_lanes_supported): Ditto.
> > 
> > ---
> >  gcc/internal-fn.cc         |  7 +++
> >  gcc/tree-vect-data-refs.cc | 61 +++++++++++++++++--------
> >  gcc/tree-vect-loop.cc      | 11 +++--
> >  gcc/tree-vect-slp.cc       |  2 +-
> >  gcc/tree-vect-stmts.cc     | 93 ++++++++++++++++++++++++++++++++------
> >  gcc/tree-vectorizer.h      |  4 +-
> >  6 files changed, 137 insertions(+), 41 deletions(-)
> > 
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index 4f2b20a79e5..cc1ede58799 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -4578,6 +4578,7 @@ internal_load_fn_p (internal_fn fn)
> >      case IFN_MASK_LOAD:
> >      case IFN_LOAD_LANES:
> >      case IFN_MASK_LOAD_LANES:
> > +    case IFN_MASK_LEN_LOAD_LANES:
> >      case IFN_GATHER_LOAD:
> >      case IFN_MASK_GATHER_LOAD:
> >      case IFN_MASK_LEN_GATHER_LOAD:
> > @@ -4600,6 +4601,7 @@ internal_store_fn_p (internal_fn fn)
> >      case IFN_MASK_STORE:
> >      case IFN_STORE_LANES:
> >      case IFN_MASK_STORE_LANES:
> > +    case IFN_MASK_LEN_STORE_LANES:
> >      case IFN_SCATTER_STORE:
> >      case IFN_MASK_SCATTER_STORE:
> >      case IFN_MASK_LEN_SCATTER_STORE:
> > @@ -4672,6 +4674,8 @@ internal_fn_len_index (internal_fn fn)
> >      case IFN_COND_LEN_NEG:
> >      case IFN_MASK_LEN_LOAD:
> >      case IFN_MASK_LEN_STORE:
> > +    case IFN_MASK_LEN_LOAD_LANES:
> > +    case IFN_MASK_LEN_STORE_LANES:
> >        return 3;
> >  
> >      default:
> > @@ -4689,8 +4693,10 @@ internal_fn_mask_index (internal_fn fn)
> >      {
> >      case IFN_MASK_LOAD:
> >      case IFN_MASK_LOAD_LANES:
> > +    case IFN_MASK_LEN_LOAD_LANES:
> >      case IFN_MASK_STORE:
> >      case IFN_MASK_STORE_LANES:
> > +    case IFN_MASK_LEN_STORE_LANES:
> >      case IFN_MASK_LEN_LOAD:
> >      case IFN_MASK_LEN_STORE:
> >        return 2;
> > @@ -4726,6 +4732,7 @@ internal_fn_stored_value_index (internal_fn fn)
> >        return 4;
> >  
> >      case IFN_MASK_LEN_STORE:
> > +    case IFN_MASK_LEN_STORE_LANES:
> >        return 5;
> >  
> >      default:
> > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > index a3570c45b52..232b91e8ed3 100644
> > --- a/gcc/tree-vect-data-refs.cc
> > +++ b/gcc/tree-vect-data-refs.cc
> > @@ -5439,24 +5439,34 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
> >  }
> >  
> >  
> > -/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
> > +/* Return FN if vec_{mask_}store_lanes is available for COUNT vectors of
> >     type VECTYPE.  MASKED_P says whether the masked form is needed.  */
> >  
> > -bool
> > +internal_fn
> >  vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> >      bool masked_p)
> >  {
> > -  if (masked_p)
> > -    return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> > - vec_mask_store_lanes_optab,
> > - vectype, count);
> > +  if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
> > +     vec_mask_len_store_lanes_optab,
> > +     vectype, count))
> > +    return IFN_MASK_LEN_STORE_LANES;
> > +  else if (masked_p)
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> > + vec_mask_store_lanes_optab,
> > + vectype, count))
> > + return IFN_MASK_STORE_LANES;
> > +    }
> >    else
> > -    return vect_lanes_optab_supported_p ("vec_store_lanes",
> > - vec_store_lanes_optab,
> > - vectype, count);
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_store_lanes",
> > + vec_store_lanes_optab,
> > + vectype, count))
> > + return IFN_STORE_LANES;
> > +    }
> > +  return IFN_LAST;
> >  }
> >  
> > -
> >  /* Function vect_permute_store_chain.
> >  
> >     Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
> > @@ -6056,21 +6066,32 @@ vect_grouped_load_supported (tree vectype, bool single_element_p,
> >    return false;
> >  }
> >  
> > -/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
> > -   type VECTYPE.  MASKED_P says whether the masked form is needed.  */
> > +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> > +   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
> >  
> > -bool
> > +internal_fn
> >  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> >     bool masked_p)
> >  {
> > -  if (masked_p)
> > -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > - vec_mask_load_lanes_optab,
> > - vectype, count);
> > +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> > +     vec_mask_len_load_lanes_optab,
> > +     vectype, count))
> > +    return IFN_MASK_LEN_LOAD_LANES;
> > +  else if (masked_p)
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > + vec_mask_load_lanes_optab,
> > + vectype, count))
> > + return IFN_MASK_LOAD_LANES;
> > +    }
> >    else
> > -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> > - vec_load_lanes_optab,
> > - vectype, count);
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> > + vec_load_lanes_optab,
> > + vectype, count))
> > + return IFN_LOAD_LANES;
> > +    }
> > +  return IFN_LAST;
> >  }
> >  
> >  /* Function vect_permute_load_chain.
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 07f3717ed9d..2a0cfa3e2e8 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -2839,7 +2839,8 @@ start_over:
> >       instructions record it and move on to the next instance.  */
> >    if (loads_permuted
> >        && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
> > -       && vect_store_lanes_supported (vectype, group_size, false))
> > +       && vect_store_lanes_supported (vectype, group_size, false)
> > +    != IFN_LAST)
> >      {
> >        FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
> >  {
> > @@ -2848,9 +2849,9 @@ start_over:
> >    /* Use SLP for strided accesses (or if we can't
> >       load-lanes).  */
> >    if (STMT_VINFO_STRIDED_P (stmt_vinfo)
> > -       || ! vect_load_lanes_supported
> > +       || vect_load_lanes_supported
> >      (STMT_VINFO_VECTYPE (stmt_vinfo),
> > -      DR_GROUP_SIZE (stmt_vinfo), false))
> > +      DR_GROUP_SIZE (stmt_vinfo), false) != IFN_LAST)
> >      break;
> >  }
> >  
> > @@ -3153,7 +3154,7 @@ again:
> >        vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
> >        unsigned int size = DR_GROUP_SIZE (vinfo);
> >        tree vectype = STMT_VINFO_VECTYPE (vinfo);
> > -      if (! vect_store_lanes_supported (vectype, size, false)
> > +      if (vect_store_lanes_supported (vectype, size, false) != IFN_LAST
> >  && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
> >  && ! vect_grouped_store_supported (vectype, size))
> >  return opt_result::failure_at (vinfo->stmt,
> > @@ -3165,7 +3166,7 @@ again:
> >    bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
> >    size = DR_GROUP_SIZE (vinfo);
> >    vectype = STMT_VINFO_VECTYPE (vinfo);
> > -   if (! vect_load_lanes_supported (vectype, size, false)
> > +   if (vect_load_lanes_supported (vectype, size, false) != IFN_LAST
> >        && ! vect_grouped_load_supported (vectype, single_element_p,
> >  size))
> >      return opt_result::failure_at (vinfo->stmt,
> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > index 41997d5a546..3adb06dfa18 100644
> > --- a/gcc/tree-vect-slp.cc
> > +++ b/gcc/tree-vect-slp.cc
> > @@ -3094,7 +3094,7 @@ vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
> >    if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
> >        || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> >      return false;
> > -  return vect_store_lanes_supported (vectype, group_size, false);
> > +  return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
> >  }
> >  
> >  /* Analyze an SLP instance starting from a group of grouped stores.  Call
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index 89607a98f99..0f21315995e 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -1610,9 +1610,15 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> >    bool is_load = (vls_type == VLS_LOAD);
> >    if (memory_access_type == VMAT_LOAD_STORE_LANES)
> >      {
> > -      if (is_load
> > -   ? !vect_load_lanes_supported (vectype, group_size, true)
> > -   : !vect_store_lanes_supported (vectype, group_size, true))
> > +      internal_fn ifn
> > + = (is_load ? vect_load_lanes_supported (vectype, group_size, true)
> > +    : vect_store_lanes_supported (vectype, group_size, true));
> > +      if (ifn == IFN_MASK_LEN_LOAD_LANES || ifn == IFN_MASK_LEN_STORE_LANES)
> > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> > +      else if (ifn == IFN_MASK_LOAD_LANES || ifn == IFN_MASK_STORE_LANES)
> > + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> > +        scalar_mask);
> > +      else
> >  {
> >    if (dump_enabled_p ())
> >      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -1620,10 +1626,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> >       " the target doesn't have an appropriate"
> >       " load/store-lanes instruction.\n");
> >    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> > -   return;
> >  }
> > -      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> > -      scalar_mask);
> >        return;
> >      }
> >  
> > @@ -2274,9 +2277,11 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
> >  
> >    /* Otherwise try using LOAD/STORE_LANES.  */
> >    else if (vls_type == VLS_LOAD
> > -    ? vect_load_lanes_supported (vectype, group_size, masked_p)
> > -    : vect_store_lanes_supported (vectype, group_size,
> > - masked_p))
> > +      ? vect_load_lanes_supported (vectype, group_size, masked_p)
> > + != IFN_LAST
> > +      : vect_store_lanes_supported (vectype, group_size,
> > +    masked_p)
> > + != IFN_LAST)
> >      {
> >        *memory_access_type = VMAT_LOAD_STORE_LANES;
> >        overrun_p = would_overrun_p;
> > @@ -3090,8 +3095,7 @@ vect_get_loop_variant_data_ptr_increment (
> >    /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
> >       IVs are updated by variable amount but we will support them in the future.
> >     */
> > -  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> > -       && memory_access_type != VMAT_LOAD_STORE_LANES);
> > +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
> >  
> >    /* When we support SELECT_VL pattern, we dynamic adjust
> >       the memory address by .SELECT_VL result.
> > @@ -8885,6 +8889,8 @@ vectorizable_store (vec_info *vinfo,
> >      }
> >  
> >    tree final_mask = NULL;
> > +   tree final_len = NULL;
> > +   tree bias = NULL;
> >    if (loop_masks)
> >      final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >       ncopies, vectype, j);
> > @@ -8892,8 +8898,38 @@ vectorizable_store (vec_info *vinfo,
> >      final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >     final_mask, vec_mask, gsi);
> >  
> > +   if (vect_store_lanes_supported (vectype, group_size, false)
> > +       == IFN_MASK_LEN_STORE_LANES)
>  
> can you use the previously computed 'ifn' here please?
>  
> Otherwise LGTM.
>  
> Thanks,
> Richard.
>  
> > +     {
> > +       if (loop_lens)
> > + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > +        ncopies, vectype, j, 1);
> > +       else
> > + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> > +       signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > +       bias = build_int_cst (intQI_type_node, biasval);
> > +       if (!final_mask)
> > + {
> > +   mask_vectype = truth_type_for (vectype);
> > +   final_mask = build_minus_one_cst (mask_vectype);
> > + }
> > +     }
> > +
> >    gcall *call;
> > -   if (final_mask)
> > +   if (final_len && final_mask)
> > +     {
> > +       /* Emit:
> > +    MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> > + LEN, BIAS, VEC_ARRAY).  */
> > +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> > +       tree alias_ptr = build_int_cst (ref_type, align);
> > +       call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> > + dataref_ptr, alias_ptr,
> > + final_mask, final_len, bias,
> > + vec_array);
> > +     }
> > +   else if (final_mask)
> >      {
> >        /* Emit:
> >     MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> > @@ -10445,6 +10481,8 @@ vectorizable_load (vec_info *vinfo,
> >    vec_array = create_vector_array (vectype, vec_num);
> >  
> >    tree final_mask = NULL_TREE;
> > +   tree final_len = NULL_TREE;
> > +   tree bias = NULL_TREE;
> >    if (loop_masks)
> >      final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >       ncopies, vectype, j);
> > @@ -10452,8 +10490,37 @@ vectorizable_load (vec_info *vinfo,
> >      final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >     final_mask, vec_mask, gsi);
> >  
> > +   if (vect_load_lanes_supported (vectype, group_size, false)
> > +       == IFN_MASK_LEN_LOAD_LANES)
> > +     {
> > +       if (loop_lens)
> > + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > +        ncopies, vectype, j, 1);
> > +       else
> > + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> > +       signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > +       bias = build_int_cst (intQI_type_node, biasval);
> > +       if (!final_mask)
> > + {
> > +   mask_vectype = truth_type_for (vectype);
> > +   final_mask = build_minus_one_cst (mask_vectype);
> > + }
> > +     }
> > +
> >    gcall *call;
> > -   if (final_mask)
> > +   if (final_len && final_mask)
> > +     {
> > +       /* Emit:
> > +    VEC_ARRAY = MASK_LEN_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> > +     VEC_MASK, LEN, BIAS).  */
> > +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> > +       tree alias_ptr = build_int_cst (ref_type, align);
> > +       call = gimple_build_call_internal (IFN_MASK_LEN_LOAD_LANES, 5,
> > + dataref_ptr, alias_ptr,
> > + final_mask, final_len, bias);
> > +     }
> > +   else if (final_mask)
> >      {
> >        /* Emit:
> >     VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > index 5987a327332..6a2e55aa1fc 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -2296,9 +2296,9 @@ extern tree bump_vector_ptr (vec_info *, tree, gimple *, gimple_stmt_iterator *,
> >  extern void vect_copy_ref_info (tree, tree);
> >  extern tree vect_create_destination_var (tree, tree);
> >  extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT);
> > -extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> > +extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> >  extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
> > -extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> > +extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> >  extern void vect_permute_store_chain (vec_info *, vec<tree> &,
> >        unsigned int, stmt_vec_info,
> >        gimple_stmt_iterator *, vec<tree> *);
> > 
>  
>
juzhe.zhong@rivai.ai Aug. 15, 2023, 11:44 a.m. UTC | #4
Hi, Richi.

I realize this code perform analysis for load/store

+  internal_fn lanes_ifn;
   if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, vls_type,
                            ncopies, &memory_access_type, &poffset,
-                           &alignment_support_scheme, &misalignment, &gs_info))
+                           &alignment_support_scheme, &misalignment, &gs_info,
+                           &lanes_ifn))

This function generate gather/scatter info "gs_info", using same approach.

add "&lanes_ifn" here which compute IFN for lanes load/store.

Does it reasonable ?

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-15 19:19
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford
Subject: Re: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer
On Tue, 15 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> Hi, Richi.
> 
> > +   if (vect_store_lanes_supported (vectype, group_size, false)
> > +       == IFN_MASK_LEN_STORE_LANES)
> 
> >> can you use the previously computed 'ifn' here please?
> 
> Do you mean rewrite the codes as follows :?
> 
> internal_fn lanes_ifn = vect_store_lanes_supported (vectype, group_size, false);
> 
> if (lanes_ifn == IFN_MASK_LEN_STORE_LANES).
 
The vect_store_lanes_supported is performed during analysis already
and ideally we'd not re-do such check, so please save it in a
variable at that point.
> >> I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
> 
> Yeah, working on it and I will test on both X86 and ARM.
> 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-15 17:40
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford
> Subject: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer
> On Mon, 14 Aug 2023, juzhe.zhong@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > 
> > Hi, Richard and Richi.
> > 
> > This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer.
> > 
> > Consider this simple case:
> > 
> > void __attribute__ ((noinline, noclone))
> > foo (int *__restrict a, int *__restrict b, int *__restrict c,
> >   int *__restrict d, int *__restrict e, int *__restrict f,
> >   int *__restrict g, int *__restrict h, int *__restrict j, int n)
> > {
> >   for (int i = 0; i < n; ++i)
> >     {
> >       a[i] = j[i * 8];
> >       b[i] = j[i * 8 + 1];
> >       c[i] = j[i * 8 + 2];
> >       d[i] = j[i * 8 + 3];
> >       e[i] = j[i * 8 + 4];
> >       f[i] = j[i * 8 + 5];
> >       g[i] = j[i * 8 + 6];
> >       h[i] = j[i * 8 + 7];
> >     }
> > }
> > 
> > RVV Gimple IR:
> > 
> >   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
> >   ivtmp_125 = _79 * 32;
> >   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0);
> >   vect__8.9_122 = vect_array.8[0];
> >   vect__8.10_121 = vect_array.8[1];
> >   vect__8.11_120 = vect_array.8[2];
> >   vect__8.12_119 = vect_array.8[3];
> >   vect__8.13_118 = vect_array.8[4];
> >   vect__8.14_117 = vect_array.8[5];
> >   vect__8.15_116 = vect_array.8[6];
> >   vect__8.16_115 = vect_array.8[7];
> >   vect_array.8 ={v} {CLOBBER};
> >   ivtmp_114 = _79 * 4;
> >   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
> >   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
> >   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
> >   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
> >   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
> >   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
> >   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
> >   .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
> > 
> > ASM:
> > 
> > foo:
> > lw t4,8(sp)
> > ld t5,0(sp)
> > ble t4,zero,.L5
> > .L3:
> > vsetvli t1,t4,e8,mf4,ta,ma
> > vlseg8e32.v v8,(t5)
> > slli t3,t1,2
> > slli t6,t1,5
> > vse32.v v8,0(a0)
> > vse32.v v9,0(a1)
> > vse32.v v10,0(a2)
> > vse32.v v11,0(a3)
> > vse32.v v12,0(a4)
> > vse32.v v13,0(a5)
> > vse32.v v14,0(a6)
> > vse32.v v15,0(a7)
> > sub t4,t4,t1
> > add t5,t5,t6
> > add a0,a0,t3
> > add a1,a1,t3
> > add a2,a2,t3
> > add a3,a3,t3
> > add a4,a4,t3
> > add a5,a5,t3
> > add a6,a6,t3
> > add a7,a7,t3
> > bne t4,zero,.L3
> > .L5:
> > ret
> > 
> > The details of the approach:
> > 
> > Step 1 - Modifiy the LANES LOAD/STORE support function (vect_load_lanes_supported/vect_store_lanes_supported):
> > 
> > +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> > +   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
> >  
> > -bool
> > +internal_fn
> >  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> >     bool masked_p)
> >  {
> > -  if (masked_p)
> > -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > - vec_mask_load_lanes_optab,
> > - vectype, count);
> > +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> > +     vec_mask_len_load_lanes_optab,
> > +     vectype, count))
> > +    return IFN_MASK_LEN_LOAD_LANES;
> > +  else if (masked_p)
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > + vec_mask_load_lanes_optab,
> > + vectype, count))
> > + return IFN_MASK_LOAD_LANES;
> > +    }
> >    else
> > -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> > - vec_load_lanes_optab,
> > - vectype, count);
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> > + vec_load_lanes_optab,
> > + vectype, count))
> > + return IFN_LOAD_LANES;
> > +    }
> > +  return IFN_LAST;
> >  }
> >  
> > Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE.
> > I change it into return internal_fn of the LANES LOAD/STORE that target support,
> > If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.
> > 
> > Step 2 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:
> > 
> > +   if (vect_store_lanes_supported (vectype, group_size, false)
> > +       == IFN_MASK_LEN_STORE_LANES)
> > +     {
> > +       if (loop_lens)
> > + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > +        ncopies, vectype, j, 1);
> > +       else
> > + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> > +       signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > +       bias = build_int_cst (intQI_type_node, biasval);
> > +       if (!final_mask)
> > + {
> > +   mask_vectype = truth_type_for (vectype);
> > +   final_mask = build_minus_one_cst (mask_vectype);
> > + }
> > +     }
> > +
> >    gcall *call;
> > -   if (final_mask)
> > +   if (final_len && final_mask)
> > +     {
> > +       /* Emit:
> > +    MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> > + LEN, BIAS, VEC_ARRAY).  */
> > +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> > +       tree alias_ptr = build_int_cst (ref_type, align);
> > +       call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> > + dataref_ptr, alias_ptr,
> > + final_mask, final_len, bias,
> > + vec_array);
> > +     }
> > +   else if (final_mask)
> > 
> > The LEN and MASK flow is totally the same as other MASK_LEN_* load/store.
> > 
> > This patch bootstrap and regrssion on X86 passed.
> > 
> > Fully tested on RISC-V.
> > 
> > Ok for trunk ?
>  
> I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
> > gcc/ChangeLog:
> > 
> >         * internal-fn.cc (internal_load_fn_p): Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer.
> >         (internal_store_fn_p): Ditto.
> >         (internal_fn_len_index): Ditto.
> >         (internal_fn_mask_index): Ditto.
> >         (internal_fn_stored_value_index): Ditto.
> >         * tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto.
> >         (vect_load_lanes_supported): Ditto.
> >         * tree-vect-loop.cc: Ditto.
> >         * tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto.
> >         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
> >         (get_group_load_store_type): Ditto.
> >         (vectorizable_store): Ditto.
> >         (vectorizable_load): Ditto.
> >         * tree-vectorizer.h (vect_store_lanes_supported): Ditto.
> >         (vect_load_lanes_supported): Ditto.
> > 
> > ---
> >  gcc/internal-fn.cc         |  7 +++
> >  gcc/tree-vect-data-refs.cc | 61 +++++++++++++++++--------
> >  gcc/tree-vect-loop.cc      | 11 +++--
> >  gcc/tree-vect-slp.cc       |  2 +-
> >  gcc/tree-vect-stmts.cc     | 93 ++++++++++++++++++++++++++++++++------
> >  gcc/tree-vectorizer.h      |  4 +-
> >  6 files changed, 137 insertions(+), 41 deletions(-)
> > 
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index 4f2b20a79e5..cc1ede58799 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -4578,6 +4578,7 @@ internal_load_fn_p (internal_fn fn)
> >      case IFN_MASK_LOAD:
> >      case IFN_LOAD_LANES:
> >      case IFN_MASK_LOAD_LANES:
> > +    case IFN_MASK_LEN_LOAD_LANES:
> >      case IFN_GATHER_LOAD:
> >      case IFN_MASK_GATHER_LOAD:
> >      case IFN_MASK_LEN_GATHER_LOAD:
> > @@ -4600,6 +4601,7 @@ internal_store_fn_p (internal_fn fn)
> >      case IFN_MASK_STORE:
> >      case IFN_STORE_LANES:
> >      case IFN_MASK_STORE_LANES:
> > +    case IFN_MASK_LEN_STORE_LANES:
> >      case IFN_SCATTER_STORE:
> >      case IFN_MASK_SCATTER_STORE:
> >      case IFN_MASK_LEN_SCATTER_STORE:
> > @@ -4672,6 +4674,8 @@ internal_fn_len_index (internal_fn fn)
> >      case IFN_COND_LEN_NEG:
> >      case IFN_MASK_LEN_LOAD:
> >      case IFN_MASK_LEN_STORE:
> > +    case IFN_MASK_LEN_LOAD_LANES:
> > +    case IFN_MASK_LEN_STORE_LANES:
> >        return 3;
> >  
> >      default:
> > @@ -4689,8 +4693,10 @@ internal_fn_mask_index (internal_fn fn)
> >      {
> >      case IFN_MASK_LOAD:
> >      case IFN_MASK_LOAD_LANES:
> > +    case IFN_MASK_LEN_LOAD_LANES:
> >      case IFN_MASK_STORE:
> >      case IFN_MASK_STORE_LANES:
> > +    case IFN_MASK_LEN_STORE_LANES:
> >      case IFN_MASK_LEN_LOAD:
> >      case IFN_MASK_LEN_STORE:
> >        return 2;
> > @@ -4726,6 +4732,7 @@ internal_fn_stored_value_index (internal_fn fn)
> >        return 4;
> >  
> >      case IFN_MASK_LEN_STORE:
> > +    case IFN_MASK_LEN_STORE_LANES:
> >        return 5;
> >  
> >      default:
> > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > index a3570c45b52..232b91e8ed3 100644
> > --- a/gcc/tree-vect-data-refs.cc
> > +++ b/gcc/tree-vect-data-refs.cc
> > @@ -5439,24 +5439,34 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
> >  }
> >  
> >  
> > -/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
> > +/* Return FN if vec_{mask_}store_lanes is available for COUNT vectors of
> >     type VECTYPE.  MASKED_P says whether the masked form is needed.  */
> >  
> > -bool
> > +internal_fn
> >  vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> >      bool masked_p)
> >  {
> > -  if (masked_p)
> > -    return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> > - vec_mask_store_lanes_optab,
> > - vectype, count);
> > +  if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
> > +     vec_mask_len_store_lanes_optab,
> > +     vectype, count))
> > +    return IFN_MASK_LEN_STORE_LANES;
> > +  else if (masked_p)
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> > + vec_mask_store_lanes_optab,
> > + vectype, count))
> > + return IFN_MASK_STORE_LANES;
> > +    }
> >    else
> > -    return vect_lanes_optab_supported_p ("vec_store_lanes",
> > - vec_store_lanes_optab,
> > - vectype, count);
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_store_lanes",
> > + vec_store_lanes_optab,
> > + vectype, count))
> > + return IFN_STORE_LANES;
> > +    }
> > +  return IFN_LAST;
> >  }
> >  
> > -
> >  /* Function vect_permute_store_chain.
> >  
> >     Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
> > @@ -6056,21 +6066,32 @@ vect_grouped_load_supported (tree vectype, bool single_element_p,
> >    return false;
> >  }
> >  
> > -/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
> > -   type VECTYPE.  MASKED_P says whether the masked form is needed.  */
> > +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> > +   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
> >  
> > -bool
> > +internal_fn
> >  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> >     bool masked_p)
> >  {
> > -  if (masked_p)
> > -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > - vec_mask_load_lanes_optab,
> > - vectype, count);
> > +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> > +     vec_mask_len_load_lanes_optab,
> > +     vectype, count))
> > +    return IFN_MASK_LEN_LOAD_LANES;
> > +  else if (masked_p)
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > + vec_mask_load_lanes_optab,
> > + vectype, count))
> > + return IFN_MASK_LOAD_LANES;
> > +    }
> >    else
> > -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> > - vec_load_lanes_optab,
> > - vectype, count);
> > +    {
> > +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> > + vec_load_lanes_optab,
> > + vectype, count))
> > + return IFN_LOAD_LANES;
> > +    }
> > +  return IFN_LAST;
> >  }
> >  
> >  /* Function vect_permute_load_chain.
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 07f3717ed9d..2a0cfa3e2e8 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -2839,7 +2839,8 @@ start_over:
> >       instructions record it and move on to the next instance.  */
> >    if (loads_permuted
> >        && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
> > -       && vect_store_lanes_supported (vectype, group_size, false))
> > +       && vect_store_lanes_supported (vectype, group_size, false)
> > +    != IFN_LAST)
> >      {
> >        FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
> >  {
> > @@ -2848,9 +2849,9 @@ start_over:
> >    /* Use SLP for strided accesses (or if we can't
> >       load-lanes).  */
> >    if (STMT_VINFO_STRIDED_P (stmt_vinfo)
> > -       || ! vect_load_lanes_supported
> > +       || vect_load_lanes_supported
> >      (STMT_VINFO_VECTYPE (stmt_vinfo),
> > -      DR_GROUP_SIZE (stmt_vinfo), false))
> > +      DR_GROUP_SIZE (stmt_vinfo), false) != IFN_LAST)
> >      break;
> >  }
> >  
> > @@ -3153,7 +3154,7 @@ again:
> >        vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
> >        unsigned int size = DR_GROUP_SIZE (vinfo);
> >        tree vectype = STMT_VINFO_VECTYPE (vinfo);
> > -      if (! vect_store_lanes_supported (vectype, size, false)
> > +      if (vect_store_lanes_supported (vectype, size, false) != IFN_LAST
> >  && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
> >  && ! vect_grouped_store_supported (vectype, size))
> >  return opt_result::failure_at (vinfo->stmt,
> > @@ -3165,7 +3166,7 @@ again:
> >    bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
> >    size = DR_GROUP_SIZE (vinfo);
> >    vectype = STMT_VINFO_VECTYPE (vinfo);
> > -   if (! vect_load_lanes_supported (vectype, size, false)
> > +   if (vect_load_lanes_supported (vectype, size, false) != IFN_LAST
> >        && ! vect_grouped_load_supported (vectype, single_element_p,
> >  size))
> >      return opt_result::failure_at (vinfo->stmt,
> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > index 41997d5a546..3adb06dfa18 100644
> > --- a/gcc/tree-vect-slp.cc
> > +++ b/gcc/tree-vect-slp.cc
> > @@ -3094,7 +3094,7 @@ vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
> >    if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
> >        || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> >      return false;
> > -  return vect_store_lanes_supported (vectype, group_size, false);
> > +  return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
> >  }
> >  
> >  /* Analyze an SLP instance starting from a group of grouped stores.  Call
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index 89607a98f99..0f21315995e 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -1610,9 +1610,15 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> >    bool is_load = (vls_type == VLS_LOAD);
> >    if (memory_access_type == VMAT_LOAD_STORE_LANES)
> >      {
> > -      if (is_load
> > -   ? !vect_load_lanes_supported (vectype, group_size, true)
> > -   : !vect_store_lanes_supported (vectype, group_size, true))
> > +      internal_fn ifn
> > + = (is_load ? vect_load_lanes_supported (vectype, group_size, true)
> > +    : vect_store_lanes_supported (vectype, group_size, true));
> > +      if (ifn == IFN_MASK_LEN_LOAD_LANES || ifn == IFN_MASK_LEN_STORE_LANES)
> > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> > +      else if (ifn == IFN_MASK_LOAD_LANES || ifn == IFN_MASK_STORE_LANES)
> > + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> > +        scalar_mask);
> > +      else
> >  {
> >    if (dump_enabled_p ())
> >      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -1620,10 +1626,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> >       " the target doesn't have an appropriate"
> >       " load/store-lanes instruction.\n");
> >    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> > -   return;
> >  }
> > -      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> > -      scalar_mask);
> >        return;
> >      }
> >  
> > @@ -2274,9 +2277,11 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
> >  
> >    /* Otherwise try using LOAD/STORE_LANES.  */
> >    else if (vls_type == VLS_LOAD
> > -    ? vect_load_lanes_supported (vectype, group_size, masked_p)
> > -    : vect_store_lanes_supported (vectype, group_size,
> > - masked_p))
> > +      ? vect_load_lanes_supported (vectype, group_size, masked_p)
> > + != IFN_LAST
> > +      : vect_store_lanes_supported (vectype, group_size,
> > +    masked_p)
> > + != IFN_LAST)
> >      {
> >        *memory_access_type = VMAT_LOAD_STORE_LANES;
> >        overrun_p = would_overrun_p;
> > @@ -3090,8 +3095,7 @@ vect_get_loop_variant_data_ptr_increment (
> >    /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
> >       IVs are updated by variable amount but we will support them in the future.
> >     */
> > -  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> > -       && memory_access_type != VMAT_LOAD_STORE_LANES);
> > +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
> >  
> >    /* When we support SELECT_VL pattern, we dynamic adjust
> >       the memory address by .SELECT_VL result.
> > @@ -8885,6 +8889,8 @@ vectorizable_store (vec_info *vinfo,
> >      }
> >  
> >    tree final_mask = NULL;
> > +   tree final_len = NULL;
> > +   tree bias = NULL;
> >    if (loop_masks)
> >      final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >       ncopies, vectype, j);
> > @@ -8892,8 +8898,38 @@ vectorizable_store (vec_info *vinfo,
> >      final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >     final_mask, vec_mask, gsi);
> >  
> > +   if (vect_store_lanes_supported (vectype, group_size, false)
> > +       == IFN_MASK_LEN_STORE_LANES)
>  
> can you use the previously computed 'ifn' here please?
>  
> Otherwise LGTM.
>  
> Thanks,
> Richard.
>  
> > +     {
> > +       if (loop_lens)
> > + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > +        ncopies, vectype, j, 1);
> > +       else
> > + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> > +       signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > +       bias = build_int_cst (intQI_type_node, biasval);
> > +       if (!final_mask)
> > + {
> > +   mask_vectype = truth_type_for (vectype);
> > +   final_mask = build_minus_one_cst (mask_vectype);
> > + }
> > +     }
> > +
> >    gcall *call;
> > -   if (final_mask)
> > +   if (final_len && final_mask)
> > +     {
> > +       /* Emit:
> > +    MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> > + LEN, BIAS, VEC_ARRAY).  */
> > +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> > +       tree alias_ptr = build_int_cst (ref_type, align);
> > +       call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> > + dataref_ptr, alias_ptr,
> > + final_mask, final_len, bias,
> > + vec_array);
> > +     }
> > +   else if (final_mask)
> >      {
> >        /* Emit:
> >     MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> > @@ -10445,6 +10481,8 @@ vectorizable_load (vec_info *vinfo,
> >    vec_array = create_vector_array (vectype, vec_num);
> >  
> >    tree final_mask = NULL_TREE;
> > +   tree final_len = NULL_TREE;
> > +   tree bias = NULL_TREE;
> >    if (loop_masks)
> >      final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >       ncopies, vectype, j);
> > @@ -10452,8 +10490,37 @@ vectorizable_load (vec_info *vinfo,
> >      final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >     final_mask, vec_mask, gsi);
> >  
> > +   if (vect_load_lanes_supported (vectype, group_size, false)
> > +       == IFN_MASK_LEN_LOAD_LANES)
> > +     {
> > +       if (loop_lens)
> > + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > +        ncopies, vectype, j, 1);
> > +       else
> > + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> > +       signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > +       bias = build_int_cst (intQI_type_node, biasval);
> > +       if (!final_mask)
> > + {
> > +   mask_vectype = truth_type_for (vectype);
> > +   final_mask = build_minus_one_cst (mask_vectype);
> > + }
> > +     }
> > +
> >    gcall *call;
> > -   if (final_mask)
> > +   if (final_len && final_mask)
> > +     {
> > +       /* Emit:
> > +    VEC_ARRAY = MASK_LEN_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> > +     VEC_MASK, LEN, BIAS).  */
> > +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> > +       tree alias_ptr = build_int_cst (ref_type, align);
> > +       call = gimple_build_call_internal (IFN_MASK_LEN_LOAD_LANES, 5,
> > + dataref_ptr, alias_ptr,
> > + final_mask, final_len, bias);
> > +     }
> > +   else if (final_mask)
> >      {
> >        /* Emit:
> >     VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > index 5987a327332..6a2e55aa1fc 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -2296,9 +2296,9 @@ extern tree bump_vector_ptr (vec_info *, tree, gimple *, gimple_stmt_iterator *,
> >  extern void vect_copy_ref_info (tree, tree);
> >  extern tree vect_create_destination_var (tree, tree);
> >  extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT);
> > -extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> > +extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> >  extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
> > -extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> > +extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> >  extern void vect_permute_store_chain (vec_info *, vec<tree> &,
> >        unsigned int, stmt_vec_info,
> >        gimple_stmt_iterator *, vec<tree> *);
> > 
>  
>
Richard Biener Aug. 15, 2023, 11:52 a.m. UTC | #5
On Tue, 15 Aug 2023, juzhe.zhong@rivai.ai wrote:

> Hi, Richi.
> 
> I realize this code perform analysis for load/store
> 
> +  internal_fn lanes_ifn;
>    if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, vls_type,
>                             ncopies, &memory_access_type, &poffset,
> -                           &alignment_support_scheme, &misalignment, &gs_info))
> +                           &alignment_support_scheme, &misalignment, &gs_info,
> +                           &lanes_ifn))
> 
> This function generate gather/scatter info "gs_info", using same approach.
> 
> add "&lanes_ifn" here which compute IFN for lanes load/store.
> 
> Does it reasonable ?

Ah, OK.  I guess re-computing it is OK then (once).

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-15 19:19
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford
> Subject: Re: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer
> On Tue, 15 Aug 2023, juzhe.zhong@rivai.ai wrote:
>  
> > Hi, Richi.
> > 
> > > +   if (vect_store_lanes_supported (vectype, group_size, false)
> > > +       == IFN_MASK_LEN_STORE_LANES)
> > 
> > >> can you use the previously computed 'ifn' here please?
> > 
> > Do you mean rewrite the codes as follows :?
> > 
> > internal_fn lanes_ifn = vect_store_lanes_supported (vectype, group_size, false);
> > 
> > if (lanes_ifn == IFN_MASK_LEN_STORE_LANES).
>  
> The vect_store_lanes_supported is performed during analysis already
> and ideally we'd not re-do such check, so please save it in a
> variable at that point.
> > >> I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
> > 
> > Yeah, working on it and I will test on both X86 and ARM.
> > 
> > Thanks.
> > 
> > 
> > juzhe.zhong@rivai.ai
> >  
> > From: Richard Biener
> > Date: 2023-08-15 17:40
> > To: Ju-Zhe Zhong
> > CC: gcc-patches; richard.sandiford
> > Subject: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer
> > On Mon, 14 Aug 2023, juzhe.zhong@rivai.ai wrote:
> >  
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > > 
> > > Hi, Richard and Richi.
> > > 
> > > This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer.
> > > 
> > > Consider this simple case:
> > > 
> > > void __attribute__ ((noinline, noclone))
> > > foo (int *__restrict a, int *__restrict b, int *__restrict c,
> > >   int *__restrict d, int *__restrict e, int *__restrict f,
> > >   int *__restrict g, int *__restrict h, int *__restrict j, int n)
> > > {
> > >   for (int i = 0; i < n; ++i)
> > >     {
> > >       a[i] = j[i * 8];
> > >       b[i] = j[i * 8 + 1];
> > >       c[i] = j[i * 8 + 2];
> > >       d[i] = j[i * 8 + 3];
> > >       e[i] = j[i * 8 + 4];
> > >       f[i] = j[i * 8 + 5];
> > >       g[i] = j[i * 8 + 6];
> > >       h[i] = j[i * 8 + 7];
> > >     }
> > > }
> > > 
> > > RVV Gimple IR:
> > > 
> > >   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
> > >   ivtmp_125 = _79 * 32;
> > >   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0);
> > >   vect__8.9_122 = vect_array.8[0];
> > >   vect__8.10_121 = vect_array.8[1];
> > >   vect__8.11_120 = vect_array.8[2];
> > >   vect__8.12_119 = vect_array.8[3];
> > >   vect__8.13_118 = vect_array.8[4];
> > >   vect__8.14_117 = vect_array.8[5];
> > >   vect__8.15_116 = vect_array.8[6];
> > >   vect__8.16_115 = vect_array.8[7];
> > >   vect_array.8 ={v} {CLOBBER};
> > >   ivtmp_114 = _79 * 4;
> > >   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
> > >   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
> > >   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
> > >   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
> > >   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
> > >   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
> > >   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
> > >   .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
> > > 
> > > ASM:
> > > 
> > > foo:
> > > lw t4,8(sp)
> > > ld t5,0(sp)
> > > ble t4,zero,.L5
> > > .L3:
> > > vsetvli t1,t4,e8,mf4,ta,ma
> > > vlseg8e32.v v8,(t5)
> > > slli t3,t1,2
> > > slli t6,t1,5
> > > vse32.v v8,0(a0)
> > > vse32.v v9,0(a1)
> > > vse32.v v10,0(a2)
> > > vse32.v v11,0(a3)
> > > vse32.v v12,0(a4)
> > > vse32.v v13,0(a5)
> > > vse32.v v14,0(a6)
> > > vse32.v v15,0(a7)
> > > sub t4,t4,t1
> > > add t5,t5,t6
> > > add a0,a0,t3
> > > add a1,a1,t3
> > > add a2,a2,t3
> > > add a3,a3,t3
> > > add a4,a4,t3
> > > add a5,a5,t3
> > > add a6,a6,t3
> > > add a7,a7,t3
> > > bne t4,zero,.L3
> > > .L5:
> > > ret
> > > 
> > > The details of the approach:
> > > 
> > > Step 1 - Modifiy the LANES LOAD/STORE support function (vect_load_lanes_supported/vect_store_lanes_supported):
> > > 
> > > +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> > > +   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
> > >  
> > > -bool
> > > +internal_fn
> > >  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> > >     bool masked_p)
> > >  {
> > > -  if (masked_p)
> > > -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > > - vec_mask_load_lanes_optab,
> > > - vectype, count);
> > > +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> > > +     vec_mask_len_load_lanes_optab,
> > > +     vectype, count))
> > > +    return IFN_MASK_LEN_LOAD_LANES;
> > > +  else if (masked_p)
> > > +    {
> > > +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > > + vec_mask_load_lanes_optab,
> > > + vectype, count))
> > > + return IFN_MASK_LOAD_LANES;
> > > +    }
> > >    else
> > > -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> > > - vec_load_lanes_optab,
> > > - vectype, count);
> > > +    {
> > > +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> > > + vec_load_lanes_optab,
> > > + vectype, count))
> > > + return IFN_LOAD_LANES;
> > > +    }
> > > +  return IFN_LAST;
> > >  }
> > >  
> > > Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE.
> > > I change it into return internal_fn of the LANES LOAD/STORE that target support,
> > > If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.
> > > 
> > > Step 2 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:
> > > 
> > > +   if (vect_store_lanes_supported (vectype, group_size, false)
> > > +       == IFN_MASK_LEN_STORE_LANES)
> > > +     {
> > > +       if (loop_lens)
> > > + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > > +        ncopies, vectype, j, 1);
> > > +       else
> > > + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> > > +       signed char biasval
> > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > > +       bias = build_int_cst (intQI_type_node, biasval);
> > > +       if (!final_mask)
> > > + {
> > > +   mask_vectype = truth_type_for (vectype);
> > > +   final_mask = build_minus_one_cst (mask_vectype);
> > > + }
> > > +     }
> > > +
> > >    gcall *call;
> > > -   if (final_mask)
> > > +   if (final_len && final_mask)
> > > +     {
> > > +       /* Emit:
> > > +    MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> > > + LEN, BIAS, VEC_ARRAY).  */
> > > +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> > > +       tree alias_ptr = build_int_cst (ref_type, align);
> > > +       call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> > > + dataref_ptr, alias_ptr,
> > > + final_mask, final_len, bias,
> > > + vec_array);
> > > +     }
> > > +   else if (final_mask)
> > > 
> > > The LEN and MASK flow is totally the same as other MASK_LEN_* load/store.
> > > 
> > > This patch bootstrap and regrssion on X86 passed.
> > > 
> > > Fully tested on RISC-V.
> > > 
> > > Ok for trunk ?
> >  
> > I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
> > > gcc/ChangeLog:
> > > 
> > >         * internal-fn.cc (internal_load_fn_p): Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer.
> > >         (internal_store_fn_p): Ditto.
> > >         (internal_fn_len_index): Ditto.
> > >         (internal_fn_mask_index): Ditto.
> > >         (internal_fn_stored_value_index): Ditto.
> > >         * tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto.
> > >         (vect_load_lanes_supported): Ditto.
> > >         * tree-vect-loop.cc: Ditto.
> > >         * tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto.
> > >         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
> > >         (get_group_load_store_type): Ditto.
> > >         (vectorizable_store): Ditto.
> > >         (vectorizable_load): Ditto.
> > >         * tree-vectorizer.h (vect_store_lanes_supported): Ditto.
> > >         (vect_load_lanes_supported): Ditto.
> > > 
> > > ---
> > >  gcc/internal-fn.cc         |  7 +++
> > >  gcc/tree-vect-data-refs.cc | 61 +++++++++++++++++--------
> > >  gcc/tree-vect-loop.cc      | 11 +++--
> > >  gcc/tree-vect-slp.cc       |  2 +-
> > >  gcc/tree-vect-stmts.cc     | 93 ++++++++++++++++++++++++++++++++------
> > >  gcc/tree-vectorizer.h      |  4 +-
> > >  6 files changed, 137 insertions(+), 41 deletions(-)
> > > 
> > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > index 4f2b20a79e5..cc1ede58799 100644
> > > --- a/gcc/internal-fn.cc
> > > +++ b/gcc/internal-fn.cc
> > > @@ -4578,6 +4578,7 @@ internal_load_fn_p (internal_fn fn)
> > >      case IFN_MASK_LOAD:
> > >      case IFN_LOAD_LANES:
> > >      case IFN_MASK_LOAD_LANES:
> > > +    case IFN_MASK_LEN_LOAD_LANES:
> > >      case IFN_GATHER_LOAD:
> > >      case IFN_MASK_GATHER_LOAD:
> > >      case IFN_MASK_LEN_GATHER_LOAD:
> > > @@ -4600,6 +4601,7 @@ internal_store_fn_p (internal_fn fn)
> > >      case IFN_MASK_STORE:
> > >      case IFN_STORE_LANES:
> > >      case IFN_MASK_STORE_LANES:
> > > +    case IFN_MASK_LEN_STORE_LANES:
> > >      case IFN_SCATTER_STORE:
> > >      case IFN_MASK_SCATTER_STORE:
> > >      case IFN_MASK_LEN_SCATTER_STORE:
> > > @@ -4672,6 +4674,8 @@ internal_fn_len_index (internal_fn fn)
> > >      case IFN_COND_LEN_NEG:
> > >      case IFN_MASK_LEN_LOAD:
> > >      case IFN_MASK_LEN_STORE:
> > > +    case IFN_MASK_LEN_LOAD_LANES:
> > > +    case IFN_MASK_LEN_STORE_LANES:
> > >        return 3;
> > >  
> > >      default:
> > > @@ -4689,8 +4693,10 @@ internal_fn_mask_index (internal_fn fn)
> > >      {
> > >      case IFN_MASK_LOAD:
> > >      case IFN_MASK_LOAD_LANES:
> > > +    case IFN_MASK_LEN_LOAD_LANES:
> > >      case IFN_MASK_STORE:
> > >      case IFN_MASK_STORE_LANES:
> > > +    case IFN_MASK_LEN_STORE_LANES:
> > >      case IFN_MASK_LEN_LOAD:
> > >      case IFN_MASK_LEN_STORE:
> > >        return 2;
> > > @@ -4726,6 +4732,7 @@ internal_fn_stored_value_index (internal_fn fn)
> > >        return 4;
> > >  
> > >      case IFN_MASK_LEN_STORE:
> > > +    case IFN_MASK_LEN_STORE_LANES:
> > >        return 5;
> > >  
> > >      default:
> > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > > index a3570c45b52..232b91e8ed3 100644
> > > --- a/gcc/tree-vect-data-refs.cc
> > > +++ b/gcc/tree-vect-data-refs.cc
> > > @@ -5439,24 +5439,34 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
> > >  }
> > >  
> > >  
> > > -/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
> > > +/* Return FN if vec_{mask_}store_lanes is available for COUNT vectors of
> > >     type VECTYPE.  MASKED_P says whether the masked form is needed.  */
> > >  
> > > -bool
> > > +internal_fn
> > >  vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> > >      bool masked_p)
> > >  {
> > > -  if (masked_p)
> > > -    return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> > > - vec_mask_store_lanes_optab,
> > > - vectype, count);
> > > +  if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
> > > +     vec_mask_len_store_lanes_optab,
> > > +     vectype, count))
> > > +    return IFN_MASK_LEN_STORE_LANES;
> > > +  else if (masked_p)
> > > +    {
> > > +      if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> > > + vec_mask_store_lanes_optab,
> > > + vectype, count))
> > > + return IFN_MASK_STORE_LANES;
> > > +    }
> > >    else
> > > -    return vect_lanes_optab_supported_p ("vec_store_lanes",
> > > - vec_store_lanes_optab,
> > > - vectype, count);
> > > +    {
> > > +      if (vect_lanes_optab_supported_p ("vec_store_lanes",
> > > + vec_store_lanes_optab,
> > > + vectype, count))
> > > + return IFN_STORE_LANES;
> > > +    }
> > > +  return IFN_LAST;
> > >  }
> > >  
> > > -
> > >  /* Function vect_permute_store_chain.
> > >  
> > >     Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
> > > @@ -6056,21 +6066,32 @@ vect_grouped_load_supported (tree vectype, bool single_element_p,
> > >    return false;
> > >  }
> > >  
> > > -/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
> > > -   type VECTYPE.  MASKED_P says whether the masked form is needed.  */
> > > +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> > > +   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
> > >  
> > > -bool
> > > +internal_fn
> > >  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> > >     bool masked_p)
> > >  {
> > > -  if (masked_p)
> > > -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > > - vec_mask_load_lanes_optab,
> > > - vectype, count);
> > > +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> > > +     vec_mask_len_load_lanes_optab,
> > > +     vectype, count))
> > > +    return IFN_MASK_LEN_LOAD_LANES;
> > > +  else if (masked_p)
> > > +    {
> > > +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > > + vec_mask_load_lanes_optab,
> > > + vectype, count))
> > > + return IFN_MASK_LOAD_LANES;
> > > +    }
> > >    else
> > > -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> > > - vec_load_lanes_optab,
> > > - vectype, count);
> > > +    {
> > > +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> > > + vec_load_lanes_optab,
> > > + vectype, count))
> > > + return IFN_LOAD_LANES;
> > > +    }
> > > +  return IFN_LAST;
> > >  }
> > >  
> > >  /* Function vect_permute_load_chain.
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > index 07f3717ed9d..2a0cfa3e2e8 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -2839,7 +2839,8 @@ start_over:
> > >       instructions record it and move on to the next instance.  */
> > >    if (loads_permuted
> > >        && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
> > > -       && vect_store_lanes_supported (vectype, group_size, false))
> > > +       && vect_store_lanes_supported (vectype, group_size, false)
> > > +    != IFN_LAST)
> > >      {
> > >        FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
> > >  {
> > > @@ -2848,9 +2849,9 @@ start_over:
> > >    /* Use SLP for strided accesses (or if we can't
> > >       load-lanes).  */
> > >    if (STMT_VINFO_STRIDED_P (stmt_vinfo)
> > > -       || ! vect_load_lanes_supported
> > > +       || vect_load_lanes_supported
> > >      (STMT_VINFO_VECTYPE (stmt_vinfo),
> > > -      DR_GROUP_SIZE (stmt_vinfo), false))
> > > +      DR_GROUP_SIZE (stmt_vinfo), false) != IFN_LAST)
> > >      break;
> > >  }
> > >  
> > > @@ -3153,7 +3154,7 @@ again:
> > >        vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
> > >        unsigned int size = DR_GROUP_SIZE (vinfo);
> > >        tree vectype = STMT_VINFO_VECTYPE (vinfo);
> > > -      if (! vect_store_lanes_supported (vectype, size, false)
> > > +      if (vect_store_lanes_supported (vectype, size, false) != IFN_LAST
> > >  && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
> > >  && ! vect_grouped_store_supported (vectype, size))
> > >  return opt_result::failure_at (vinfo->stmt,
> > > @@ -3165,7 +3166,7 @@ again:
> > >    bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
> > >    size = DR_GROUP_SIZE (vinfo);
> > >    vectype = STMT_VINFO_VECTYPE (vinfo);
> > > -   if (! vect_load_lanes_supported (vectype, size, false)
> > > +   if (vect_load_lanes_supported (vectype, size, false) != IFN_LAST
> > >        && ! vect_grouped_load_supported (vectype, single_element_p,
> > >  size))
> > >      return opt_result::failure_at (vinfo->stmt,
> > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > > index 41997d5a546..3adb06dfa18 100644
> > > --- a/gcc/tree-vect-slp.cc
> > > +++ b/gcc/tree-vect-slp.cc
> > > @@ -3094,7 +3094,7 @@ vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
> > >    if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
> > >        || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> > >      return false;
> > > -  return vect_store_lanes_supported (vectype, group_size, false);
> > > +  return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
> > >  }
> > >  
> > >  /* Analyze an SLP instance starting from a group of grouped stores.  Call
> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > index 89607a98f99..0f21315995e 100644
> > > --- a/gcc/tree-vect-stmts.cc
> > > +++ b/gcc/tree-vect-stmts.cc
> > > @@ -1610,9 +1610,15 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> > >    bool is_load = (vls_type == VLS_LOAD);
> > >    if (memory_access_type == VMAT_LOAD_STORE_LANES)
> > >      {
> > > -      if (is_load
> > > -   ? !vect_load_lanes_supported (vectype, group_size, true)
> > > -   : !vect_store_lanes_supported (vectype, group_size, true))
> > > +      internal_fn ifn
> > > + = (is_load ? vect_load_lanes_supported (vectype, group_size, true)
> > > +    : vect_store_lanes_supported (vectype, group_size, true));
> > > +      if (ifn == IFN_MASK_LEN_LOAD_LANES || ifn == IFN_MASK_LEN_STORE_LANES)
> > > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> > > +      else if (ifn == IFN_MASK_LOAD_LANES || ifn == IFN_MASK_STORE_LANES)
> > > + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> > > +        scalar_mask);
> > > +      else
> > >  {
> > >    if (dump_enabled_p ())
> > >      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > @@ -1620,10 +1626,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> > >       " the target doesn't have an appropriate"
> > >       " load/store-lanes instruction.\n");
> > >    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> > > -   return;
> > >  }
> > > -      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> > > -      scalar_mask);
> > >        return;
> > >      }
> > >  
> > > @@ -2274,9 +2277,11 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
> > >  
> > >    /* Otherwise try using LOAD/STORE_LANES.  */
> > >    else if (vls_type == VLS_LOAD
> > > -    ? vect_load_lanes_supported (vectype, group_size, masked_p)
> > > -    : vect_store_lanes_supported (vectype, group_size,
> > > - masked_p))
> > > +      ? vect_load_lanes_supported (vectype, group_size, masked_p)
> > > + != IFN_LAST
> > > +      : vect_store_lanes_supported (vectype, group_size,
> > > +    masked_p)
> > > + != IFN_LAST)
> > >      {
> > >        *memory_access_type = VMAT_LOAD_STORE_LANES;
> > >        overrun_p = would_overrun_p;
> > > @@ -3090,8 +3095,7 @@ vect_get_loop_variant_data_ptr_increment (
> > >    /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
> > >       IVs are updated by variable amount but we will support them in the future.
> > >     */
> > > -  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> > > -       && memory_access_type != VMAT_LOAD_STORE_LANES);
> > > +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
> > >  
> > >    /* When we support SELECT_VL pattern, we dynamic adjust
> > >       the memory address by .SELECT_VL result.
> > > @@ -8885,6 +8889,8 @@ vectorizable_store (vec_info *vinfo,
> > >      }
> > >  
> > >    tree final_mask = NULL;
> > > +   tree final_len = NULL;
> > > +   tree bias = NULL;
> > >    if (loop_masks)
> > >      final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> > >       ncopies, vectype, j);
> > > @@ -8892,8 +8898,38 @@ vectorizable_store (vec_info *vinfo,
> > >      final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> > >     final_mask, vec_mask, gsi);
> > >  
> > > +   if (vect_store_lanes_supported (vectype, group_size, false)
> > > +       == IFN_MASK_LEN_STORE_LANES)
> >  
> > can you use the previously computed 'ifn' here please?
> >  
> > Otherwise LGTM.
> >  
> > Thanks,
> > Richard.
> >  
> > > +     {
> > > +       if (loop_lens)
> > > + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > > +        ncopies, vectype, j, 1);
> > > +       else
> > > + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> > > +       signed char biasval
> > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > > +       bias = build_int_cst (intQI_type_node, biasval);
> > > +       if (!final_mask)
> > > + {
> > > +   mask_vectype = truth_type_for (vectype);
> > > +   final_mask = build_minus_one_cst (mask_vectype);
> > > + }
> > > +     }
> > > +
> > >    gcall *call;
> > > -   if (final_mask)
> > > +   if (final_len && final_mask)
> > > +     {
> > > +       /* Emit:
> > > +    MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> > > + LEN, BIAS, VEC_ARRAY).  */
> > > +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> > > +       tree alias_ptr = build_int_cst (ref_type, align);
> > > +       call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> > > + dataref_ptr, alias_ptr,
> > > + final_mask, final_len, bias,
> > > + vec_array);
> > > +     }
> > > +   else if (final_mask)
> > >      {
> > >        /* Emit:
> > >     MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> > > @@ -10445,6 +10481,8 @@ vectorizable_load (vec_info *vinfo,
> > >    vec_array = create_vector_array (vectype, vec_num);
> > >  
> > >    tree final_mask = NULL_TREE;
> > > +   tree final_len = NULL_TREE;
> > > +   tree bias = NULL_TREE;
> > >    if (loop_masks)
> > >      final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> > >       ncopies, vectype, j);
> > > @@ -10452,8 +10490,37 @@ vectorizable_load (vec_info *vinfo,
> > >      final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> > >     final_mask, vec_mask, gsi);
> > >  
> > > +   if (vect_load_lanes_supported (vectype, group_size, false)
> > > +       == IFN_MASK_LEN_LOAD_LANES)
> > > +     {
> > > +       if (loop_lens)
> > > + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > > +        ncopies, vectype, j, 1);
> > > +       else
> > > + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> > > +       signed char biasval
> > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > > +       bias = build_int_cst (intQI_type_node, biasval);
> > > +       if (!final_mask)
> > > + {
> > > +   mask_vectype = truth_type_for (vectype);
> > > +   final_mask = build_minus_one_cst (mask_vectype);
> > > + }
> > > +     }
> > > +
> > >    gcall *call;
> > > -   if (final_mask)
> > > +   if (final_len && final_mask)
> > > +     {
> > > +       /* Emit:
> > > +    VEC_ARRAY = MASK_LEN_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> > > +     VEC_MASK, LEN, BIAS).  */
> > > +       unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> > > +       tree alias_ptr = build_int_cst (ref_type, align);
> > > +       call = gimple_build_call_internal (IFN_MASK_LEN_LOAD_LANES, 5,
> > > + dataref_ptr, alias_ptr,
> > > + final_mask, final_len, bias);
> > > +     }
> > > +   else if (final_mask)
> > >      {
> > >        /* Emit:
> > >     VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > index 5987a327332..6a2e55aa1fc 100644
> > > --- a/gcc/tree-vectorizer.h
> > > +++ b/gcc/tree-vectorizer.h
> > > @@ -2296,9 +2296,9 @@ extern tree bump_vector_ptr (vec_info *, tree, gimple *, gimple_stmt_iterator *,
> > >  extern void vect_copy_ref_info (tree, tree);
> > >  extern tree vect_create_destination_var (tree, tree);
> > >  extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT);
> > > -extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> > > +extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> > >  extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
> > > -extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> > > +extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> > >  extern void vect_permute_store_chain (vec_info *, vec<tree> &,
> > >        unsigned int, stmt_vec_info,
> > >        gimple_stmt_iterator *, vec<tree> *);
> > > 
> >  
> > 
>  
>
diff mbox series

Patch

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 4f2b20a79e5..cc1ede58799 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4578,6 +4578,7 @@  internal_load_fn_p (internal_fn fn)
     case IFN_MASK_LOAD:
     case IFN_LOAD_LANES:
     case IFN_MASK_LOAD_LANES:
+    case IFN_MASK_LEN_LOAD_LANES:
     case IFN_GATHER_LOAD:
     case IFN_MASK_GATHER_LOAD:
     case IFN_MASK_LEN_GATHER_LOAD:
@@ -4600,6 +4601,7 @@  internal_store_fn_p (internal_fn fn)
     case IFN_MASK_STORE:
     case IFN_STORE_LANES:
     case IFN_MASK_STORE_LANES:
+    case IFN_MASK_LEN_STORE_LANES:
     case IFN_SCATTER_STORE:
     case IFN_MASK_SCATTER_STORE:
     case IFN_MASK_LEN_SCATTER_STORE:
@@ -4672,6 +4674,8 @@  internal_fn_len_index (internal_fn fn)
     case IFN_COND_LEN_NEG:
     case IFN_MASK_LEN_LOAD:
     case IFN_MASK_LEN_STORE:
+    case IFN_MASK_LEN_LOAD_LANES:
+    case IFN_MASK_LEN_STORE_LANES:
       return 3;
 
     default:
@@ -4689,8 +4693,10 @@  internal_fn_mask_index (internal_fn fn)
     {
     case IFN_MASK_LOAD:
     case IFN_MASK_LOAD_LANES:
+    case IFN_MASK_LEN_LOAD_LANES:
     case IFN_MASK_STORE:
     case IFN_MASK_STORE_LANES:
+    case IFN_MASK_LEN_STORE_LANES:
     case IFN_MASK_LEN_LOAD:
     case IFN_MASK_LEN_STORE:
       return 2;
@@ -4726,6 +4732,7 @@  internal_fn_stored_value_index (internal_fn fn)
       return 4;
 
     case IFN_MASK_LEN_STORE:
+    case IFN_MASK_LEN_STORE_LANES:
       return 5;
 
     default:
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index a3570c45b52..232b91e8ed3 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -5439,24 +5439,34 @@  vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
 }
 
 
-/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
+/* Return FN if vec_{mask_}store_lanes is available for COUNT vectors of
    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
 
-bool
+internal_fn
 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
 			    bool masked_p)
 {
-  if (masked_p)
-    return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
-					 vec_mask_store_lanes_optab,
-					 vectype, count);
+  if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
+				    vec_mask_len_store_lanes_optab,
+				    vectype, count))
+    return IFN_MASK_LEN_STORE_LANES;
+  else if (masked_p)
+    {
+      if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
+					vec_mask_store_lanes_optab,
+					vectype, count))
+	return IFN_MASK_STORE_LANES;
+    }
   else
-    return vect_lanes_optab_supported_p ("vec_store_lanes",
-					 vec_store_lanes_optab,
-					 vectype, count);
+    {
+      if (vect_lanes_optab_supported_p ("vec_store_lanes",
+					vec_store_lanes_optab,
+					vectype, count))
+	return IFN_STORE_LANES;
+    }
+  return IFN_LAST;
 }
 
-
 /* Function vect_permute_store_chain.
 
    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
@@ -6056,21 +6066,32 @@  vect_grouped_load_supported (tree vectype, bool single_element_p,
   return false;
 }
 
-/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
-   type VECTYPE.  MASKED_P says whether the masked form is needed.  */
+/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
+   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */
 
-bool
+internal_fn
 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
 			   bool masked_p)
 {
-  if (masked_p)
-    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
-					 vec_mask_load_lanes_optab,
-					 vectype, count);
+  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
+				    vec_mask_len_load_lanes_optab,
+				    vectype, count))
+    return IFN_MASK_LEN_LOAD_LANES;
+  else if (masked_p)
+    {
+      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
+					vec_mask_load_lanes_optab,
+					vectype, count))
+	return IFN_MASK_LOAD_LANES;
+    }
   else
-    return vect_lanes_optab_supported_p ("vec_load_lanes",
-					 vec_load_lanes_optab,
-					 vectype, count);
+    {
+      if (vect_lanes_optab_supported_p ("vec_load_lanes",
+					vec_load_lanes_optab,
+					vectype, count))
+	return IFN_LOAD_LANES;
+    }
+  return IFN_LAST;
 }
 
 /* Function vect_permute_load_chain.
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 07f3717ed9d..2a0cfa3e2e8 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2839,7 +2839,8 @@  start_over:
 	     instructions record it and move on to the next instance.  */
 	  if (loads_permuted
 	      && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
-	      && vect_store_lanes_supported (vectype, group_size, false))
+	      && vect_store_lanes_supported (vectype, group_size, false)
+		   != IFN_LAST)
 	    {
 	      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
 		{
@@ -2848,9 +2849,9 @@  start_over:
 		  /* Use SLP for strided accesses (or if we can't
 		     load-lanes).  */
 		  if (STMT_VINFO_STRIDED_P (stmt_vinfo)
-		      || ! vect_load_lanes_supported
+		      || vect_load_lanes_supported
 			    (STMT_VINFO_VECTYPE (stmt_vinfo),
-			     DR_GROUP_SIZE (stmt_vinfo), false))
+			     DR_GROUP_SIZE (stmt_vinfo), false) != IFN_LAST)
 		    break;
 		}
 
@@ -3153,7 +3154,7 @@  again:
       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
       unsigned int size = DR_GROUP_SIZE (vinfo);
       tree vectype = STMT_VINFO_VECTYPE (vinfo);
-      if (! vect_store_lanes_supported (vectype, size, false)
+      if (vect_store_lanes_supported (vectype, size, false) != IFN_LAST
 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
 	 && ! vect_grouped_store_supported (vectype, size))
 	return opt_result::failure_at (vinfo->stmt,
@@ -3165,7 +3166,7 @@  again:
 	  bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
 	  size = DR_GROUP_SIZE (vinfo);
 	  vectype = STMT_VINFO_VECTYPE (vinfo);
-	  if (! vect_load_lanes_supported (vectype, size, false)
+	  if (vect_load_lanes_supported (vectype, size, false) != IFN_LAST
 	      && ! vect_grouped_load_supported (vectype, single_element_p,
 						size))
 	    return opt_result::failure_at (vinfo->stmt,
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 41997d5a546..3adb06dfa18 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3094,7 +3094,7 @@  vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
     return false;
-  return vect_store_lanes_supported (vectype, group_size, false);
+  return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
 }
 
 /* Analyze an SLP instance starting from a group of grouped stores.  Call
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 89607a98f99..0f21315995e 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1610,9 +1610,15 @@  check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
   bool is_load = (vls_type == VLS_LOAD);
   if (memory_access_type == VMAT_LOAD_STORE_LANES)
     {
-      if (is_load
-	  ? !vect_load_lanes_supported (vectype, group_size, true)
-	  : !vect_store_lanes_supported (vectype, group_size, true))
+      internal_fn ifn
+	= (is_load ? vect_load_lanes_supported (vectype, group_size, true)
+		   : vect_store_lanes_supported (vectype, group_size, true));
+      if (ifn == IFN_MASK_LEN_LOAD_LANES || ifn == IFN_MASK_LEN_STORE_LANES)
+	vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+      else if (ifn == IFN_MASK_LOAD_LANES || ifn == IFN_MASK_STORE_LANES)
+	vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
+			       scalar_mask);
+      else
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1620,10 +1626,7 @@  check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
 			     " the target doesn't have an appropriate"
 			     " load/store-lanes instruction.\n");
 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-	  return;
 	}
-      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
-			     scalar_mask);
       return;
     }
 
@@ -2274,9 +2277,11 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 
 	  /* Otherwise try using LOAD/STORE_LANES.  */
 	  else if (vls_type == VLS_LOAD
-		   ? vect_load_lanes_supported (vectype, group_size, masked_p)
-		   : vect_store_lanes_supported (vectype, group_size,
-						 masked_p))
+		     ? vect_load_lanes_supported (vectype, group_size, masked_p)
+			 != IFN_LAST
+		     : vect_store_lanes_supported (vectype, group_size,
+						   masked_p)
+			 != IFN_LAST)
 	    {
 	      *memory_access_type = VMAT_LOAD_STORE_LANES;
 	      overrun_p = would_overrun_p;
@@ -3090,8 +3095,7 @@  vect_get_loop_variant_data_ptr_increment (
   /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
      IVs are updated by variable amount but we will support them in the future.
    */
-  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
-	      && memory_access_type != VMAT_LOAD_STORE_LANES);
+  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
 
   /* When we support SELECT_VL pattern, we dynamic adjust
      the memory address by .SELECT_VL result.
@@ -8885,6 +8889,8 @@  vectorizable_store (vec_info *vinfo,
 	    }
 
 	  tree final_mask = NULL;
+	  tree final_len = NULL;
+	  tree bias = NULL;
 	  if (loop_masks)
 	    final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
 					     ncopies, vectype, j);
@@ -8892,8 +8898,38 @@  vectorizable_store (vec_info *vinfo,
 	    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
 					   final_mask, vec_mask, gsi);
 
+	  if (vect_store_lanes_supported (vectype, group_size, false)
+	      == IFN_MASK_LEN_STORE_LANES)
+	    {
+	      if (loop_lens)
+		final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+					       ncopies, vectype, j, 1);
+	      else
+		final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+	      signed char biasval
+		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	      bias = build_int_cst (intQI_type_node, biasval);
+	      if (!final_mask)
+		{
+		  mask_vectype = truth_type_for (vectype);
+		  final_mask = build_minus_one_cst (mask_vectype);
+		}
+	    }
+
 	  gcall *call;
-	  if (final_mask)
+	  if (final_len && final_mask)
+	    {
+	      /* Emit:
+		   MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
+					 LEN, BIAS, VEC_ARRAY).  */
+	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
+	      tree alias_ptr = build_int_cst (ref_type, align);
+	      call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
+						 dataref_ptr, alias_ptr,
+						 final_mask, final_len, bias,
+						 vec_array);
+	    }
+	  else if (final_mask)
 	    {
 	      /* Emit:
 		   MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
@@ -10445,6 +10481,8 @@  vectorizable_load (vec_info *vinfo,
 	  vec_array = create_vector_array (vectype, vec_num);
 
 	  tree final_mask = NULL_TREE;
+	  tree final_len = NULL_TREE;
+	  tree bias = NULL_TREE;
 	  if (loop_masks)
 	    final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
 					     ncopies, vectype, j);
@@ -10452,8 +10490,37 @@  vectorizable_load (vec_info *vinfo,
 	    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
 					   final_mask, vec_mask, gsi);
 
+	  if (vect_load_lanes_supported (vectype, group_size, false)
+	      == IFN_MASK_LEN_LOAD_LANES)
+	    {
+	      if (loop_lens)
+		final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+					       ncopies, vectype, j, 1);
+	      else
+		final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+	      signed char biasval
+		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	      bias = build_int_cst (intQI_type_node, biasval);
+	      if (!final_mask)
+		{
+		  mask_vectype = truth_type_for (vectype);
+		  final_mask = build_minus_one_cst (mask_vectype);
+		}
+	    }
+
 	  gcall *call;
-	  if (final_mask)
+	  if (final_len && final_mask)
+	    {
+	      /* Emit:
+		   VEC_ARRAY = MASK_LEN_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
+						    VEC_MASK, LEN, BIAS).  */
+	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
+	      tree alias_ptr = build_int_cst (ref_type, align);
+	      call = gimple_build_call_internal (IFN_MASK_LEN_LOAD_LANES, 5,
+						 dataref_ptr, alias_ptr,
+						 final_mask, final_len, bias);
+	    }
+	  else if (final_mask)
 	    {
 	      /* Emit:
 		   VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 5987a327332..6a2e55aa1fc 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2296,9 +2296,9 @@  extern tree bump_vector_ptr (vec_info *, tree, gimple *, gimple_stmt_iterator *,
 extern void vect_copy_ref_info (tree, tree);
 extern tree vect_create_destination_var (tree, tree);
 extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT);
-extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
+extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
 extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
-extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
+extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
 extern void vect_permute_store_chain (vec_info *, vec<tree> &,
 				      unsigned int, stmt_vec_info,
 				      gimple_stmt_iterator *, vec<tree> *);