diff mbox series

[V2] VECT: Support LEN_MASK_ LOAD/STORE to support flow control for length loop control

Message ID 20230612041438.272885-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V2] VECT: Support LEN_MASK_ LOAD/STORE to support flow control for length loop control | expand

Commit Message

juzhe.zhong@rivai.ai June 12, 2023, 4:14 a.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Target like ARM SVE in GCC has an elegant way to handle both loop control
and flow control simultaneously:

loop_control_mask = WHILE_ULT
flow_control_mask = comparison
control_mask = loop_control_mask & flow_control_mask;
MASK_LOAD (control_mask)
MASK_STORE (control_mask)

However, targets like RVV (RISC-V Vector) can not use this approach in
auto-vectorization since RVV use length in loop control.

This patch adds LEN_MASK_ LOAD/STORE to support flow control for targets
like RISC-V that uses length in loop control.
Normalize load/store into LEN_MASK_ LOAD/STORE as long as either length
or mask is valid. Length is the outcome of SELECT_VL or MIN_EXPR.
Mask is the outcome of comparison.

LEN_MASK_ LOAD/STORE format is defined as follows:
1). LEN_MASK_LOAD (ptr, align, length, mask).
2). LEN_MASK_STORE (ptr, align, length, mask, vec).

Consider these 4 following cases:

VLA: Variable-length auto-vectorization
VLS: Specific-length auto-vectorization

Case 1 (VLS): -mrvv-vector-bits=128   IR (Does not use LEN_MASK_*):
Code:					v1 = MEM (...)
  for (int i = 0; i < 4; i++)           v2 = MEM (...)
    a[i] = b[i] + c[i];                 v3 = v1 + v2 
                                        MEM[...] = v3

Case 2 (VLS): -mrvv-vector-bits=128   IR (LEN_MASK_* with length = VF, mask = comparison):
Code:                                   mask = comparison
  for (int i = 0; i < 4; i++)           v1 = LEN_MASK_LOAD (length = VF, mask)
    if (cond[i])                        v2 = LEN_MASK_LOAD (length = VF, mask) 
      a[i] = b[i] + c[i];               v3 = v1 + v2
                                        LEN_MASK_STORE (length = VF, mask, v3)
           
Case 3 (VLA):
Code:                                   loop_len = SELECT_VL or MIN
  for (int i = 0; i < n; i++)           v1 = LEN_MASK_LOAD (length = loop_len, mask = {-1,-1,...})
      a[i] = b[i] + c[i];               v2 = LEN_MASK_LOAD (length = loop_len, mask = {-1,-1,...})
                                        v3 = v1 + v2                            
                                        LEN_MASK_STORE (length = loop_len, mask = {-1,-1,...}, v3)

Case 4 (VLA):
Code:                                   loop_len = SELECT_VL or MIN
  for (int i = 0; i < n; i++)           mask = comparison
      if (cond[i])                      v1 = LEN_MASK_LOAD (length = loop_len, mask)
      a[i] = b[i] + c[i];               v2 = LEN_MASK_LOAD (length = loop_len, mask)
                                        v3 = v1 + v2                            
                                        LEN_MASK_STORE (length = loop_len, mask, v3)

More features:
1. Support simplify gimple fold for LEN_MASK_ LOAD/STORE:
   LEN_MASK_STORE (length = vf, mask = {-1,-1,...}, v) ===> MEM [...] = V
2. Allow DSE for LEN_MASK_* LOAD/STORE.

Bootstrap && Regression on X86 with no surprise difference.

gcc/ChangeLog:

        * doc/md.texi: Add LEN_MASK_ LOAD/STORE.
        * genopinit.cc (main): Ditto.
        (CMP_NAME): Ditto.
        * gimple-fold.cc (arith_overflowed_p): Ditto.
        (gimple_fold_partial_load_store_mem_ref): Ditto.
        (gimple_fold_partial_store): Ditto.
        (gimple_fold_call): Ditto.
        * internal-fn.cc (len_maskload_direct): Ditto.
        (len_maskstore_direct): Ditto.
        (expand_partial_load_optab_fn): Ditto.
        (expand_len_maskload_optab_fn): Ditto.
        (expand_partial_store_optab_fn): Ditto.
        (expand_len_maskstore_optab_fn): Ditto.
        (direct_len_maskload_optab_supported_p): Ditto.
        (direct_len_maskstore_optab_supported_p): Ditto.
        (internal_load_fn_p): Ditto.
        (internal_store_fn_p): Ditto.
        (internal_fn_mask_index): Ditto.
        (internal_fn_stored_value_index): Ditto.
        * internal-fn.def (LEN_MASK_LOAD): Ditto.
        (LEN_MASK_STORE): Ditto.
        * optabs-query.cc (can_vec_len_mask_load_store_p): Ditto.
        * optabs-query.h (can_vec_len_mask_load_store_p): Ditto.
        * optabs.def (OPTAB_CD): Ditto.
        * tree-data-ref.cc (get_references_in_stmt): Ditto.
        * tree-if-conv.cc (ifcvt_can_use_mask_load_store): Ditto.
        * tree-ssa-alias.cc (ref_maybe_used_by_call_p_1): Ditto.
        (call_may_clobber_ref_p_1): Ditto.
        * tree-ssa-dse.cc (initialize_ao_ref_for_dse): Ditto.
        (dse_optimize_stmt): Ditto.
        * tree-ssa-loop-ivopts.cc (get_mem_type_for_internal_fn): Ditto.
        (get_alias_ptr_type_for_ptr_address): Ditto.
        * tree-ssa-sccvn.cc (vn_reference_lookup_3): Ditto.
        * tree-vect-data-refs.cc (can_group_stmts_p): Ditto.
        (vect_find_stmt_data_reference): Ditto.
        (vect_supportable_dr_alignment): Ditto.
        * tree-vect-loop.cc (vect_verify_loop_lens): Ditto.
        (optimize_mask_stores): Ditto.
        * tree-vect-slp.cc (vect_get_operand_map): Ditto.
        (vect_build_slp_tree_2): Ditto.
        * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        (vect_get_vector_types_for_stmt): Ditto.
        * tree-vectorizer.cc (try_vectorize_loop_1): Ditto.

---
 gcc/doc/md.texi             | 32 ++++++++++++
 gcc/genopinit.cc            |  6 ++-
 gcc/gimple-fold.cc          | 28 ++++++++---
 gcc/internal-fn.cc          | 37 +++++++++++++-
 gcc/internal-fn.def         |  4 ++
 gcc/optabs-query.cc         | 39 +++++++++++++++
 gcc/optabs-query.h          |  1 +
 gcc/optabs.def              |  2 +
 gcc/tree-data-ref.cc        |  4 ++
 gcc/tree-if-conv.cc         |  3 ++
 gcc/tree-ssa-alias.cc       |  3 ++
 gcc/tree-ssa-dse.cc         | 12 +++++
 gcc/tree-ssa-loop-ivopts.cc |  8 +++
 gcc/tree-ssa-sccvn.cc       |  6 +++
 gcc/tree-vect-data-refs.cc  | 20 +++++---
 gcc/tree-vect-loop.cc       | 52 +++++++++++--------
 gcc/tree-vect-slp.cc        |  5 ++
 gcc/tree-vect-stmts.cc      | 99 ++++++++++++++++++++++++++++++++++---
 gcc/tree-vectorizer.cc      |  2 +
 19 files changed, 320 insertions(+), 43 deletions(-)

Comments

Richard Biener June 15, 2023, 8:06 a.m. UTC | #1
On Mon, 12 Jun 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Target like ARM SVE in GCC has an elegant way to handle both loop control
> and flow control simultaneously:
> 
> loop_control_mask = WHILE_ULT
> flow_control_mask = comparison
> control_mask = loop_control_mask & flow_control_mask;
> MASK_LOAD (control_mask)
> MASK_STORE (control_mask)
> 
> However, targets like RVV (RISC-V Vector) can not use this approach in
> auto-vectorization since RVV use length in loop control.
> 
> This patch adds LEN_MASK_ LOAD/STORE to support flow control for targets
> like RISC-V that uses length in loop control.
> Normalize load/store into LEN_MASK_ LOAD/STORE as long as either length
> or mask is valid. Length is the outcome of SELECT_VL or MIN_EXPR.
> Mask is the outcome of comparison.
> 
> LEN_MASK_ LOAD/STORE format is defined as follows:
> 1). LEN_MASK_LOAD (ptr, align, length, mask).
> 2). LEN_MASK_STORE (ptr, align, length, mask, vec).
> 
> Consider these 4 following cases:
> 
> VLA: Variable-length auto-vectorization
> VLS: Specific-length auto-vectorization
> 
> Case 1 (VLS): -mrvv-vector-bits=128   IR (Does not use LEN_MASK_*):
> Code:					v1 = MEM (...)
>   for (int i = 0; i < 4; i++)           v2 = MEM (...)
>     a[i] = b[i] + c[i];                 v3 = v1 + v2 
>                                         MEM[...] = v3
> 
> Case 2 (VLS): -mrvv-vector-bits=128   IR (LEN_MASK_* with length = VF, mask = comparison):
> Code:                                   mask = comparison
>   for (int i = 0; i < 4; i++)           v1 = LEN_MASK_LOAD (length = VF, mask)
>     if (cond[i])                        v2 = LEN_MASK_LOAD (length = VF, mask) 
>       a[i] = b[i] + c[i];               v3 = v1 + v2
>                                         LEN_MASK_STORE (length = VF, mask, v3)
>            
> Case 3 (VLA):
> Code:                                   loop_len = SELECT_VL or MIN
>   for (int i = 0; i < n; i++)           v1 = LEN_MASK_LOAD (length = loop_len, mask = {-1,-1,...})
>       a[i] = b[i] + c[i];               v2 = LEN_MASK_LOAD (length = loop_len, mask = {-1,-1,...})
>                                         v3 = v1 + v2                            
>                                         LEN_MASK_STORE (length = loop_len, mask = {-1,-1,...}, v3)
> 
> Case 4 (VLA):
> Code:                                   loop_len = SELECT_VL or MIN
>   for (int i = 0; i < n; i++)           mask = comparison
>       if (cond[i])                      v1 = LEN_MASK_LOAD (length = loop_len, mask)
>       a[i] = b[i] + c[i];               v2 = LEN_MASK_LOAD (length = loop_len, mask)
>                                         v3 = v1 + v2                            
>                                         LEN_MASK_STORE (length = loop_len, mask, v3)
> 
> More features:
> 1. Support simplify gimple fold for LEN_MASK_ LOAD/STORE:
>    LEN_MASK_STORE (length = vf, mask = {-1,-1,...}, v) ===> MEM [...] = V
> 2. Allow DSE for LEN_MASK_* LOAD/STORE.
> 
> Bootstrap && Regression on X86 with no surprise difference.

Can you please split the patch?  Have 1/n add the optab and ifn
plus adjust the generic ifn predicates.  Have 2/n adjust the vectorizer
parts and 3/n optional things such as DSE.

Some comments below.

> gcc/ChangeLog:
> 
>         * doc/md.texi: Add LEN_MASK_ LOAD/STORE.
>         * genopinit.cc (main): Ditto.
>         (CMP_NAME): Ditto.
>         * gimple-fold.cc (arith_overflowed_p): Ditto.
>         (gimple_fold_partial_load_store_mem_ref): Ditto.
>         (gimple_fold_partial_store): Ditto.
>         (gimple_fold_call): Ditto.
>         * internal-fn.cc (len_maskload_direct): Ditto.
>         (len_maskstore_direct): Ditto.
>         (expand_partial_load_optab_fn): Ditto.
>         (expand_len_maskload_optab_fn): Ditto.
>         (expand_partial_store_optab_fn): Ditto.
>         (expand_len_maskstore_optab_fn): Ditto.
>         (direct_len_maskload_optab_supported_p): Ditto.
>         (direct_len_maskstore_optab_supported_p): Ditto.
>         (internal_load_fn_p): Ditto.
>         (internal_store_fn_p): Ditto.
>         (internal_fn_mask_index): Ditto.
>         (internal_fn_stored_value_index): Ditto.
>         * internal-fn.def (LEN_MASK_LOAD): Ditto.
>         (LEN_MASK_STORE): Ditto.
>         * optabs-query.cc (can_vec_len_mask_load_store_p): Ditto.
>         * optabs-query.h (can_vec_len_mask_load_store_p): Ditto.
>         * optabs.def (OPTAB_CD): Ditto.
>         * tree-data-ref.cc (get_references_in_stmt): Ditto.
>         * tree-if-conv.cc (ifcvt_can_use_mask_load_store): Ditto.
>         * tree-ssa-alias.cc (ref_maybe_used_by_call_p_1): Ditto.
>         (call_may_clobber_ref_p_1): Ditto.
>         * tree-ssa-dse.cc (initialize_ao_ref_for_dse): Ditto.
>         (dse_optimize_stmt): Ditto.
>         * tree-ssa-loop-ivopts.cc (get_mem_type_for_internal_fn): Ditto.
>         (get_alias_ptr_type_for_ptr_address): Ditto.
>         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Ditto.
>         * tree-vect-data-refs.cc (can_group_stmts_p): Ditto.
>         (vect_find_stmt_data_reference): Ditto.
>         (vect_supportable_dr_alignment): Ditto.
>         * tree-vect-loop.cc (vect_verify_loop_lens): Ditto.
>         (optimize_mask_stores): Ditto.
>         * tree-vect-slp.cc (vect_get_operand_map): Ditto.
>         (vect_build_slp_tree_2): Ditto.
>         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
>         (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         (vect_get_vector_types_for_stmt): Ditto.
>         * tree-vectorizer.cc (try_vectorize_loop_1): Ditto.
> 
> ---
>  gcc/doc/md.texi             | 32 ++++++++++++
>  gcc/genopinit.cc            |  6 ++-
>  gcc/gimple-fold.cc          | 28 ++++++++---
>  gcc/internal-fn.cc          | 37 +++++++++++++-
>  gcc/internal-fn.def         |  4 ++
>  gcc/optabs-query.cc         | 39 +++++++++++++++
>  gcc/optabs-query.h          |  1 +
>  gcc/optabs.def              |  2 +
>  gcc/tree-data-ref.cc        |  4 ++
>  gcc/tree-if-conv.cc         |  3 ++
>  gcc/tree-ssa-alias.cc       |  3 ++
>  gcc/tree-ssa-dse.cc         | 12 +++++
>  gcc/tree-ssa-loop-ivopts.cc |  8 +++
>  gcc/tree-ssa-sccvn.cc       |  6 +++
>  gcc/tree-vect-data-refs.cc  | 20 +++++---
>  gcc/tree-vect-loop.cc       | 52 +++++++++++--------
>  gcc/tree-vect-slp.cc        |  5 ++
>  gcc/tree-vect-stmts.cc      | 99 ++++++++++++++++++++++++++++++++++---
>  gcc/tree-vectorizer.cc      |  2 +
>  19 files changed, 320 insertions(+), 43 deletions(-)
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 95f7fe1f802..fc99990465d 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5136,6 +5136,38 @@ of @code{QI} elements.
>  
>  This pattern is not allowed to @code{FAIL}.
>  
> +@cindex @code{len_maskload@var{m}@var{n}} instruction pattern
> +@item @samp{len_maskload@var{m}@var{n}}
> +Perform a load of vector which is predicated by length and mask
> +from memory operand 1 of mode @var{m} into register operand 0.
> +Length is provided in operand 2 which has whichever
> +integer mode the target prefers.
> +Mask is provided in register operand 3 of mode @var{n}.
> +
> +operand 2 can be a variable or a constant amount. It can be vectorization
> +factor which is the special constant value represents the maximum length.

Can you try using the same wording for length and mask operands
as for len_load and maskload?  Also len_load has the "bias"
operand which you omit here - IIRC that was added for s390 which
for unknown reason behaves a little different than power.  If
len support for s390 ever extends to other ops or power or s390
gain mask support for conditional code we'd likely have to adjust
each optab you add.  Maybe it's better to add the bias operand
now.

Andreas?  Uli?

> +
> +operand 3 can be a variable or a constant amount. It can be all 1
> +which is the special constant value represents the full mask.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
> +@cindex @code{len_maskstore@var{m}@var{n}} instruction pattern
> +@item @samp{len_maskstore@var{m}@var{n}}
> +Perform a store of vector which is predicated by length and mask
> +from register operand 1 of mode @var{m} into memory operand 0.
> +Length is provided in operand 2 which has whichever
> +integer mode the target prefers.
> +Mask is provided in register operand 3 of mode @var{n}.
> +
> +operand 2 can be a variable or a constant amount. It can be vectorization
> +factor which is the special constant value represents the maximum length.
> +
> +operand 3 can be a variable or a constant amount. It can be all 1
> +which is the special constant value represents the full mask.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
>  @cindex @code{vec_perm@var{m}} instruction pattern
>  @item @samp{vec_perm@var{m}}
>  Output a (variable) vector permutation.  Operand 0 is the destination
> diff --git a/gcc/genopinit.cc b/gcc/genopinit.cc
> index 0c1b6859ca0..9aeebd66724 100644
> --- a/gcc/genopinit.cc
> +++ b/gcc/genopinit.cc
> @@ -376,7 +376,8 @@ main (int argc, const char **argv)
>  
>    fprintf (s_file,
>  	   "/* Returns TRUE if the target supports any of the partial vector\n"
> -	   "   optabs: while_ult_optab, len_load_optab or len_store_optab,\n"
> +	   "   optabs: while_ult_optab, len_load_optab, len_store_optab,\n"
> +	   "   len_maskload_optab or len_maskstore_optab,\n"
>  	   "   for any mode.  */\n"
>  	   "bool\npartial_vectors_supported_p (void)\n{\n");
>    bool any_match = false;
> @@ -386,7 +387,8 @@ main (int argc, const char **argv)
>      {
>  #define CMP_NAME(N) !strncmp (p->name, (N), strlen ((N)))
>        if (CMP_NAME("while_ult") || CMP_NAME ("len_load")
> -	  || CMP_NAME ("len_store"))
> +	  || CMP_NAME ("len_store") || CMP_NAME ("len_maskload")
> +	  || CMP_NAME ("len_maskstore"))
>  	{
>  	  if (first)
>  	    fprintf (s_file, " HAVE_%s", p->name);
> diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
> index 581575b65ec..a2c2ad5bfe7 100644
> --- a/gcc/gimple-fold.cc
> +++ b/gcc/gimple-fold.cc
> @@ -5370,8 +5370,8 @@ arith_overflowed_p (enum tree_code code, const_tree type,
>    return wi::min_precision (wres, sign) > TYPE_PRECISION (type);
>  }
>  
> -/* If IFN_{MASK,LEN}_LOAD/STORE call CALL is unconditional, return a MEM_REF
> -   for the memory it references, otherwise return null.  VECTYPE is the
> +/* If IFN_{MASK,LEN,LEN_MASK}_LOAD/STORE call CALL is unconditional, return a
> +   MEM_REF for the memory it references, otherwise return null.  VECTYPE is the
>     type of the memory vector.  MASK_P indicates it's for MASK if true,
>     otherwise it's for LEN.  */
>  
> @@ -5383,7 +5383,20 @@ gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool mask_p)
>    if (!tree_fits_uhwi_p (alias_align))
>      return NULL_TREE;
>  
> -  if (mask_p)
> +  if (gimple_call_internal_fn (call) == IFN_LEN_MASK_LOAD
> +      || gimple_call_internal_fn (call) == IFN_LEN_MASK_STORE)
> +    {

Can you instead add a len_p argument to the function and do

    if (len_p)
      check len is full
    if (mask_p)
      check mask is full

?

> +      tree basic_len = gimple_call_arg (call, 2);
> +      if (!poly_int_tree_p (basic_len))
> +	return NULL_TREE;
> +      if (maybe_ne (tree_to_poly_uint64 (basic_len),
> +		    TYPE_VECTOR_SUBPARTS (vectype)))
> +	return NULL_TREE;
> +      tree mask = gimple_call_arg (call, 3);
> +      if (!integer_all_onesp (mask))
> +	return NULL_TREE;
> +    }
> +  else if (mask_p)
>      {
>        tree mask = gimple_call_arg (call, 2);
>        if (!integer_all_onesp (mask))
> @@ -5409,7 +5422,7 @@ gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool mask_p)
>    return fold_build2 (MEM_REF, vectype, ptr, offset);
>  }
>  
> -/* Try to fold IFN_{MASK,LEN}_LOAD call CALL.  Return true on success.
> +/* Try to fold IFN_{MASK,LEN,LEN_MASK}_LOAD call CALL.  Return true on success.
>     MASK_P indicates it's for MASK if true, otherwise it's for LEN.  */
>  
>  static bool
> @@ -5431,14 +5444,15 @@ gimple_fold_partial_load (gimple_stmt_iterator *gsi, gcall *call, bool mask_p)
>    return false;
>  }
>  
> -/* Try to fold IFN_{MASK,LEN}_STORE call CALL.  Return true on success.
> +/* Try to fold IFN_{MASK,LEN,LEN_MASK}_STORE call CALL.  Return true on success.
>     MASK_P indicates it's for MASK if true, otherwise it's for LEN.  */
>  
>  static bool
>  gimple_fold_partial_store (gimple_stmt_iterator *gsi, gcall *call,
>  			   bool mask_p)
>  {
> -  tree rhs = gimple_call_arg (call, 3);
> +  tree rhs = gimple_call_arg (
> +    call, gimple_call_internal_fn (call) == IFN_LEN_MASK_STORE ? 4 : 3);

Use internal_fn_stored_value_index and internal_fn_mask_index,
possibly add internal_fn_len_index?

>    if (tree lhs
>        = gimple_fold_partial_load_store_mem_ref (call, TREE_TYPE (rhs), mask_p))
>      {
> @@ -5659,9 +5673,11 @@ gimple_fold_call (gimple_stmt_iterator *gsi, bool inplace)
>  	  cplx_result = true;
>  	  break;
>  	case IFN_MASK_LOAD:
> +	case IFN_LEN_MASK_LOAD:
>  	  changed |= gimple_fold_partial_load (gsi, stmt, true);
>  	  break;
>  	case IFN_MASK_STORE:
> +	case IFN_LEN_MASK_STORE:
>  	  changed |= gimple_fold_partial_store (gsi, stmt, true);
>  	  break;
>  	case IFN_LEN_LOAD:
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index da9b944dd5d..4a9fe388eed 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -165,6 +165,7 @@ init_internal_fns ()
>  #define mask_load_lanes_direct { -1, -1, false }
>  #define gather_load_direct { 3, 1, false }
>  #define len_load_direct { -1, -1, false }
> +#define len_maskload_direct { -1, 3, false }
>  #define mask_store_direct { 3, 2, false }
>  #define store_lanes_direct { 0, 0, false }
>  #define mask_store_lanes_direct { 0, 0, false }
> @@ -172,6 +173,7 @@ init_internal_fns ()
>  #define vec_cond_direct { 2, 0, false }
>  #define scatter_store_direct { 3, 1, false }
>  #define len_store_direct { 3, 3, false }
> +#define len_maskstore_direct { 4, 3, false }
>  #define vec_set_direct { 3, 3, false }
>  #define unary_direct { 0, 0, true }
>  #define unary_convert_direct { -1, 0, true }
> @@ -2875,6 +2877,17 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>        create_input_operand (&ops[3], bias, QImode);
>        expand_insn (icode, 4, ops);
>      }
> +  else if (optab == len_maskload_optab)
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> +				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      maskt = gimple_call_arg (stmt, 3);
> +      mask = expand_normal (maskt);
> +      create_input_operand (&ops[3], mask, TYPE_MODE (TREE_TYPE (maskt)));
> +      icode = convert_optab_handler (optab, TYPE_MODE (type),
> +				     TYPE_MODE (TREE_TYPE (maskt)));
> +      expand_insn (icode, 4, ops);
> +    }
>    else
>      {
>        create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> @@ -2888,6 +2901,7 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  #define expand_mask_load_optab_fn expand_partial_load_optab_fn
>  #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn
>  #define expand_len_load_optab_fn expand_partial_load_optab_fn
> +#define expand_len_maskload_optab_fn expand_partial_load_optab_fn
>  
>  /* Expand MASK_STORE{,_LANES} or LEN_STORE call STMT using optab OPTAB.  */
>  
> @@ -2900,7 +2914,7 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> -  rhs = gimple_call_arg (stmt, 3);
> +  rhs = gimple_call_arg (stmt, optab == len_maskstore_optab ? 4 : 3);

see above

>    type = TREE_TYPE (rhs);
>    lhs = expand_call_mem_ref (type, stmt, 0);
>  
> @@ -2927,6 +2941,16 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>        create_input_operand (&ops[3], bias, QImode);
>        expand_insn (icode, 4, ops);
>      }
> +  else if (optab == len_maskstore_optab)
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> +				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      maskt = gimple_call_arg (stmt, 3);
> +      mask = expand_normal (maskt);
> +      create_input_operand (&ops[3], mask, TYPE_MODE (TREE_TYPE (maskt)));
> +      icode = convert_optab_handler (optab, TYPE_MODE (type), GET_MODE (mask));
> +      expand_insn (icode, 4, ops);
> +    }
>    else
>      {
>        create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> @@ -2937,6 +2961,7 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  #define expand_mask_store_optab_fn expand_partial_store_optab_fn
>  #define expand_mask_store_lanes_optab_fn expand_mask_store_optab_fn
>  #define expand_len_store_optab_fn expand_partial_store_optab_fn
> +#define expand_len_maskstore_optab_fn expand_partial_store_optab_fn
>  
>  /* Expand VCOND, VCONDU and VCONDEQ optab internal functions.
>     The expansion of STMT happens based on OPTAB table associated.  */
> @@ -3890,6 +3915,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>  #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
>  #define direct_gather_load_optab_supported_p convert_optab_supported_p
>  #define direct_len_load_optab_supported_p direct_optab_supported_p
> +#define direct_len_maskload_optab_supported_p convert_optab_supported_p
>  #define direct_mask_store_optab_supported_p convert_optab_supported_p
>  #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
>  #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
> @@ -3897,6 +3923,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>  #define direct_vec_cond_optab_supported_p convert_optab_supported_p
>  #define direct_scatter_store_optab_supported_p convert_optab_supported_p
>  #define direct_len_store_optab_supported_p direct_optab_supported_p
> +#define direct_len_maskstore_optab_supported_p convert_optab_supported_p
>  #define direct_while_optab_supported_p convert_optab_supported_p
>  #define direct_fold_extract_optab_supported_p direct_optab_supported_p
>  #define direct_fold_left_optab_supported_p direct_optab_supported_p
> @@ -4361,6 +4388,7 @@ internal_load_fn_p (internal_fn fn)
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
>      case IFN_LEN_LOAD:
> +    case IFN_LEN_MASK_LOAD:
>        return true;
>  
>      default:
> @@ -4381,6 +4409,7 @@ internal_store_fn_p (internal_fn fn)
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
>      case IFN_LEN_STORE:
> +    case IFN_LEN_MASK_STORE:
>        return true;
>  
>      default:
> @@ -4420,6 +4449,10 @@ internal_fn_mask_index (internal_fn fn)
>      case IFN_MASK_STORE_LANES:
>        return 2;
>  
> +    case IFN_LEN_MASK_LOAD:
> +    case IFN_LEN_MASK_STORE:
> +      return 3;
> +
>      case IFN_MASK_GATHER_LOAD:
>      case IFN_MASK_SCATTER_STORE:
>        return 4;
> @@ -4444,6 +4477,8 @@ internal_fn_stored_value_index (internal_fn fn)
>      case IFN_MASK_SCATTER_STORE:
>      case IFN_LEN_STORE:
>        return 3;
> +    case IFN_LEN_MASK_STORE:
> +      return 4;
>  
>      default:
>        return -1;
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 5d638de6d06..cf0bcea5ac7 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -50,12 +50,14 @@ along with GCC; see the file COPYING3.  If not see
>     - mask_load_lanes: currently just vec_mask_load_lanes
>     - gather_load: used for {mask_,}gather_load
>     - len_load: currently just len_load
> +   - len_maskload: currently just len_maskload
>  
>     - mask_store: currently just maskstore
>     - store_lanes: currently just vec_store_lanes
>     - mask_store_lanes: currently just vec_mask_store_lanes
>     - scatter_store: used for {mask_,}scatter_store
>     - len_store: currently just len_store
> +   - len_maskstore: currently just len_maskstore
>  
>     - unary: a normal unary optab, such as vec_reverse_<mode>
>     - binary: a normal binary optab, such as vec_interleave_lo_<mode>
> @@ -157,6 +159,7 @@ DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
>  		       mask_gather_load, gather_load)
>  
>  DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
> +DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, len_maskload, len_maskload)
>  
>  DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store)
>  DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
> @@ -175,6 +178,7 @@ DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
>  DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
>  
>  DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> +DEF_INTERNAL_OPTAB_FN (LEN_MASK_STORE, 0, len_maskstore, len_maskstore)
>  
>  DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
>  DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
> diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> index 276f8408dd7..ec765e78088 100644
> --- a/gcc/optabs-query.cc
> +++ b/gcc/optabs-query.cc
> @@ -624,6 +624,45 @@ get_len_load_store_mode (machine_mode mode, bool is_load)
>    return opt_machine_mode ();
>  }
>  
> +/* Return true if target supports vector length && masked load/store for mode.
> +   Length is used on loop control and mask is used on flow control.  */
> +
> +bool
> +can_vec_len_mask_load_store_p (machine_mode mode, bool is_load)

why deviate from can_vec_mask_load_store_p and not pass in the
mask_mode?  In fact I wonder why this function differs from
can_vec_mask_load_store_p besides using other optabs?  Couldn't
we simply add a bool with_len argument to can_vec_mask_load_store_p?

> +{
> +  optab op = is_load ? len_maskload_optab : len_maskstore_optab;
> +  machine_mode vmode;
> +  machine_mode mask_mode;
> +
> +  /* If mode is vector mode, check it directly.  */
> +  if (VECTOR_MODE_P (mode))
> +    return targetm.vectorize.get_mask_mode (mode).exists (&mask_mode)
> +	   && convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing;
> +
> +  scalar_mode smode;
> +  if (is_a<scalar_mode> (mode, &smode))
> +    /* See if there is any chance the mask load or store might be
> +       vectorized.  If not, punt.  */
> +    vmode = targetm.vectorize.preferred_simd_mode (smode);
> +  else
> +    vmode = mode;
> +
> +  if (VECTOR_MODE_P (vmode)
> +      && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
> +      && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
> +    return true;
> +
> +  auto_vector_modes vector_modes;
> +  targetm.vectorize.autovectorize_vector_modes (&vector_modes, true);
> +  for (machine_mode base_mode : vector_modes)
> +    if (related_vector_mode (base_mode, smode).exists (&vmode)
> +	&& targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
> +	&& convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
> +      return true;
> +
> +  return false;
> +}
> +
>  /* Return true if there is a compare_and_swap pattern.  */
>  
>  bool
> diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
> index b266d2fe990..2b9c9b44af2 100644
> --- a/gcc/optabs-query.h
> +++ b/gcc/optabs-query.h
> @@ -189,6 +189,7 @@ enum insn_code find_widening_optab_handler_and_mode (optab, machine_mode,
>  int can_mult_highpart_p (machine_mode, bool);
>  bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool);
>  opt_machine_mode get_len_load_store_mode (machine_mode, bool);
> +bool can_vec_len_mask_load_store_p (machine_mode, bool);
>  bool can_compare_and_swap_p (machine_mode, bool);
>  bool can_atomic_exchange_p (machine_mode, bool);
>  bool can_atomic_load_p (machine_mode);
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index f31b69c5d85..f5401aea364 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -91,6 +91,8 @@ OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b")
>  OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b")
>  OPTAB_CD(maskload_optab, "maskload$a$b")
>  OPTAB_CD(maskstore_optab, "maskstore$a$b")
> +OPTAB_CD(len_maskload_optab, "len_maskload$a$b")
> +OPTAB_CD(len_maskstore_optab, "len_maskstore$a$b")
>  OPTAB_CD(gather_load_optab, "gather_load$a$b")
>  OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b")
>  OPTAB_CD(scatter_store_optab, "scatter_store$a$b")
> diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
> index b576cce6db6..99aca44e6a5 100644
> --- a/gcc/tree-data-ref.cc
> +++ b/gcc/tree-data-ref.cc
> @@ -5816,6 +5816,8 @@ get_references_in_stmt (gimple *stmt, vec<data_ref_loc, va_heap> *references)
>  	    }
>  	  case IFN_MASK_LOAD:
>  	  case IFN_MASK_STORE:
> +	  case IFN_LEN_MASK_LOAD:
> +	  case IFN_LEN_MASK_STORE:
>  	    break;
>  	  default:
>  	    clobbers_memory = true;
> @@ -5861,11 +5863,13 @@ get_references_in_stmt (gimple *stmt, vec<data_ref_loc, va_heap> *references)
>  	switch (gimple_call_internal_fn (stmt))
>  	  {
>  	  case IFN_MASK_LOAD:
> +	  case IFN_LEN_MASK_LOAD:
>  	    if (gimple_call_lhs (stmt) == NULL_TREE)
>  	      break;
>  	    ref.is_read = true;
>  	    /* FALLTHRU */
>  	  case IFN_MASK_STORE:
> +	  case IFN_LEN_MASK_STORE:
>  	    ptr = build_int_cst (TREE_TYPE (gimple_call_arg (stmt, 1)), 0);
>  	    align = tree_to_shwi (gimple_call_arg (stmt, 1));
>  	    if (ref.is_read)
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 1393ce184e3..0f549fa528d 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -960,6 +960,9 @@ ifcvt_can_use_mask_load_store (gimple *stmt)
>    if (can_vec_mask_load_store_p (mode, VOIDmode, is_load))
>      return true;
>  
> +  if (can_vec_len_mask_load_store_p (mode, is_load))
> +    return true;

But if-conversion only needs the conditional masking, not _len.  I
don't think we need to check this at all?  In fact
can_vec_mask_load_store_p should probably return true when
LEN_MASKLOAD is available since we can always use the full vector
len as len argument?  The suggested bool argument with_len
could also become an enum { WITH_LEN, WITHOUT_LEN, EITHER },
but not sure if that's really necessary.

Are you going to provide len_load, maskload and len_maskload
patterns or just len_maskload?  (I hope the last)

> +
>    return false;
>  }
>  
> diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc
> index 79ed956e300..100c4b2e7d9 100644
> --- a/gcc/tree-ssa-alias.cc
> +++ b/gcc/tree-ssa-alias.cc
> @@ -2815,11 +2815,13 @@ ref_maybe_used_by_call_p_1 (gcall *call, ao_ref *ref, bool tbaa_p)
>        case IFN_SCATTER_STORE:
>        case IFN_MASK_SCATTER_STORE:
>        case IFN_LEN_STORE:
> +      case IFN_LEN_MASK_STORE:
>  	return false;
>        case IFN_MASK_STORE_LANES:
>  	goto process_args;
>        case IFN_MASK_LOAD:
>        case IFN_LEN_LOAD:
> +      case IFN_LEN_MASK_LOAD:
>        case IFN_MASK_LOAD_LANES:
>  	{
>  	  ao_ref rhs_ref;
> @@ -3065,6 +3067,7 @@ call_may_clobber_ref_p_1 (gcall *call, ao_ref *ref, bool tbaa_p)
>  	return false;
>        case IFN_MASK_STORE:
>        case IFN_LEN_STORE:
> +      case IFN_LEN_MASK_STORE:
>        case IFN_MASK_STORE_LANES:
>  	{
>  	  tree rhs = gimple_call_arg (call,
> diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc
> index eabe8ba4522..acaf844b8ef 100644
> --- a/gcc/tree-ssa-dse.cc
> +++ b/gcc/tree-ssa-dse.cc
> @@ -174,6 +174,17 @@ initialize_ao_ref_for_dse (gimple *stmt, ao_ref *write, bool may_def_ok = false)
>  	      return true;
>  	    }
>  	  break;
> +	case IFN_LEN_MASK_STORE:
> +	  /* We cannot initialize a must-def ao_ref (in all cases) but we
> +	     can provide a may-def variant.  */
> +	  if (may_def_ok)
> +	    {
> +	      ao_ref_init_from_ptr_and_size
> +		  (write, gimple_call_arg (stmt, 0),
> +		   TYPE_SIZE_UNIT (TREE_TYPE (gimple_call_arg (stmt, 4))));

common with IFN_MASK_STORE by using internal_fn_stored_value_index

> +	      return true;
> +	    }
> +	  break;
>  	default:;
>  	}
>      }
> @@ -1483,6 +1494,7 @@ dse_optimize_stmt (function *fun, gimple_stmt_iterator *gsi, sbitmap live_bytes)
>  	{
>  	case IFN_LEN_STORE:
>  	case IFN_MASK_STORE:
> +	case IFN_LEN_MASK_STORE:
>  	  {
>  	    enum dse_store_status store_status;
>  	    store_status = dse_classify_store (&ref, stmt, false, live_bytes);
> diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
> index 6fbd2d59318..e8e9df1ab74 100644
> --- a/gcc/tree-ssa-loop-ivopts.cc
> +++ b/gcc/tree-ssa-loop-ivopts.cc
> @@ -2439,6 +2439,7 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p)
>      case IFN_MASK_LOAD:
>      case IFN_MASK_LOAD_LANES:
>      case IFN_LEN_LOAD:
> +    case IFN_LEN_MASK_LOAD:
>        if (op_p == gimple_call_arg_ptr (call, 0))
>  	return TREE_TYPE (gimple_call_lhs (call));
>        return NULL_TREE;
> @@ -2450,6 +2451,11 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p)
>  	return TREE_TYPE (gimple_call_arg (call, 3));
>        return NULL_TREE;
>  
> +    case IFN_LEN_MASK_STORE:
> +      if (op_p == gimple_call_arg_ptr (call, 0))
> +	return TREE_TYPE (gimple_call_arg (call, 4));

internal_fn_stored_value_index

> +      return NULL_TREE;
> +
>      default:
>        return NULL_TREE;
>      }
> @@ -7555,6 +7561,8 @@ get_alias_ptr_type_for_ptr_address (iv_use *use)
>      case IFN_MASK_STORE_LANES:
>      case IFN_LEN_LOAD:
>      case IFN_LEN_STORE:
> +    case IFN_LEN_MASK_LOAD:
> +    case IFN_LEN_MASK_STORE:
>        /* The second argument contains the correct alias type.  */
>        gcc_assert (use->op_p = gimple_call_arg_ptr (call, 0));
>        return TREE_TYPE (gimple_call_arg (call, 1));
> diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> index 27c84e78fcf..02fbc4a2dfa 100644
> --- a/gcc/tree-ssa-sccvn.cc
> +++ b/gcc/tree-ssa-sccvn.cc
> @@ -3304,6 +3304,12 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
>  	  if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
>  	    return (void *)-1;
>  	  break;
> +	case IFN_LEN_MASK_STORE:
> +	  len = gimple_call_arg (call, 2);
> +	  mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> +	  if (!tree_fits_uhwi_p (len) || TREE_CODE (mask) != VECTOR_CST)
> +	    return (void *)-1;
> +	  break;

That's too simple, the code using the info is not prepared for
LEN_MASK since it handles maskload and len_load separately.
I suggest to drop this, picking it up separately.

>  	default:
>  	  return (void *)-1;
>  	}
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index ebe93832b1e..fb83446519a 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -3039,17 +3039,21 @@ can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
>        if (!call2 || !gimple_call_internal_p (call2))
>  	return false;
>        internal_fn ifn = gimple_call_internal_fn (call1);
> -      if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
> +      if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE
> +	  && ifn != IFN_LEN_MASK_LOAD && ifn != IFN_LEN_MASK_STORE)
>  	return false;
>        if (ifn != gimple_call_internal_fn (call2))
>  	return false;
>  
>        /* Check that the masks are the same.  Cope with casts of masks,
>  	 like those created by build_mask_conversion.  */
> -      tree mask1 = gimple_call_arg (call1, 2);
> -      tree mask2 = gimple_call_arg (call2, 2);
> +      unsigned int mask_argno
> +	= ifn == IFN_LEN_MASK_LOAD || ifn == IFN_LEN_MASK_STORE ? 3 : 2;
> +      tree mask1 = gimple_call_arg (call1, mask_argno);
> +      tree mask2 = gimple_call_arg (call2, mask_argno);
>        if (!operand_equal_p (mask1, mask2, 0)
> -          && (ifn == IFN_MASK_STORE || !allow_slp_p))
> +	  && (ifn == IFN_MASK_STORE || ifn == IFN_LEN_MASK_STORE
> +	      || !allow_slp_p))

I think you need to verify the length operand is the full vector, note
this is for if-conversion which could care less for _LEN, but if we
insist on using _LEN (I didn't see you changing if-conversion that
way?!) then we need to put in some value even for the "scalar"
placeholder.  I'd suggest to simply use IFN_MASK_{LOAD,STORE} in
if-conversion but vectorize that as LEN_ with full length if
plain MASK_LOAD/STORE isn't available.  Which means these changes
are not necessary at all.

>  	{
>  	  mask1 = strip_conversion (mask1);
>  	  if (!mask1)
> @@ -4292,7 +4296,9 @@ vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
>    if (gcall *call = dyn_cast <gcall *> (stmt))
>      if (!gimple_call_internal_p (call)
>  	|| (gimple_call_internal_fn (call) != IFN_MASK_LOAD
> -	    && gimple_call_internal_fn (call) != IFN_MASK_STORE))
> +	    && gimple_call_internal_fn (call) != IFN_MASK_STORE
> +	    && gimple_call_internal_fn (call) != IFN_LEN_MASK_LOAD
> +	    && gimple_call_internal_fn (call) != IFN_LEN_MASK_STORE))
>        {
>  	free_data_ref (dr);
>  	return opt_result::failure_at (stmt,
> @@ -6731,7 +6737,9 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
>    if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
>      if (gimple_call_internal_p (stmt)
>  	&& (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
> -	    || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
> +	    || gimple_call_internal_fn (stmt) == IFN_MASK_STORE
> +	    || gimple_call_internal_fn (stmt) == IFN_LEN_MASK_LOAD
> +	    || gimple_call_internal_fn (stmt) == IFN_LEN_MASK_STORE))
>        return dr_unaligned_supported;
>  
>    if (loop_vinfo)
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index ace9e759f5b..03de41d4988 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1296,30 +1296,33 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
>    if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
>      return false;
>  
> -  machine_mode len_load_mode = get_len_load_store_mode
> -    (loop_vinfo->vector_mode, true).require ();
> -  machine_mode len_store_mode = get_len_load_store_mode
> -    (loop_vinfo->vector_mode, false).require ();
> +  if (!can_vec_len_mask_load_store_p (loop_vinfo->vector_mode, true)
> +      && !can_vec_len_mask_load_store_p (loop_vinfo->vector_mode, false))
> +    {
> +      machine_mode len_load_mode
> +	= get_len_load_store_mode (loop_vinfo->vector_mode, true).require ();
> +      machine_mode len_store_mode
> +	= get_len_load_store_mode (loop_vinfo->vector_mode, false).require ();

as with can_mask_* we don't really care if masking is supported or
not so I suggest to amend get_len_load_store_mode to also check the
len_mask{load,store} optabs?  Given that the function
should possibly return the corresponding IFN as well.

> -  signed char partial_load_bias = internal_len_load_store_bias
> -    (IFN_LEN_LOAD, len_load_mode);
> +      signed char partial_load_bias
> +	= internal_len_load_store_bias (IFN_LEN_LOAD, len_load_mode);
>  
> -  signed char partial_store_bias = internal_len_load_store_bias
> -    (IFN_LEN_STORE, len_store_mode);
> +      signed char partial_store_bias
> +	= internal_len_load_store_bias (IFN_LEN_STORE, len_store_mode);
>  
> -  gcc_assert (partial_load_bias == partial_store_bias);
> +      gcc_assert (partial_load_bias == partial_store_bias);
>  
> -  if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
> -    return false;
> +      if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
> +	return false;
>  
> -  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> -     len_loads with a length of zero.  In order to avoid that we prohibit
> -     more than one loop length here.  */
> -  if (partial_load_bias == -1
> -      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
> -    return false;
> +      /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> +	 len_loads with a length of zero.  In order to avoid that we prohibit
> +	 more than one loop length here.  */
> +      if (partial_load_bias == -1 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
> +	return false;
>  
> -  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
> +      LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
> +    }
>  
>    unsigned int max_nitems_per_iter = 1;
>    unsigned int i;
> @@ -11317,7 +11320,8 @@ optimize_mask_stores (class loop *loop)
>  	   gsi_next (&gsi))
>  	{
>  	  stmt = gsi_stmt (gsi);
> -	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
> +	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE)
> +	      || gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE))
>  	    worklist.safe_push (stmt);
>  	}
>      }
> @@ -11340,7 +11344,8 @@ optimize_mask_stores (class loop *loop)
>        tree zero;
>  
>        last = worklist.pop ();
> -      mask = gimple_call_arg (last, 2);
> +      mask = gimple_call_arg (
> +	last, gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE) ? 3 : 2);

use the proper ifn index compute fn

>        bb = gimple_bb (last);
>        /* Create then_bb and if-then structure in CFG, then_bb belongs to
>  	 the same loop as if_bb.  It could be different to LOOP when two
> @@ -11473,7 +11478,12 @@ optimize_mask_stores (class loop *loop)
>  	    }
>  	  /* Put other masked stores with the same mask to STORE_BB.  */
>  	  if (worklist.is_empty ()
> -	      || gimple_call_arg (worklist.last (), 2) != mask
> +	      || gimple_call_arg (worklist.last (),
> +				  gimple_call_internal_p (worklist.last (),
> +							  IFN_LEN_MASK_STORE)
> +				    ? 3
> +				    : 2)

use the proper ifn index compute fn

> +		   != mask
>  	      || worklist.last () != stmt1)
>  	    break;
>  	  last = worklist.pop ();
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index ab89a82f1b3..937b5295df4 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -489,6 +489,7 @@ static const int cond_expr_maps[3][5] = {
>  };
>  static const int arg1_map[] = { 1, 1 };
>  static const int arg2_map[] = { 1, 2 };
> +static const int arg3_map[] = { 1, 3 };
>  static const int arg1_arg4_map[] = { 2, 1, 4 };
>  static const int op1_op0_map[] = { 2, 1, 0 };
>  
> @@ -524,6 +525,9 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
>  	  case IFN_MASK_LOAD:
>  	    return arg2_map;
>  
> +	  case IFN_LEN_MASK_LOAD:
> +	    return arg3_map;
> +
>  	  case IFN_GATHER_LOAD:
>  	    return arg1_map;
>  
> @@ -1779,6 +1783,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>      {
>        if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
>  	gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
> +		    || gimple_call_internal_p (stmt, IFN_LEN_MASK_LOAD)

It's probably not necessary to handle this if if-conversion emits
just IFN_MASK_LOAD.

>  		    || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
>  		    || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
>        else
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index a7acc032d47..9b797c61c88 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1837,6 +1837,15 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>        using_partial_vectors_p = true;
>      }
>  
> +  if (can_vec_len_mask_load_store_p (vecmode, is_load))
> +    {
> +      nvectors = group_memory_nvectors (group_size * vf, nunits);
> +      /* Length is used on loop control and mask for flow control.*/
> +      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> +      vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> +      using_partial_vectors_p = true;
> +    }
> +

so this answers my question - you just have len_mask{load,store}?

>    if (!using_partial_vectors_p)
>      {
>        if (dump_enabled_p ())
> @@ -7978,8 +7987,9 @@ vectorizable_store (vec_info *vinfo,
>        if (memory_access_type == VMAT_CONTIGUOUS)
>  	{
>  	  if (!VECTOR_MODE_P (vec_mode)
> -	      || !can_vec_mask_load_store_p (vec_mode,
> -					     TYPE_MODE (mask_vectype), false))
> +	      || (!can_vec_mask_load_store_p (vec_mode,
> +					      TYPE_MODE (mask_vectype), false)
> +		  && !can_vec_len_mask_load_store_p (vec_mode, false)))
>  	    return false;
>  	}
>        else if (memory_access_type != VMAT_LOAD_STORE_LANES
> @@ -8942,7 +8952,38 @@ vectorizable_store (vec_info *vinfo,
>  		}
>  
>  	      /* Arguments are ready.  Create the new vector stmt.  */
> -	      if (final_mask)
> +	      if (can_vec_len_mask_load_store_p (TYPE_MODE (vectype), false)
> +		  && (final_mask || loop_lens))

I think we really want to common this somehow, having
if (loop_lens) do the final_len compute and then afterwards
select the IFN to create, filling required default args of
final_mask and final_len if not computed.

> +		{
> +		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> +		  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +		  if (!final_mask)
> +		    {
> +		      machine_mode mask_mode
> +			= targetm.vectorize.get_mask_mode (TYPE_MODE (vectype))
> +			    .require ();
> +		      mask_vectype
> +			= build_truth_vector_type_for_mode (nunits, mask_mode);
> +		      tree mask = build_int_cst (TREE_TYPE (mask_vectype), -1);
> +		      final_mask = build_vector_from_val (mask_vectype, mask);
> +		    }
> +		  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +		  tree final_len;
> +		  if (loop_lens)
> +		    final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +						   vec_num * ncopies, vectype,
> +						   vec_num * j + i, 1);
> +		  else
> +		    final_len = build_int_cst (iv_type, nunits);
> +		  gcall *call
> +		    = gimple_build_call_internal (IFN_LEN_MASK_STORE, 5,
> +						  dataref_ptr, ptr, final_len,
> +						  final_mask, vec_oprnd);
> +		  gimple_call_set_nothrow (call, true);
> +		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> +		  new_stmt = call;
> +		}
> +	      else if (final_mask)
>  		{
>  		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>  		  gcall *call
> @@ -9407,8 +9448,9 @@ vectorizable_load (vec_info *vinfo,
>  	{
>  	  machine_mode vec_mode = TYPE_MODE (vectype);
>  	  if (!VECTOR_MODE_P (vec_mode)
> -	      || !can_vec_mask_load_store_p (vec_mode,
> -					     TYPE_MODE (mask_vectype), true))
> +	      || (!can_vec_mask_load_store_p (vec_mode,
> +					      TYPE_MODE (mask_vectype), true)
> +		  && !can_vec_len_mask_load_store_p (vec_mode, false)))
>  	    return false;
>  	}
>        else if (memory_access_type != VMAT_LOAD_STORE_LANES
> @@ -10301,7 +10343,47 @@ vectorizable_load (vec_info *vinfo,
>  					      align, misalign);
>  		    align = least_bit_hwi (misalign | align);
>  
> -		    if (final_mask)
> +		    if (can_vec_len_mask_load_store_p (TYPE_MODE (vectype),
> +						       true)
> +			&& (final_mask || loop_lens)
> +			&& memory_access_type != VMAT_INVARIANT)

same

> +		      {
> +			tree ptr
> +			  = build_int_cst (ref_type, align * BITS_PER_UNIT);
> +			poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +			if (!final_mask)
> +			  {
> +			    machine_mode mask_mode
> +			      = targetm.vectorize
> +				  .get_mask_mode (TYPE_MODE (vectype))
> +				  .require ();
> +			    mask_vectype
> +			      = build_truth_vector_type_for_mode (nunits,
> +								  mask_mode);
> +			    tree mask
> +			      = build_int_cst (TREE_TYPE (mask_vectype), -1);
> +			    final_mask
> +			      = build_vector_from_val (mask_vectype, mask);
> +			  }

and split this out to a helper function

> +			tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +			tree final_len;
> +			if (loop_lens)
> +			  final_len
> +			    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +						 vec_num * ncopies, vectype,
> +						 vec_num * j + i, 1);
> +			else
> +			  final_len = build_int_cst (iv_type, nunits);
> +
> +			gcall *call
> +			  = gimple_build_call_internal (IFN_LEN_MASK_LOAD, 4,
> +							dataref_ptr, ptr,
> +							final_len, final_mask);
> +			gimple_call_set_nothrow (call, true);
> +			new_stmt = call;
> +			data_ref = NULL_TREE;
> +		      }
> +		    else if (final_mask)
>  		      {
>  			tree ptr = build_int_cst (ref_type,
>  						  align * BITS_PER_UNIT);
> @@ -13027,7 +13109,8 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>  
>    if (gimple_get_lhs (stmt) == NULL_TREE
>        /* MASK_STORE has no lhs, but is ok.  */
> -      && !gimple_call_internal_p (stmt, IFN_MASK_STORE))
> +      && !gimple_call_internal_p (stmt, IFN_MASK_STORE)
> +      && !gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE))

See if-conversion comment.  This shouldn't be necessary.

>      {
>        if (is_a <gcall *> (stmt))
>  	{
> @@ -13071,6 +13154,8 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>  	scalar_type = TREE_TYPE (DR_REF (dr));
>        else if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
>  	scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
> +      else if (gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE))
> +	scalar_type = TREE_TYPE (gimple_call_arg (stmt, 4));
>        else
>  	scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
>  
> diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
> index a048e9d8917..19312404ac4 100644
> --- a/gcc/tree-vectorizer.cc
> +++ b/gcc/tree-vectorizer.cc
> @@ -1101,6 +1101,8 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
>  		{
>  		  internal_fn ifn = gimple_call_internal_fn (call);
>  		  if (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE
> +		      || ifn == IFN_LEN_MASK_LOAD
> +		      || ifn == IFN_LEN_MASK_STORE

Likewise.

>  		      /* Don't keep the if-converted parts when the ifn with
>  			 specifc type is not supported by the backend.  */
>  		      || (direct_internal_fn_p (ifn)
>
juzhe.zhong@rivai.ai June 15, 2023, 8:47 a.m. UTC | #2
Hi, Richard.

Thanks for comments.

>>Can you try using the same wording for length and mask operands
>>as for len_load and maskload?  Also len_load has the "bias"
>>operand which you omit here - IIRC that was added for s390 which
>>for unknown reason behaves a little different than power.  If
>>len support for s390 ever extends to other ops or power or s390
>>gain mask support for conditional code we'd likely have to adjust
>>each optab you add.  Maybe it's better to add the bias operand
>>now.

I don't know BIAS well and It seems to be a Power target dependent feature.
I think len_mask_* in general should only need lenght and mask operand.
Actually, the function argument is totally same as vp_load/vp_store in LLVM.

Could I just keep current format (without BIAS argument)? And extend it with BIAS if
PowerPC want to use LEN_MASK_ * ?


>> Can you please split the patch?  Have 1/n add the optab and ifn
>>plus adjust the generic ifn predicates.  Have 2/n adjust the vectorizer
>>parts and 3/n optional things such as DSE.
Ok.

>>Can you instead add a len_p argument to the function and do

>>    if (len_p)
>>      check len is full
>>    if (mask_p)
>>      check mask is full

>>?


>>Use internal_fn_stored_value_index and internal_fn_mask_index,
>>possibly add internal_fn_len_index?

>>why deviate from can_vec_mask_load_store_p and not pass in the
>>mask_mode?  In fact I wonder why this function differs from
>>can_vec_mask_load_store_p besides using other optabs?  Couldn't
>>we simply add a bool with_len argument to can_vec_mask_load_store_p?

>>But if-conversion only needs the conditional masking, not _len.  I
>>don't think we need to check this at all?  In fact
>>can_vec_mask_load_store_p should probably return true when
>>LEN_MASKLOAD is available since we can always use the full vector
>>len as len argument?  The suggested bool argument with_len
>>could also become an enum { WITH_LEN, WITHOUT_LEN, EITHER },
>>but not sure if that's really necessary.

Ok.


>>Are you going to provide len_load, maskload and len_maskload
>>patterns or just len_maskload?  (I hope the last)

I just want to only enable len_maskload in RISC-V port (no len_load or mask_load)
As I said in commit log, 

for (int i= 0; i < n; i++)
  a[i] = a[i] + b[i];

has length no mask == > len_maskload/len_maskstore ( length, mask = {1,1,1,1,1,1,1,.....})

for (int i= 0; i < 4; i++)
  if (cond[i])
  a[i] = a[i] + b[i];

no length has mask == > len_maskload/len_maskstore ( length = vf, mask = comparison)


>> common with IFN_MASK_STORE by using internal_fn_stored_value_index
Ok.

>> internal_fn_stored_value_index
ok

>> That's too simple, the code using the info is not prepared for
>> LEN_MASK since it handles maskload and len_load separately.
>> I suggest to drop this, picking it up separately.
ok

>> I think you need to verify the length operand is the full vector, note
>> this is for if-conversion which could care less for _LEN, but if we
>> insist on using _LEN (I didn't see you changing if-conversion that
>> way?!) then we need to put in some value even for the "scalar"
>> placeholder.  I'd suggest to simply use IFN_MASK_{LOAD,STORE} in
>> if-conversion but vectorize that as LEN_ with full length if
>> plain MASK_LOAD/STORE isn't available.  Which means these changes
>> are not necessary at all.

ok

>>as with can_mask_* we don't really care if masking is supported or
>>not so I suggest to amend get_len_load_store_mode to also check the
>>len_mask{load,store} optabs?  Given that the function
>>should possibly return the corresponding IFN as well.
ok

>>use the proper ifn index compute fn
ok

>>so this answers my question - you just have len_mask{load,store}?
Yes.


>>I think we really want to common this somehow, having
>>if (loop_lens) do the final_len compute and then afterwards
>>select the IFN to create, filling required default args of
>>final_mask and final_len if not computed.
ok.

>>and split this out to a helper function
ok.




juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-06-15 16:06
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford; krebbel; uweigand
Subject: Re: [PATCH V2] VECT: Support LEN_MASK_ LOAD/STORE to support flow control for length loop control
On Mon, 12 Jun 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Target like ARM SVE in GCC has an elegant way to handle both loop control
> and flow control simultaneously:
> 
> loop_control_mask = WHILE_ULT
> flow_control_mask = comparison
> control_mask = loop_control_mask & flow_control_mask;
> MASK_LOAD (control_mask)
> MASK_STORE (control_mask)
> 
> However, targets like RVV (RISC-V Vector) can not use this approach in
> auto-vectorization since RVV use length in loop control.
> 
> This patch adds LEN_MASK_ LOAD/STORE to support flow control for targets
> like RISC-V that uses length in loop control.
> Normalize load/store into LEN_MASK_ LOAD/STORE as long as either length
> or mask is valid. Length is the outcome of SELECT_VL or MIN_EXPR.
> Mask is the outcome of comparison.
> 
> LEN_MASK_ LOAD/STORE format is defined as follows:
> 1). LEN_MASK_LOAD (ptr, align, length, mask).
> 2). LEN_MASK_STORE (ptr, align, length, mask, vec).
> 
> Consider these 4 following cases:
> 
> VLA: Variable-length auto-vectorization
> VLS: Specific-length auto-vectorization
> 
> Case 1 (VLS): -mrvv-vector-bits=128   IR (Does not use LEN_MASK_*):
> Code: v1 = MEM (...)
>   for (int i = 0; i < 4; i++)           v2 = MEM (...)
>     a[i] = b[i] + c[i];                 v3 = v1 + v2 
>                                         MEM[...] = v3
> 
> Case 2 (VLS): -mrvv-vector-bits=128   IR (LEN_MASK_* with length = VF, mask = comparison):
> Code:                                   mask = comparison
>   for (int i = 0; i < 4; i++)           v1 = LEN_MASK_LOAD (length = VF, mask)
>     if (cond[i])                        v2 = LEN_MASK_LOAD (length = VF, mask) 
>       a[i] = b[i] + c[i];               v3 = v1 + v2
>                                         LEN_MASK_STORE (length = VF, mask, v3)
>            
> Case 3 (VLA):
> Code:                                   loop_len = SELECT_VL or MIN
>   for (int i = 0; i < n; i++)           v1 = LEN_MASK_LOAD (length = loop_len, mask = {-1,-1,...})
>       a[i] = b[i] + c[i];               v2 = LEN_MASK_LOAD (length = loop_len, mask = {-1,-1,...})
>                                         v3 = v1 + v2                            
>                                         LEN_MASK_STORE (length = loop_len, mask = {-1,-1,...}, v3)
> 
> Case 4 (VLA):
> Code:                                   loop_len = SELECT_VL or MIN
>   for (int i = 0; i < n; i++)           mask = comparison
>       if (cond[i])                      v1 = LEN_MASK_LOAD (length = loop_len, mask)
>       a[i] = b[i] + c[i];               v2 = LEN_MASK_LOAD (length = loop_len, mask)
>                                         v3 = v1 + v2                            
>                                         LEN_MASK_STORE (length = loop_len, mask, v3)
> 
> More features:
> 1. Support simplify gimple fold for LEN_MASK_ LOAD/STORE:
>    LEN_MASK_STORE (length = vf, mask = {-1,-1,...}, v) ===> MEM [...] = V
> 2. Allow DSE for LEN_MASK_* LOAD/STORE.
> 
> Bootstrap && Regression on X86 with no surprise difference.
 
Can you please split the patch?  Have 1/n add the optab and ifn
plus adjust the generic ifn predicates.  Have 2/n adjust the vectorizer
parts and 3/n optional things such as DSE.
 
Some comments below.
 
> gcc/ChangeLog:
> 
>         * doc/md.texi: Add LEN_MASK_ LOAD/STORE.
>         * genopinit.cc (main): Ditto.
>         (CMP_NAME): Ditto.
>         * gimple-fold.cc (arith_overflowed_p): Ditto.
>         (gimple_fold_partial_load_store_mem_ref): Ditto.
>         (gimple_fold_partial_store): Ditto.
>         (gimple_fold_call): Ditto.
>         * internal-fn.cc (len_maskload_direct): Ditto.
>         (len_maskstore_direct): Ditto.
>         (expand_partial_load_optab_fn): Ditto.
>         (expand_len_maskload_optab_fn): Ditto.
>         (expand_partial_store_optab_fn): Ditto.
>         (expand_len_maskstore_optab_fn): Ditto.
>         (direct_len_maskload_optab_supported_p): Ditto.
>         (direct_len_maskstore_optab_supported_p): Ditto.
>         (internal_load_fn_p): Ditto.
>         (internal_store_fn_p): Ditto.
>         (internal_fn_mask_index): Ditto.
>         (internal_fn_stored_value_index): Ditto.
>         * internal-fn.def (LEN_MASK_LOAD): Ditto.
>         (LEN_MASK_STORE): Ditto.
>         * optabs-query.cc (can_vec_len_mask_load_store_p): Ditto.
>         * optabs-query.h (can_vec_len_mask_load_store_p): Ditto.
>         * optabs.def (OPTAB_CD): Ditto.
>         * tree-data-ref.cc (get_references_in_stmt): Ditto.
>         * tree-if-conv.cc (ifcvt_can_use_mask_load_store): Ditto.
>         * tree-ssa-alias.cc (ref_maybe_used_by_call_p_1): Ditto.
>         (call_may_clobber_ref_p_1): Ditto.
>         * tree-ssa-dse.cc (initialize_ao_ref_for_dse): Ditto.
>         (dse_optimize_stmt): Ditto.
>         * tree-ssa-loop-ivopts.cc (get_mem_type_for_internal_fn): Ditto.
>         (get_alias_ptr_type_for_ptr_address): Ditto.
>         * tree-ssa-sccvn.cc (vn_reference_lookup_3): Ditto.
>         * tree-vect-data-refs.cc (can_group_stmts_p): Ditto.
>         (vect_find_stmt_data_reference): Ditto.
>         (vect_supportable_dr_alignment): Ditto.
>         * tree-vect-loop.cc (vect_verify_loop_lens): Ditto.
>         (optimize_mask_stores): Ditto.
>         * tree-vect-slp.cc (vect_get_operand_map): Ditto.
>         (vect_build_slp_tree_2): Ditto.
>         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
>         (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         (vect_get_vector_types_for_stmt): Ditto.
>         * tree-vectorizer.cc (try_vectorize_loop_1): Ditto.
> 
> ---
>  gcc/doc/md.texi             | 32 ++++++++++++
>  gcc/genopinit.cc            |  6 ++-
>  gcc/gimple-fold.cc          | 28 ++++++++---
>  gcc/internal-fn.cc          | 37 +++++++++++++-
>  gcc/internal-fn.def         |  4 ++
>  gcc/optabs-query.cc         | 39 +++++++++++++++
>  gcc/optabs-query.h          |  1 +
>  gcc/optabs.def              |  2 +
>  gcc/tree-data-ref.cc        |  4 ++
>  gcc/tree-if-conv.cc         |  3 ++
>  gcc/tree-ssa-alias.cc       |  3 ++
>  gcc/tree-ssa-dse.cc         | 12 +++++
>  gcc/tree-ssa-loop-ivopts.cc |  8 +++
>  gcc/tree-ssa-sccvn.cc       |  6 +++
>  gcc/tree-vect-data-refs.cc  | 20 +++++---
>  gcc/tree-vect-loop.cc       | 52 +++++++++++--------
>  gcc/tree-vect-slp.cc        |  5 ++
>  gcc/tree-vect-stmts.cc      | 99 ++++++++++++++++++++++++++++++++++---
>  gcc/tree-vectorizer.cc      |  2 +
>  19 files changed, 320 insertions(+), 43 deletions(-)
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 95f7fe1f802..fc99990465d 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5136,6 +5136,38 @@ of @code{QI} elements.
>  
>  This pattern is not allowed to @code{FAIL}.
>  
> +@cindex @code{len_maskload@var{m}@var{n}} instruction pattern
> +@item @samp{len_maskload@var{m}@var{n}}
> +Perform a load of vector which is predicated by length and mask
> +from memory operand 1 of mode @var{m} into register operand 0.
> +Length is provided in operand 2 which has whichever
> +integer mode the target prefers.
> +Mask is provided in register operand 3 of mode @var{n}.
> +
> +operand 2 can be a variable or a constant amount. It can be vectorization
> +factor which is the special constant value represents the maximum length.
 
Can you try using the same wording for length and mask operands
as for len_load and maskload?  Also len_load has the "bias"
operand which you omit here - IIRC that was added for s390 which
for unknown reason behaves a little different than power.  If
len support for s390 ever extends to other ops or power or s390
gain mask support for conditional code we'd likely have to adjust
each optab you add.  Maybe it's better to add the bias operand
now.
 
Andreas?  Uli?
 
> +
> +operand 3 can be a variable or a constant amount. It can be all 1
> +which is the special constant value represents the full mask.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
> +@cindex @code{len_maskstore@var{m}@var{n}} instruction pattern
> +@item @samp{len_maskstore@var{m}@var{n}}
> +Perform a store of vector which is predicated by length and mask
> +from register operand 1 of mode @var{m} into memory operand 0.
> +Length is provided in operand 2 which has whichever
> +integer mode the target prefers.
> +Mask is provided in register operand 3 of mode @var{n}.
> +
> +operand 2 can be a variable or a constant amount. It can be vectorization
> +factor which is the special constant value represents the maximum length.
> +
> +operand 3 can be a variable or a constant amount. It can be all 1
> +which is the special constant value represents the full mask.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
>  @cindex @code{vec_perm@var{m}} instruction pattern
>  @item @samp{vec_perm@var{m}}
>  Output a (variable) vector permutation.  Operand 0 is the destination
> diff --git a/gcc/genopinit.cc b/gcc/genopinit.cc
> index 0c1b6859ca0..9aeebd66724 100644
> --- a/gcc/genopinit.cc
> +++ b/gcc/genopinit.cc
> @@ -376,7 +376,8 @@ main (int argc, const char **argv)
>  
>    fprintf (s_file,
>     "/* Returns TRUE if the target supports any of the partial vector\n"
> -    "   optabs: while_ult_optab, len_load_optab or len_store_optab,\n"
> +    "   optabs: while_ult_optab, len_load_optab, len_store_optab,\n"
> +    "   len_maskload_optab or len_maskstore_optab,\n"
>     "   for any mode.  */\n"
>     "bool\npartial_vectors_supported_p (void)\n{\n");
>    bool any_match = false;
> @@ -386,7 +387,8 @@ main (int argc, const char **argv)
>      {
>  #define CMP_NAME(N) !strncmp (p->name, (N), strlen ((N)))
>        if (CMP_NAME("while_ult") || CMP_NAME ("len_load")
> -   || CMP_NAME ("len_store"))
> +   || CMP_NAME ("len_store") || CMP_NAME ("len_maskload")
> +   || CMP_NAME ("len_maskstore"))
>  {
>    if (first)
>      fprintf (s_file, " HAVE_%s", p->name);
> diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
> index 581575b65ec..a2c2ad5bfe7 100644
> --- a/gcc/gimple-fold.cc
> +++ b/gcc/gimple-fold.cc
> @@ -5370,8 +5370,8 @@ arith_overflowed_p (enum tree_code code, const_tree type,
>    return wi::min_precision (wres, sign) > TYPE_PRECISION (type);
>  }
>  
> -/* If IFN_{MASK,LEN}_LOAD/STORE call CALL is unconditional, return a MEM_REF
> -   for the memory it references, otherwise return null.  VECTYPE is the
> +/* If IFN_{MASK,LEN,LEN_MASK}_LOAD/STORE call CALL is unconditional, return a
> +   MEM_REF for the memory it references, otherwise return null.  VECTYPE is the
>     type of the memory vector.  MASK_P indicates it's for MASK if true,
>     otherwise it's for LEN.  */
>  
> @@ -5383,7 +5383,20 @@ gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool mask_p)
>    if (!tree_fits_uhwi_p (alias_align))
>      return NULL_TREE;
>  
> -  if (mask_p)
> +  if (gimple_call_internal_fn (call) == IFN_LEN_MASK_LOAD
> +      || gimple_call_internal_fn (call) == IFN_LEN_MASK_STORE)
> +    {
 
Can you instead add a len_p argument to the function and do
 
    if (len_p)
      check len is full
    if (mask_p)
      check mask is full
 
?
 
> +      tree basic_len = gimple_call_arg (call, 2);
> +      if (!poly_int_tree_p (basic_len))
> + return NULL_TREE;
> +      if (maybe_ne (tree_to_poly_uint64 (basic_len),
> +     TYPE_VECTOR_SUBPARTS (vectype)))
> + return NULL_TREE;
> +      tree mask = gimple_call_arg (call, 3);
> +      if (!integer_all_onesp (mask))
> + return NULL_TREE;
> +    }
> +  else if (mask_p)
>      {
>        tree mask = gimple_call_arg (call, 2);
>        if (!integer_all_onesp (mask))
> @@ -5409,7 +5422,7 @@ gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool mask_p)
>    return fold_build2 (MEM_REF, vectype, ptr, offset);
>  }
>  
> -/* Try to fold IFN_{MASK,LEN}_LOAD call CALL.  Return true on success.
> +/* Try to fold IFN_{MASK,LEN,LEN_MASK}_LOAD call CALL.  Return true on success.
>     MASK_P indicates it's for MASK if true, otherwise it's for LEN.  */
>  
>  static bool
> @@ -5431,14 +5444,15 @@ gimple_fold_partial_load (gimple_stmt_iterator *gsi, gcall *call, bool mask_p)
>    return false;
>  }
>  
> -/* Try to fold IFN_{MASK,LEN}_STORE call CALL.  Return true on success.
> +/* Try to fold IFN_{MASK,LEN,LEN_MASK}_STORE call CALL.  Return true on success.
>     MASK_P indicates it's for MASK if true, otherwise it's for LEN.  */
>  
>  static bool
>  gimple_fold_partial_store (gimple_stmt_iterator *gsi, gcall *call,
>     bool mask_p)
>  {
> -  tree rhs = gimple_call_arg (call, 3);
> +  tree rhs = gimple_call_arg (
> +    call, gimple_call_internal_fn (call) == IFN_LEN_MASK_STORE ? 4 : 3);
 
Use internal_fn_stored_value_index and internal_fn_mask_index,
possibly add internal_fn_len_index?
 
>    if (tree lhs
>        = gimple_fold_partial_load_store_mem_ref (call, TREE_TYPE (rhs), mask_p))
>      {
> @@ -5659,9 +5673,11 @@ gimple_fold_call (gimple_stmt_iterator *gsi, bool inplace)
>    cplx_result = true;
>    break;
>  case IFN_MASK_LOAD:
> + case IFN_LEN_MASK_LOAD:
>    changed |= gimple_fold_partial_load (gsi, stmt, true);
>    break;
>  case IFN_MASK_STORE:
> + case IFN_LEN_MASK_STORE:
>    changed |= gimple_fold_partial_store (gsi, stmt, true);
>    break;
>  case IFN_LEN_LOAD:
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index da9b944dd5d..4a9fe388eed 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -165,6 +165,7 @@ init_internal_fns ()
>  #define mask_load_lanes_direct { -1, -1, false }
>  #define gather_load_direct { 3, 1, false }
>  #define len_load_direct { -1, -1, false }
> +#define len_maskload_direct { -1, 3, false }
>  #define mask_store_direct { 3, 2, false }
>  #define store_lanes_direct { 0, 0, false }
>  #define mask_store_lanes_direct { 0, 0, false }
> @@ -172,6 +173,7 @@ init_internal_fns ()
>  #define vec_cond_direct { 2, 0, false }
>  #define scatter_store_direct { 3, 1, false }
>  #define len_store_direct { 3, 3, false }
> +#define len_maskstore_direct { 4, 3, false }
>  #define vec_set_direct { 3, 3, false }
>  #define unary_direct { 0, 0, true }
>  #define unary_convert_direct { -1, 0, true }
> @@ -2875,6 +2877,17 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>        create_input_operand (&ops[3], bias, QImode);
>        expand_insn (icode, 4, ops);
>      }
> +  else if (optab == len_maskload_optab)
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> +    TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      maskt = gimple_call_arg (stmt, 3);
> +      mask = expand_normal (maskt);
> +      create_input_operand (&ops[3], mask, TYPE_MODE (TREE_TYPE (maskt)));
> +      icode = convert_optab_handler (optab, TYPE_MODE (type),
> +      TYPE_MODE (TREE_TYPE (maskt)));
> +      expand_insn (icode, 4, ops);
> +    }
>    else
>      {
>        create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> @@ -2888,6 +2901,7 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  #define expand_mask_load_optab_fn expand_partial_load_optab_fn
>  #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn
>  #define expand_len_load_optab_fn expand_partial_load_optab_fn
> +#define expand_len_maskload_optab_fn expand_partial_load_optab_fn
>  
>  /* Expand MASK_STORE{,_LANES} or LEN_STORE call STMT using optab OPTAB.  */
>  
> @@ -2900,7 +2914,7 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> -  rhs = gimple_call_arg (stmt, 3);
> +  rhs = gimple_call_arg (stmt, optab == len_maskstore_optab ? 4 : 3);
 
see above
 
>    type = TREE_TYPE (rhs);
>    lhs = expand_call_mem_ref (type, stmt, 0);
>  
> @@ -2927,6 +2941,16 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>        create_input_operand (&ops[3], bias, QImode);
>        expand_insn (icode, 4, ops);
>      }
> +  else if (optab == len_maskstore_optab)
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> +    TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      maskt = gimple_call_arg (stmt, 3);
> +      mask = expand_normal (maskt);
> +      create_input_operand (&ops[3], mask, TYPE_MODE (TREE_TYPE (maskt)));
> +      icode = convert_optab_handler (optab, TYPE_MODE (type), GET_MODE (mask));
> +      expand_insn (icode, 4, ops);
> +    }
>    else
>      {
>        create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> @@ -2937,6 +2961,7 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  #define expand_mask_store_optab_fn expand_partial_store_optab_fn
>  #define expand_mask_store_lanes_optab_fn expand_mask_store_optab_fn
>  #define expand_len_store_optab_fn expand_partial_store_optab_fn
> +#define expand_len_maskstore_optab_fn expand_partial_store_optab_fn
>  
>  /* Expand VCOND, VCONDU and VCONDEQ optab internal functions.
>     The expansion of STMT happens based on OPTAB table associated.  */
> @@ -3890,6 +3915,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>  #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
>  #define direct_gather_load_optab_supported_p convert_optab_supported_p
>  #define direct_len_load_optab_supported_p direct_optab_supported_p
> +#define direct_len_maskload_optab_supported_p convert_optab_supported_p
>  #define direct_mask_store_optab_supported_p convert_optab_supported_p
>  #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
>  #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
> @@ -3897,6 +3923,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>  #define direct_vec_cond_optab_supported_p convert_optab_supported_p
>  #define direct_scatter_store_optab_supported_p convert_optab_supported_p
>  #define direct_len_store_optab_supported_p direct_optab_supported_p
> +#define direct_len_maskstore_optab_supported_p convert_optab_supported_p
>  #define direct_while_optab_supported_p convert_optab_supported_p
>  #define direct_fold_extract_optab_supported_p direct_optab_supported_p
>  #define direct_fold_left_optab_supported_p direct_optab_supported_p
> @@ -4361,6 +4388,7 @@ internal_load_fn_p (internal_fn fn)
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
>      case IFN_LEN_LOAD:
> +    case IFN_LEN_MASK_LOAD:
>        return true;
>  
>      default:
> @@ -4381,6 +4409,7 @@ internal_store_fn_p (internal_fn fn)
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
>      case IFN_LEN_STORE:
> +    case IFN_LEN_MASK_STORE:
>        return true;
>  
>      default:
> @@ -4420,6 +4449,10 @@ internal_fn_mask_index (internal_fn fn)
>      case IFN_MASK_STORE_LANES:
>        return 2;
>  
> +    case IFN_LEN_MASK_LOAD:
> +    case IFN_LEN_MASK_STORE:
> +      return 3;
> +
>      case IFN_MASK_GATHER_LOAD:
>      case IFN_MASK_SCATTER_STORE:
>        return 4;
> @@ -4444,6 +4477,8 @@ internal_fn_stored_value_index (internal_fn fn)
>      case IFN_MASK_SCATTER_STORE:
>      case IFN_LEN_STORE:
>        return 3;
> +    case IFN_LEN_MASK_STORE:
> +      return 4;
>  
>      default:
>        return -1;
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 5d638de6d06..cf0bcea5ac7 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -50,12 +50,14 @@ along with GCC; see the file COPYING3.  If not see
>     - mask_load_lanes: currently just vec_mask_load_lanes
>     - gather_load: used for {mask_,}gather_load
>     - len_load: currently just len_load
> +   - len_maskload: currently just len_maskload
>  
>     - mask_store: currently just maskstore
>     - store_lanes: currently just vec_store_lanes
>     - mask_store_lanes: currently just vec_mask_store_lanes
>     - scatter_store: used for {mask_,}scatter_store
>     - len_store: currently just len_store
> +   - len_maskstore: currently just len_maskstore
>  
>     - unary: a normal unary optab, such as vec_reverse_<mode>
>     - binary: a normal binary optab, such as vec_interleave_lo_<mode>
> @@ -157,6 +159,7 @@ DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
>         mask_gather_load, gather_load)
>  
>  DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
> +DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, len_maskload, len_maskload)
>  
>  DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store)
>  DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
> @@ -175,6 +178,7 @@ DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
>  DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
>  
>  DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> +DEF_INTERNAL_OPTAB_FN (LEN_MASK_STORE, 0, len_maskstore, len_maskstore)
>  
>  DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
>  DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
> diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> index 276f8408dd7..ec765e78088 100644
> --- a/gcc/optabs-query.cc
> +++ b/gcc/optabs-query.cc
> @@ -624,6 +624,45 @@ get_len_load_store_mode (machine_mode mode, bool is_load)
>    return opt_machine_mode ();
>  }
>  
> +/* Return true if target supports vector length && masked load/store for mode.
> +   Length is used on loop control and mask is used on flow control.  */
> +
> +bool
> +can_vec_len_mask_load_store_p (machine_mode mode, bool is_load)
 
why deviate from can_vec_mask_load_store_p and not pass in the
mask_mode?  In fact I wonder why this function differs from
can_vec_mask_load_store_p besides using other optabs?  Couldn't
we simply add a bool with_len argument to can_vec_mask_load_store_p?
 
> +{
> +  optab op = is_load ? len_maskload_optab : len_maskstore_optab;
> +  machine_mode vmode;
> +  machine_mode mask_mode;
> +
> +  /* If mode is vector mode, check it directly.  */
> +  if (VECTOR_MODE_P (mode))
> +    return targetm.vectorize.get_mask_mode (mode).exists (&mask_mode)
> +    && convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing;
> +
> +  scalar_mode smode;
> +  if (is_a<scalar_mode> (mode, &smode))
> +    /* See if there is any chance the mask load or store might be
> +       vectorized.  If not, punt.  */
> +    vmode = targetm.vectorize.preferred_simd_mode (smode);
> +  else
> +    vmode = mode;
> +
> +  if (VECTOR_MODE_P (vmode)
> +      && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
> +      && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
> +    return true;
> +
> +  auto_vector_modes vector_modes;
> +  targetm.vectorize.autovectorize_vector_modes (&vector_modes, true);
> +  for (machine_mode base_mode : vector_modes)
> +    if (related_vector_mode (base_mode, smode).exists (&vmode)
> + && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
> + && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
> +      return true;
> +
> +  return false;
> +}
> +
>  /* Return true if there is a compare_and_swap pattern.  */
>  
>  bool
> diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
> index b266d2fe990..2b9c9b44af2 100644
> --- a/gcc/optabs-query.h
> +++ b/gcc/optabs-query.h
> @@ -189,6 +189,7 @@ enum insn_code find_widening_optab_handler_and_mode (optab, machine_mode,
>  int can_mult_highpart_p (machine_mode, bool);
>  bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool);
>  opt_machine_mode get_len_load_store_mode (machine_mode, bool);
> +bool can_vec_len_mask_load_store_p (machine_mode, bool);
>  bool can_compare_and_swap_p (machine_mode, bool);
>  bool can_atomic_exchange_p (machine_mode, bool);
>  bool can_atomic_load_p (machine_mode);
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index f31b69c5d85..f5401aea364 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -91,6 +91,8 @@ OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b")
>  OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b")
>  OPTAB_CD(maskload_optab, "maskload$a$b")
>  OPTAB_CD(maskstore_optab, "maskstore$a$b")
> +OPTAB_CD(len_maskload_optab, "len_maskload$a$b")
> +OPTAB_CD(len_maskstore_optab, "len_maskstore$a$b")
>  OPTAB_CD(gather_load_optab, "gather_load$a$b")
>  OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b")
>  OPTAB_CD(scatter_store_optab, "scatter_store$a$b")
> diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
> index b576cce6db6..99aca44e6a5 100644
> --- a/gcc/tree-data-ref.cc
> +++ b/gcc/tree-data-ref.cc
> @@ -5816,6 +5816,8 @@ get_references_in_stmt (gimple *stmt, vec<data_ref_loc, va_heap> *references)
>      }
>    case IFN_MASK_LOAD:
>    case IFN_MASK_STORE:
> +   case IFN_LEN_MASK_LOAD:
> +   case IFN_LEN_MASK_STORE:
>      break;
>    default:
>      clobbers_memory = true;
> @@ -5861,11 +5863,13 @@ get_references_in_stmt (gimple *stmt, vec<data_ref_loc, va_heap> *references)
>  switch (gimple_call_internal_fn (stmt))
>    {
>    case IFN_MASK_LOAD:
> +   case IFN_LEN_MASK_LOAD:
>      if (gimple_call_lhs (stmt) == NULL_TREE)
>        break;
>      ref.is_read = true;
>      /* FALLTHRU */
>    case IFN_MASK_STORE:
> +   case IFN_LEN_MASK_STORE:
>      ptr = build_int_cst (TREE_TYPE (gimple_call_arg (stmt, 1)), 0);
>      align = tree_to_shwi (gimple_call_arg (stmt, 1));
>      if (ref.is_read)
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 1393ce184e3..0f549fa528d 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -960,6 +960,9 @@ ifcvt_can_use_mask_load_store (gimple *stmt)
>    if (can_vec_mask_load_store_p (mode, VOIDmode, is_load))
>      return true;
>  
> +  if (can_vec_len_mask_load_store_p (mode, is_load))
> +    return true;
 
But if-conversion only needs the conditional masking, not _len.  I
don't think we need to check this at all?  In fact
can_vec_mask_load_store_p should probably return true when
LEN_MASKLOAD is available since we can always use the full vector
len as len argument?  The suggested bool argument with_len
could also become an enum { WITH_LEN, WITHOUT_LEN, EITHER },
but not sure if that's really necessary.
 
Are you going to provide len_load, maskload and len_maskload
patterns or just len_maskload?  (I hope the last)
 
> +
>    return false;
>  }
>  
> diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc
> index 79ed956e300..100c4b2e7d9 100644
> --- a/gcc/tree-ssa-alias.cc
> +++ b/gcc/tree-ssa-alias.cc
> @@ -2815,11 +2815,13 @@ ref_maybe_used_by_call_p_1 (gcall *call, ao_ref *ref, bool tbaa_p)
>        case IFN_SCATTER_STORE:
>        case IFN_MASK_SCATTER_STORE:
>        case IFN_LEN_STORE:
> +      case IFN_LEN_MASK_STORE:
>  return false;
>        case IFN_MASK_STORE_LANES:
>  goto process_args;
>        case IFN_MASK_LOAD:
>        case IFN_LEN_LOAD:
> +      case IFN_LEN_MASK_LOAD:
>        case IFN_MASK_LOAD_LANES:
>  {
>    ao_ref rhs_ref;
> @@ -3065,6 +3067,7 @@ call_may_clobber_ref_p_1 (gcall *call, ao_ref *ref, bool tbaa_p)
>  return false;
>        case IFN_MASK_STORE:
>        case IFN_LEN_STORE:
> +      case IFN_LEN_MASK_STORE:
>        case IFN_MASK_STORE_LANES:
>  {
>    tree rhs = gimple_call_arg (call,
> diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc
> index eabe8ba4522..acaf844b8ef 100644
> --- a/gcc/tree-ssa-dse.cc
> +++ b/gcc/tree-ssa-dse.cc
> @@ -174,6 +174,17 @@ initialize_ao_ref_for_dse (gimple *stmt, ao_ref *write, bool may_def_ok = false)
>        return true;
>      }
>    break;
> + case IFN_LEN_MASK_STORE:
> +   /* We cannot initialize a must-def ao_ref (in all cases) but we
> +      can provide a may-def variant.  */
> +   if (may_def_ok)
> +     {
> +       ao_ref_init_from_ptr_and_size
> +   (write, gimple_call_arg (stmt, 0),
> +    TYPE_SIZE_UNIT (TREE_TYPE (gimple_call_arg (stmt, 4))));
 
common with IFN_MASK_STORE by using internal_fn_stored_value_index
 
> +       return true;
> +     }
> +   break;
>  default:;
>  }
>      }
> @@ -1483,6 +1494,7 @@ dse_optimize_stmt (function *fun, gimple_stmt_iterator *gsi, sbitmap live_bytes)
>  {
>  case IFN_LEN_STORE:
>  case IFN_MASK_STORE:
> + case IFN_LEN_MASK_STORE:
>    {
>      enum dse_store_status store_status;
>      store_status = dse_classify_store (&ref, stmt, false, live_bytes);
> diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
> index 6fbd2d59318..e8e9df1ab74 100644
> --- a/gcc/tree-ssa-loop-ivopts.cc
> +++ b/gcc/tree-ssa-loop-ivopts.cc
> @@ -2439,6 +2439,7 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p)
>      case IFN_MASK_LOAD:
>      case IFN_MASK_LOAD_LANES:
>      case IFN_LEN_LOAD:
> +    case IFN_LEN_MASK_LOAD:
>        if (op_p == gimple_call_arg_ptr (call, 0))
>  return TREE_TYPE (gimple_call_lhs (call));
>        return NULL_TREE;
> @@ -2450,6 +2451,11 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p)
>  return TREE_TYPE (gimple_call_arg (call, 3));
>        return NULL_TREE;
>  
> +    case IFN_LEN_MASK_STORE:
> +      if (op_p == gimple_call_arg_ptr (call, 0))
> + return TREE_TYPE (gimple_call_arg (call, 4));
 
internal_fn_stored_value_index
 
> +      return NULL_TREE;
> +
>      default:
>        return NULL_TREE;
>      }
> @@ -7555,6 +7561,8 @@ get_alias_ptr_type_for_ptr_address (iv_use *use)
>      case IFN_MASK_STORE_LANES:
>      case IFN_LEN_LOAD:
>      case IFN_LEN_STORE:
> +    case IFN_LEN_MASK_LOAD:
> +    case IFN_LEN_MASK_STORE:
>        /* The second argument contains the correct alias type.  */
>        gcc_assert (use->op_p = gimple_call_arg_ptr (call, 0));
>        return TREE_TYPE (gimple_call_arg (call, 1));
> diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> index 27c84e78fcf..02fbc4a2dfa 100644
> --- a/gcc/tree-ssa-sccvn.cc
> +++ b/gcc/tree-ssa-sccvn.cc
> @@ -3304,6 +3304,12 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
>    if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
>      return (void *)-1;
>    break;
> + case IFN_LEN_MASK_STORE:
> +   len = gimple_call_arg (call, 2);
> +   mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> +   if (!tree_fits_uhwi_p (len) || TREE_CODE (mask) != VECTOR_CST)
> +     return (void *)-1;
> +   break;
 
That's too simple, the code using the info is not prepared for
LEN_MASK since it handles maskload and len_load separately.
I suggest to drop this, picking it up separately.
 
>  default:
>    return (void *)-1;
>  }
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index ebe93832b1e..fb83446519a 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -3039,17 +3039,21 @@ can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
>        if (!call2 || !gimple_call_internal_p (call2))
>  return false;
>        internal_fn ifn = gimple_call_internal_fn (call1);
> -      if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
> +      if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE
> +   && ifn != IFN_LEN_MASK_LOAD && ifn != IFN_LEN_MASK_STORE)
>  return false;
>        if (ifn != gimple_call_internal_fn (call2))
>  return false;
>  
>        /* Check that the masks are the same.  Cope with casts of masks,
>  like those created by build_mask_conversion.  */
> -      tree mask1 = gimple_call_arg (call1, 2);
> -      tree mask2 = gimple_call_arg (call2, 2);
> +      unsigned int mask_argno
> + = ifn == IFN_LEN_MASK_LOAD || ifn == IFN_LEN_MASK_STORE ? 3 : 2;
> +      tree mask1 = gimple_call_arg (call1, mask_argno);
> +      tree mask2 = gimple_call_arg (call2, mask_argno);
>        if (!operand_equal_p (mask1, mask2, 0)
> -          && (ifn == IFN_MASK_STORE || !allow_slp_p))
> +   && (ifn == IFN_MASK_STORE || ifn == IFN_LEN_MASK_STORE
> +       || !allow_slp_p))
 
I think you need to verify the length operand is the full vector, note
this is for if-conversion which could care less for _LEN, but if we
insist on using _LEN (I didn't see you changing if-conversion that
way?!) then we need to put in some value even for the "scalar"
placeholder.  I'd suggest to simply use IFN_MASK_{LOAD,STORE} in
if-conversion but vectorize that as LEN_ with full length if
plain MASK_LOAD/STORE isn't available.  Which means these changes
are not necessary at all.
 
>  {
>    mask1 = strip_conversion (mask1);
>    if (!mask1)
> @@ -4292,7 +4296,9 @@ vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
>    if (gcall *call = dyn_cast <gcall *> (stmt))
>      if (!gimple_call_internal_p (call)
>  || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
> -     && gimple_call_internal_fn (call) != IFN_MASK_STORE))
> +     && gimple_call_internal_fn (call) != IFN_MASK_STORE
> +     && gimple_call_internal_fn (call) != IFN_LEN_MASK_LOAD
> +     && gimple_call_internal_fn (call) != IFN_LEN_MASK_STORE))
>        {
>  free_data_ref (dr);
>  return opt_result::failure_at (stmt,
> @@ -6731,7 +6737,9 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
>    if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
>      if (gimple_call_internal_p (stmt)
>  && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
> -     || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
> +     || gimple_call_internal_fn (stmt) == IFN_MASK_STORE
> +     || gimple_call_internal_fn (stmt) == IFN_LEN_MASK_LOAD
> +     || gimple_call_internal_fn (stmt) == IFN_LEN_MASK_STORE))
>        return dr_unaligned_supported;
>  
>    if (loop_vinfo)
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index ace9e759f5b..03de41d4988 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1296,30 +1296,33 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
>    if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
>      return false;
>  
> -  machine_mode len_load_mode = get_len_load_store_mode
> -    (loop_vinfo->vector_mode, true).require ();
> -  machine_mode len_store_mode = get_len_load_store_mode
> -    (loop_vinfo->vector_mode, false).require ();
> +  if (!can_vec_len_mask_load_store_p (loop_vinfo->vector_mode, true)
> +      && !can_vec_len_mask_load_store_p (loop_vinfo->vector_mode, false))
> +    {
> +      machine_mode len_load_mode
> + = get_len_load_store_mode (loop_vinfo->vector_mode, true).require ();
> +      machine_mode len_store_mode
> + = get_len_load_store_mode (loop_vinfo->vector_mode, false).require ();
 
as with can_mask_* we don't really care if masking is supported or
not so I suggest to amend get_len_load_store_mode to also check the
len_mask{load,store} optabs?  Given that the function
should possibly return the corresponding IFN as well.
 
> -  signed char partial_load_bias = internal_len_load_store_bias
> -    (IFN_LEN_LOAD, len_load_mode);
> +      signed char partial_load_bias
> + = internal_len_load_store_bias (IFN_LEN_LOAD, len_load_mode);
>  
> -  signed char partial_store_bias = internal_len_load_store_bias
> -    (IFN_LEN_STORE, len_store_mode);
> +      signed char partial_store_bias
> + = internal_len_load_store_bias (IFN_LEN_STORE, len_store_mode);
>  
> -  gcc_assert (partial_load_bias == partial_store_bias);
> +      gcc_assert (partial_load_bias == partial_store_bias);
>  
> -  if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
> -    return false;
> +      if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
> + return false;
>  
> -  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> -     len_loads with a length of zero.  In order to avoid that we prohibit
> -     more than one loop length here.  */
> -  if (partial_load_bias == -1
> -      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
> -    return false;
> +      /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> + len_loads with a length of zero.  In order to avoid that we prohibit
> + more than one loop length here.  */
> +      if (partial_load_bias == -1 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
> + return false;
>  
> -  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
> +      LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
> +    }
>  
>    unsigned int max_nitems_per_iter = 1;
>    unsigned int i;
> @@ -11317,7 +11320,8 @@ optimize_mask_stores (class loop *loop)
>     gsi_next (&gsi))
>  {
>    stmt = gsi_stmt (gsi);
> -   if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
> +   if (gimple_call_internal_p (stmt, IFN_MASK_STORE)
> +       || gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE))
>      worklist.safe_push (stmt);
>  }
>      }
> @@ -11340,7 +11344,8 @@ optimize_mask_stores (class loop *loop)
>        tree zero;
>  
>        last = worklist.pop ();
> -      mask = gimple_call_arg (last, 2);
> +      mask = gimple_call_arg (
> + last, gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE) ? 3 : 2);
 
use the proper ifn index compute fn
 
>        bb = gimple_bb (last);
>        /* Create then_bb and if-then structure in CFG, then_bb belongs to
>  the same loop as if_bb.  It could be different to LOOP when two
> @@ -11473,7 +11478,12 @@ optimize_mask_stores (class loop *loop)
>      }
>    /* Put other masked stores with the same mask to STORE_BB.  */
>    if (worklist.is_empty ()
> -       || gimple_call_arg (worklist.last (), 2) != mask
> +       || gimple_call_arg (worklist.last (),
> +   gimple_call_internal_p (worklist.last (),
> +   IFN_LEN_MASK_STORE)
> +     ? 3
> +     : 2)
 
use the proper ifn index compute fn
 
> +    != mask
>        || worklist.last () != stmt1)
>      break;
>    last = worklist.pop ();
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index ab89a82f1b3..937b5295df4 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -489,6 +489,7 @@ static const int cond_expr_maps[3][5] = {
>  };
>  static const int arg1_map[] = { 1, 1 };
>  static const int arg2_map[] = { 1, 2 };
> +static const int arg3_map[] = { 1, 3 };
>  static const int arg1_arg4_map[] = { 2, 1, 4 };
>  static const int op1_op0_map[] = { 2, 1, 0 };
>  
> @@ -524,6 +525,9 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
>    case IFN_MASK_LOAD:
>      return arg2_map;
>  
> +   case IFN_LEN_MASK_LOAD:
> +     return arg3_map;
> +
>    case IFN_GATHER_LOAD:
>      return arg1_map;
>  
> @@ -1779,6 +1783,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>      {
>        if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
>  gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
> +     || gimple_call_internal_p (stmt, IFN_LEN_MASK_LOAD)
 
It's probably not necessary to handle this if if-conversion emits
just IFN_MASK_LOAD.
 
>      || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
>      || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
>        else
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index a7acc032d47..9b797c61c88 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1837,6 +1837,15 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>        using_partial_vectors_p = true;
>      }
>  
> +  if (can_vec_len_mask_load_store_p (vecmode, is_load))
> +    {
> +      nvectors = group_memory_nvectors (group_size * vf, nunits);
> +      /* Length is used on loop control and mask for flow control.*/
> +      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> +      vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> +      using_partial_vectors_p = true;
> +    }
> +
 
so this answers my question - you just have len_mask{load,store}?
 
>    if (!using_partial_vectors_p)
>      {
>        if (dump_enabled_p ())
> @@ -7978,8 +7987,9 @@ vectorizable_store (vec_info *vinfo,
>        if (memory_access_type == VMAT_CONTIGUOUS)
>  {
>    if (!VECTOR_MODE_P (vec_mode)
> -       || !can_vec_mask_load_store_p (vec_mode,
> -      TYPE_MODE (mask_vectype), false))
> +       || (!can_vec_mask_load_store_p (vec_mode,
> +       TYPE_MODE (mask_vectype), false)
> +   && !can_vec_len_mask_load_store_p (vec_mode, false)))
>      return false;
>  }
>        else if (memory_access_type != VMAT_LOAD_STORE_LANES
> @@ -8942,7 +8952,38 @@ vectorizable_store (vec_info *vinfo,
>  }
>  
>        /* Arguments are ready.  Create the new vector stmt.  */
> -       if (final_mask)
> +       if (can_vec_len_mask_load_store_p (TYPE_MODE (vectype), false)
> +   && (final_mask || loop_lens))
 
I think we really want to common this somehow, having
if (loop_lens) do the final_len compute and then afterwards
select the IFN to create, filling required default args of
final_mask and final_len if not computed.
 
> + {
> +   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> +   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +   if (!final_mask)
> +     {
> +       machine_mode mask_mode
> + = targetm.vectorize.get_mask_mode (TYPE_MODE (vectype))
> +     .require ();
> +       mask_vectype
> + = build_truth_vector_type_for_mode (nunits, mask_mode);
> +       tree mask = build_int_cst (TREE_TYPE (mask_vectype), -1);
> +       final_mask = build_vector_from_val (mask_vectype, mask);
> +     }
> +   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +   tree final_len;
> +   if (loop_lens)
> +     final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +    vec_num * ncopies, vectype,
> +    vec_num * j + i, 1);
> +   else
> +     final_len = build_int_cst (iv_type, nunits);
> +   gcall *call
> +     = gimple_build_call_internal (IFN_LEN_MASK_STORE, 5,
> +   dataref_ptr, ptr, final_len,
> +   final_mask, vec_oprnd);
> +   gimple_call_set_nothrow (call, true);
> +   vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> +   new_stmt = call;
> + }
> +       else if (final_mask)
>  {
>    tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>    gcall *call
> @@ -9407,8 +9448,9 @@ vectorizable_load (vec_info *vinfo,
>  {
>    machine_mode vec_mode = TYPE_MODE (vectype);
>    if (!VECTOR_MODE_P (vec_mode)
> -       || !can_vec_mask_load_store_p (vec_mode,
> -      TYPE_MODE (mask_vectype), true))
> +       || (!can_vec_mask_load_store_p (vec_mode,
> +       TYPE_MODE (mask_vectype), true)
> +   && !can_vec_len_mask_load_store_p (vec_mode, false)))
>      return false;
>  }
>        else if (memory_access_type != VMAT_LOAD_STORE_LANES
> @@ -10301,7 +10343,47 @@ vectorizable_load (vec_info *vinfo,
>        align, misalign);
>      align = least_bit_hwi (misalign | align);
>  
> -     if (final_mask)
> +     if (can_vec_len_mask_load_store_p (TYPE_MODE (vectype),
> +        true)
> + && (final_mask || loop_lens)
> + && memory_access_type != VMAT_INVARIANT)
 
same
 
> +       {
> + tree ptr
> +   = build_int_cst (ref_type, align * BITS_PER_UNIT);
> + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> + if (!final_mask)
> +   {
> +     machine_mode mask_mode
> +       = targetm.vectorize
> +   .get_mask_mode (TYPE_MODE (vectype))
> +   .require ();
> +     mask_vectype
> +       = build_truth_vector_type_for_mode (nunits,
> +   mask_mode);
> +     tree mask
> +       = build_int_cst (TREE_TYPE (mask_vectype), -1);
> +     final_mask
> +       = build_vector_from_val (mask_vectype, mask);
> +   }
 
and split this out to a helper function
 
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> + tree final_len;
> + if (loop_lens)
> +   final_len
> +     = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i, 1);
> + else
> +   final_len = build_int_cst (iv_type, nunits);
> +
> + gcall *call
> +   = gimple_build_call_internal (IFN_LEN_MASK_LOAD, 4,
> + dataref_ptr, ptr,
> + final_len, final_mask);
> + gimple_call_set_nothrow (call, true);
> + new_stmt = call;
> + data_ref = NULL_TREE;
> +       }
> +     else if (final_mask)
>        {
>  tree ptr = build_int_cst (ref_type,
>    align * BITS_PER_UNIT);
> @@ -13027,7 +13109,8 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>  
>    if (gimple_get_lhs (stmt) == NULL_TREE
>        /* MASK_STORE has no lhs, but is ok.  */
> -      && !gimple_call_internal_p (stmt, IFN_MASK_STORE))
> +      && !gimple_call_internal_p (stmt, IFN_MASK_STORE)
> +      && !gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE))
 
See if-conversion comment.  This shouldn't be necessary.
 
>      {
>        if (is_a <gcall *> (stmt))
>  {
> @@ -13071,6 +13154,8 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>  scalar_type = TREE_TYPE (DR_REF (dr));
>        else if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
>  scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
> +      else if (gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE))
> + scalar_type = TREE_TYPE (gimple_call_arg (stmt, 4));
>        else
>  scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
>  
> diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
> index a048e9d8917..19312404ac4 100644
> --- a/gcc/tree-vectorizer.cc
> +++ b/gcc/tree-vectorizer.cc
> @@ -1101,6 +1101,8 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
>  {
>    internal_fn ifn = gimple_call_internal_fn (call);
>    if (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE
> +       || ifn == IFN_LEN_MASK_LOAD
> +       || ifn == IFN_LEN_MASK_STORE
 
Likewise.
 
>        /* Don't keep the if-converted parts when the ifn with
>  specifc type is not supported by the backend.  */
>        || (direct_internal_fn_p (ifn)
>
Robin Dapp June 15, 2023, 8:58 a.m. UTC | #3
>>> Can you try using the same wording for length and mask operands
>>> as for len_load and maskload?  Also len_load has the "bias"
>>> operand which you omit here - IIRC that was added for s390 which
>>> for unknown reason behaves a little different than power.  If
>>> len support for s390 ever extends to other ops or power or s390
>>> gain mask support for conditional code we'd likely have to adjust
>>> each optab you add.  Maybe it's better to add the bias operand
>>> now.
> 
> I don't know BIAS well and It seems to be a Power target dependent feature.
> I think len_mask_* in general should only need lenght and mask operand.
> Actually, the function argument is totally same as vp_load/vp_store in LLVM.
> 
> Could I just keep current format (without BIAS argument)? And extend it with BIAS if
> PowerPC want to use LEN_MASK_ * ?

FYI: The only proper user of bias is s390 because the insns cannot handle a
zero length.  Power doesn't actually need it.  What we do is just subtract the
bias (== 1) from the length in case of n_rgroups == 1 and nothing for
bias == 0 so the actually bias support code needed is small.

Regards
 Robin
juzhe.zhong@rivai.ai June 15, 2023, 9:01 a.m. UTC | #4
Ok. I will add BIAS argument too for the convenience of possible s390 needed.
Even though we (RVV) don't really need it.

Thank. Will send a small patch V3 soon.


juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-06-15 16:58
To: juzhe.zhong@rivai.ai; rguenther
CC: rdapp.gcc; gcc-patches; richard.sandiford; krebbel; uweigand; linkw
Subject: Re: [PATCH V2] VECT: Support LEN_MASK_ LOAD/STORE to support flow control for length loop control
>>> Can you try using the same wording for length and mask operands
>>> as for len_load and maskload?  Also len_load has the "bias"
>>> operand which you omit here - IIRC that was added for s390 which
>>> for unknown reason behaves a little different than power.  If
>>> len support for s390 ever extends to other ops or power or s390
>>> gain mask support for conditional code we'd likely have to adjust
>>> each optab you add.  Maybe it's better to add the bias operand
>>> now.
> 
> I don't know BIAS well and It seems to be a Power target dependent feature.
> I think len_mask_* in general should only need lenght and mask operand.
> Actually, the function argument is totally same as vp_load/vp_store in LLVM.
> 
> Could I just keep current format (without BIAS argument)? And extend it with BIAS if
> PowerPC want to use LEN_MASK_ * ?
 
FYI: The only proper user of bias is s390 because the insns cannot handle a
zero length.  Power doesn't actually need it.  What we do is just subtract the
bias (== 1) from the length in case of n_rgroups == 1 and nothing for
bias == 0 so the actually bias support code needed is small.
 
Regards
Robin
Richard Biener June 15, 2023, 9:15 a.m. UTC | #5
On Thu, 15 Jun 2023, Robin Dapp wrote:

> >>> Can you try using the same wording for length and mask operands
> >>> as for len_load and maskload?  Also len_load has the "bias"
> >>> operand which you omit here - IIRC that was added for s390 which
> >>> for unknown reason behaves a little different than power.  If
> >>> len support for s390 ever extends to other ops or power or s390
> >>> gain mask support for conditional code we'd likely have to adjust
> >>> each optab you add.  Maybe it's better to add the bias operand
> >>> now.
> > 
> > I don't know BIAS well and It seems to be a Power target dependent feature.
> > I think len_mask_* in general should only need lenght and mask operand.
> > Actually, the function argument is totally same as vp_load/vp_store in LLVM.
> > 
> > Could I just keep current format (without BIAS argument)? And extend it with BIAS if
> > PowerPC want to use LEN_MASK_ * ?
> 
> FYI: The only proper user of bias is s390 because the insns cannot handle a
> zero length.  Power doesn't actually need it.  What we do is just subtract the
> bias (== 1) from the length in case of n_rgroups == 1 and nothing for
> bias == 0 so the actually bias support code needed is small.

Meh, PoP is now behind a paywall, trying to get through ... I wonder
if there's a nice online html documenting the s390 len_load/store
instructions to better understand the need for the bias.

Richard.
Robin Dapp June 15, 2023, 9:18 a.m. UTC | #6
> Meh, PoP is now behind a paywall, trying to get through ... I wonder
> if there's a nice online html documenting the s390 len_load/store
> instructions to better understand the need for the bias.

https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf

Look for vector load with length (store).  The length operand specifies
the highest bytes to load instead of the actual length.

Regards
 Robin
Robin Dapp June 15, 2023, 9:20 a.m. UTC | #7
On 6/15/23 11:18, Robin Dapp wrote:
>> Meh, PoP is now behind a paywall, trying to get through ... I wonder
>> if there's a nice online html documenting the s390 len_load/store
>> instructions to better understand the need for the bias.

This is z16, but obviously no changes for vll/vstl:
https://publibfp.dhe.ibm.com/epubs/pdf/a227832d.pdf

Regards
 Robin
Richard Biener June 15, 2023, 9:52 a.m. UTC | #8
On Thu, 15 Jun 2023, Robin Dapp wrote:

> > Meh, PoP is now behind a paywall, trying to get through ... I wonder
> > if there's a nice online html documenting the s390 len_load/store
> > instructions to better understand the need for the bias.
> 
> https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> 
> Look for vector load with length (store).  The length operand specifies
> the highest bytes to load instead of the actual length.

Hmm.  It indeed cannot represent len == 0, so you are making sure
that never happens?  Because when it is actually zero you are
going to get -1 here?  At least I don't see the bias operand used at
all:

; Implement len_load/len_store optabs with vll/vstl.
(define_expand "len_load_v16qi"
  [(match_operand:V16QI 0 "register_operand")
   (match_operand:V16QI 1 "memory_operand")
   (match_operand:QI 2 "register_operand")
   (match_operand:QI 3 "vll_bias_operand")
  ]
  "TARGET_VX && TARGET_64BIT"
{
  rtx mem = adjust_address (operands[1], BLKmode, 0);

  rtx len = gen_reg_rtx (SImode);
  emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2]));
  emit_insn (gen_vllv16qi (operands[0], len, mem));
  DONE;
})

the docs of len_load say

"
@cindex @code{len_load_@var{m}} instruction pattern
@item @samp{len_load_@var{m}}
Load (operand 2 - operand 3) elements from vector memory operand 1
into vector register operand 0, setting the other elements of
operand 0 to undefined values.  Operands 0 and 1 have mode @var{m},
which must be a vector mode.  Operand 2 has whichever integer mode the
target prefers.  Operand 3 conceptually has mode @code{QI}. 

Operand 2 can be a variable or a constant amount.  Operand 3 specifies a
constant bias: it is either a constant 0 or a constant -1.  The predicate 
on
operand 3 must only accept the bias values that the target actually 
supports.
GCC handles a bias of 0 more efficiently than a bias of -1.

If (operand 2 - operand 3) exceeds the number of elements in mode
@var{m}, the behavior is undefined.

If the target prefers the length to be measured in bytes rather than
elements, it should only implement this pattern for vectors of @code{QI}
elements."

the minus in 'operand 2 - operand 3' should be a plus if the
bias is really zero or -1.  I suppose

'If (operand 2 - operand 3) exceeds the number of elements in mode
@var{m}, the behavior is undefined.'

means that the vectorizer has to make sure the biased element
count never underflows?

That is, for a loop like

void foo (double *x, float *y, int n)
{
  for (int i = 0; i < n; ++i)
    y[i] = x[i];
}

you should get

   x1 = len_load (...);
   x2 = len_load (...);
   y = VEC_PACK_TRUNC_EXPR <x1, x2>
   len_store (..., y);

but then the x2 load can end up with a len of zero and thus
trap (since you will load either a full vector or the first
byte of it).  I see you do

  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
     len_loads with a length of zero.  In order to avoid that we prohibit
     more than one loop length here.  */
  if (partial_load_bias == -1
      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
    return false;

that's quite conservative.  I think you can do better when the
loads are aligned, reading an extra byte when ignoring the bias
is OK and you at least know the very first element is used.
For stores you would need to emit compare&jump for all but
the first store of a group though ...

That said, I'm still not seeing where you actually apply the bias.

Richard.
juzhe.zhong@rivai.ai June 15, 2023, 10:08 a.m. UTC | #9
Hi, Richi. I have sent the first splitted patch (only add ifn and optabs) as you suggested.
https://gcc.gnu.org/pipermail/gcc-patches/2023-June/621874.html 
Could you take a look at it?
After this patch is approved, I will send the second patch (Support them into vectorizer) next.

Thanks!


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-06-15 17:52
To: Robin Dapp
CC: juzhe.zhong@rivai.ai; gcc-patches; richard.sandiford; krebbel; uweigand; linkw
Subject: Re: [PATCH V2] VECT: Support LEN_MASK_ LOAD/STORE to support flow control for length loop control
On Thu, 15 Jun 2023, Robin Dapp wrote:
 
> > Meh, PoP is now behind a paywall, trying to get through ... I wonder
> > if there's a nice online html documenting the s390 len_load/store
> > instructions to better understand the need for the bias.
> 
> https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> 
> Look for vector load with length (store).  The length operand specifies
> the highest bytes to load instead of the actual length.
 
Hmm.  It indeed cannot represent len == 0, so you are making sure
that never happens?  Because when it is actually zero you are
going to get -1 here?  At least I don't see the bias operand used at
all:
 
; Implement len_load/len_store optabs with vll/vstl.
(define_expand "len_load_v16qi"
  [(match_operand:V16QI 0 "register_operand")
   (match_operand:V16QI 1 "memory_operand")
   (match_operand:QI 2 "register_operand")
   (match_operand:QI 3 "vll_bias_operand")
  ]
  "TARGET_VX && TARGET_64BIT"
{
  rtx mem = adjust_address (operands[1], BLKmode, 0);
 
  rtx len = gen_reg_rtx (SImode);
  emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2]));
  emit_insn (gen_vllv16qi (operands[0], len, mem));
  DONE;
})
 
the docs of len_load say
 
"
@cindex @code{len_load_@var{m}} instruction pattern
@item @samp{len_load_@var{m}}
Load (operand 2 - operand 3) elements from vector memory operand 1
into vector register operand 0, setting the other elements of
operand 0 to undefined values.  Operands 0 and 1 have mode @var{m},
which must be a vector mode.  Operand 2 has whichever integer mode the
target prefers.  Operand 3 conceptually has mode @code{QI}. 
 
Operand 2 can be a variable or a constant amount.  Operand 3 specifies a
constant bias: it is either a constant 0 or a constant -1.  The predicate 
on
operand 3 must only accept the bias values that the target actually 
supports.
GCC handles a bias of 0 more efficiently than a bias of -1.
 
If (operand 2 - operand 3) exceeds the number of elements in mode
@var{m}, the behavior is undefined.
 
If the target prefers the length to be measured in bytes rather than
elements, it should only implement this pattern for vectors of @code{QI}
elements."
 
the minus in 'operand 2 - operand 3' should be a plus if the
bias is really zero or -1.  I suppose
 
'If (operand 2 - operand 3) exceeds the number of elements in mode
@var{m}, the behavior is undefined.'
 
means that the vectorizer has to make sure the biased element
count never underflows?
 
That is, for a loop like
 
void foo (double *x, float *y, int n)
{
  for (int i = 0; i < n; ++i)
    y[i] = x[i];
}
 
you should get
 
   x1 = len_load (...);
   x2 = len_load (...);
   y = VEC_PACK_TRUNC_EXPR <x1, x2>
   len_store (..., y);
 
but then the x2 load can end up with a len of zero and thus
trap (since you will load either a full vector or the first
byte of it).  I see you do
 
  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
     len_loads with a length of zero.  In order to avoid that we prohibit
     more than one loop length here.  */
  if (partial_load_bias == -1
      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
    return false;
 
that's quite conservative.  I think you can do better when the
loads are aligned, reading an extra byte when ignoring the bias
is OK and you at least know the very first element is used.
For stores you would need to emit compare&jump for all but
the first store of a group though ...
 
That said, I'm still not seeing where you actually apply the bias.
 
Richard.
Robin Dapp June 15, 2023, 10:10 a.m. UTC | #10
> the minus in 'operand 2 - operand 3' should be a plus if the
> bias is really zero or -1.  I suppose

Yes, that somehow got lost from when the bias was still +1.  Maybe
Juzhe can fix this in the course of his patch.

> that's quite conservative.  I think you can do better when the
> loads are aligned, reading an extra byte when ignoring the bias
> is OK and you at least know the very first element is used.
> For stores you would need to emit compare&jump for all but
> the first store of a group though ...

The implementation is a first shot and yes we could do a bit
better but limiting to a single rgroup is IMHO the more severe
restriction.  The pattern wasn't hit very often across SPEC
either way.  I think overall proper masking is  more important for
fixed-length vectors while length control might be more useful
for variable-length vectors.  Just my gut feeling though, you're
the expert there.

> That said, I'm still not seeing where you actually apply the bias.

We do

+
+  int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+  if (partial_load_bias != 0)
+    {
+      tree adjusted_len = rgc->bias_adjusted_ctrl;
+      gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
+                                           rgc->controls[0],
+                                           build_int_cst
+                                           (TREE_TYPE (rgc->controls[0]),
+                                            partial_load_bias));
+      gimple_seq_add_stmt (header_seq, minus);
+    }
+

as well as

+         if (use_bias_adjusted_len)
+           {
+             gcc_assert (i == 0);
+             tree adjusted_len =
+               make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
+             SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
+             rgl->bias_adjusted_ctrl = adjusted_len;
+           }

Regards
 Robin
Richard Biener June 15, 2023, 11:12 a.m. UTC | #11
On Thu, 15 Jun 2023, Robin Dapp wrote:

> > the minus in 'operand 2 - operand 3' should be a plus if the
> > bias is really zero or -1.  I suppose
> 
> Yes, that somehow got lost from when the bias was still +1.  Maybe
> Juzhe can fix this in the course of his patch.
> 
> > that's quite conservative.  I think you can do better when the
> > loads are aligned, reading an extra byte when ignoring the bias
> > is OK and you at least know the very first element is used.
> > For stores you would need to emit compare&jump for all but
> > the first store of a group though ...
> 
> The implementation is a first shot and yes we could do a bit
> better but limiting to a single rgroup is IMHO the more severe
> restriction.  The pattern wasn't hit very often across SPEC
> either way.  I think overall proper masking is  more important for
> fixed-length vectors while length control might be more useful
> for variable-length vectors.  Just my gut feeling though, you're
> the expert there.
> 
> > That said, I'm still not seeing where you actually apply the bias.
> 
> We do
> 
> +
> +  int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +  if (partial_load_bias != 0)
> +    {
> +      tree adjusted_len = rgc->bias_adjusted_ctrl;
> +      gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
> +                                           rgc->controls[0],
> +                                           build_int_cst
> +                                           (TREE_TYPE (rgc->controls[0]),
> +                                            partial_load_bias));
> +      gimple_seq_add_stmt (header_seq, minus);
> +    }
> +
> 
> as well as
> 
> +         if (use_bias_adjusted_len)
> +           {
> +             gcc_assert (i == 0);
> +             tree adjusted_len =
> +               make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> +             SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
> +             rgl->bias_adjusted_ctrl = adjusted_len;
> +           }

Ah, OK.  It's a bit odd to have predicates on define_expand.  The
define_expand pattern is expected to only match either literal 0
or literal -1 (and consistently so for all len_ optabs) and thus
operand 2, the length, needs to be adjusted by the middle-end
to match up with the pattern supplied operand 3.

Richard.
diff mbox series

Patch

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 95f7fe1f802..fc99990465d 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5136,6 +5136,38 @@  of @code{QI} elements.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{len_maskload@var{m}@var{n}} instruction pattern
+@item @samp{len_maskload@var{m}@var{n}}
+Perform a load of vector which is predicated by length and mask
+from memory operand 1 of mode @var{m} into register operand 0.
+Length is provided in operand 2 which has whichever
+integer mode the target prefers.
+Mask is provided in register operand 3 of mode @var{n}.
+
+operand 2 can be a variable or a constant amount. It can be vectorization
+factor which is the special constant value represents the maximum length.
+
+operand 3 can be a variable or a constant amount. It can be all 1
+which is the special constant value represents the full mask.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{len_maskstore@var{m}@var{n}} instruction pattern
+@item @samp{len_maskstore@var{m}@var{n}}
+Perform a store of vector which is predicated by length and mask
+from register operand 1 of mode @var{m} into memory operand 0.
+Length is provided in operand 2 which has whichever
+integer mode the target prefers.
+Mask is provided in register operand 3 of mode @var{n}.
+
+operand 2 can be a variable or a constant amount. It can be vectorization
+factor which is the special constant value represents the maximum length.
+
+operand 3 can be a variable or a constant amount. It can be all 1
+which is the special constant value represents the full mask.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{vec_perm@var{m}} instruction pattern
 @item @samp{vec_perm@var{m}}
 Output a (variable) vector permutation.  Operand 0 is the destination
diff --git a/gcc/genopinit.cc b/gcc/genopinit.cc
index 0c1b6859ca0..9aeebd66724 100644
--- a/gcc/genopinit.cc
+++ b/gcc/genopinit.cc
@@ -376,7 +376,8 @@  main (int argc, const char **argv)
 
   fprintf (s_file,
 	   "/* Returns TRUE if the target supports any of the partial vector\n"
-	   "   optabs: while_ult_optab, len_load_optab or len_store_optab,\n"
+	   "   optabs: while_ult_optab, len_load_optab, len_store_optab,\n"
+	   "   len_maskload_optab or len_maskstore_optab,\n"
 	   "   for any mode.  */\n"
 	   "bool\npartial_vectors_supported_p (void)\n{\n");
   bool any_match = false;
@@ -386,7 +387,8 @@  main (int argc, const char **argv)
     {
 #define CMP_NAME(N) !strncmp (p->name, (N), strlen ((N)))
       if (CMP_NAME("while_ult") || CMP_NAME ("len_load")
-	  || CMP_NAME ("len_store"))
+	  || CMP_NAME ("len_store") || CMP_NAME ("len_maskload")
+	  || CMP_NAME ("len_maskstore"))
 	{
 	  if (first)
 	    fprintf (s_file, " HAVE_%s", p->name);
diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
index 581575b65ec..a2c2ad5bfe7 100644
--- a/gcc/gimple-fold.cc
+++ b/gcc/gimple-fold.cc
@@ -5370,8 +5370,8 @@  arith_overflowed_p (enum tree_code code, const_tree type,
   return wi::min_precision (wres, sign) > TYPE_PRECISION (type);
 }
 
-/* If IFN_{MASK,LEN}_LOAD/STORE call CALL is unconditional, return a MEM_REF
-   for the memory it references, otherwise return null.  VECTYPE is the
+/* If IFN_{MASK,LEN,LEN_MASK}_LOAD/STORE call CALL is unconditional, return a
+   MEM_REF for the memory it references, otherwise return null.  VECTYPE is the
    type of the memory vector.  MASK_P indicates it's for MASK if true,
    otherwise it's for LEN.  */
 
@@ -5383,7 +5383,20 @@  gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool mask_p)
   if (!tree_fits_uhwi_p (alias_align))
     return NULL_TREE;
 
-  if (mask_p)
+  if (gimple_call_internal_fn (call) == IFN_LEN_MASK_LOAD
+      || gimple_call_internal_fn (call) == IFN_LEN_MASK_STORE)
+    {
+      tree basic_len = gimple_call_arg (call, 2);
+      if (!poly_int_tree_p (basic_len))
+	return NULL_TREE;
+      if (maybe_ne (tree_to_poly_uint64 (basic_len),
+		    TYPE_VECTOR_SUBPARTS (vectype)))
+	return NULL_TREE;
+      tree mask = gimple_call_arg (call, 3);
+      if (!integer_all_onesp (mask))
+	return NULL_TREE;
+    }
+  else if (mask_p)
     {
       tree mask = gimple_call_arg (call, 2);
       if (!integer_all_onesp (mask))
@@ -5409,7 +5422,7 @@  gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool mask_p)
   return fold_build2 (MEM_REF, vectype, ptr, offset);
 }
 
-/* Try to fold IFN_{MASK,LEN}_LOAD call CALL.  Return true on success.
+/* Try to fold IFN_{MASK,LEN,LEN_MASK}_LOAD call CALL.  Return true on success.
    MASK_P indicates it's for MASK if true, otherwise it's for LEN.  */
 
 static bool
@@ -5431,14 +5444,15 @@  gimple_fold_partial_load (gimple_stmt_iterator *gsi, gcall *call, bool mask_p)
   return false;
 }
 
-/* Try to fold IFN_{MASK,LEN}_STORE call CALL.  Return true on success.
+/* Try to fold IFN_{MASK,LEN,LEN_MASK}_STORE call CALL.  Return true on success.
    MASK_P indicates it's for MASK if true, otherwise it's for LEN.  */
 
 static bool
 gimple_fold_partial_store (gimple_stmt_iterator *gsi, gcall *call,
 			   bool mask_p)
 {
-  tree rhs = gimple_call_arg (call, 3);
+  tree rhs = gimple_call_arg (
+    call, gimple_call_internal_fn (call) == IFN_LEN_MASK_STORE ? 4 : 3);
   if (tree lhs
       = gimple_fold_partial_load_store_mem_ref (call, TREE_TYPE (rhs), mask_p))
     {
@@ -5659,9 +5673,11 @@  gimple_fold_call (gimple_stmt_iterator *gsi, bool inplace)
 	  cplx_result = true;
 	  break;
 	case IFN_MASK_LOAD:
+	case IFN_LEN_MASK_LOAD:
 	  changed |= gimple_fold_partial_load (gsi, stmt, true);
 	  break;
 	case IFN_MASK_STORE:
+	case IFN_LEN_MASK_STORE:
 	  changed |= gimple_fold_partial_store (gsi, stmt, true);
 	  break;
 	case IFN_LEN_LOAD:
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index da9b944dd5d..4a9fe388eed 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -165,6 +165,7 @@  init_internal_fns ()
 #define mask_load_lanes_direct { -1, -1, false }
 #define gather_load_direct { 3, 1, false }
 #define len_load_direct { -1, -1, false }
+#define len_maskload_direct { -1, 3, false }
 #define mask_store_direct { 3, 2, false }
 #define store_lanes_direct { 0, 0, false }
 #define mask_store_lanes_direct { 0, 0, false }
@@ -172,6 +173,7 @@  init_internal_fns ()
 #define vec_cond_direct { 2, 0, false }
 #define scatter_store_direct { 3, 1, false }
 #define len_store_direct { 3, 3, false }
+#define len_maskstore_direct { 4, 3, false }
 #define vec_set_direct { 3, 3, false }
 #define unary_direct { 0, 0, true }
 #define unary_convert_direct { -1, 0, true }
@@ -2875,6 +2877,17 @@  expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
       create_input_operand (&ops[3], bias, QImode);
       expand_insn (icode, 4, ops);
     }
+  else if (optab == len_maskload_optab)
+    {
+      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
+				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
+      maskt = gimple_call_arg (stmt, 3);
+      mask = expand_normal (maskt);
+      create_input_operand (&ops[3], mask, TYPE_MODE (TREE_TYPE (maskt)));
+      icode = convert_optab_handler (optab, TYPE_MODE (type),
+				     TYPE_MODE (TREE_TYPE (maskt)));
+      expand_insn (icode, 4, ops);
+    }
   else
     {
       create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
@@ -2888,6 +2901,7 @@  expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 #define expand_mask_load_optab_fn expand_partial_load_optab_fn
 #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn
 #define expand_len_load_optab_fn expand_partial_load_optab_fn
+#define expand_len_maskload_optab_fn expand_partial_load_optab_fn
 
 /* Expand MASK_STORE{,_LANES} or LEN_STORE call STMT using optab OPTAB.  */
 
@@ -2900,7 +2914,7 @@  expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
   insn_code icode;
 
   maskt = gimple_call_arg (stmt, 2);
-  rhs = gimple_call_arg (stmt, 3);
+  rhs = gimple_call_arg (stmt, optab == len_maskstore_optab ? 4 : 3);
   type = TREE_TYPE (rhs);
   lhs = expand_call_mem_ref (type, stmt, 0);
 
@@ -2927,6 +2941,16 @@  expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
       create_input_operand (&ops[3], bias, QImode);
       expand_insn (icode, 4, ops);
     }
+  else if (optab == len_maskstore_optab)
+    {
+      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
+				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
+      maskt = gimple_call_arg (stmt, 3);
+      mask = expand_normal (maskt);
+      create_input_operand (&ops[3], mask, TYPE_MODE (TREE_TYPE (maskt)));
+      icode = convert_optab_handler (optab, TYPE_MODE (type), GET_MODE (mask));
+      expand_insn (icode, 4, ops);
+    }
   else
     {
       create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
@@ -2937,6 +2961,7 @@  expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 #define expand_mask_store_optab_fn expand_partial_store_optab_fn
 #define expand_mask_store_lanes_optab_fn expand_mask_store_optab_fn
 #define expand_len_store_optab_fn expand_partial_store_optab_fn
+#define expand_len_maskstore_optab_fn expand_partial_store_optab_fn
 
 /* Expand VCOND, VCONDU and VCONDEQ optab internal functions.
    The expansion of STMT happens based on OPTAB table associated.  */
@@ -3890,6 +3915,7 @@  multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_gather_load_optab_supported_p convert_optab_supported_p
 #define direct_len_load_optab_supported_p direct_optab_supported_p
+#define direct_len_maskload_optab_supported_p convert_optab_supported_p
 #define direct_mask_store_optab_supported_p convert_optab_supported_p
 #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
@@ -3897,6 +3923,7 @@  multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_vec_cond_optab_supported_p convert_optab_supported_p
 #define direct_scatter_store_optab_supported_p convert_optab_supported_p
 #define direct_len_store_optab_supported_p direct_optab_supported_p
+#define direct_len_maskstore_optab_supported_p convert_optab_supported_p
 #define direct_while_optab_supported_p convert_optab_supported_p
 #define direct_fold_extract_optab_supported_p direct_optab_supported_p
 #define direct_fold_left_optab_supported_p direct_optab_supported_p
@@ -4361,6 +4388,7 @@  internal_load_fn_p (internal_fn fn)
     case IFN_GATHER_LOAD:
     case IFN_MASK_GATHER_LOAD:
     case IFN_LEN_LOAD:
+    case IFN_LEN_MASK_LOAD:
       return true;
 
     default:
@@ -4381,6 +4409,7 @@  internal_store_fn_p (internal_fn fn)
     case IFN_SCATTER_STORE:
     case IFN_MASK_SCATTER_STORE:
     case IFN_LEN_STORE:
+    case IFN_LEN_MASK_STORE:
       return true;
 
     default:
@@ -4420,6 +4449,10 @@  internal_fn_mask_index (internal_fn fn)
     case IFN_MASK_STORE_LANES:
       return 2;
 
+    case IFN_LEN_MASK_LOAD:
+    case IFN_LEN_MASK_STORE:
+      return 3;
+
     case IFN_MASK_GATHER_LOAD:
     case IFN_MASK_SCATTER_STORE:
       return 4;
@@ -4444,6 +4477,8 @@  internal_fn_stored_value_index (internal_fn fn)
     case IFN_MASK_SCATTER_STORE:
     case IFN_LEN_STORE:
       return 3;
+    case IFN_LEN_MASK_STORE:
+      return 4;
 
     default:
       return -1;
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 5d638de6d06..cf0bcea5ac7 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -50,12 +50,14 @@  along with GCC; see the file COPYING3.  If not see
    - mask_load_lanes: currently just vec_mask_load_lanes
    - gather_load: used for {mask_,}gather_load
    - len_load: currently just len_load
+   - len_maskload: currently just len_maskload
 
    - mask_store: currently just maskstore
    - store_lanes: currently just vec_store_lanes
    - mask_store_lanes: currently just vec_mask_store_lanes
    - scatter_store: used for {mask_,}scatter_store
    - len_store: currently just len_store
+   - len_maskstore: currently just len_maskstore
 
    - unary: a normal unary optab, such as vec_reverse_<mode>
    - binary: a normal binary optab, such as vec_interleave_lo_<mode>
@@ -157,6 +159,7 @@  DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
 		       mask_gather_load, gather_load)
 
 DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
+DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, len_maskload, len_maskload)
 
 DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store)
 DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
@@ -175,6 +178,7 @@  DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
 DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
 
 DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
+DEF_INTERNAL_OPTAB_FN (LEN_MASK_STORE, 0, len_maskstore, len_maskstore)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
 DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 276f8408dd7..ec765e78088 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -624,6 +624,45 @@  get_len_load_store_mode (machine_mode mode, bool is_load)
   return opt_machine_mode ();
 }
 
+/* Return true if target supports vector length && masked load/store for mode.
+   Length is used on loop control and mask is used on flow control.  */
+
+bool
+can_vec_len_mask_load_store_p (machine_mode mode, bool is_load)
+{
+  optab op = is_load ? len_maskload_optab : len_maskstore_optab;
+  machine_mode vmode;
+  machine_mode mask_mode;
+
+  /* If mode is vector mode, check it directly.  */
+  if (VECTOR_MODE_P (mode))
+    return targetm.vectorize.get_mask_mode (mode).exists (&mask_mode)
+	   && convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing;
+
+  scalar_mode smode;
+  if (is_a<scalar_mode> (mode, &smode))
+    /* See if there is any chance the mask load or store might be
+       vectorized.  If not, punt.  */
+    vmode = targetm.vectorize.preferred_simd_mode (smode);
+  else
+    vmode = mode;
+
+  if (VECTOR_MODE_P (vmode)
+      && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
+      && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
+    return true;
+
+  auto_vector_modes vector_modes;
+  targetm.vectorize.autovectorize_vector_modes (&vector_modes, true);
+  for (machine_mode base_mode : vector_modes)
+    if (related_vector_mode (base_mode, smode).exists (&vmode)
+	&& targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
+	&& convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
+      return true;
+
+  return false;
+}
+
 /* Return true if there is a compare_and_swap pattern.  */
 
 bool
diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index b266d2fe990..2b9c9b44af2 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -189,6 +189,7 @@  enum insn_code find_widening_optab_handler_and_mode (optab, machine_mode,
 int can_mult_highpart_p (machine_mode, bool);
 bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool);
 opt_machine_mode get_len_load_store_mode (machine_mode, bool);
+bool can_vec_len_mask_load_store_p (machine_mode, bool);
 bool can_compare_and_swap_p (machine_mode, bool);
 bool can_atomic_exchange_p (machine_mode, bool);
 bool can_atomic_load_p (machine_mode);
diff --git a/gcc/optabs.def b/gcc/optabs.def
index f31b69c5d85..f5401aea364 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -91,6 +91,8 @@  OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b")
 OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b")
 OPTAB_CD(maskload_optab, "maskload$a$b")
 OPTAB_CD(maskstore_optab, "maskstore$a$b")
+OPTAB_CD(len_maskload_optab, "len_maskload$a$b")
+OPTAB_CD(len_maskstore_optab, "len_maskstore$a$b")
 OPTAB_CD(gather_load_optab, "gather_load$a$b")
 OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b")
 OPTAB_CD(scatter_store_optab, "scatter_store$a$b")
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index b576cce6db6..99aca44e6a5 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -5816,6 +5816,8 @@  get_references_in_stmt (gimple *stmt, vec<data_ref_loc, va_heap> *references)
 	    }
 	  case IFN_MASK_LOAD:
 	  case IFN_MASK_STORE:
+	  case IFN_LEN_MASK_LOAD:
+	  case IFN_LEN_MASK_STORE:
 	    break;
 	  default:
 	    clobbers_memory = true;
@@ -5861,11 +5863,13 @@  get_references_in_stmt (gimple *stmt, vec<data_ref_loc, va_heap> *references)
 	switch (gimple_call_internal_fn (stmt))
 	  {
 	  case IFN_MASK_LOAD:
+	  case IFN_LEN_MASK_LOAD:
 	    if (gimple_call_lhs (stmt) == NULL_TREE)
 	      break;
 	    ref.is_read = true;
 	    /* FALLTHRU */
 	  case IFN_MASK_STORE:
+	  case IFN_LEN_MASK_STORE:
 	    ptr = build_int_cst (TREE_TYPE (gimple_call_arg (stmt, 1)), 0);
 	    align = tree_to_shwi (gimple_call_arg (stmt, 1));
 	    if (ref.is_read)
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 1393ce184e3..0f549fa528d 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -960,6 +960,9 @@  ifcvt_can_use_mask_load_store (gimple *stmt)
   if (can_vec_mask_load_store_p (mode, VOIDmode, is_load))
     return true;
 
+  if (can_vec_len_mask_load_store_p (mode, is_load))
+    return true;
+
   return false;
 }
 
diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc
index 79ed956e300..100c4b2e7d9 100644
--- a/gcc/tree-ssa-alias.cc
+++ b/gcc/tree-ssa-alias.cc
@@ -2815,11 +2815,13 @@  ref_maybe_used_by_call_p_1 (gcall *call, ao_ref *ref, bool tbaa_p)
       case IFN_SCATTER_STORE:
       case IFN_MASK_SCATTER_STORE:
       case IFN_LEN_STORE:
+      case IFN_LEN_MASK_STORE:
 	return false;
       case IFN_MASK_STORE_LANES:
 	goto process_args;
       case IFN_MASK_LOAD:
       case IFN_LEN_LOAD:
+      case IFN_LEN_MASK_LOAD:
       case IFN_MASK_LOAD_LANES:
 	{
 	  ao_ref rhs_ref;
@@ -3065,6 +3067,7 @@  call_may_clobber_ref_p_1 (gcall *call, ao_ref *ref, bool tbaa_p)
 	return false;
       case IFN_MASK_STORE:
       case IFN_LEN_STORE:
+      case IFN_LEN_MASK_STORE:
       case IFN_MASK_STORE_LANES:
 	{
 	  tree rhs = gimple_call_arg (call,
diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc
index eabe8ba4522..acaf844b8ef 100644
--- a/gcc/tree-ssa-dse.cc
+++ b/gcc/tree-ssa-dse.cc
@@ -174,6 +174,17 @@  initialize_ao_ref_for_dse (gimple *stmt, ao_ref *write, bool may_def_ok = false)
 	      return true;
 	    }
 	  break;
+	case IFN_LEN_MASK_STORE:
+	  /* We cannot initialize a must-def ao_ref (in all cases) but we
+	     can provide a may-def variant.  */
+	  if (may_def_ok)
+	    {
+	      ao_ref_init_from_ptr_and_size
+		  (write, gimple_call_arg (stmt, 0),
+		   TYPE_SIZE_UNIT (TREE_TYPE (gimple_call_arg (stmt, 4))));
+	      return true;
+	    }
+	  break;
 	default:;
 	}
     }
@@ -1483,6 +1494,7 @@  dse_optimize_stmt (function *fun, gimple_stmt_iterator *gsi, sbitmap live_bytes)
 	{
 	case IFN_LEN_STORE:
 	case IFN_MASK_STORE:
+	case IFN_LEN_MASK_STORE:
 	  {
 	    enum dse_store_status store_status;
 	    store_status = dse_classify_store (&ref, stmt, false, live_bytes);
diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
index 6fbd2d59318..e8e9df1ab74 100644
--- a/gcc/tree-ssa-loop-ivopts.cc
+++ b/gcc/tree-ssa-loop-ivopts.cc
@@ -2439,6 +2439,7 @@  get_mem_type_for_internal_fn (gcall *call, tree *op_p)
     case IFN_MASK_LOAD:
     case IFN_MASK_LOAD_LANES:
     case IFN_LEN_LOAD:
+    case IFN_LEN_MASK_LOAD:
       if (op_p == gimple_call_arg_ptr (call, 0))
 	return TREE_TYPE (gimple_call_lhs (call));
       return NULL_TREE;
@@ -2450,6 +2451,11 @@  get_mem_type_for_internal_fn (gcall *call, tree *op_p)
 	return TREE_TYPE (gimple_call_arg (call, 3));
       return NULL_TREE;
 
+    case IFN_LEN_MASK_STORE:
+      if (op_p == gimple_call_arg_ptr (call, 0))
+	return TREE_TYPE (gimple_call_arg (call, 4));
+      return NULL_TREE;
+
     default:
       return NULL_TREE;
     }
@@ -7555,6 +7561,8 @@  get_alias_ptr_type_for_ptr_address (iv_use *use)
     case IFN_MASK_STORE_LANES:
     case IFN_LEN_LOAD:
     case IFN_LEN_STORE:
+    case IFN_LEN_MASK_LOAD:
+    case IFN_LEN_MASK_STORE:
       /* The second argument contains the correct alias type.  */
       gcc_assert (use->op_p = gimple_call_arg_ptr (call, 0));
       return TREE_TYPE (gimple_call_arg (call, 1));
diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
index 27c84e78fcf..02fbc4a2dfa 100644
--- a/gcc/tree-ssa-sccvn.cc
+++ b/gcc/tree-ssa-sccvn.cc
@@ -3304,6 +3304,12 @@  vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
 	  if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
 	    return (void *)-1;
 	  break;
+	case IFN_LEN_MASK_STORE:
+	  len = gimple_call_arg (call, 2);
+	  mask = gimple_call_arg (call, internal_fn_mask_index (fn));
+	  if (!tree_fits_uhwi_p (len) || TREE_CODE (mask) != VECTOR_CST)
+	    return (void *)-1;
+	  break;
 	default:
 	  return (void *)-1;
 	}
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index ebe93832b1e..fb83446519a 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3039,17 +3039,21 @@  can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
       if (!call2 || !gimple_call_internal_p (call2))
 	return false;
       internal_fn ifn = gimple_call_internal_fn (call1);
-      if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
+      if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE
+	  && ifn != IFN_LEN_MASK_LOAD && ifn != IFN_LEN_MASK_STORE)
 	return false;
       if (ifn != gimple_call_internal_fn (call2))
 	return false;
 
       /* Check that the masks are the same.  Cope with casts of masks,
 	 like those created by build_mask_conversion.  */
-      tree mask1 = gimple_call_arg (call1, 2);
-      tree mask2 = gimple_call_arg (call2, 2);
+      unsigned int mask_argno
+	= ifn == IFN_LEN_MASK_LOAD || ifn == IFN_LEN_MASK_STORE ? 3 : 2;
+      tree mask1 = gimple_call_arg (call1, mask_argno);
+      tree mask2 = gimple_call_arg (call2, mask_argno);
       if (!operand_equal_p (mask1, mask2, 0)
-          && (ifn == IFN_MASK_STORE || !allow_slp_p))
+	  && (ifn == IFN_MASK_STORE || ifn == IFN_LEN_MASK_STORE
+	      || !allow_slp_p))
 	{
 	  mask1 = strip_conversion (mask1);
 	  if (!mask1)
@@ -4292,7 +4296,9 @@  vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
   if (gcall *call = dyn_cast <gcall *> (stmt))
     if (!gimple_call_internal_p (call)
 	|| (gimple_call_internal_fn (call) != IFN_MASK_LOAD
-	    && gimple_call_internal_fn (call) != IFN_MASK_STORE))
+	    && gimple_call_internal_fn (call) != IFN_MASK_STORE
+	    && gimple_call_internal_fn (call) != IFN_LEN_MASK_LOAD
+	    && gimple_call_internal_fn (call) != IFN_LEN_MASK_STORE))
       {
 	free_data_ref (dr);
 	return opt_result::failure_at (stmt,
@@ -6731,7 +6737,9 @@  vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
     if (gimple_call_internal_p (stmt)
 	&& (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
-	    || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
+	    || gimple_call_internal_fn (stmt) == IFN_MASK_STORE
+	    || gimple_call_internal_fn (stmt) == IFN_LEN_MASK_LOAD
+	    || gimple_call_internal_fn (stmt) == IFN_LEN_MASK_STORE))
       return dr_unaligned_supported;
 
   if (loop_vinfo)
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index ace9e759f5b..03de41d4988 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1296,30 +1296,33 @@  vect_verify_loop_lens (loop_vec_info loop_vinfo)
   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
     return false;
 
-  machine_mode len_load_mode = get_len_load_store_mode
-    (loop_vinfo->vector_mode, true).require ();
-  machine_mode len_store_mode = get_len_load_store_mode
-    (loop_vinfo->vector_mode, false).require ();
+  if (!can_vec_len_mask_load_store_p (loop_vinfo->vector_mode, true)
+      && !can_vec_len_mask_load_store_p (loop_vinfo->vector_mode, false))
+    {
+      machine_mode len_load_mode
+	= get_len_load_store_mode (loop_vinfo->vector_mode, true).require ();
+      machine_mode len_store_mode
+	= get_len_load_store_mode (loop_vinfo->vector_mode, false).require ();
 
-  signed char partial_load_bias = internal_len_load_store_bias
-    (IFN_LEN_LOAD, len_load_mode);
+      signed char partial_load_bias
+	= internal_len_load_store_bias (IFN_LEN_LOAD, len_load_mode);
 
-  signed char partial_store_bias = internal_len_load_store_bias
-    (IFN_LEN_STORE, len_store_mode);
+      signed char partial_store_bias
+	= internal_len_load_store_bias (IFN_LEN_STORE, len_store_mode);
 
-  gcc_assert (partial_load_bias == partial_store_bias);
+      gcc_assert (partial_load_bias == partial_store_bias);
 
-  if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
-    return false;
+      if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
+	return false;
 
-  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
-     len_loads with a length of zero.  In order to avoid that we prohibit
-     more than one loop length here.  */
-  if (partial_load_bias == -1
-      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
-    return false;
+      /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
+	 len_loads with a length of zero.  In order to avoid that we prohibit
+	 more than one loop length here.  */
+      if (partial_load_bias == -1 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
+	return false;
 
-  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
+      LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
+    }
 
   unsigned int max_nitems_per_iter = 1;
   unsigned int i;
@@ -11317,7 +11320,8 @@  optimize_mask_stores (class loop *loop)
 	   gsi_next (&gsi))
 	{
 	  stmt = gsi_stmt (gsi);
-	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
+	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE)
+	      || gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE))
 	    worklist.safe_push (stmt);
 	}
     }
@@ -11340,7 +11344,8 @@  optimize_mask_stores (class loop *loop)
       tree zero;
 
       last = worklist.pop ();
-      mask = gimple_call_arg (last, 2);
+      mask = gimple_call_arg (
+	last, gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE) ? 3 : 2);
       bb = gimple_bb (last);
       /* Create then_bb and if-then structure in CFG, then_bb belongs to
 	 the same loop as if_bb.  It could be different to LOOP when two
@@ -11473,7 +11478,12 @@  optimize_mask_stores (class loop *loop)
 	    }
 	  /* Put other masked stores with the same mask to STORE_BB.  */
 	  if (worklist.is_empty ()
-	      || gimple_call_arg (worklist.last (), 2) != mask
+	      || gimple_call_arg (worklist.last (),
+				  gimple_call_internal_p (worklist.last (),
+							  IFN_LEN_MASK_STORE)
+				    ? 3
+				    : 2)
+		   != mask
 	      || worklist.last () != stmt1)
 	    break;
 	  last = worklist.pop ();
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index ab89a82f1b3..937b5295df4 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -489,6 +489,7 @@  static const int cond_expr_maps[3][5] = {
 };
 static const int arg1_map[] = { 1, 1 };
 static const int arg2_map[] = { 1, 2 };
+static const int arg3_map[] = { 1, 3 };
 static const int arg1_arg4_map[] = { 2, 1, 4 };
 static const int op1_op0_map[] = { 2, 1, 0 };
 
@@ -524,6 +525,9 @@  vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 	  case IFN_MASK_LOAD:
 	    return arg2_map;
 
+	  case IFN_LEN_MASK_LOAD:
+	    return arg3_map;
+
 	  case IFN_GATHER_LOAD:
 	    return arg1_map;
 
@@ -1779,6 +1783,7 @@  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
     {
       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 	gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
+		    || gimple_call_internal_p (stmt, IFN_LEN_MASK_LOAD)
 		    || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
 		    || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
       else
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a7acc032d47..9b797c61c88 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1837,6 +1837,15 @@  check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
       using_partial_vectors_p = true;
     }
 
+  if (can_vec_len_mask_load_store_p (vecmode, is_load))
+    {
+      nvectors = group_memory_nvectors (group_size * vf, nunits);
+      /* Length is used on loop control and mask for flow control.*/
+      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+      vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+      using_partial_vectors_p = true;
+    }
+
   if (!using_partial_vectors_p)
     {
       if (dump_enabled_p ())
@@ -7978,8 +7987,9 @@  vectorizable_store (vec_info *vinfo,
       if (memory_access_type == VMAT_CONTIGUOUS)
 	{
 	  if (!VECTOR_MODE_P (vec_mode)
-	      || !can_vec_mask_load_store_p (vec_mode,
-					     TYPE_MODE (mask_vectype), false))
+	      || (!can_vec_mask_load_store_p (vec_mode,
+					      TYPE_MODE (mask_vectype), false)
+		  && !can_vec_len_mask_load_store_p (vec_mode, false)))
 	    return false;
 	}
       else if (memory_access_type != VMAT_LOAD_STORE_LANES
@@ -8942,7 +8952,38 @@  vectorizable_store (vec_info *vinfo,
 		}
 
 	      /* Arguments are ready.  Create the new vector stmt.  */
-	      if (final_mask)
+	      if (can_vec_len_mask_load_store_p (TYPE_MODE (vectype), false)
+		  && (final_mask || loop_lens))
+		{
+		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
+		  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+		  if (!final_mask)
+		    {
+		      machine_mode mask_mode
+			= targetm.vectorize.get_mask_mode (TYPE_MODE (vectype))
+			    .require ();
+		      mask_vectype
+			= build_truth_vector_type_for_mode (nunits, mask_mode);
+		      tree mask = build_int_cst (TREE_TYPE (mask_vectype), -1);
+		      final_mask = build_vector_from_val (mask_vectype, mask);
+		    }
+		  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+		  tree final_len;
+		  if (loop_lens)
+		    final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+						   vec_num * ncopies, vectype,
+						   vec_num * j + i, 1);
+		  else
+		    final_len = build_int_cst (iv_type, nunits);
+		  gcall *call
+		    = gimple_build_call_internal (IFN_LEN_MASK_STORE, 5,
+						  dataref_ptr, ptr, final_len,
+						  final_mask, vec_oprnd);
+		  gimple_call_set_nothrow (call, true);
+		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
+		  new_stmt = call;
+		}
+	      else if (final_mask)
 		{
 		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
 		  gcall *call
@@ -9407,8 +9448,9 @@  vectorizable_load (vec_info *vinfo,
 	{
 	  machine_mode vec_mode = TYPE_MODE (vectype);
 	  if (!VECTOR_MODE_P (vec_mode)
-	      || !can_vec_mask_load_store_p (vec_mode,
-					     TYPE_MODE (mask_vectype), true))
+	      || (!can_vec_mask_load_store_p (vec_mode,
+					      TYPE_MODE (mask_vectype), true)
+		  && !can_vec_len_mask_load_store_p (vec_mode, false)))
 	    return false;
 	}
       else if (memory_access_type != VMAT_LOAD_STORE_LANES
@@ -10301,7 +10343,47 @@  vectorizable_load (vec_info *vinfo,
 					      align, misalign);
 		    align = least_bit_hwi (misalign | align);
 
-		    if (final_mask)
+		    if (can_vec_len_mask_load_store_p (TYPE_MODE (vectype),
+						       true)
+			&& (final_mask || loop_lens)
+			&& memory_access_type != VMAT_INVARIANT)
+		      {
+			tree ptr
+			  = build_int_cst (ref_type, align * BITS_PER_UNIT);
+			poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+			if (!final_mask)
+			  {
+			    machine_mode mask_mode
+			      = targetm.vectorize
+				  .get_mask_mode (TYPE_MODE (vectype))
+				  .require ();
+			    mask_vectype
+			      = build_truth_vector_type_for_mode (nunits,
+								  mask_mode);
+			    tree mask
+			      = build_int_cst (TREE_TYPE (mask_vectype), -1);
+			    final_mask
+			      = build_vector_from_val (mask_vectype, mask);
+			  }
+			tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+			tree final_len;
+			if (loop_lens)
+			  final_len
+			    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+						 vec_num * ncopies, vectype,
+						 vec_num * j + i, 1);
+			else
+			  final_len = build_int_cst (iv_type, nunits);
+
+			gcall *call
+			  = gimple_build_call_internal (IFN_LEN_MASK_LOAD, 4,
+							dataref_ptr, ptr,
+							final_len, final_mask);
+			gimple_call_set_nothrow (call, true);
+			new_stmt = call;
+			data_ref = NULL_TREE;
+		      }
+		    else if (final_mask)
 		      {
 			tree ptr = build_int_cst (ref_type,
 						  align * BITS_PER_UNIT);
@@ -13027,7 +13109,8 @@  vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
 
   if (gimple_get_lhs (stmt) == NULL_TREE
       /* MASK_STORE has no lhs, but is ok.  */
-      && !gimple_call_internal_p (stmt, IFN_MASK_STORE))
+      && !gimple_call_internal_p (stmt, IFN_MASK_STORE)
+      && !gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE))
     {
       if (is_a <gcall *> (stmt))
 	{
@@ -13071,6 +13154,8 @@  vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
 	scalar_type = TREE_TYPE (DR_REF (dr));
       else if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 	scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
+      else if (gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE))
+	scalar_type = TREE_TYPE (gimple_call_arg (stmt, 4));
       else
 	scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 
diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
index a048e9d8917..19312404ac4 100644
--- a/gcc/tree-vectorizer.cc
+++ b/gcc/tree-vectorizer.cc
@@ -1101,6 +1101,8 @@  try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 		{
 		  internal_fn ifn = gimple_call_internal_fn (call);
 		  if (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE
+		      || ifn == IFN_LEN_MASK_LOAD
+		      || ifn == IFN_LEN_MASK_STORE
 		      /* Don't keep the if-converted parts when the ifn with
 			 specifc type is not supported by the backend.  */
 		      || (direct_internal_fn_p (ifn)