diff mbox series

[V4] VECT: Support loop len control on EXTRACT_LAST vectorization

Message ID 20230811142448.2913622-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V4] VECT: Support loop len control on EXTRACT_LAST vectorization | expand

Commit Message

juzhe.zhong@rivai.ai Aug. 11, 2023, 2:24 p.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.

This patch add support live vectorization by VEC_EXTRACT for LEN loop control.

Consider this following case:

#include <stdint.h>

#define EXTRACT_LAST(TYPE)			\
  TYPE __attribute__ ((noinline, noclone))	\
  test_##TYPE (TYPE *x, int n, TYPE value)	\
  {						\
    TYPE last;					\
    for (int j = 0; j < n; ++j)			\
      {						\
	last = x[j];				\
	x[j] = last * value;			\
      }						\
    return last;				\
  }

#define TEST_ALL(T)				\
  T (uint8_t)					\

TEST_ALL (EXTRACT_LAST)

ARM SVE IR:

Preheader:
  max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });

Loop:
  ...
  # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
  ...
  vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
  vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
  .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
  ...
  next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
  ...

Epilogue:
  _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);

For RVV since we prefer len in loop control, after this patch for RVV:

Loop:
  ...
  loop_len_22 = SELECT_VL;
  vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
  vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
  .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
  ...

Epilogue:
  _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);

Details of this approach:

1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
            for LEN loop control.
   
   This function we check whether target support:
    - Use LEN as the loop control.
    - Support VEC_EXTRACT optab.

2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.

3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).

The only difference between mask and len is that len is using length generated by SELECT_VL and
use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.

gcc/ChangeLog:

	* tree-vect-loop.cc (vectorizable_live_operation): Add loop len control.

---
 gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 16 deletions(-)

Comments

Richard Biener Aug. 14, 2023, 6:53 a.m. UTC | #1
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch add support live vectorization by VEC_EXTRACT for LEN loop control.

OK.

Thanks,
Richard.

> Consider this following case:
> 
> #include <stdint.h>
> 
> #define EXTRACT_LAST(TYPE)			\
>   TYPE __attribute__ ((noinline, noclone))	\
>   test_##TYPE (TYPE *x, int n, TYPE value)	\
>   {						\
>     TYPE last;					\
>     for (int j = 0; j < n; ++j)			\
>       {						\
> 	last = x[j];				\
> 	x[j] = last * value;			\
>       }						\
>     return last;				\
>   }
> 
> #define TEST_ALL(T)				\
>   T (uint8_t)					\
> 
> TEST_ALL (EXTRACT_LAST)
> 
> ARM SVE IR:
> 
> Preheader:
>   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> 
> Loop:
>   ...
>   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>   ...
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>   ...
>   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> 
> For RVV since we prefer len in loop control, after this patch for RVV:
> 
> Loop:
>   ...
>   loop_len_22 = SELECT_VL;
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>   ...
> 
> Epilogue:
>   _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
> 
> Details of this approach:
> 
> 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
>             for LEN loop control.
>    
>    This function we check whether target support:
>     - Use LEN as the loop control.
>     - Support VEC_EXTRACT optab.
> 
> 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> 
> 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
> 
> The only difference between mask and len is that len is using length generated by SELECT_VL and
> use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop.cc (vectorizable_live_operation): Add loop len control.
> 
> ---
>  gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++---------
>  1 file changed, 62 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bf8d677b584..a011e2dacb2 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        /* No transformation required.  */
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  	{
> -	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -					       OPTIMIZE_FOR_SPEED))
> -	    {
> -	      if (dump_enabled_p ())
> -		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -				 "can't operate on partial vectors "
> -				 "because the target doesn't support extract "
> -				 "last reduction.\n");
> -	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -	    }
> -	  else if (slp_node)
> +	  if (slp_node)
>  	    {
>  	      if (dump_enabled_p ())
>  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  	  else
>  	    {
>  	      gcc_assert (ncopies == 1 && !slp_node);
> -	      vect_record_loop_mask (loop_vinfo,
> -				     &LOOP_VINFO_MASKS (loop_vinfo),
> -				     1, vectype, NULL);
> +	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> +						  OPTIMIZE_FOR_SPEED))
> +		vect_record_loop_mask (loop_vinfo,
> +				       &LOOP_VINFO_MASKS (loop_vinfo),
> +				       1, vectype, NULL);
> +	      else if (convert_optab_handler (vec_extract_optab,
> +					      TYPE_MODE (vectype),
> +					      TYPE_MODE (TREE_TYPE (vectype)))
> +		       != CODE_FOR_nothing)
> +		vect_record_loop_len (loop_vinfo,
> +				      &LOOP_VINFO_LENS (loop_vinfo),
> +				      1, vectype, 1);
> +	      else
> +		{
> +		  if (dump_enabled_p ())
> +		    dump_printf_loc (
> +		      MSG_MISSED_OPTIMIZATION, vect_location,
> +		      "can't operate on partial vectors "
> +		      "because the target doesn't support extract "
> +		      "last reduction.\n");
> +		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +		}
>  	    }
>  	}
>        /* ???  Enable for loop costing as well.  */
> @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +	{
> +	  /* Emit:
> +
> +	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> +	     where VEC_LHS is the vectorized live-out result and MASK is
> +	     the loop mask for the final iteration.  */
> +	  gcc_assert (ncopies == 1 && !slp_node);
> +	  gimple_seq tem = NULL;
> +	  gimple_stmt_iterator gsi = gsi_last (tem);
> +	  tree len
> +	    = vect_get_loop_len (loop_vinfo, &gsi,
> +				 &LOOP_VINFO_LENS (loop_vinfo),
> +				 1, vectype, 0, 0);
> +
> +	  /* BIAS - 1.  */
> +	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	  tree bias_minus_one
> +	    = int_const_binop (MINUS_EXPR,
> +			       build_int_cst (TREE_TYPE (len), biasval),
> +			       build_one_cst (TREE_TYPE (len)));
> +
> +	  /* LAST_INDEX = LEN + (BIAS - 1).  */
> +	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> +					  len, bias_minus_one);
> +
> +	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> +	  tree scalar_res
> +	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +			    vec_lhs_phi, last_index);
> +
> +	  /* Convert the extracted vector element to the scalar type.  */
> +	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +	}
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	{
>  	  /* Emit:
>  
>
juzhe.zhong@rivai.ai Aug. 14, 2023, 7:09 a.m. UTC | #2
Thanks Richi.

CC kewen to see whether this patch is suitable for powerpc and s390.



juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-14 14:53
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
 
OK.
 
Thanks,
Richard.
 
> Consider this following case:
> 
> #include <stdint.h>
> 
> #define EXTRACT_LAST(TYPE) \
>   TYPE __attribute__ ((noinline, noclone)) \
>   test_##TYPE (TYPE *x, int n, TYPE value) \
>   { \
>     TYPE last; \
>     for (int j = 0; j < n; ++j) \
>       { \
> last = x[j]; \
> x[j] = last * value; \
>       } \
>     return last; \
>   }
> 
> #define TEST_ALL(T) \
>   T (uint8_t) \
> 
> TEST_ALL (EXTRACT_LAST)
> 
> ARM SVE IR:
> 
> Preheader:
>   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> 
> Loop:
>   ...
>   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>   ...
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>   ...
>   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> 
> For RVV since we prefer len in loop control, after this patch for RVV:
> 
> Loop:
>   ...
>   loop_len_22 = SELECT_VL;
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>   ...
> 
> Epilogue:
>   _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
> 
> Details of this approach:
> 
> 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
>             for LEN loop control.
>    
>    This function we check whether target support:
>     - Use LEN as the loop control.
>     - Support VEC_EXTRACT optab.
> 
> 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> 
> 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
> 
> The only difference between mask and len is that len is using length generated by SELECT_VL and
> use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
> 
> gcc/ChangeLog:
> 
> * tree-vect-loop.cc (vectorizable_live_operation): Add loop len control.
> 
> ---
>  gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++---------
>  1 file changed, 62 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bf8d677b584..a011e2dacb2 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        /* No transformation required.  */
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  {
> -   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -        OPTIMIZE_FOR_SPEED))
> -     {
> -       if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "can't operate on partial vectors "
> - "because the target doesn't support extract "
> - "last reduction.\n");
> -       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -     }
> -   else if (slp_node)
> +   if (slp_node)
>      {
>        if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    else
>      {
>        gcc_assert (ncopies == 1 && !slp_node);
> -       vect_record_loop_mask (loop_vinfo,
> -      &LOOP_VINFO_MASKS (loop_vinfo),
> -      1, vectype, NULL);
> +       if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> +   OPTIMIZE_FOR_SPEED))
> + vect_record_loop_mask (loop_vinfo,
> +        &LOOP_VINFO_MASKS (loop_vinfo),
> +        1, vectype, NULL);
> +       else if (convert_optab_handler (vec_extract_optab,
> +       TYPE_MODE (vectype),
> +       TYPE_MODE (TREE_TYPE (vectype)))
> +        != CODE_FOR_nothing)
> + vect_record_loop_len (loop_vinfo,
> +       &LOOP_VINFO_LENS (loop_vinfo),
> +       1, vectype, 1);
> +       else
> + {
> +   if (dump_enabled_p ())
> +     dump_printf_loc (
> +       MSG_MISSED_OPTIMIZATION, vect_location,
> +       "can't operate on partial vectors "
> +       "because the target doesn't support extract "
> +       "last reduction.\n");
> +   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> + }
>      }
>  }
>        /* ???  Enable for loop costing as well.  */
> @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> +   /* Emit:
> +
> +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> +      where VEC_LHS is the vectorized live-out result and MASK is
> +      the loop mask for the final iteration.  */
> +   gcc_assert (ncopies == 1 && !slp_node);
> +   gimple_seq tem = NULL;
> +   gimple_stmt_iterator gsi = gsi_last (tem);
> +   tree len
> +     = vect_get_loop_len (loop_vinfo, &gsi,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 0, 0);
> +
> +   /* BIAS - 1.  */
> +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +   tree bias_minus_one
> +     = int_const_binop (MINUS_EXPR,
> +        build_int_cst (TREE_TYPE (len), biasval),
> +        build_one_cst (TREE_TYPE (len)));
> +
> +   /* LAST_INDEX = LEN + (BIAS - 1).  */
> +   tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> +   len, bias_minus_one);
> +
> +   /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> +   tree scalar_res
> +     = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +     vec_lhs_phi, last_index);
> +
> +   /* Convert the extracted vector element to the scalar type.  */
> +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> + }
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  {
>    /* Emit:
>  
>
Kewen.Lin Aug. 14, 2023, 8:51 a.m. UTC | #3
Hi Juzhe,

on 2023/8/14 15:09, juzhe.zhong@rivai.ai wrote:
> Thanks Richi.
> 
> CC kewen to see whether this patch is suitable for powerpc and s390.

I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.

A short list looks like:

< FAIL: gcc.c-torture/compile/20150108.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn,
at internal-fn.cc:3164)
< FAIL: gcc.c-torture/compile/20150108.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.c-torture/compile/20150108.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.c-torture/compile/20150108.c   -O3 -g  (test for excess errors)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn,
at internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/20011126-2.c   -O3 -g  (test for excess errors)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn, at
internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.c-torture/execute/pr58419.c   -O3 -g  (test for excess errors)
< FAIL: gcc.dg/pr84321.c (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/pr84321.c (test for excess errors)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr108793.c   -O3 -g  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn, at
internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr51070-2.c   -O3 -g  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
< FAIL: gcc.dg/torture/pr51070.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for excess errors)
< FAIL: gcc.dg/torture/pr51070.c   -O3 -g  (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164)
....

> 
> ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> juzhe.zhong@rivai.ai
> 
>      
>     *From:* Richard Biener <mailto:rguenther@suse.de>
>     *Date:* 2023-08-14 14:53
>     *To:* Ju-Zhe Zhong <mailto:juzhe.zhong@rivai.ai>
>     *CC:* gcc-patches <mailto:gcc-patches@gcc.gnu.org>; richard.sandiford <mailto:richard.sandiford@arm.com>
>     *Subject:* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
>     On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
>      
>     > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>     >
>     > Hi, Richard and Richi.
>     >
>     > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
>      
>     OK.
>      
>     Thanks,
>     Richard.
>      
>     > Consider this following case:
>     >
>     > #include <stdint.h>
>     >
>     > #define EXTRACT_LAST(TYPE) \
>     >   TYPE __attribute__ ((noinline, noclone)) \
>     >   test_##TYPE (TYPE *x, int n, TYPE value) \
>     >   { \
>     >     TYPE last; \
>     >     for (int j = 0; j < n; ++j) \
>     >       { \
>     > last = x[j]; \
>     > x[j] = last * value; \
>     >       } \
>     >     return last; \
>     >   }
>     >
>     > #define TEST_ALL(T) \
>     >   T (uint8_t) \
>     >
>     > TEST_ALL (EXTRACT_LAST)
>     >
>     > ARM SVE IR:
>     >
>     > Preheader:
>     >   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
>     >
>     > Loop:
>     >   ...
>     >   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>     >   ...
>     >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>     >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>     >   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>     >   ...
>     >   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>     >   ...
>     >
>     > Epilogue:
>     >   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
>     >
>     > For RVV since we prefer len in loop control, after this patch for RVV:
>     >
>     > Loop:
>     >   ...
>     >   loop_len_22 = SELECT_VL;
>     >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>     >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>     >   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>     >   ...
>     >
>     > Epilogue:
>     >   _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
>     >
>     > Details of this approach:
>     >
>     > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
>     >             for LEN loop control.
>     >   
>     >    This function we check whether target support:
>     >     - Use LEN as the loop control.
>     >     - Support VEC_EXTRACT optab.
>     >
>     > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
>     >
>     > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
>     >
>     > The only difference between mask and len is that len is using length generated by SELECT_VL and
>     > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
>     >
>     > gcc/ChangeLog:
>     >
>     > * tree-vect-loop.cc (vectorizable_live_operation): Add loop len control.
>     >
>     > ---
>     >  gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++---------
>     >  1 file changed, 62 insertions(+), 16 deletions(-)
>     >
>     > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>     > index bf8d677b584..a011e2dacb2 100644
>     > --- a/gcc/tree-vect-loop.cc
>     > +++ b/gcc/tree-vect-loop.cc
>     > @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>     >        /* No transformation required.  */
>     >        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>     >  {
>     > -   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
>     > -        OPTIMIZE_FOR_SPEED))
>     > -     {
>     > -       if (dump_enabled_p ())
>     > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>     > - "can't operate on partial vectors "
>     > - "because the target doesn't support extract "
>     > - "last reduction.\n");
>     > -       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>     > -     }
>     > -   else if (slp_node)
>     > +   if (slp_node)
>     >      {
>     >        if (dump_enabled_p ())
>     >  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>     > @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>     >    else
>     >      {
>     >        gcc_assert (ncopies == 1 && !slp_node);
>     > -       vect_record_loop_mask (loop_vinfo,
>     > -      &LOOP_VINFO_MASKS (loop_vinfo),
>     > -      1, vectype, NULL);
>     > +       if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
>     > +   OPTIMIZE_FOR_SPEED))
>     > + vect_record_loop_mask (loop_vinfo,
>     > +        &LOOP_VINFO_MASKS (loop_vinfo),
>     > +        1, vectype, NULL);
>     > +       else if (convert_optab_handler (vec_extract_optab,
>     > +       TYPE_MODE (vectype),
>     > +       TYPE_MODE (TREE_TYPE (vectype)))
>     > +        != CODE_FOR_nothing)
>     > + vect_record_loop_len (loop_vinfo,
>     > +       &LOOP_VINFO_LENS (loop_vinfo),
>     > +       1, vectype, 1);
>     > +       else
>     > + {
>     > +   if (dump_enabled_p ())
>     > +     dump_printf_loc (
>     > +       MSG_MISSED_OPTIMIZATION, vect_location,
>     > +       "can't operate on partial vectors "
>     > +       "because the target doesn't support extract "
>     > +       "last reduction.\n");
>     > +   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>     > + }
>     >      }
>     >  }
>     >        /* ???  Enable for loop costing as well.  */
>     > @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>     >    gimple *vec_stmt;
>     >    if (slp_node)
>     >      {
>     > -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
>     > +      gcc_assert (!loop_vinfo
>     > +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
>     > +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>     > 
>     >        /* Get the correct slp vectorized stmt.  */
>     >        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
>     > @@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>     > 
>     >        gimple_seq stmts = NULL;
>     >        tree new_tree;
>     > -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>     > +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>     > + {
>     > +   /* Emit:
>     > +
>     > +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
>     > +
>     > +      where VEC_LHS is the vectorized live-out result and MASK is
>     > +      the loop mask for the final iteration.  */
>     > +   gcc_assert (ncopies == 1 && !slp_node);
>     > +   gimple_seq tem = NULL;
>     > +   gimple_stmt_iterator gsi = gsi_last (tem);
>     > +   tree len
>     > +     = vect_get_loop_len (loop_vinfo, &gsi,
>     > + &LOOP_VINFO_LENS (loop_vinfo),
>     > + 1, vectype, 0, 0);
>     > +
>     > +   /* BIAS - 1.  */
>     > +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>     > +   tree bias_minus_one
>     > +     = int_const_binop (MINUS_EXPR,
>     > +        build_int_cst (TREE_TYPE (len), biasval),
>     > +        build_one_cst (TREE_TYPE (len)));
>     > +
>     > +   /* LAST_INDEX = LEN + (BIAS - 1).  */
>     > +   tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
>     > +   len, bias_minus_one);
>     > +
>     > +   /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
>     > +   tree scalar_res
>     > +     = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
>     > +     vec_lhs_phi, last_index);
>     > +
>     > +   /* Convert the extracted vector element to the scalar type.  */
>     > +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
>     > + }
>     > +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>     >  {
>     >    /* Emit:
>     > 
>     >
>      
>     -- 
>     Richard Biener <rguenther@suse.de>
>     SUSE Software Solutions Germany GmbH,
>     Frankenstrasse 146, 90461 Nuernberg, Germany;
>     GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
>      
>
Robin Dapp Aug. 14, 2023, 8:58 a.m. UTC | #4
Hi Kewen,

> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.

I think the problem is that just like for vec_set we're expecting
the vec_extract expander not to fail.  It is probably passed not a
const int here anymore and therefore fails to expand?

can_vec_extract_var_idx_p is supposed to check if the backend
supports extracting a variable index.

Regards
 Robin
Richard Biener Aug. 14, 2023, 9:15 a.m. UTC | #5
On Mon, 14 Aug 2023, Robin Dapp wrote:

> Hi Kewen,
> 
> > I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?
> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.

expansion does

  enum insn_code icode = convert_optab_handler (optab, outermode,
                                                extract_mode);

  if (icode != CODE_FOR_nothing)
    {
      create_output_operand (&ops[0], target, extract_mode);
      create_input_operand (&ops[1], src, outermode);
      create_convert_operand_from (&ops[2], pos,
                                   TYPE_MODE (TREE_TYPE (op1)), true);
      if (maybe_expand_insn (icode, 3, ops))
        {
          if (!rtx_equal_p (target, ops[0].value))
            emit_move_insn (target, ops[0].value);
          return;
        }

<--- here

    }
  gcc_unreachable ();

so if maybe_expand_insn fails that would need to be sth we need
to cover in the predicate to check.  But that looks possibly
target dependent?  What does actually fail here?

Richard.
Kewen.Lin Aug. 14, 2023, 9:19 a.m. UTC | #6
Hi Robin,

on 2023/8/14 16:58, Robin Dapp wrote:
> Hi Kewen,
> 
>> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?

Thanks for the comments!  Yeah, I think the expectation doesn't hold
on Power, as our vec_extract optab only support const index, that
is:

(define_expand "vec_extract<mode><VEC_base_l>"
  [(match_operand:<VEC_base> 0 "register_operand")
   (match_operand:VEC_E 1 "vlogical_operand")
   (match_operand 2 "const_int_operand")]
  "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)"
{
  rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
  DONE;
})

> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.

OK, it sounds that this new capability needs to further check with
function can_vec_extract_var_idx_p to ensure the ifn expanding work
as expected.  I re-spined by adding the below as your comments:

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 07f3717ed9d..80ba5cae84a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
               else if (convert_optab_handler (vec_extract_optab,
                                               TYPE_MODE (vectype),
                                               TYPE_MODE (TREE_TYPE (vectype)))
-                       != CODE_FOR_nothing)
+                         != CODE_FOR_nothing
+                       && can_vec_extract_var_idx_p (
+                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
                 vect_record_loop_len (loop_vinfo,
                                       &LOOP_VINFO_LENS (loop_vinfo),
                                       1, vectype, 1);

BR,
Kewen
juzhe.zhong@rivai.ai Aug. 14, 2023, 9:26 a.m. UTC | #7
-                       != CODE_FOR_nothing)
+                         != CODE_FOR_nothing
+                       && can_vec_extract_var_idx_p (
+                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))

I think maybe 'can_vec_extract_var_idx_p' check is enough and remove convert_optab_handler (vec_extract_optab,... check.
Looking forward Richi's more comments.

Thanks.


juzhe.zhong@rivai.ai
 
From: Kewen.Lin
Date: 2023-08-14 17:19
To: Robin Dapp
CC: gcc-patches; richard.sandiford; rguenther; juzhe.zhong@rivai.ai
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
Hi Robin,
 
on 2023/8/14 16:58, Robin Dapp wrote:
> Hi Kewen,
> 
>> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?
 
Thanks for the comments!  Yeah, I think the expectation doesn't hold
on Power, as our vec_extract optab only support const index, that
is:
 
(define_expand "vec_extract<mode><VEC_base_l>"
  [(match_operand:<VEC_base> 0 "register_operand")
   (match_operand:VEC_E 1 "vlogical_operand")
   (match_operand 2 "const_int_operand")]
  "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)"
{
  rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
  DONE;
})
 
> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.
 
OK, it sounds that this new capability needs to further check with
function can_vec_extract_var_idx_p to ensure the ifn expanding work
as expected.  I re-spined by adding the below as your comments:
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 07f3717ed9d..80ba5cae84a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
               else if (convert_optab_handler (vec_extract_optab,
                                               TYPE_MODE (vectype),
                                               TYPE_MODE (TREE_TYPE (vectype)))
-                       != CODE_FOR_nothing)
+                         != CODE_FOR_nothing
+                       && can_vec_extract_var_idx_p (
+                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
                 vect_record_loop_len (loop_vinfo,
                                       &LOOP_VINFO_LENS (loop_vinfo),
                                       1, vectype, 1);
 
BR,
Kewen
Richard Biener Aug. 14, 2023, 12:07 p.m. UTC | #8
On Mon, 14 Aug 2023, juzhe.zhong@rivai.ai wrote:

> -                       != CODE_FOR_nothing)
> +                         != CODE_FOR_nothing
> +                       && can_vec_extract_var_idx_p (
> +                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
> 
> I think maybe 'can_vec_extract_var_idx_p' check is enough and remove convert_optab_handler (vec_extract_optab,... check.
> Looking forward Richi's more comments.

Yes, I think can_vec_extract_var_idx_p already does that so no need to
duplicate it here.

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Kewen.Lin
> Date: 2023-08-14 17:19
> To: Robin Dapp
> CC: gcc-patches; richard.sandiford; rguenther; juzhe.zhong@rivai.ai
> Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
> Hi Robin,
>  
> on 2023/8/14 16:58, Robin Dapp wrote:
> > Hi Kewen,
> > 
> >> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
> > 
> > I think the problem is that just like for vec_set we're expecting
> > the vec_extract expander not to fail.  It is probably passed not a
> > const int here anymore and therefore fails to expand?
>  
> Thanks for the comments!  Yeah, I think the expectation doesn't hold
> on Power, as our vec_extract optab only support const index, that
> is:
>  
> (define_expand "vec_extract<mode><VEC_base_l>"
>   [(match_operand:<VEC_base> 0 "register_operand")
>    (match_operand:VEC_E 1 "vlogical_operand")
>    (match_operand 2 "const_int_operand")]
>   "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)"
> {
>   rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
>   DONE;
> })
>  
> > 
> > can_vec_extract_var_idx_p is supposed to check if the backend
> > supports extracting a variable index.
>  
> OK, it sounds that this new capability needs to further check with
> function can_vec_extract_var_idx_p to ensure the ifn expanding work
> as expected.  I re-spined by adding the below as your comments:
>  
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 07f3717ed9d..80ba5cae84a 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>                else if (convert_optab_handler (vec_extract_optab,
>                                                TYPE_MODE (vectype),
>                                                TYPE_MODE (TREE_TYPE (vectype)))
> -                       != CODE_FOR_nothing)
> +                         != CODE_FOR_nothing
> +                       && can_vec_extract_var_idx_p (
> +                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
>                  vect_record_loop_len (loop_vinfo,
>                                        &LOOP_VINFO_LENS (loop_vinfo),
>                                        1, vectype, 1);
>  
> BR,
> Kewen
>  
>
juzhe.zhong@rivai.ai Aug. 14, 2023, 12:08 p.m. UTC | #9
Hi, Kewin.

Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing?

Thanks.


juzhe.zhong@rivai.ai
 
From: Kewen.Lin
Date: 2023-08-14 17:19
To: Robin Dapp
CC: gcc-patches; richard.sandiford; rguenther; juzhe.zhong@rivai.ai
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
Hi Robin,
 
on 2023/8/14 16:58, Robin Dapp wrote:
> Hi Kewen,
> 
>> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
> 
> I think the problem is that just like for vec_set we're expecting
> the vec_extract expander not to fail.  It is probably passed not a
> const int here anymore and therefore fails to expand?
 
Thanks for the comments!  Yeah, I think the expectation doesn't hold
on Power, as our vec_extract optab only support const index, that
is:
 
(define_expand "vec_extract<mode><VEC_base_l>"
  [(match_operand:<VEC_base> 0 "register_operand")
   (match_operand:VEC_E 1 "vlogical_operand")
   (match_operand 2 "const_int_operand")]
  "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)"
{
  rs6000_expand_vector_extract (operands[0], operands[1], operands[2]);
  DONE;
})
 
> 
> can_vec_extract_var_idx_p is supposed to check if the backend
> supports extracting a variable index.
 
OK, it sounds that this new capability needs to further check with
function can_vec_extract_var_idx_p to ensure the ifn expanding work
as expected.  I re-spined by adding the below as your comments:
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 07f3717ed9d..80ba5cae84a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
               else if (convert_optab_handler (vec_extract_optab,
                                               TYPE_MODE (vectype),
                                               TYPE_MODE (TREE_TYPE (vectype)))
-                       != CODE_FOR_nothing)
+                         != CODE_FOR_nothing
+                       && can_vec_extract_var_idx_p (
+                         TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
                 vect_record_loop_len (loop_vinfo,
                                       &LOOP_VINFO_LENS (loop_vinfo),
                                       1, vectype, 1);
 
BR,
Kewen
Kewen.Lin Aug. 14, 2023, 12:45 p.m. UTC | #10
Hi Juzhe,

on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote:
> Hi, Kewin.
> 
> Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing?

The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing to the
previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and
its required new includes as below:

+#include "memmodel.h"
+#include "optabs.h"
 
Could you have a double check?

Since I just tested it on Power10, you have the full ownership on the patch, I'd leave
the v5 posting to you.  Thanks!

BR,
Kewen
-----
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bc3063c3615..5ae9f69c7eb 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "ssa.h"
 #include "optabs-tree.h"
+#include "memmodel.h"
+#include "optabs.h"
 #include "diagnostic-core.h"
 #include "fold-const.h"
 #include "stor-layout.h"
@@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       /* No transformation required.  */
       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
 	{
-	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-					       OPTIMIZE_FOR_SPEED))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't operate on partial vectors "
-				 "because the target doesn't support extract "
-				 "last reduction.\n");
-	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-	    }
-	  else if (slp_node)
+	  if (slp_node)
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	  else
 	    {
 	      gcc_assert (ncopies == 1 && !slp_node);
-	      vect_record_loop_mask (loop_vinfo,
-				     &LOOP_VINFO_MASKS (loop_vinfo),
-				     1, vectype, NULL);
+	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+						  OPTIMIZE_FOR_SPEED))
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo),
+				       1, vectype, NULL);
+	      else if (can_vec_extract_var_idx_p (
+			 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
+		vect_record_loop_len (loop_vinfo,
+				      &LOOP_VINFO_LENS (loop_vinfo),
+				      1, vectype, 1);
+	      else
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (
+		      MSG_MISSED_OPTIMIZATION, vect_location,
+		      "can't operate on partial vectors "
+		      "because the target doesn't support extract "
+		      "last reduction.\n");
+		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+		}
 	    }
 	}
       /* ???  Enable for loop costing as well.  */
@@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo
+		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));

       /* Get the correct slp vectorized stmt.  */
       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,

       gimple_seq stmts = NULL;
       tree new_tree;
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+	{
+	  /* Emit:
+
+	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
+	     where VEC_LHS is the vectorized live-out result and MASK is
+	     the loop mask for the final iteration.  */
+	  gcc_assert (ncopies == 1 && !slp_node);
+	  gimple_seq tem = NULL;
+	  gimple_stmt_iterator gsi = gsi_last (tem);
+	  tree len
+	    = vect_get_loop_len (loop_vinfo, &gsi,
+				 &LOOP_VINFO_LENS (loop_vinfo),
+				 1, vectype, 0, 0);
+
+	  /* BIAS - 1.  */
+	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	  tree bias_minus_one
+	    = int_const_binop (MINUS_EXPR,
+			       build_int_cst (TREE_TYPE (len), biasval),
+			       build_one_cst (TREE_TYPE (len)));
+
+	  /* LAST_INDEX = LEN + (BIAS - 1).  */
+	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+					  len, bias_minus_one);
+
+	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
+	  tree scalar_res
+	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+			    vec_lhs_phi, last_index);
+
+	  /* Convert the extracted vector element to the scalar type.  */
+	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+	}
+      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	{
 	  /* Emit:
juzhe.zhong@rivai.ai Aug. 14, 2023, 1:28 p.m. UTC | #11
Thanks Kewen.

But I saw there is 2 more files include:

+#include "memmodel.h"
+#include "optabs.h"

Not sure whether Richard and Richi ok with that change ?

Thanks.



juzhe.zhong@rivai.ai
 
From: Kewen.Lin
Date: 2023-08-14 20:45
To: juzhe.zhong@rivai.ai
CC: Robin Dapp; richard.sandiford; rguenther; GCC Patches
Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
Hi Juzhe,
 
on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote:
> Hi, Kewin.
> 
> Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing?
 
The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing to the
previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and
its required new includes as below:
 
+#include "memmodel.h"
+#include "optabs.h"
Could you have a double check?
 
Since I just tested it on Power10, you have the full ownership on the patch, I'd leave
the v5 posting to you.  Thanks!
 
BR,
Kewen
-----
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bc3063c3615..5ae9f69c7eb 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
#include "tree-pass.h"
#include "ssa.h"
#include "optabs-tree.h"
+#include "memmodel.h"
+#include "optabs.h"
#include "diagnostic-core.h"
#include "fold-const.h"
#include "stor-layout.h"
@@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       /* No transformation required.  */
       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
{
-   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-        OPTIMIZE_FOR_SPEED))
-     {
-       if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't operate on partial vectors "
- "because the target doesn't support extract "
- "last reduction.\n");
-       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-     }
-   else if (slp_node)
+   if (slp_node)
    {
      if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
  else
    {
      gcc_assert (ncopies == 1 && !slp_node);
-       vect_record_loop_mask (loop_vinfo,
-      &LOOP_VINFO_MASKS (loop_vinfo),
-      1, vectype, NULL);
+       if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+   OPTIMIZE_FOR_SPEED))
+ vect_record_loop_mask (loop_vinfo,
+        &LOOP_VINFO_MASKS (loop_vinfo),
+        1, vectype, NULL);
+       else if (can_vec_extract_var_idx_p (
+ TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
+ vect_record_loop_len (loop_vinfo,
+       &LOOP_VINFO_LENS (loop_vinfo),
+       1, vectype, 1);
+       else
+ {
+   if (dump_enabled_p ())
+     dump_printf_loc (
+       MSG_MISSED_OPTIMIZATION, vect_location,
+       "can't operate on partial vectors "
+       "because the target doesn't support extract "
+       "last reduction.\n");
+   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
    }
}
       /* ???  Enable for loop costing as well.  */
@@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo
+   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
 
       /* Get the correct slp vectorized stmt.  */
       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 
       gimple_seq stmts = NULL;
       tree new_tree;
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+ {
+   /* Emit:
+
+        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
+      where VEC_LHS is the vectorized live-out result and MASK is
+      the loop mask for the final iteration.  */
+   gcc_assert (ncopies == 1 && !slp_node);
+   gimple_seq tem = NULL;
+   gimple_stmt_iterator gsi = gsi_last (tem);
+   tree len
+     = vect_get_loop_len (loop_vinfo, &gsi,
+ &LOOP_VINFO_LENS (loop_vinfo),
+ 1, vectype, 0, 0);
+
+   /* BIAS - 1.  */
+   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+   tree bias_minus_one
+     = int_const_binop (MINUS_EXPR,
+        build_int_cst (TREE_TYPE (len), biasval),
+        build_one_cst (TREE_TYPE (len)));
+
+   /* LAST_INDEX = LEN + (BIAS - 1).  */
+   tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+   len, bias_minus_one);
+
+   /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
+   tree scalar_res
+     = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+     vec_lhs_phi, last_index);
+
+   /* Convert the extracted vector element to the scalar type.  */
+   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+ }
+      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
{
  /* Emit:
Richard Biener Aug. 14, 2023, 1:44 p.m. UTC | #12
On Mon, 14 Aug 2023, ??? wrote:

> Thanks Kewen.
> 
> But I saw there is 2 more files include:
> 
> +#include "memmodel.h"
> +#include "optabs.h"
> 
> Not sure whether Richard and Richi ok with that change ?

Yes, please just apply some common sense.

> Thanks.
> 
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Kewen.Lin
> Date: 2023-08-14 20:45
> To: juzhe.zhong@rivai.ai
> CC: Robin Dapp; richard.sandiford; rguenther; GCC Patches
> Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization
> Hi Juzhe,
>  
> on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote:
> > Hi, Kewin.
> > 
> > Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing?
>  
> The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing to the
> previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and
> its required new includes as below:
>  
> +#include "memmodel.h"
> +#include "optabs.h"
> Could you have a double check?
>  
> Since I just tested it on Power10, you have the full ownership on the patch, I'd leave
> the v5 posting to you.  Thanks!
>  
> BR,
> Kewen
> -----
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bc3063c3615..5ae9f69c7eb 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
> #include "tree-pass.h"
> #include "ssa.h"
> #include "optabs-tree.h"
> +#include "memmodel.h"
> +#include "optabs.h"
> #include "diagnostic-core.h"
> #include "fold-const.h"
> #include "stor-layout.h"
> @@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        /* No transformation required.  */
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> {
> -   if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -        OPTIMIZE_FOR_SPEED))
> -     {
> -       if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "can't operate on partial vectors "
> - "because the target doesn't support extract "
> - "last reduction.\n");
> -       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -     }
> -   else if (slp_node)
> +   if (slp_node)
>     {
>       if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>   else
>     {
>       gcc_assert (ncopies == 1 && !slp_node);
> -       vect_record_loop_mask (loop_vinfo,
> -      &LOOP_VINFO_MASKS (loop_vinfo),
> -      1, vectype, NULL);
> +       if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> +   OPTIMIZE_FOR_SPEED))
> + vect_record_loop_mask (loop_vinfo,
> +        &LOOP_VINFO_MASKS (loop_vinfo),
> +        1, vectype, NULL);
> +       else if (can_vec_extract_var_idx_p (
> + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
> + vect_record_loop_len (loop_vinfo,
> +       &LOOP_VINFO_LENS (loop_vinfo),
> +       1, vectype, 1);
> +       else
> + {
> +   if (dump_enabled_p ())
> +     dump_printf_loc (
> +       MSG_MISSED_OPTIMIZATION, vect_location,
> +       "can't operate on partial vectors "
> +       "because the target doesn't support extract "
> +       "last reduction.\n");
> +   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> + }
>     }
> }
>        /* ???  Enable for loop costing as well.  */
> @@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> +   /* Emit:
> +
> +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> +      where VEC_LHS is the vectorized live-out result and MASK is
> +      the loop mask for the final iteration.  */
> +   gcc_assert (ncopies == 1 && !slp_node);
> +   gimple_seq tem = NULL;
> +   gimple_stmt_iterator gsi = gsi_last (tem);
> +   tree len
> +     = vect_get_loop_len (loop_vinfo, &gsi,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 0, 0);
> +
> +   /* BIAS - 1.  */
> +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +   tree bias_minus_one
> +     = int_const_binop (MINUS_EXPR,
> +        build_int_cst (TREE_TYPE (len), biasval),
> +        build_one_cst (TREE_TYPE (len)));
> +
> +   /* LAST_INDEX = LEN + (BIAS - 1).  */
> +   tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> +   len, bias_minus_one);
> +
> +   /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> +   tree scalar_res
> +     = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +     vec_lhs_phi, last_index);
> +
> +   /* Convert the extracted vector element to the scalar type.  */
> +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> + }
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> {
>   /* Emit:
>  
>
Stefan Schulze Frielinghaus Aug. 14, 2023, 6:51 p.m. UTC | #13
Hi everyone,

I have bootstrapped and regtested the patch below on s390.  For the
64-bit target I do not see any changes regarding the testsuite.  For the
31-bit target I see the following failures:

FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable

I've randomely picked pr50451.c and ran gcc against it which results in:

during GIMPLE pass: vect
dump file: pr50451.c.174t.vect
/gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ‘foo’:
/gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
0x1265d21 opt_mode<scalar_int_mode>::require() const
        /gcc-verify-workdir/patched/src/gcc/machmode.h:313
0x1d7e4e9 opt_mode<machine_mode>::require() const
        /gcc-verify-workdir/patched/src/gcc/vec.h:955
0x1d7e4e9 vect_verify_loop_lens
        /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
0x1da29ab vect_analyze_loop_2
        /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
0x1da40c7 vect_analyze_loop_1
        /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
        /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
0x1deed27 try_vectorize_loop_1
        /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
0x1deed27 try_vectorize_loop
        /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
0x1def5c1 execute
        /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
Please submit a full bug report, with preprocessed source (by using -freport-bug).
Please include the complete backtrace with any bug report.
See <https://gcc.gnu.org/bugs/> for instructions.

I will come back to this tomorrow.

Cheers,
Stefan

On Mon, Aug 14, 2023 at 08:45:21PM +0800, Kewen.Lin via Gcc-patches wrote:
> Hi Juzhe,
> 
> on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote:
> > Hi, Kewin.
> > 
> > Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing?
> 
> The below diff was bootstrapped and regress-tested on Power10 LE.  Comparing to the
> previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and
> its required new includes as below:
> 
> +#include "memmodel.h"
> +#include "optabs.h"
>  
> Could you have a double check?
> 
> Since I just tested it on Power10, you have the full ownership on the patch, I'd leave
> the v5 posting to you.  Thanks!
> 
> BR,
> Kewen
> -----
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bc3063c3615..5ae9f69c7eb 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -32,6 +32,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-pass.h"
>  #include "ssa.h"
>  #include "optabs-tree.h"
> +#include "memmodel.h"
> +#include "optabs.h"
>  #include "diagnostic-core.h"
>  #include "fold-const.h"
>  #include "stor-layout.h"
> @@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        /* No transformation required.  */
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  	{
> -	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -					       OPTIMIZE_FOR_SPEED))
> -	    {
> -	      if (dump_enabled_p ())
> -		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -				 "can't operate on partial vectors "
> -				 "because the target doesn't support extract "
> -				 "last reduction.\n");
> -	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -	    }
> -	  else if (slp_node)
> +	  if (slp_node)
>  	    {
>  	      if (dump_enabled_p ())
>  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  	  else
>  	    {
>  	      gcc_assert (ncopies == 1 && !slp_node);
> -	      vect_record_loop_mask (loop_vinfo,
> -				     &LOOP_VINFO_MASKS (loop_vinfo),
> -				     1, vectype, NULL);
> +	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> +						  OPTIMIZE_FOR_SPEED))
> +		vect_record_loop_mask (loop_vinfo,
> +				       &LOOP_VINFO_MASKS (loop_vinfo),
> +				       1, vectype, NULL);
> +	      else if (can_vec_extract_var_idx_p (
> +			 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
> +		vect_record_loop_len (loop_vinfo,
> +				      &LOOP_VINFO_LENS (loop_vinfo),
> +				      1, vectype, 1);
> +	      else
> +		{
> +		  if (dump_enabled_p ())
> +		    dump_printf_loc (
> +		      MSG_MISSED_OPTIMIZATION, vect_location,
> +		      "can't operate on partial vectors "
> +		      "because the target doesn't support extract "
> +		      "last reduction.\n");
> +		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +		}
>  	    }
>  	}
>        /* ???  Enable for loop costing as well.  */
> @@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> 
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> 
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +	{
> +	  /* Emit:
> +
> +	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> +	     where VEC_LHS is the vectorized live-out result and MASK is
> +	     the loop mask for the final iteration.  */
> +	  gcc_assert (ncopies == 1 && !slp_node);
> +	  gimple_seq tem = NULL;
> +	  gimple_stmt_iterator gsi = gsi_last (tem);
> +	  tree len
> +	    = vect_get_loop_len (loop_vinfo, &gsi,
> +				 &LOOP_VINFO_LENS (loop_vinfo),
> +				 1, vectype, 0, 0);
> +
> +	  /* BIAS - 1.  */
> +	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	  tree bias_minus_one
> +	    = int_const_binop (MINUS_EXPR,
> +			       build_int_cst (TREE_TYPE (len), biasval),
> +			       build_one_cst (TREE_TYPE (len)));
> +
> +	  /* LAST_INDEX = LEN + (BIAS - 1).  */
> +	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> +					  len, bias_minus_one);
> +
> +	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> +	  tree scalar_res
> +	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +			    vec_lhs_phi, last_index);
> +
> +	  /* Convert the extracted vector element to the scalar type.  */
> +	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +	}
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	{
>  	  /* Emit:
Kewen.Lin Aug. 15, 2023, 6:25 a.m. UTC | #14
Hi Stefan,

on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote:
> Hi everyone,
> 
> I have bootstrapped and regtested the patch below on s390.  For the
> 64-bit target I do not see any changes regarding the testsuite.  For the
> 31-bit target I see the following failures:
> 
> FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
> FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
> FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
> FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
> FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
> FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
> FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
> FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
> FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
> UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
> UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
> UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
> UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
> UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
> UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable
> 
> I've randomely picked pr50451.c and ran gcc against it which results in:
> 
> during GIMPLE pass: vect
> dump file: pr50451.c.174t.vect
> /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ‘foo’:
> /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
> 0x1265d21 opt_mode<scalar_int_mode>::require() const
>         /gcc-verify-workdir/patched/src/gcc/machmode.h:313
> 0x1d7e4e9 opt_mode<machine_mode>::require() const
>         /gcc-verify-workdir/patched/src/gcc/vec.h:955
> 0x1d7e4e9 vect_verify_loop_lens
>         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
> 0x1da29ab vect_analyze_loop_2
>         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
> 0x1da40c7 vect_analyze_loop_1
>         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
> 0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
>         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
> 0x1deed27 try_vectorize_loop_1
>         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
> 0x1deed27 try_vectorize_loop
>         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
> 0x1def5c1 execute
>         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
> Please submit a full bug report, with preprocessed source (by using -freport-bug).
> Please include the complete backtrace with any bug report.
> See <https://gcc.gnu.org/bugs/> for instructions.
> 

It looks like s390 supports variable index vec_extract at -m31 but
no vector with length.  It seems we need to further check the vector
with length capability, with something like:

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5ae9f69c7eb..ef754467baf 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
                 vect_record_loop_mask (loop_vinfo,
                                        &LOOP_VINFO_MASKS (loop_vinfo),
                                        1, vectype, NULL);
-              else if (can_vec_extract_var_idx_p (
+              else if (get_len_load_store_mode (TYPE_MODE (vectype), true)
+                         .exists ()
+                       && can_vec_extract_var_idx_p (
                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
-                vect_record_loop_len (loop_vinfo,
-                                      &LOOP_VINFO_LENS (loop_vinfo),
+                vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
                                       1, vectype, 1);
               else
                 {

sigh, the formatting looks odd.

BR,
Kewen
Richard Biener Aug. 15, 2023, 6:58 a.m. UTC | #15
On Tue, 15 Aug 2023, Kewen.Lin wrote:

> Hi Stefan,
> 
> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote:
> > Hi everyone,
> > 
> > I have bootstrapped and regtested the patch below on s390.  For the
> > 64-bit target I do not see any changes regarding the testsuite.  For the
> > 31-bit target I see the following failures:
> > 
> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable
> > 
> > I've randomely picked pr50451.c and ran gcc against it which results in:
> > 
> > during GIMPLE pass: vect
> > dump file: pr50451.c.174t.vect
> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?:
> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
> > 0x1265d21 opt_mode<scalar_int_mode>::require() const
> >         /gcc-verify-workdir/patched/src/gcc/machmode.h:313
> > 0x1d7e4e9 opt_mode<machine_mode>::require() const
> >         /gcc-verify-workdir/patched/src/gcc/vec.h:955
> > 0x1d7e4e9 vect_verify_loop_lens
> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
> > 0x1da29ab vect_analyze_loop_2
> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
> > 0x1da40c7 vect_analyze_loop_1
> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
> > 0x1deed27 try_vectorize_loop_1
> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
> > 0x1deed27 try_vectorize_loop
> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
> > 0x1def5c1 execute
> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
> > Please submit a full bug report, with preprocessed source (by using -freport-bug).
> > Please include the complete backtrace with any bug report.
> > See <https://gcc.gnu.org/bugs/> for instructions.
> > 
> 
> It looks like s390 supports variable index vec_extract at -m31 but
> no vector with length.  It seems we need to further check the vector
> with length capability, with something like:
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 5ae9f69c7eb..ef754467baf 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>                  vect_record_loop_mask (loop_vinfo,
>                                         &LOOP_VINFO_MASKS (loop_vinfo),
>                                         1, vectype, NULL);
> -              else if (can_vec_extract_var_idx_p (
> +              else if (get_len_load_store_mode (TYPE_MODE (vectype), true)
> +                         .exists ()
> +                       && can_vec_extract_var_idx_p (
>                           TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
> -                vect_record_loop_len (loop_vinfo,
> -                                      &LOOP_VINFO_LENS (loop_vinfo),
> +                vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
>                                        1, vectype, 1);
>                else
>                  {
> 
> sigh, the formatting looks odd.

I think the error is in vect_verify_loop_lens which assumes that
when we record _any_ length related op the target has to support
both len_load and len_store.  Now that we have many other _len
functions that's certainly not true.

Instead a vect_verify_loop_lens-local "fix" would be to not use
.require () but instead when !.exists () simply return false.
That would still effectively require both len-load and len-store
for any -len predicated loop, but at least avoid the ICE.

Richard.
Richard Sandiford Aug. 15, 2023, 8:52 a.m. UTC | #16
Richard Biener <rguenther@suse.de> writes:
> On Tue, 15 Aug 2023, Kewen.Lin wrote:
>
>> Hi Stefan,
>> 
>> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote:
>> > Hi everyone,
>> > 
>> > I have bootstrapped and regtested the patch below on s390.  For the
>> > 64-bit target I do not see any changes regarding the testsuite.  For the
>> > 31-bit target I see the following failures:
>> > 
>> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
>> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
>> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
>> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
>> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
>> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
>> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
>> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
>> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable
>> > 
>> > I've randomely picked pr50451.c and ran gcc against it which results in:
>> > 
>> > during GIMPLE pass: vect
>> > dump file: pr50451.c.174t.vect
>> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?:
>> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
>> > 0x1265d21 opt_mode<scalar_int_mode>::require() const
>> >         /gcc-verify-workdir/patched/src/gcc/machmode.h:313
>> > 0x1d7e4e9 opt_mode<machine_mode>::require() const
>> >         /gcc-verify-workdir/patched/src/gcc/vec.h:955
>> > 0x1d7e4e9 vect_verify_loop_lens
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
>> > 0x1da29ab vect_analyze_loop_2
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
>> > 0x1da40c7 vect_analyze_loop_1
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
>> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
>> > 0x1deed27 try_vectorize_loop_1
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
>> > 0x1deed27 try_vectorize_loop
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
>> > 0x1def5c1 execute
>> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
>> > Please submit a full bug report, with preprocessed source (by using -freport-bug).
>> > Please include the complete backtrace with any bug report.
>> > See <https://gcc.gnu.org/bugs/> for instructions.
>> > 
>> 
>> It looks like s390 supports variable index vec_extract at -m31 but
>> no vector with length.  It seems we need to further check the vector
>> with length capability, with something like:
>> 
>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> index 5ae9f69c7eb..ef754467baf 100644
>> --- a/gcc/tree-vect-loop.cc
>> +++ b/gcc/tree-vect-loop.cc
>> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>>                  vect_record_loop_mask (loop_vinfo,
>>                                         &LOOP_VINFO_MASKS (loop_vinfo),
>>                                         1, vectype, NULL);
>> -              else if (can_vec_extract_var_idx_p (
>> +              else if (get_len_load_store_mode (TYPE_MODE (vectype), true)
>> +                         .exists ()
>> +                       && can_vec_extract_var_idx_p (
>>                           TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
>> -                vect_record_loop_len (loop_vinfo,
>> -                                      &LOOP_VINFO_LENS (loop_vinfo),
>> +                vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
>>                                        1, vectype, 1);
>>                else
>>                  {
>> 
>> sigh, the formatting looks odd.
>
> I think the error is in vect_verify_loop_lens which assumes that
> when we record _any_ length related op the target has to support
> both len_load and len_store.  Now that we have many other _len
> functions that's certainly not true.
>
> Instead a vect_verify_loop_lens-local "fix" would be to not use
> .require () but instead when !.exists () simply return false.
> That would still effectively require both len-load and len-store
> for any -len predicated loop, but at least avoid the ICE.

Yeah, agree that would be the simplest workaround.  But I think
instead we should require vectorizable_load and vectorizable_store
to record the bias that they want to use (perhaps in a hash_set?).
Then vect_verify_loop_lens can return false if the set has more
than one element.  It can use a bias of 0 if the set is empty.

Thanks,
Richard
Richard Biener Aug. 15, 2023, 8:55 a.m. UTC | #17
On Tue, 15 Aug 2023, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:
> > On Tue, 15 Aug 2023, Kewen.Lin wrote:
> >
> >> Hi Stefan,
> >> 
> >> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote:
> >> > Hi everyone,
> >> > 
> >> > I have bootstrapped and regtested the patch below on s390.  For the
> >> > 64-bit target I do not see any changes regarding the testsuite.  For the
> >> > 31-bit target I see the following failures:
> >> > 
> >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
> >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
> >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
> >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
> >> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
> >> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
> >> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
> >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
> >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
> >> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable
> >> > 
> >> > I've randomely picked pr50451.c and ran gcc against it which results in:
> >> > 
> >> > during GIMPLE pass: vect
> >> > dump file: pr50451.c.174t.vect
> >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?:
> >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
> >> > 0x1265d21 opt_mode<scalar_int_mode>::require() const
> >> >         /gcc-verify-workdir/patched/src/gcc/machmode.h:313
> >> > 0x1d7e4e9 opt_mode<machine_mode>::require() const
> >> >         /gcc-verify-workdir/patched/src/gcc/vec.h:955
> >> > 0x1d7e4e9 vect_verify_loop_lens
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
> >> > 0x1da29ab vect_analyze_loop_2
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
> >> > 0x1da40c7 vect_analyze_loop_1
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
> >> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
> >> > 0x1deed27 try_vectorize_loop_1
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
> >> > 0x1deed27 try_vectorize_loop
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
> >> > 0x1def5c1 execute
> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
> >> > Please submit a full bug report, with preprocessed source (by using -freport-bug).
> >> > Please include the complete backtrace with any bug report.
> >> > See <https://gcc.gnu.org/bugs/> for instructions.
> >> > 
> >> 
> >> It looks like s390 supports variable index vec_extract at -m31 but
> >> no vector with length.  It seems we need to further check the vector
> >> with length capability, with something like:
> >> 
> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> >> index 5ae9f69c7eb..ef754467baf 100644
> >> --- a/gcc/tree-vect-loop.cc
> >> +++ b/gcc/tree-vect-loop.cc
> >> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> >>                  vect_record_loop_mask (loop_vinfo,
> >>                                         &LOOP_VINFO_MASKS (loop_vinfo),
> >>                                         1, vectype, NULL);
> >> -              else if (can_vec_extract_var_idx_p (
> >> +              else if (get_len_load_store_mode (TYPE_MODE (vectype), true)
> >> +                         .exists ()
> >> +                       && can_vec_extract_var_idx_p (
> >>                           TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
> >> -                vect_record_loop_len (loop_vinfo,
> >> -                                      &LOOP_VINFO_LENS (loop_vinfo),
> >> +                vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
> >>                                        1, vectype, 1);
> >>                else
> >>                  {
> >> 
> >> sigh, the formatting looks odd.
> >
> > I think the error is in vect_verify_loop_lens which assumes that
> > when we record _any_ length related op the target has to support
> > both len_load and len_store.  Now that we have many other _len
> > functions that's certainly not true.
> >
> > Instead a vect_verify_loop_lens-local "fix" would be to not use
> > .require () but instead when !.exists () simply return false.
> > That would still effectively require both len-load and len-store
> > for any -len predicated loop, but at least avoid the ICE.
> 
> Yeah, agree that would be the simplest workaround.  But I think
> instead we should require vectorizable_load and vectorizable_store
> to record the bias that they want to use (perhaps in a hash_set?).
> Then vect_verify_loop_lens can return false if the set has more
> than one element.  It can use a bias of 0 if the set is empty.

But with all the other _LEN fns now also having a bias, never
quering it but using the one from len_{load,store} having
that supported looks like a requirement?  OTOH if we would require
querying it for each used _LEN fn then supporting multiple
different biases wouldn't be an issue either?

Richard.
Richard Sandiford Aug. 15, 2023, 9:05 a.m. UTC | #18
Richard Biener <rguenther@suse.de> writes:
> On Tue, 15 Aug 2023, Richard Sandiford wrote:
>
>> Richard Biener <rguenther@suse.de> writes:
>> > On Tue, 15 Aug 2023, Kewen.Lin wrote:
>> >
>> >> Hi Stefan,
>> >> 
>> >> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote:
>> >> > Hi everyone,
>> >> > 
>> >> > I have bootstrapped and regtested the patch below on s390.  For the
>> >> > 64-bit target I do not see any changes regarding the testsuite.  For the
>> >> > 31-bit target I see the following failures:
>> >> > 
>> >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors)
>> >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors)
>> >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313)
>> >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors)
>> >> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable
>> >> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects  scan-tree-dump-times optimized "\\* 10" 2
>> >> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2
>> >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable
>> >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable
>> >> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable
>> >> > 
>> >> > I've randomely picked pr50451.c and ran gcc against it which results in:
>> >> > 
>> >> > during GIMPLE pass: vect
>> >> > dump file: pr50451.c.174t.vect
>> >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?:
>> >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313
>> >> > 0x1265d21 opt_mode<scalar_int_mode>::require() const
>> >> >         /gcc-verify-workdir/patched/src/gcc/machmode.h:313
>> >> > 0x1d7e4e9 opt_mode<machine_mode>::require() const
>> >> >         /gcc-verify-workdir/patched/src/gcc/vec.h:955
>> >> > 0x1d7e4e9 vect_verify_loop_lens
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471
>> >> > 0x1da29ab vect_analyze_loop_2
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929
>> >> > 0x1da40c7 vect_analyze_loop_1
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330
>> >> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*)
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484
>> >> > 0x1deed27 try_vectorize_loop_1
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064
>> >> > 0x1deed27 try_vectorize_loop
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180
>> >> > 0x1def5c1 execute
>> >> >         /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296
>> >> > Please submit a full bug report, with preprocessed source (by using -freport-bug).
>> >> > Please include the complete backtrace with any bug report.
>> >> > See <https://gcc.gnu.org/bugs/> for instructions.
>> >> > 
>> >> 
>> >> It looks like s390 supports variable index vec_extract at -m31 but
>> >> no vector with length.  It seems we need to further check the vector
>> >> with length capability, with something like:
>> >> 
>> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> >> index 5ae9f69c7eb..ef754467baf 100644
>> >> --- a/gcc/tree-vect-loop.cc
>> >> +++ b/gcc/tree-vect-loop.cc
>> >> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>> >>                  vect_record_loop_mask (loop_vinfo,
>> >>                                         &LOOP_VINFO_MASKS (loop_vinfo),
>> >>                                         1, vectype, NULL);
>> >> -              else if (can_vec_extract_var_idx_p (
>> >> +              else if (get_len_load_store_mode (TYPE_MODE (vectype), true)
>> >> +                         .exists ()
>> >> +                       && can_vec_extract_var_idx_p (
>> >>                           TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
>> >> -                vect_record_loop_len (loop_vinfo,
>> >> -                                      &LOOP_VINFO_LENS (loop_vinfo),
>> >> +                vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
>> >>                                        1, vectype, 1);
>> >>                else
>> >>                  {
>> >> 
>> >> sigh, the formatting looks odd.
>> >
>> > I think the error is in vect_verify_loop_lens which assumes that
>> > when we record _any_ length related op the target has to support
>> > both len_load and len_store.  Now that we have many other _len
>> > functions that's certainly not true.
>> >
>> > Instead a vect_verify_loop_lens-local "fix" would be to not use
>> > .require () but instead when !.exists () simply return false.
>> > That would still effectively require both len-load and len-store
>> > for any -len predicated loop, but at least avoid the ICE.
>> 
>> Yeah, agree that would be the simplest workaround.  But I think
>> instead we should require vectorizable_load and vectorizable_store
>> to record the bias that they want to use (perhaps in a hash_set?).
>> Then vect_verify_loop_lens can return false if the set has more
>> than one element.  It can use a bias of 0 if the set is empty.
>
> But with all the other _LEN fns now also having a bias, never
> quering it but using the one from len_{load,store} having
> that supported looks like a requirement?  OTOH if we would require
> querying it for each used _LEN fn then supporting multiple
> different biases wouldn't be an issue either?

Yeah, the OTOH is what I think we should do.  The bias stuff dates
back to when loads and stores were the only LEN_* operations.  The reason
it checks both loads and stores is to ensure consistency between "all"
length operations.

If instead we took the position that LEN_LOAD determines the biases
for everything, we wouldn't need to check LEN_STORE biases.

But the biases need to be consistent because they're built into the IVs.
In principle we could create a different LEN input for each bias,
but that's pointless when no target needs it.

Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bf8d677b584..a011e2dacb2 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10278,17 +10278,7 @@  vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       /* No transformation required.  */
       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
 	{
-	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-					       OPTIMIZE_FOR_SPEED))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't operate on partial vectors "
-				 "because the target doesn't support extract "
-				 "last reduction.\n");
-	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
-	    }
-	  else if (slp_node)
+	  if (slp_node)
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10308,9 +10298,28 @@  vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	  else
 	    {
 	      gcc_assert (ncopies == 1 && !slp_node);
-	      vect_record_loop_mask (loop_vinfo,
-				     &LOOP_VINFO_MASKS (loop_vinfo),
-				     1, vectype, NULL);
+	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+						  OPTIMIZE_FOR_SPEED))
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo),
+				       1, vectype, NULL);
+	      else if (convert_optab_handler (vec_extract_optab,
+					      TYPE_MODE (vectype),
+					      TYPE_MODE (TREE_TYPE (vectype)))
+		       != CODE_FOR_nothing)
+		vect_record_loop_len (loop_vinfo,
+				      &LOOP_VINFO_LENS (loop_vinfo),
+				      1, vectype, 1);
+	      else
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (
+		      MSG_MISSED_OPTIMIZATION, vect_location,
+		      "can't operate on partial vectors "
+		      "because the target doesn't support extract "
+		      "last reduction.\n");
+		  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+		}
 	    }
 	}
       /* ???  Enable for loop costing as well.  */
@@ -10336,7 +10345,9 @@  vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo
+		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
 
       /* Get the correct slp vectorized stmt.  */
       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10380,7 +10391,42 @@  vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 
       gimple_seq stmts = NULL;
       tree new_tree;
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+	{
+	  /* Emit:
+
+	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
+	     where VEC_LHS is the vectorized live-out result and MASK is
+	     the loop mask for the final iteration.  */
+	  gcc_assert (ncopies == 1 && !slp_node);
+	  gimple_seq tem = NULL;
+	  gimple_stmt_iterator gsi = gsi_last (tem);
+	  tree len
+	    = vect_get_loop_len (loop_vinfo, &gsi,
+				 &LOOP_VINFO_LENS (loop_vinfo),
+				 1, vectype, 0, 0);
+
+	  /* BIAS - 1.  */
+	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	  tree bias_minus_one
+	    = int_const_binop (MINUS_EXPR,
+			       build_int_cst (TREE_TYPE (len), biasval),
+			       build_one_cst (TREE_TYPE (len)));
+
+	  /* LAST_INDEX = LEN + (BIAS - 1).  */
+	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+					  len, bias_minus_one);
+
+	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
+	  tree scalar_res
+	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+			    vec_lhs_phi, last_index);
+
+	  /* Convert the extracted vector element to the scalar type.  */
+	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+	}
+      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	{
 	  /* Emit: