diff mbox series

VECT: Support loop len control on EXTRACT_LAST vectorization

Message ID 20230809063622.316743-1-juzhe.zhong@rivai.ai
State New
Headers show
Series VECT: Support loop len control on EXTRACT_LAST vectorization | expand

Commit Message

juzhe.zhong@rivai.ai Aug. 9, 2023, 6:36 a.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, this patch is adding loop len control on extract_last autovectorization.

Consider this following case:

#include <stdint.h>

#define EXTRACT_LAST(TYPE)			\
  TYPE __attribute__ ((noinline, noclone))	\
  test_##TYPE (TYPE *x, int n, TYPE value)	\
  {						\
    TYPE last;					\
    for (int j = 0; j < n; ++j)			\
      {						\
	last = x[j];				\
	x[j] = last * value;			\
      }						\
    return last;				\
  }

#define TEST_ALL(T)				\
  T (uint8_t)					\

TEST_ALL (EXTRACT_LAST)

ARM SVE IR:

Preheader:
  max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });

Loop:
  ...
  # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
  ...
  vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
  vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
  .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
  ...
  next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
  ...

Epilogue:
  _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);

For RVV since we prefer len in loop control, after this patch for RVV:

Loop:
  ...
  loop_len_22 = SELECT_VL;
  vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
  vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
  .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
  ...

Epilogue:
  _25 = .EXTRACT_LAST (loop_len_22, vect_last_12.8_23);

This patch didn't add a new pattern for length loop control of extract_last.
Instead we reuse current extract_last.

Here is the code:

Step 1 - Enable length and record length for extract_last:

+	      machine_mode vec_mode = TYPE_MODE (vectype);
+	      if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode))
+		vect_record_loop_len (loop_vinfo,
+				      &LOOP_VINFO_LENS (loop_vinfo), 1,
+				      vectype, 1);
+	      else
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo), 1,
+				       vectype, NULL);

We use 'get_len_load_store_mode' to check whether targets support loop len control or not.
If yes, record a loop len.

Step 2 - Build EXTRACT_LAST with len:

-	  tree mask = vect_get_loop_mask (loop_vinfo, gsi,
-					  &LOOP_VINFO_MASKS (loop_vinfo),
-					  1, vectype, 0);
+	  tree control;
+	  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+	    control = vect_get_loop_len (loop_vinfo, gsi,
+					 &LOOP_VINFO_LENS (loop_vinfo), 1,
+					 vectype, 0, 0);
+	  else
+	    control = vect_get_loop_mask (loop_vinfo, gsi,
+					  &LOOP_VINFO_MASKS (loop_vinfo), 1,
+					  vectype, 0);
 	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-					  mask, vec_lhs_phi);
+					  control, vec_lhs_phi);

Reuse the current codes (build EXTRACT_LAST with mask), build length instead if
'LOOP_VINFO_FULLY_WITH_LENGTH_P' is true.

This patch has been fully tested in RISC-V port.

Bootstrap and Regression on X86 passed.

Ok for trunk ?

gcc/ChangeLog:

        * tree-vect-loop.cc (vectorizable_live_operation): Add length control.

---
 gcc/tree-vect-loop.cc | 40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

Comments

Richard Biener Aug. 9, 2023, 11 a.m. UTC | #1
On Wed, 9 Aug 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, this patch is adding loop len control on extract_last autovectorization.
> 
> Consider this following case:
> 
> #include <stdint.h>
> 
> #define EXTRACT_LAST(TYPE)			\
>   TYPE __attribute__ ((noinline, noclone))	\
>   test_##TYPE (TYPE *x, int n, TYPE value)	\
>   {						\
>     TYPE last;					\
>     for (int j = 0; j < n; ++j)			\
>       {						\
> 	last = x[j];				\
> 	x[j] = last * value;			\
>       }						\
>     return last;				\
>   }
> 
> #define TEST_ALL(T)				\
>   T (uint8_t)					\
> 
> TEST_ALL (EXTRACT_LAST)
> 
> ARM SVE IR:
> 
> Preheader:
>   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> 
> Loop:
>   ...
>   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>   ...
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>   ...
>   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> 
> For RVV since we prefer len in loop control, after this patch for RVV:
> 
> Loop:
>   ...
>   loop_len_22 = SELECT_VL;
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_len_22, vect_last_12.8_23);
> 
> This patch didn't add a new pattern for length loop control of extract_last.
> Instead we reuse current extract_last.
> 
> Here is the code:
> 
> Step 1 - Enable length and record length for extract_last:
> 
> +	      machine_mode vec_mode = TYPE_MODE (vectype);
> +	      if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode))
> +		vect_record_loop_len (loop_vinfo,
> +				      &LOOP_VINFO_LENS (loop_vinfo), 1,
> +				      vectype, 1);
> +	      else
> +		vect_record_loop_mask (loop_vinfo,
> +				       &LOOP_VINFO_MASKS (loop_vinfo), 1,
> +				       vectype, NULL);
> 
> We use 'get_len_load_store_mode' to check whether targets support loop len control or not.
> If yes, record a loop len.
> 
> Step 2 - Build EXTRACT_LAST with len:
> 
> -	  tree mask = vect_get_loop_mask (loop_vinfo, gsi,
> -					  &LOOP_VINFO_MASKS (loop_vinfo),
> -					  1, vectype, 0);
> +	  tree control;
> +	  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +	    control = vect_get_loop_len (loop_vinfo, gsi,
> +					 &LOOP_VINFO_LENS (loop_vinfo), 1,
> +					 vectype, 0, 0);
> +	  else
> +	    control = vect_get_loop_mask (loop_vinfo, gsi,
> +					  &LOOP_VINFO_MASKS (loop_vinfo), 1,
> +					  vectype, 0);
>  	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> -					  mask, vec_lhs_phi);
> +					  control, vec_lhs_phi);
> 
> Reuse the current codes (build EXTRACT_LAST with mask), build length instead if
> 'LOOP_VINFO_FULLY_WITH_LENGTH_P' is true.
> 
> This patch has been fully tested in RISC-V port.
> 
> Bootstrap and Regression on X86 passed.
> 
> Ok for trunk ?
> 
> gcc/ChangeLog:
> 
>         * tree-vect-loop.cc (vectorizable_live_operation): Add length control.
> 
> ---
>  gcc/tree-vect-loop.cc | 40 ++++++++++++++++++++++++++++------------
>  1 file changed, 28 insertions(+), 12 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 00058c3c13e..fde098cafde 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10311,9 +10311,15 @@ vectorizable_live_operation (vec_info *vinfo,
>  	  else
>  	    {
>  	      gcc_assert (ncopies == 1 && !slp_node);
> -	      vect_record_loop_mask (loop_vinfo,
> -				     &LOOP_VINFO_MASKS (loop_vinfo),
> -				     1, vectype, NULL);
> +	      machine_mode vec_mode = TYPE_MODE (vectype);
> +	      if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode))
> +		vect_record_loop_len (loop_vinfo,
> +				      &LOOP_VINFO_LENS (loop_vinfo), 1,
> +				      vectype, 1);
> +	      else
> +		vect_record_loop_mask (loop_vinfo,
> +				       &LOOP_VINFO_MASKS (loop_vinfo), 1,
> +				       vectype, NULL);
>  	    }
>  	}
>        /* ???  Enable for loop costing as well.  */
> @@ -10339,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +		  || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +		  || !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo));

that should be

  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 
      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))

I think.  It seems to imply that SLP isn't supported with 
masking/lengthing.

>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10383,21 +10391,29 @@ vectorizable_live_operation (vec_info *vinfo,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  /* Emit:
>  
> -	       SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> +	       SCALAR_RES = EXTRACT_LAST <VEC_LHS, CONTROL>
>  
> -	     where VEC_LHS is the vectorized live-out result and MASK is
> -	     the loop mask for the final iteration.  */
> +	     where VEC_LHS is the vectorized live-out result and CONTROL can
> +	     be either the loop mask for the final iteration or the loop len
> +	     for the final iteration.  */
>  	  gcc_assert (ncopies == 1 && !slp_node);
>  	  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> -	  tree mask = vect_get_loop_mask (loop_vinfo, gsi,
> -					  &LOOP_VINFO_MASKS (loop_vinfo),
> -					  1, vectype, 0);
> +	  tree control;
> +	  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +	    control = vect_get_loop_len (loop_vinfo, gsi,
> +					 &LOOP_VINFO_LENS (loop_vinfo), 1,
> +					 vectype, 0, 0);
> +	  else
> +	    control = vect_get_loop_mask (loop_vinfo, gsi,
> +					  &LOOP_VINFO_MASKS (loop_vinfo), 1,
> +					  vectype, 0);
>  	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> -					  mask, vec_lhs_phi);
> +					  control, vec_lhs_phi);

Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
Don't you need some CFN_LEN_EXTRACT_LAST instead?

>  	  /* Convert the extracted vector element to the scalar type.  */
>  	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> 

Otherwise looks OK to me.

Thanks,
Richard.
juzhe.zhong@rivai.ai Aug. 9, 2023, 11:15 a.m. UTC | #2
Hi, Richi.

>> that should be

>>   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
>>       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))

>> I think.  It seems to imply that SLP isn't supported with
>> masking/lengthing.

Oh, yes.  At first glance, the original code is quite suspicious and your comments make sense to me.

>> Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
>> Don't you need some CFN_LEN_EXTRACT_LAST instead?

I think CFN_EXTRACT_LAST always has either loop mask or loop len.

When both mask and length are not needed, IMHO, I think current BIT_FIELD_REF flow is good enough:
https://godbolt.org/z/Yr5M9hcc6

So I think we don't need CFN_LEN_EXTRACT_LAST. 

Instead, I think we will need CFN_LEN_FOLD_EXTRACT_LAST in the next patch.

Feel free to correct me it I am wrong.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-09 19:00
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH] VECT: Support loop len control on EXTRACT_LAST vectorization
On Wed, 9 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, this patch is adding loop len control on extract_last autovectorization.
> 
> Consider this following case:
> 
> #include <stdint.h>
> 
> #define EXTRACT_LAST(TYPE) \
>   TYPE __attribute__ ((noinline, noclone)) \
>   test_##TYPE (TYPE *x, int n, TYPE value) \
>   { \
>     TYPE last; \
>     for (int j = 0; j < n; ++j) \
>       { \
> last = x[j]; \
> x[j] = last * value; \
>       } \
>     return last; \
>   }
> 
> #define TEST_ALL(T) \
>   T (uint8_t) \
> 
> TEST_ALL (EXTRACT_LAST)
> 
> ARM SVE IR:
> 
> Preheader:
>   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> 
> Loop:
>   ...
>   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>   ...
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>   ...
>   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> 
> For RVV since we prefer len in loop control, after this patch for RVV:
> 
> Loop:
>   ...
>   loop_len_22 = SELECT_VL;
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_len_22, vect_last_12.8_23);
> 
> This patch didn't add a new pattern for length loop control of extract_last.
> Instead we reuse current extract_last.
> 
> Here is the code:
> 
> Step 1 - Enable length and record length for extract_last:
> 
> +       machine_mode vec_mode = TYPE_MODE (vectype);
> +       if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode))
> + vect_record_loop_len (loop_vinfo,
> +       &LOOP_VINFO_LENS (loop_vinfo), 1,
> +       vectype, 1);
> +       else
> + vect_record_loop_mask (loop_vinfo,
> +        &LOOP_VINFO_MASKS (loop_vinfo), 1,
> +        vectype, NULL);
> 
> We use 'get_len_load_store_mode' to check whether targets support loop len control or not.
> If yes, record a loop len.
> 
> Step 2 - Build EXTRACT_LAST with len:
> 
> -   tree mask = vect_get_loop_mask (loop_vinfo, gsi,
> -   &LOOP_VINFO_MASKS (loop_vinfo),
> -   1, vectype, 0);
> +   tree control;
> +   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +     control = vect_get_loop_len (loop_vinfo, gsi,
> + &LOOP_VINFO_LENS (loop_vinfo), 1,
> + vectype, 0, 0);
> +   else
> +     control = vect_get_loop_mask (loop_vinfo, gsi,
> +   &LOOP_VINFO_MASKS (loop_vinfo), 1,
> +   vectype, 0);
>    tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> -   mask, vec_lhs_phi);
> +   control, vec_lhs_phi);
> 
> Reuse the current codes (build EXTRACT_LAST with mask), build length instead if
> 'LOOP_VINFO_FULLY_WITH_LENGTH_P' is true.
> 
> This patch has been fully tested in RISC-V port.
> 
> Bootstrap and Regression on X86 passed.
> 
> Ok for trunk ?
> 
> gcc/ChangeLog:
> 
>         * tree-vect-loop.cc (vectorizable_live_operation): Add length control.
> 
> ---
>  gcc/tree-vect-loop.cc | 40 ++++++++++++++++++++++++++++------------
>  1 file changed, 28 insertions(+), 12 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 00058c3c13e..fde098cafde 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10311,9 +10311,15 @@ vectorizable_live_operation (vec_info *vinfo,
>    else
>      {
>        gcc_assert (ncopies == 1 && !slp_node);
> -       vect_record_loop_mask (loop_vinfo,
> -      &LOOP_VINFO_MASKS (loop_vinfo),
> -      1, vectype, NULL);
> +       machine_mode vec_mode = TYPE_MODE (vectype);
> +       if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode))
> + vect_record_loop_len (loop_vinfo,
> +       &LOOP_VINFO_LENS (loop_vinfo), 1,
> +       vectype, 1);
> +       else
> + vect_record_loop_mask (loop_vinfo,
> +        &LOOP_VINFO_MASKS (loop_vinfo), 1,
> +        vectype, NULL);
>      }
>  }
>        /* ???  Enable for loop costing as well.  */
> @@ -10339,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +   || !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo));
 
that should be
 
  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 
      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
 
I think.  It seems to imply that SLP isn't supported with 
masking/lengthing.
 
>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10383,21 +10391,29 @@ vectorizable_live_operation (vec_info *vinfo,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +   || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  {
>    /* Emit:
>  
> -        SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> +        SCALAR_RES = EXTRACT_LAST <VEC_LHS, CONTROL>
>  
> -      where VEC_LHS is the vectorized live-out result and MASK is
> -      the loop mask for the final iteration.  */
> +      where VEC_LHS is the vectorized live-out result and CONTROL can
> +      be either the loop mask for the final iteration or the loop len
> +      for the final iteration.  */
>    gcc_assert (ncopies == 1 && !slp_node);
>    tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> -   tree mask = vect_get_loop_mask (loop_vinfo, gsi,
> -   &LOOP_VINFO_MASKS (loop_vinfo),
> -   1, vectype, 0);
> +   tree control;
> +   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +     control = vect_get_loop_len (loop_vinfo, gsi,
> + &LOOP_VINFO_LENS (loop_vinfo), 1,
> + vectype, 0, 0);
> +   else
> +     control = vect_get_loop_mask (loop_vinfo, gsi,
> +   &LOOP_VINFO_MASKS (loop_vinfo), 1,
> +   vectype, 0);
>    tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> -   mask, vec_lhs_phi);
> +   control, vec_lhs_phi);
 
Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
Don't you need some CFN_LEN_EXTRACT_LAST instead?
 
>    /* Convert the extracted vector element to the scalar type.  */
>    new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> 
 
Otherwise looks OK to me.
 
Thanks,
Richard.
Richard Biener Aug. 9, 2023, 12:21 p.m. UTC | #3
On Wed, 9 Aug 2023, juzhe.zhong@rivai.ai wrote:

> Hi, Richi.
> 
> >> that should be
> 
> >>   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> >>       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> 
> >> I think.  It seems to imply that SLP isn't supported with
> >> masking/lengthing.
> 
> Oh, yes.  At first glance, the original code is quite suspicious and your comments make sense to me.
> 
> >> Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
> >> Don't you need some CFN_LEN_EXTRACT_LAST instead?
> 
> I think CFN_EXTRACT_LAST always has either loop mask or loop len.
> 
> When both mask and length are not needed, IMHO, I think current BIT_FIELD_REF flow is good enough:
> https://godbolt.org/z/Yr5M9hcc6
> 
> So I think we don't need CFN_LEN_EXTRACT_LAST. 
> 
> Instead, I think we will need CFN_LEN_FOLD_EXTRACT_LAST in the next patch.
> 
> Feel free to correct me it I am wrong.

Richard S. should know best, but I don't think FOLD_EXTRACT_LAST is
any different from EXTRACT_LAST (the value for EXTRACT_LAST with
all zeros mask seems unspecified?).
Note the expanders are documented
as to receive 'mask' operands, not 'len' ones (and we'd miss BIAS).

As for SLP support the loop mask should have all SLP lanes
consistently masked/unmasked (same for 'len' I suppose), but we
want to extract a specific SLP lane only.  For masks I think
producing a mask that has all 'i' SLP lanes enabled and AND
that to the mask would select the proper lane for EXTRACT_LAST.
Not sure how to handle this for 'len' - I guess since 'len'
covers all SLP lanes as well we could just subtract
SLP_TREE_LANES (node) - slp_index from it?  I'll note we don't
handle ncopies > 1 which I think we could with using FOLD_EXTRACT_LAST?

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-09 19:00
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford
> Subject: Re: [PATCH] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Wed, 9 Aug 2023, juzhe.zhong@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > 
> > Hi, this patch is adding loop len control on extract_last autovectorization.
> > 
> > Consider this following case:
> > 
> > #include <stdint.h>
> > 
> > #define EXTRACT_LAST(TYPE) \
> >   TYPE __attribute__ ((noinline, noclone)) \
> >   test_##TYPE (TYPE *x, int n, TYPE value) \
> >   { \
> >     TYPE last; \
> >     for (int j = 0; j < n; ++j) \
> >       { \
> > last = x[j]; \
> > x[j] = last * value; \
> >       } \
> >     return last; \
> >   }
> > 
> > #define TEST_ALL(T) \
> >   T (uint8_t) \
> > 
> > TEST_ALL (EXTRACT_LAST)
> > 
> > ARM SVE IR:
> > 
> > Preheader:
> >   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> > 
> > Loop:
> >   ...
> >   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> >   ...
> >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> >   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> >   ...
> >   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> >   ...
> > 
> > Epilogue:
> >   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> > 
> > For RVV since we prefer len in loop control, after this patch for RVV:
> > 
> > Loop:
> >   ...
> >   loop_len_22 = SELECT_VL;
> >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> >   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> >   ...
> > 
> > Epilogue:
> >   _25 = .EXTRACT_LAST (loop_len_22, vect_last_12.8_23);
> > 
> > This patch didn't add a new pattern for length loop control of extract_last.
> > Instead we reuse current extract_last.
> > 
> > Here is the code:
> > 
> > Step 1 - Enable length and record length for extract_last:
> > 
> > +       machine_mode vec_mode = TYPE_MODE (vectype);
> > +       if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode))
> > + vect_record_loop_len (loop_vinfo,
> > +       &LOOP_VINFO_LENS (loop_vinfo), 1,
> > +       vectype, 1);
> > +       else
> > + vect_record_loop_mask (loop_vinfo,
> > +        &LOOP_VINFO_MASKS (loop_vinfo), 1,
> > +        vectype, NULL);
> > 
> > We use 'get_len_load_store_mode' to check whether targets support loop len control or not.
> > If yes, record a loop len.
> > 
> > Step 2 - Build EXTRACT_LAST with len:
> > 
> > -   tree mask = vect_get_loop_mask (loop_vinfo, gsi,
> > -   &LOOP_VINFO_MASKS (loop_vinfo),
> > -   1, vectype, 0);
> > +   tree control;
> > +   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > +     control = vect_get_loop_len (loop_vinfo, gsi,
> > + &LOOP_VINFO_LENS (loop_vinfo), 1,
> > + vectype, 0, 0);
> > +   else
> > +     control = vect_get_loop_mask (loop_vinfo, gsi,
> > +   &LOOP_VINFO_MASKS (loop_vinfo), 1,
> > +   vectype, 0);
> >    tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > -   mask, vec_lhs_phi);
> > +   control, vec_lhs_phi);
> > 
> > Reuse the current codes (build EXTRACT_LAST with mask), build length instead if
> > 'LOOP_VINFO_FULLY_WITH_LENGTH_P' is true.
> > 
> > This patch has been fully tested in RISC-V port.
> > 
> > Bootstrap and Regression on X86 passed.
> > 
> > Ok for trunk ?
> > 
> > gcc/ChangeLog:
> > 
> >         * tree-vect-loop.cc (vectorizable_live_operation): Add length control.
> > 
> > ---
> >  gcc/tree-vect-loop.cc | 40 ++++++++++++++++++++++++++++------------
> >  1 file changed, 28 insertions(+), 12 deletions(-)
> > 
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 00058c3c13e..fde098cafde 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -10311,9 +10311,15 @@ vectorizable_live_operation (vec_info *vinfo,
> >    else
> >      {
> >        gcc_assert (ncopies == 1 && !slp_node);
> > -       vect_record_loop_mask (loop_vinfo,
> > -      &LOOP_VINFO_MASKS (loop_vinfo),
> > -      1, vectype, NULL);
> > +       machine_mode vec_mode = TYPE_MODE (vectype);
> > +       if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode))
> > + vect_record_loop_len (loop_vinfo,
> > +       &LOOP_VINFO_LENS (loop_vinfo), 1,
> > +       vectype, 1);
> > +       else
> > + vect_record_loop_mask (loop_vinfo,
> > +        &LOOP_VINFO_MASKS (loop_vinfo), 1,
> > +        vectype, NULL);
> >      }
> >  }
> >        /* ???  Enable for loop costing as well.  */
> > @@ -10339,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo,
> >    gimple *vec_stmt;
> >    if (slp_node)
> >      {
> > -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > +      gcc_assert (!loop_vinfo
> > +   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > +   || !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo));
>  
> that should be
>  
>   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 
>       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  
> I think.  It seems to imply that SLP isn't supported with 
> masking/lengthing.
>  
> >  
> >        /* Get the correct slp vectorized stmt.  */
> >        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > @@ -10383,21 +10391,29 @@ vectorizable_live_operation (vec_info *vinfo,
> >  
> >        gimple_seq stmts = NULL;
> >        tree new_tree;
> > -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > +   || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> >  {
> >    /* Emit:
> >  
> > -        SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> > +        SCALAR_RES = EXTRACT_LAST <VEC_LHS, CONTROL>
> >  
> > -      where VEC_LHS is the vectorized live-out result and MASK is
> > -      the loop mask for the final iteration.  */
> > +      where VEC_LHS is the vectorized live-out result and CONTROL can
> > +      be either the loop mask for the final iteration or the loop len
> > +      for the final iteration.  */
> >    gcc_assert (ncopies == 1 && !slp_node);
> >    tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> > -   tree mask = vect_get_loop_mask (loop_vinfo, gsi,
> > -   &LOOP_VINFO_MASKS (loop_vinfo),
> > -   1, vectype, 0);
> > +   tree control;
> > +   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > +     control = vect_get_loop_len (loop_vinfo, gsi,
> > + &LOOP_VINFO_LENS (loop_vinfo), 1,
> > + vectype, 0, 0);
> > +   else
> > +     control = vect_get_loop_mask (loop_vinfo, gsi,
> > +   &LOOP_VINFO_MASKS (loop_vinfo), 1,
> > +   vectype, 0);
> >    tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > -   mask, vec_lhs_phi);
> > +   control, vec_lhs_phi);
>  
> Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
> Don't you need some CFN_LEN_EXTRACT_LAST instead?
>  
> >    /* Convert the extracted vector element to the scalar type.  */
> >    new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > 
>  
> Otherwise looks OK to me.
>  
> Thanks,
> Richard.
>  
>
juzhe.zhong@rivai.ai Aug. 9, 2023, 12:34 p.m. UTC | #4
Hi, Richi.
>> Note the expanders are documented
>> as to receive 'mask' operands, not 'len' ones (and we'd miss BIAS).
Oh. Yes. This patch is reusing current EXTRACT_LAST and generate loop len instead of loop mask.
It seems this patch missed 'BIAS'. If we need 'BIAS', we may need LEN_EXTRACT_LAST pattern.

 
>> Richard S. should know best,
>> As for SLP support the loop mask should have all SLP lanes
>> consistently masked/unmasked (same for 'len' I suppose), but we
>> want to extract a specific SLP lane only.  For masks I think
>> producing a mask that has all 'i' SLP lanes enabled and AND
>> that to the mask would select the proper lane for EXTRACT_LAST.
>> Not sure how to handle this for 'len' - I guess since 'len'
>> covers all SLP lanes as well we could just subtract
>> SLP_TREE_LANES (node) - slp_index from it?  I'll note we don't
handle ncopies > 1 which I think we could with using FOLD_EXTRACT_LAST?

For SLP stuff, I am not sure.
And I agree that we need to wait for Richard S review.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-09 20:21
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford
Subject: Re: Re: [PATCH] VECT: Support loop len control on EXTRACT_LAST vectorization
On Wed, 9 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> Hi, Richi.
> 
> >> that should be
> 
> >>   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> >>       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> 
> >> I think.  It seems to imply that SLP isn't supported with
> >> masking/lengthing.
> 
> Oh, yes.  At first glance, the original code is quite suspicious and your comments make sense to me.
> 
> >> Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
> >> Don't you need some CFN_LEN_EXTRACT_LAST instead?
> 
> I think CFN_EXTRACT_LAST always has either loop mask or loop len.
> 
> When both mask and length are not needed, IMHO, I think current BIT_FIELD_REF flow is good enough:
> https://godbolt.org/z/Yr5M9hcc6
> 
> So I think we don't need CFN_LEN_EXTRACT_LAST. 
> 
> Instead, I think we will need CFN_LEN_FOLD_EXTRACT_LAST in the next patch.
> 
> Feel free to correct me it I am wrong.
 
Richard S. should know best, but I don't think FOLD_EXTRACT_LAST is
any different from EXTRACT_LAST (the value for EXTRACT_LAST with
all zeros mask seems unspecified?).
Note the expanders are documented
as to receive 'mask' operands, not 'len' ones (and we'd miss BIAS).
 
As for SLP support the loop mask should have all SLP lanes
consistently masked/unmasked (same for 'len' I suppose), but we
want to extract a specific SLP lane only.  For masks I think
producing a mask that has all 'i' SLP lanes enabled and AND
that to the mask would select the proper lane for EXTRACT_LAST.
Not sure how to handle this for 'len' - I guess since 'len'
covers all SLP lanes as well we could just subtract
SLP_TREE_LANES (node) - slp_index from it?  I'll note we don't
handle ncopies > 1 which I think we could with using FOLD_EXTRACT_LAST?
 
Richard.
 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-09 19:00
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford
> Subject: Re: [PATCH] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Wed, 9 Aug 2023, juzhe.zhong@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > 
> > Hi, this patch is adding loop len control on extract_last autovectorization.
> > 
> > Consider this following case:
> > 
> > #include <stdint.h>
> > 
> > #define EXTRACT_LAST(TYPE) \
> >   TYPE __attribute__ ((noinline, noclone)) \
> >   test_##TYPE (TYPE *x, int n, TYPE value) \
> >   { \
> >     TYPE last; \
> >     for (int j = 0; j < n; ++j) \
> >       { \
> > last = x[j]; \
> > x[j] = last * value; \
> >       } \
> >     return last; \
> >   }
> > 
> > #define TEST_ALL(T) \
> >   T (uint8_t) \
> > 
> > TEST_ALL (EXTRACT_LAST)
> > 
> > ARM SVE IR:
> > 
> > Preheader:
> >   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> > 
> > Loop:
> >   ...
> >   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> >   ...
> >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> >   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> >   ...
> >   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> >   ...
> > 
> > Epilogue:
> >   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> > 
> > For RVV since we prefer len in loop control, after this patch for RVV:
> > 
> > Loop:
> >   ...
> >   loop_len_22 = SELECT_VL;
> >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> >   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> >   ...
> > 
> > Epilogue:
> >   _25 = .EXTRACT_LAST (loop_len_22, vect_last_12.8_23);
> > 
> > This patch didn't add a new pattern for length loop control of extract_last.
> > Instead we reuse current extract_last.
> > 
> > Here is the code:
> > 
> > Step 1 - Enable length and record length for extract_last:
> > 
> > +       machine_mode vec_mode = TYPE_MODE (vectype);
> > +       if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode))
> > + vect_record_loop_len (loop_vinfo,
> > +       &LOOP_VINFO_LENS (loop_vinfo), 1,
> > +       vectype, 1);
> > +       else
> > + vect_record_loop_mask (loop_vinfo,
> > +        &LOOP_VINFO_MASKS (loop_vinfo), 1,
> > +        vectype, NULL);
> > 
> > We use 'get_len_load_store_mode' to check whether targets support loop len control or not.
> > If yes, record a loop len.
> > 
> > Step 2 - Build EXTRACT_LAST with len:
> > 
> > -   tree mask = vect_get_loop_mask (loop_vinfo, gsi,
> > -   &LOOP_VINFO_MASKS (loop_vinfo),
> > -   1, vectype, 0);
> > +   tree control;
> > +   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > +     control = vect_get_loop_len (loop_vinfo, gsi,
> > + &LOOP_VINFO_LENS (loop_vinfo), 1,
> > + vectype, 0, 0);
> > +   else
> > +     control = vect_get_loop_mask (loop_vinfo, gsi,
> > +   &LOOP_VINFO_MASKS (loop_vinfo), 1,
> > +   vectype, 0);
> >    tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > -   mask, vec_lhs_phi);
> > +   control, vec_lhs_phi);
> > 
> > Reuse the current codes (build EXTRACT_LAST with mask), build length instead if
> > 'LOOP_VINFO_FULLY_WITH_LENGTH_P' is true.
> > 
> > This patch has been fully tested in RISC-V port.
> > 
> > Bootstrap and Regression on X86 passed.
> > 
> > Ok for trunk ?
> > 
> > gcc/ChangeLog:
> > 
> >         * tree-vect-loop.cc (vectorizable_live_operation): Add length control.
> > 
> > ---
> >  gcc/tree-vect-loop.cc | 40 ++++++++++++++++++++++++++++------------
> >  1 file changed, 28 insertions(+), 12 deletions(-)
> > 
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 00058c3c13e..fde098cafde 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -10311,9 +10311,15 @@ vectorizable_live_operation (vec_info *vinfo,
> >    else
> >      {
> >        gcc_assert (ncopies == 1 && !slp_node);
> > -       vect_record_loop_mask (loop_vinfo,
> > -      &LOOP_VINFO_MASKS (loop_vinfo),
> > -      1, vectype, NULL);
> > +       machine_mode vec_mode = TYPE_MODE (vectype);
> > +       if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode))
> > + vect_record_loop_len (loop_vinfo,
> > +       &LOOP_VINFO_LENS (loop_vinfo), 1,
> > +       vectype, 1);
> > +       else
> > + vect_record_loop_mask (loop_vinfo,
> > +        &LOOP_VINFO_MASKS (loop_vinfo), 1,
> > +        vectype, NULL);
> >      }
> >  }
> >        /* ???  Enable for loop costing as well.  */
> > @@ -10339,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo,
> >    gimple *vec_stmt;
> >    if (slp_node)
> >      {
> > -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > +      gcc_assert (!loop_vinfo
> > +   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > +   || !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo));
>  
> that should be
>  
>   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 
>       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  
> I think.  It seems to imply that SLP isn't supported with 
> masking/lengthing.
>  
> >  
> >        /* Get the correct slp vectorized stmt.  */
> >        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > @@ -10383,21 +10391,29 @@ vectorizable_live_operation (vec_info *vinfo,
> >  
> >        gimple_seq stmts = NULL;
> >        tree new_tree;
> > -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > +   || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> >  {
> >    /* Emit:
> >  
> > -        SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> > +        SCALAR_RES = EXTRACT_LAST <VEC_LHS, CONTROL>
> >  
> > -      where VEC_LHS is the vectorized live-out result and MASK is
> > -      the loop mask for the final iteration.  */
> > +      where VEC_LHS is the vectorized live-out result and CONTROL can
> > +      be either the loop mask for the final iteration or the loop len
> > +      for the final iteration.  */
> >    gcc_assert (ncopies == 1 && !slp_node);
> >    tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> > -   tree mask = vect_get_loop_mask (loop_vinfo, gsi,
> > -   &LOOP_VINFO_MASKS (loop_vinfo),
> > -   1, vectype, 0);
> > +   tree control;
> > +   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > +     control = vect_get_loop_len (loop_vinfo, gsi,
> > + &LOOP_VINFO_LENS (loop_vinfo), 1,
> > + vectype, 0, 0);
> > +   else
> > +     control = vect_get_loop_mask (loop_vinfo, gsi,
> > +   &LOOP_VINFO_MASKS (loop_vinfo), 1,
> > +   vectype, 0);
> >    tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > -   mask, vec_lhs_phi);
> > +   control, vec_lhs_phi);
>  
> Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
> Don't you need some CFN_LEN_EXTRACT_LAST instead?
>  
> >    /* Convert the extracted vector element to the scalar type.  */
> >    new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > 
>  
> Otherwise looks OK to me.
>  
> Thanks,
> Richard.
>  
>
Richard Sandiford Aug. 9, 2023, 1:30 p.m. UTC | #5
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Hi, Richi.
>
>>> that should be
>
>>>   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
>>>       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>
>>> I think.  It seems to imply that SLP isn't supported with
>>> masking/lengthing.
>
> Oh, yes.  At first glance, the original code is quite suspicious and your comments make sense to me.
>
>>> Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
>>> Don't you need some CFN_LEN_EXTRACT_LAST instead?
>
> I think CFN_EXTRACT_LAST always has either loop mask or loop len.
>
> When both mask and length are not needed, IMHO, I think current BIT_FIELD_REF flow is good enough:
> https://godbolt.org/z/Yr5M9hcc6

I'm a bit behind of email, but why isn't BIT_FIELD_REF enough for
the case that the patch is handling?  It seems that:

  .EXTRACT_LAST (len, vec)

is equivalent to:

  vec[len - 1]

I think eventually there'll be the temptation to lower/fold it like that.

FWIW, I agree a IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN would be OK,
with a mask, vector, length and bias.  But even then, I think there'll
be a temptation to lower calls with all-1 masks to val[len - 1 - bias].
So I think the function only makes sense if we have a use case where
the mask might not be all-1s.

Thanks,
Richard
juzhe.zhong@rivai.ai Aug. 9, 2023, 2:11 p.m. UTC | #6
Hi, Richard.

>> I'm a bit behind of email, but why isn't BIT_FIELD_REF enough for
>> the case that the patch is handling?  It seems that:

>>   .EXTRACT_LAST (len, vec)

>> is equivalent to:

>>   vec[len - 1]

>> I think eventually there'll be the temptation to lower/fold it like that.

Current BIT_FIELD_REF doesn't make use of LOOP_LEN.

Consider this following case:

#include <stdint.h>
#define EXTRACT_LAST(TYPE)			\
  TYPE __attribute__ ((noinline, noclone))	\
  test_##TYPE (TYPE *x, int n, TYPE value)	\
  {						\
    TYPE last;					\
    for (int j = 0; j < n; ++j)			\
      {						\
	last = x[j];				\
	x[j] = last * value;			\
      }						\
    return last;				\
  }
#define TEST_ALL(T)				\
  T (uint8_t)					\
TEST_ALL (EXTRACT_LAST)

The assembly:
https://godbolt.org/z/z1PPT948b

test_uint8_t:
        mv      a3,a0
        ble     a1,zero,.L10
        addiw   a5,a1,-1
        li      a4,14
        sext.w  a0,a1
        bleu    a5,a4,.L11
        srliw   a4,a0,4
        slli    a4,a4,4
        mv      a5,a3
        add     a4,a4,a3
        vsetivli        zero,16,e8,m1,ta,ma
        vmv.v.x v3,a2
.L4:
        vl1re8.v        v1,0(a5)
        vmul.vv v2,v1,v3
        vs1r.v  v2,0(a5)
        addi    a5,a5,16
        bne     a4,a5,.L4
        andi    a4,a1,-16
        mv      a5,a4
        vsetivli        zero,8,e8,mf2,ta,ma
        beq     a0,a4,.L16
.L3:
        subw    a0,a0,a4
        addiw   a7,a0,-1
        li      a6,6
        bleu    a7,a6,.L7
        slli    a4,a4,32
        srli    a4,a4,32
        add     a4,a3,a4
        andi    a6,a0,-8
        vle8.v  v2,0(a4)
        vmv.v.x v1,a2
        andi    a0,a0,7
        vmul.vv v1,v1,v2
        vse8.v  v1,0(a4)
        addw    a5,a6,a5
        beq     a0,zero,.L8
.L7:
        add     a6,a3,a5
        lbu     a0,0(a6)
        addiw   a4,a5,1
        mulw    a7,a0,a2
        sb      a7,0(a6)
        bge     a4,a1,.L14
        add     a4,a3,a4
        lbu     a0,0(a4)
        addiw   a6,a5,2
        mulw    a7,a2,a0
        sb      a7,0(a4)
        ble     a1,a6,.L14
        add     a6,a3,a6
        lbu     a0,0(a6)
        addiw   a4,a5,3
        mulw    a7,a2,a0
        sb      a7,0(a6)
        ble     a1,a4,.L14
        add     a4,a3,a4
        lbu     a0,0(a4)
        addiw   a6,a5,4
        mulw    a7,a2,a0
        sb      a7,0(a4)
        ble     a1,a6,.L14
        add     a6,a3,a6
        lbu     a0,0(a6)
        addiw   a4,a5,5
        mulw    a7,a2,a0
        sb      a7,0(a6)
        ble     a1,a4,.L14
        add     a4,a3,a4
        lbu     a0,0(a4)
        addiw   a5,a5,6
        mulw    a6,a2,a0
        sb      a6,0(a4)
        ble     a1,a5,.L14
        add     a3,a3,a5
        lbu     a0,0(a3)
        mulw    a2,a2,a0
        sb      a2,0(a3)
        ret
.L10:
        li      a0,0
.L14:
        ret
.L8:
        vslidedown.vi   v2,v2,7
        vmv.x.s a0,v2
        andi    a0,a0,0xff
        ret
.L11:
        li      a4,0
        li      a5,0
        vsetivli        zero,8,e8,mf2,ta,ma
        j       .L3
.L16:
        vsetivli        zero,16,e8,m1,ta,ma
        vslidedown.vi   v1,v1,15
        vmv.x.s a0,v1
        andi    a0,a0,0xff
        ret


This patch is trying to optimize the codegen for RVV for length control,
after this patch:

Gimple IR:

  <bb 4> [local count: 955630224]:
  # vectp_x.6_22 = PHI <vectp_x.6_23(4), x_11(D)(3)>
  # vectp_x.10_30 = PHI <vectp_x.10_31(4), x_11(D)(3)>
  # ivtmp_34 = PHI <ivtmp_35(4), _33(3)>
  _36 = .SELECT_VL (ivtmp_34, 16);
  vect_last_12.8_24 = .MASK_LEN_LOAD (vectp_x.6_22, 8B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, _36, 0);
  vect__4.9_28 = vect_last_12.8_24 * vect_cst__27;
  .MASK_LEN_STORE (vectp_x.10_30, 8B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, _36, 0, vect__4.9_28);
  vectp_x.6_23 = vectp_x.6_22 + _36;
  vectp_x.10_31 = vectp_x.10_30 + _36;
  ivtmp_35 = ivtmp_34 - _36;
  if (ivtmp_35 != 0)
    goto <bb 4>; [89.00%]
  else
    goto <bb 5>; [11.00%]

  <bb 5> [local count: 105119324]:
  _26 = .EXTRACT_LAST (_36, vect_last_12.8_24); [tail call]

ASM:
test_uint8_t:
ble a1,zero,.L4
mv a4,a0
vsetivli zero,16,e8,m1,ta,ma
vmv.v.x v3,a2
.L3:
vsetvli a5,a1,e8,m1,ta,ma
vle8.v v1,0(a0)
vsetivli zero,16,e8,m1,ta,ma
sub a1,a1,a5
vmul.vv v2,v1,v3
vsetvli zero,a5,e8,m1,ta,ma
vse8.v v2,0(a4)
add a0,a0,a5
add a4,a4,a5
bne a1,zero,.L3
addi a5,a5,-1
vsetivli zero,16,e8,m1,ta,ma
vslidedown.vx v1,v1,a5
vmv.x.s a0,v1
andi a0,a0,0xff
ret
.L4:
li a0,0
ret

I think this codegen is much better with this patch.

>> FWIW, I agree a IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN would be OK,
>> with a mask, vector, length and bias.  But even then, I think there'll
>> be a temptation to lower calls with all-1 masks to val[len - 1 - bias].
>> So I think the function only makes sense if we have a use case where
>> the mask might not be all-1s.

I am wondering that, instead of adding IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN.
Does it make sense I use VEC_EXTRACT as follows :?

Loop:
loop_len_22 = SELECT_VL.
....

Epilogue:
new_loop_len_22 = PHI <loop_len_22>
scalar_res = VEC_EXTRACT (v, new_loop_len_22  - 1 - BIAS)

Feel free to correct me if I am wrong.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-08-09 21:30
To: juzhe.zhong\@rivai.ai
CC: rguenther; gcc-patches
Subject: Re: [PATCH] VECT: Support loop len control on EXTRACT_LAST vectorization
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Hi, Richi.
>
>>> that should be
>
>>>   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
>>>       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>
>>> I think.  It seems to imply that SLP isn't supported with
>>> masking/lengthing.
>
> Oh, yes.  At first glance, the original code is quite suspicious and your comments make sense to me.
>
>>> Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
>>> Don't you need some CFN_LEN_EXTRACT_LAST instead?
>
> I think CFN_EXTRACT_LAST always has either loop mask or loop len.
>
> When both mask and length are not needed, IMHO, I think current BIT_FIELD_REF flow is good enough:
> https://godbolt.org/z/Yr5M9hcc6
 
I'm a bit behind of email, but why isn't BIT_FIELD_REF enough for
the case that the patch is handling?  It seems that:
 
  .EXTRACT_LAST (len, vec)
 
is equivalent to:
 
  vec[len - 1]
 
I think eventually there'll be the temptation to lower/fold it like that.
 
FWIW, I agree a IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN would be OK,
with a mask, vector, length and bias.  But even then, I think there'll
be a temptation to lower calls with all-1 masks to val[len - 1 - bias].
So I think the function only makes sense if we have a use case where
the mask might not be all-1s.
 
Thanks,
Richard
Richard Biener Aug. 9, 2023, 2:17 p.m. UTC | #7
On Wed, 9 Aug 2023, ??? wrote:

> Hi, Richard.
> 
> >> I'm a bit behind of email, but why isn't BIT_FIELD_REF enough for
> >> the case that the patch is handling?  It seems that:
> 
> >>   .EXTRACT_LAST (len, vec)
> 
> >> is equivalent to:
> 
> >>   vec[len - 1]
> 
> >> I think eventually there'll be the temptation to lower/fold it like that.
> 
> Current BIT_FIELD_REF doesn't make use of LOOP_LEN.

Yes, BIT_FIELD_REF doesn't support variable offset.

> Consider this following case:
> 
> #include <stdint.h>
> #define EXTRACT_LAST(TYPE)			\
>   TYPE __attribute__ ((noinline, noclone))	\
>   test_##TYPE (TYPE *x, int n, TYPE value)	\
>   {						\
>     TYPE last;					\
>     for (int j = 0; j < n; ++j)			\
>       {						\
> 	last = x[j];				\
> 	x[j] = last * value;			\
>       }						\
>     return last;				\
>   }
> #define TEST_ALL(T)				\
>   T (uint8_t)					\
> TEST_ALL (EXTRACT_LAST)
> 
> The assembly:
> https://godbolt.org/z/z1PPT948b
> 
> test_uint8_t:
>         mv      a3,a0
>         ble     a1,zero,.L10
>         addiw   a5,a1,-1
>         li      a4,14
>         sext.w  a0,a1
>         bleu    a5,a4,.L11
>         srliw   a4,a0,4
>         slli    a4,a4,4
>         mv      a5,a3
>         add     a4,a4,a3
>         vsetivli        zero,16,e8,m1,ta,ma
>         vmv.v.x v3,a2
> .L4:
>         vl1re8.v        v1,0(a5)
>         vmul.vv v2,v1,v3
>         vs1r.v  v2,0(a5)
>         addi    a5,a5,16
>         bne     a4,a5,.L4
>         andi    a4,a1,-16
>         mv      a5,a4
>         vsetivli        zero,8,e8,mf2,ta,ma
>         beq     a0,a4,.L16
> .L3:
>         subw    a0,a0,a4
>         addiw   a7,a0,-1
>         li      a6,6
>         bleu    a7,a6,.L7
>         slli    a4,a4,32
>         srli    a4,a4,32
>         add     a4,a3,a4
>         andi    a6,a0,-8
>         vle8.v  v2,0(a4)
>         vmv.v.x v1,a2
>         andi    a0,a0,7
>         vmul.vv v1,v1,v2
>         vse8.v  v1,0(a4)
>         addw    a5,a6,a5
>         beq     a0,zero,.L8
> .L7:
>         add     a6,a3,a5
>         lbu     a0,0(a6)
>         addiw   a4,a5,1
>         mulw    a7,a0,a2
>         sb      a7,0(a6)
>         bge     a4,a1,.L14
>         add     a4,a3,a4
>         lbu     a0,0(a4)
>         addiw   a6,a5,2
>         mulw    a7,a2,a0
>         sb      a7,0(a4)
>         ble     a1,a6,.L14
>         add     a6,a3,a6
>         lbu     a0,0(a6)
>         addiw   a4,a5,3
>         mulw    a7,a2,a0
>         sb      a7,0(a6)
>         ble     a1,a4,.L14
>         add     a4,a3,a4
>         lbu     a0,0(a4)
>         addiw   a6,a5,4
>         mulw    a7,a2,a0
>         sb      a7,0(a4)
>         ble     a1,a6,.L14
>         add     a6,a3,a6
>         lbu     a0,0(a6)
>         addiw   a4,a5,5
>         mulw    a7,a2,a0
>         sb      a7,0(a6)
>         ble     a1,a4,.L14
>         add     a4,a3,a4
>         lbu     a0,0(a4)
>         addiw   a5,a5,6
>         mulw    a6,a2,a0
>         sb      a6,0(a4)
>         ble     a1,a5,.L14
>         add     a3,a3,a5
>         lbu     a0,0(a3)
>         mulw    a2,a2,a0
>         sb      a2,0(a3)
>         ret
> .L10:
>         li      a0,0
> .L14:
>         ret
> .L8:
>         vslidedown.vi   v2,v2,7
>         vmv.x.s a0,v2
>         andi    a0,a0,0xff
>         ret
> .L11:
>         li      a4,0
>         li      a5,0
>         vsetivli        zero,8,e8,mf2,ta,ma
>         j       .L3
> .L16:
>         vsetivli        zero,16,e8,m1,ta,ma
>         vslidedown.vi   v1,v1,15
>         vmv.x.s a0,v1
>         andi    a0,a0,0xff
>         ret
> 
> 
> This patch is trying to optimize the codegen for RVV for length control,
> after this patch:
> 
> Gimple IR:
> 
>   <bb 4> [local count: 955630224]:
>   # vectp_x.6_22 = PHI <vectp_x.6_23(4), x_11(D)(3)>
>   # vectp_x.10_30 = PHI <vectp_x.10_31(4), x_11(D)(3)>
>   # ivtmp_34 = PHI <ivtmp_35(4), _33(3)>
>   _36 = .SELECT_VL (ivtmp_34, 16);
>   vect_last_12.8_24 = .MASK_LEN_LOAD (vectp_x.6_22, 8B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, _36, 0);
>   vect__4.9_28 = vect_last_12.8_24 * vect_cst__27;
>   .MASK_LEN_STORE (vectp_x.10_30, 8B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, _36, 0, vect__4.9_28);
>   vectp_x.6_23 = vectp_x.6_22 + _36;
>   vectp_x.10_31 = vectp_x.10_30 + _36;
>   ivtmp_35 = ivtmp_34 - _36;
>   if (ivtmp_35 != 0)
>     goto <bb 4>; [89.00%]
>   else
>     goto <bb 5>; [11.00%]
> 
>   <bb 5> [local count: 105119324]:
>   _26 = .EXTRACT_LAST (_36, vect_last_12.8_24); [tail call]
> 
> ASM:
> test_uint8_t:
> ble a1,zero,.L4
> mv a4,a0
> vsetivli zero,16,e8,m1,ta,ma
> vmv.v.x v3,a2
> .L3:
> vsetvli a5,a1,e8,m1,ta,ma
> vle8.v v1,0(a0)
> vsetivli zero,16,e8,m1,ta,ma
> sub a1,a1,a5
> vmul.vv v2,v1,v3
> vsetvli zero,a5,e8,m1,ta,ma
> vse8.v v2,0(a4)
> add a0,a0,a5
> add a4,a4,a5
> bne a1,zero,.L3
> addi a5,a5,-1
> vsetivli zero,16,e8,m1,ta,ma
> vslidedown.vx v1,v1,a5
> vmv.x.s a0,v1
> andi a0,a0,0xff
> ret
> .L4:
> li a0,0
> ret
> 
> I think this codegen is much better with this patch.
> 
> >> FWIW, I agree a IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN would be OK,
> >> with a mask, vector, length and bias.  But even then, I think there'll
> >> be a temptation to lower calls with all-1 masks to val[len - 1 - bias].
> >> So I think the function only makes sense if we have a use case where
> >> the mask might not be all-1s.
> 
> I am wondering that, instead of adding IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN.
> Does it make sense I use VEC_EXTRACT as follows :?
> 
> Loop:
> loop_len_22 = SELECT_VL.
> ....
> 
> Epilogue:
> new_loop_len_22 = PHI <loop_len_22>
> scalar_res = VEC_EXTRACT (v, new_loop_len_22  - 1 - BIAS)

Yes, I think VEC_EXTRACT is suitable for this.

Richard.

> Feel free to correct me if I am wrong.
> 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Sandiford
> Date: 2023-08-09 21:30
> To: juzhe.zhong\@rivai.ai
> CC: rguenther; gcc-patches
> Subject: Re: [PATCH] VECT: Support loop len control on EXTRACT_LAST vectorization
> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> > Hi, Richi.
> >
> >>> that should be
> >
> >>>   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> >>>       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> >
> >>> I think.  It seems to imply that SLP isn't supported with
> >>> masking/lengthing.
> >
> > Oh, yes.  At first glance, the original code is quite suspicious and your comments make sense to me.
> >
> >>> Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
> >>> Don't you need some CFN_LEN_EXTRACT_LAST instead?
> >
> > I think CFN_EXTRACT_LAST always has either loop mask or loop len.
> >
> > When both mask and length are not needed, IMHO, I think current BIT_FIELD_REF flow is good enough:
> > https://godbolt.org/z/Yr5M9hcc6
>  
> I'm a bit behind of email, but why isn't BIT_FIELD_REF enough for
> the case that the patch is handling?  It seems that:
>  
>   .EXTRACT_LAST (len, vec)
>  
> is equivalent to:
>  
>   vec[len - 1]
>  
> I think eventually there'll be the temptation to lower/fold it like that.
>  
> FWIW, I agree a IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN would be OK,
> with a mask, vector, length and bias.  But even then, I think there'll
> be a temptation to lower calls with all-1 masks to val[len - 1 - bias].
> So I think the function only makes sense if we have a use case where
> the mask might not be all-1s.
>  
> Thanks,
> Richard
>  
>
juzhe.zhong@rivai.ai Aug. 9, 2023, 2:35 p.m. UTC | #8
Hi, Richard.

>> Yes, I think VEC_EXTRACT is suitable for this.

Thanks for suggestion.

Actually I was trying to use VEC_EXTRACT yesterday but it ICE since GCC failed to create LCSSA PHI for VEC_EXTRACT.

For example, like ARM SVE:
https://godbolt.org/z/rfbb4rfKv 

vect dump IR:
;; Created LCSSA PHI: loop_mask_36 = PHI <loop_mask_22(3)>
  <bb 8> [local count: 105119324]:
  # vect_last_12.8_24 = PHI <vect_last_12.8_23(3)>
  # loop_mask_36 = PHI <loop_mask_22(3)>
  _25 = .EXTRACT_LAST (loop_mask_36, vect_last_12.8_24);

Then it can work.

I was trying to do the similar thing but with VEC_EXTRACT as follows for RVV:

...
loop_len_36 = SELECT_VL
....
# vect_last_12.8_24 = PHI <vect_last_12.8_23(3)>
// Missed a LCSSA PHI.
_25 = .VEC_EXTRACT (loop_len_36, vect_last_12.8_24);

This Gimple IR will cause an ICE.

When I use EXTRACT_LAST instead of VEC_EXTRACT, then:
...
loop_len_36 = SELECT_VL
....
# vect_last_12.8_24 = PHI <vect_last_12.8_23(3)>
# loop_len_22 = PHI <loop_len_36 (3)>
_25 = .VEC_EXTRACT (loop_len_22, vect_last_12.8_24);

Then it works.

I didn't figure out where to make GCC recognize VEC_EXTRACT to generate LCSSA PHI for VEC_EXTRACT.

Could you give me some help for this?

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-09 22:17
To: 钟居哲
CC: richard.sandiford; gcc-patches
Subject: Re: Re: [PATCH] VECT: Support loop len control on EXTRACT_LAST vectorization
On Wed, 9 Aug 2023, ??? wrote:
 
> Hi, Richard.
> 
> >> I'm a bit behind of email, but why isn't BIT_FIELD_REF enough for
> >> the case that the patch is handling?  It seems that:
> 
> >>   .EXTRACT_LAST (len, vec)
> 
> >> is equivalent to:
> 
> >>   vec[len - 1]
> 
> >> I think eventually there'll be the temptation to lower/fold it like that.
> 
> Current BIT_FIELD_REF doesn't make use of LOOP_LEN.
 
Yes, BIT_FIELD_REF doesn't support variable offset.
 
> Consider this following case:
> 
> #include <stdint.h>
> #define EXTRACT_LAST(TYPE) \
>   TYPE __attribute__ ((noinline, noclone)) \
>   test_##TYPE (TYPE *x, int n, TYPE value) \
>   { \
>     TYPE last; \
>     for (int j = 0; j < n; ++j) \
>       { \
> last = x[j]; \
> x[j] = last * value; \
>       } \
>     return last; \
>   }
> #define TEST_ALL(T) \
>   T (uint8_t) \
> TEST_ALL (EXTRACT_LAST)
> 
> The assembly:
> https://godbolt.org/z/z1PPT948b
> 
> test_uint8_t:
>         mv      a3,a0
>         ble     a1,zero,.L10
>         addiw   a5,a1,-1
>         li      a4,14
>         sext.w  a0,a1
>         bleu    a5,a4,.L11
>         srliw   a4,a0,4
>         slli    a4,a4,4
>         mv      a5,a3
>         add     a4,a4,a3
>         vsetivli        zero,16,e8,m1,ta,ma
>         vmv.v.x v3,a2
> .L4:
>         vl1re8.v        v1,0(a5)
>         vmul.vv v2,v1,v3
>         vs1r.v  v2,0(a5)
>         addi    a5,a5,16
>         bne     a4,a5,.L4
>         andi    a4,a1,-16
>         mv      a5,a4
>         vsetivli        zero,8,e8,mf2,ta,ma
>         beq     a0,a4,.L16
> .L3:
>         subw    a0,a0,a4
>         addiw   a7,a0,-1
>         li      a6,6
>         bleu    a7,a6,.L7
>         slli    a4,a4,32
>         srli    a4,a4,32
>         add     a4,a3,a4
>         andi    a6,a0,-8
>         vle8.v  v2,0(a4)
>         vmv.v.x v1,a2
>         andi    a0,a0,7
>         vmul.vv v1,v1,v2
>         vse8.v  v1,0(a4)
>         addw    a5,a6,a5
>         beq     a0,zero,.L8
> .L7:
>         add     a6,a3,a5
>         lbu     a0,0(a6)
>         addiw   a4,a5,1
>         mulw    a7,a0,a2
>         sb      a7,0(a6)
>         bge     a4,a1,.L14
>         add     a4,a3,a4
>         lbu     a0,0(a4)
>         addiw   a6,a5,2
>         mulw    a7,a2,a0
>         sb      a7,0(a4)
>         ble     a1,a6,.L14
>         add     a6,a3,a6
>         lbu     a0,0(a6)
>         addiw   a4,a5,3
>         mulw    a7,a2,a0
>         sb      a7,0(a6)
>         ble     a1,a4,.L14
>         add     a4,a3,a4
>         lbu     a0,0(a4)
>         addiw   a6,a5,4
>         mulw    a7,a2,a0
>         sb      a7,0(a4)
>         ble     a1,a6,.L14
>         add     a6,a3,a6
>         lbu     a0,0(a6)
>         addiw   a4,a5,5
>         mulw    a7,a2,a0
>         sb      a7,0(a6)
>         ble     a1,a4,.L14
>         add     a4,a3,a4
>         lbu     a0,0(a4)
>         addiw   a5,a5,6
>         mulw    a6,a2,a0
>         sb      a6,0(a4)
>         ble     a1,a5,.L14
>         add     a3,a3,a5
>         lbu     a0,0(a3)
>         mulw    a2,a2,a0
>         sb      a2,0(a3)
>         ret
> .L10:
>         li      a0,0
> .L14:
>         ret
> .L8:
>         vslidedown.vi   v2,v2,7
>         vmv.x.s a0,v2
>         andi    a0,a0,0xff
>         ret
> .L11:
>         li      a4,0
>         li      a5,0
>         vsetivli        zero,8,e8,mf2,ta,ma
>         j       .L3
> .L16:
>         vsetivli        zero,16,e8,m1,ta,ma
>         vslidedown.vi   v1,v1,15
>         vmv.x.s a0,v1
>         andi    a0,a0,0xff
>         ret
> 
> 
> This patch is trying to optimize the codegen for RVV for length control,
> after this patch:
> 
> Gimple IR:
> 
>   <bb 4> [local count: 955630224]:
>   # vectp_x.6_22 = PHI <vectp_x.6_23(4), x_11(D)(3)>
>   # vectp_x.10_30 = PHI <vectp_x.10_31(4), x_11(D)(3)>
>   # ivtmp_34 = PHI <ivtmp_35(4), _33(3)>
>   _36 = .SELECT_VL (ivtmp_34, 16);
>   vect_last_12.8_24 = .MASK_LEN_LOAD (vectp_x.6_22, 8B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, _36, 0);
>   vect__4.9_28 = vect_last_12.8_24 * vect_cst__27;
>   .MASK_LEN_STORE (vectp_x.10_30, 8B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, _36, 0, vect__4.9_28);
>   vectp_x.6_23 = vectp_x.6_22 + _36;
>   vectp_x.10_31 = vectp_x.10_30 + _36;
>   ivtmp_35 = ivtmp_34 - _36;
>   if (ivtmp_35 != 0)
>     goto <bb 4>; [89.00%]
>   else
>     goto <bb 5>; [11.00%]
> 
>   <bb 5> [local count: 105119324]:
>   _26 = .EXTRACT_LAST (_36, vect_last_12.8_24); [tail call]
> 
> ASM:
> test_uint8_t:
> ble a1,zero,.L4
> mv a4,a0
> vsetivli zero,16,e8,m1,ta,ma
> vmv.v.x v3,a2
> .L3:
> vsetvli a5,a1,e8,m1,ta,ma
> vle8.v v1,0(a0)
> vsetivli zero,16,e8,m1,ta,ma
> sub a1,a1,a5
> vmul.vv v2,v1,v3
> vsetvli zero,a5,e8,m1,ta,ma
> vse8.v v2,0(a4)
> add a0,a0,a5
> add a4,a4,a5
> bne a1,zero,.L3
> addi a5,a5,-1
> vsetivli zero,16,e8,m1,ta,ma
> vslidedown.vx v1,v1,a5
> vmv.x.s a0,v1
> andi a0,a0,0xff
> ret
> .L4:
> li a0,0
> ret
> 
> I think this codegen is much better with this patch.
> 
> >> FWIW, I agree a IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN would be OK,
> >> with a mask, vector, length and bias.  But even then, I think there'll
> >> be a temptation to lower calls with all-1 masks to val[len - 1 - bias].
> >> So I think the function only makes sense if we have a use case where
> >> the mask might not be all-1s.
> 
> I am wondering that, instead of adding IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN.
> Does it make sense I use VEC_EXTRACT as follows :?
> 
> Loop:
> loop_len_22 = SELECT_VL.
> ....
> 
> Epilogue:
> new_loop_len_22 = PHI <loop_len_22>
> scalar_res = VEC_EXTRACT (v, new_loop_len_22  - 1 - BIAS)
 
Yes, I think VEC_EXTRACT is suitable for this.
 
Richard.
 
> Feel free to correct me if I am wrong.
> 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Sandiford
> Date: 2023-08-09 21:30
> To: juzhe.zhong\@rivai.ai
> CC: rguenther; gcc-patches
> Subject: Re: [PATCH] VECT: Support loop len control on EXTRACT_LAST vectorization
> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> > Hi, Richi.
> >
> >>> that should be
> >
> >>>   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> >>>       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> >
> >>> I think.  It seems to imply that SLP isn't supported with
> >>> masking/lengthing.
> >
> > Oh, yes.  At first glance, the original code is quite suspicious and your comments make sense to me.
> >
> >>> Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
> >>> Don't you need some CFN_LEN_EXTRACT_LAST instead?
> >
> > I think CFN_EXTRACT_LAST always has either loop mask or loop len.
> >
> > When both mask and length are not needed, IMHO, I think current BIT_FIELD_REF flow is good enough:
> > https://godbolt.org/z/Yr5M9hcc6
>  
> I'm a bit behind of email, but why isn't BIT_FIELD_REF enough for
> the case that the patch is handling?  It seems that:
>  
>   .EXTRACT_LAST (len, vec)
>  
> is equivalent to:
>  
>   vec[len - 1]
>  
> I think eventually there'll be the temptation to lower/fold it like that.
>  
> FWIW, I agree a IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN would be OK,
> with a mask, vector, length and bias.  But even then, I think there'll
> be a temptation to lower calls with all-1 masks to val[len - 1 - bias].
> So I think the function only makes sense if we have a use case where
> the mask might not be all-1s.
>  
> Thanks,
> Richard
>  
>
diff mbox series

Patch

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 00058c3c13e..fde098cafde 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10311,9 +10311,15 @@  vectorizable_live_operation (vec_info *vinfo,
 	  else
 	    {
 	      gcc_assert (ncopies == 1 && !slp_node);
-	      vect_record_loop_mask (loop_vinfo,
-				     &LOOP_VINFO_MASKS (loop_vinfo),
-				     1, vectype, NULL);
+	      machine_mode vec_mode = TYPE_MODE (vectype);
+	      if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode))
+		vect_record_loop_len (loop_vinfo,
+				      &LOOP_VINFO_LENS (loop_vinfo), 1,
+				      vectype, 1);
+	      else
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo), 1,
+				       vectype, NULL);
 	    }
 	}
       /* ???  Enable for loop costing as well.  */
@@ -10339,7 +10345,9 @@  vectorizable_live_operation (vec_info *vinfo,
   gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo
+		  || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+		  || !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo));
 
       /* Get the correct slp vectorized stmt.  */
       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10383,21 +10391,29 @@  vectorizable_live_operation (vec_info *vinfo,
 
       gimple_seq stmts = NULL;
       tree new_tree;
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
 	{
 	  /* Emit:
 
-	       SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+	       SCALAR_RES = EXTRACT_LAST <VEC_LHS, CONTROL>
 
-	     where VEC_LHS is the vectorized live-out result and MASK is
-	     the loop mask for the final iteration.  */
+	     where VEC_LHS is the vectorized live-out result and CONTROL can
+	     be either the loop mask for the final iteration or the loop len
+	     for the final iteration.  */
 	  gcc_assert (ncopies == 1 && !slp_node);
 	  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
-	  tree mask = vect_get_loop_mask (loop_vinfo, gsi,
-					  &LOOP_VINFO_MASKS (loop_vinfo),
-					  1, vectype, 0);
+	  tree control;
+	  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+	    control = vect_get_loop_len (loop_vinfo, gsi,
+					 &LOOP_VINFO_LENS (loop_vinfo), 1,
+					 vectype, 0, 0);
+	  else
+	    control = vect_get_loop_mask (loop_vinfo, gsi,
+					  &LOOP_VINFO_MASKS (loop_vinfo), 1,
+					  vectype, 0);
 	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-					  mask, vec_lhs_phi);
+					  control, vec_lhs_phi);
 
 	  /* Convert the extracted vector element to the scalar type.  */
 	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);