diff mbox series

[V12] VECT: Add decrement IV iteration loop control by variable amount support

Message ID 20230522083814.1647787-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V12] VECT: Add decrement IV iteration loop control by variable amount support | expand

Commit Message

juzhe.zhong@rivai.ai May 22, 2023, 8:38 a.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

gcc/ChangeLog:

        * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function.
        (vect_set_loop_controls_directly): Add decrement IV support.
        (vect_set_loop_condition_partial_vectors): Ditto.
        * tree-vect-loop.cc: Ditto.
        * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.

---
 gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++-
 gcc/tree-vect-loop.cc       |  10 ++
 gcc/tree-vectorizer.h       |   8 ++
 3 files changed, 199 insertions(+), 3 deletions(-)

Comments

juzhe.zhong@rivai.ai May 23, 2023, 12:32 p.m. UTC | #1
Bootstrap on X86 passed.
Ok for trunk?

Thanks.


juzhe.zhong@rivai.ai
 
From: juzhe.zhong
Date: 2023-05-22 16:38
To: gcc-patches
CC: richard.sandiford; rguenther; Ju-Zhe Zhong
Subject: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
 
gcc/ChangeLog:
 
        * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function.
        (vect_set_loop_controls_directly): Add decrement IV support.
        (vect_set_loop_condition_partial_vectors): Ditto.
        * tree-vect-loop.cc: Ditto.
        * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.
 
---
gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++-
gcc/tree-vect-loop.cc       |  10 ++
gcc/tree-vectorizer.h       |   8 ++
3 files changed, 199 insertions(+), 3 deletions(-)
 
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index ff6159e08d5..94b38d1e0fb 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -385,6 +385,66 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
   return false;
}
+/* Try to use adjust loop lens for non-SLP multiple-rgroups.
+
+     _36 = MIN_EXPR <ivtmp_34, VF>;
+
+     First length (MIN (X, VF/N)):
+       loop_len_15 = MIN_EXPR <_36, VF/N>;
+
+     Second length:
+       tmp = _36 - loop_len_15;
+       loop_len_16 = MIN (tmp, VF/N);
+
+     Third length:
+       tmp2 = tmp - loop_len_16;
+       loop_len_17 = MIN (tmp2, VF/N);
+
+     Last length:
+       loop_len_18 = tmp2 - loop_len_17;
+*/
+
+static void
+vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
+        rgroup_controls *dest_rgm,
+        rgroup_controls *src_rgm, tree step)
+{
+  tree ctrl_type = dest_rgm->type;
+  poly_uint64 nitems_per_ctrl
+    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
+  tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
+
+  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
+    {
+      if (!step)
+ step = src_rgm->controls[i / dest_rgm->controls.length ()];
+      tree ctrl = dest_rgm->controls[i];
+      if (i == 0)
+ {
+   /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
+   gassign *assign
+     = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
+   gimple_seq_add_stmt (seq, assign);
+ }
+      else if (i == dest_rgm->controls.length () - 1)
+ {
+   /* Last iteration: Remain capped to the range [0, VF/N].  */
+   gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
+ dest_rgm->controls[i - 1]);
+   gimple_seq_add_stmt (seq, assign);
+ }
+      else
+ {
+   /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
+   step = gimple_build (seq, MINUS_EXPR, iv_type, step,
+        dest_rgm->controls[i - 1]);
+   gassign *assign
+     = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
+   gimple_seq_add_stmt (seq, assign);
+ }
+    }
+}
+
/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
    for all the rgroup controls in RGC and return a control that is nonzero
    when the loop needs to iterate.  Add any new preheader statements to
@@ -468,9 +528,78 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
-      loop, &incr_gsi, insert_after, &index_before_incr,
-      &index_after_incr);
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+    {
+      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+      tree step = make_ssa_name (iv_type);
+      /* Create decrement IV.  */
+      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
+ insert_after, &index_before_incr, &index_after_incr);
+      tree temp = gimple_build (header_seq, MIN_EXPR, iv_type,
+ index_before_incr, nitems_step);
+      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp));
+
+      if (rgc->max_nscalars_per_iter == 1)
+ {
+   /* single rgroup:
+      ...
+      _10 = (unsigned long) count_12(D);
+      ...
+      # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
+      _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+      ...
+      vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
+      ...
+      ivtmp_35 = ivtmp_9 - _36;
+      ...
+      if (ivtmp_35 != 0)
+        goto <bb 4>; [83.33%]
+      else
+        goto <bb 5>; [16.67%]
+   */
+   gassign *assign = gimple_build_assign (rgc->controls[0], step);
+   gimple_seq_add_stmt (header_seq, assign);
+ }
+      else
+ {
+   /* Multiple rgroup (SLP):
+      ...
+      _38 = (unsigned long) bnd.7_29;
+      _39 = _38 * 2;
+      ...
+      # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
+      ...
+      _43 = MIN_EXPR <ivtmp_41, 32>;
+      loop_len_26 = MIN_EXPR <_43, 16>;
+      loop_len_25 = _43 - loop_len_26;
+      ...
+      .LEN_STORE (_6, 8B, loop_len_26, ...);
+      ...
+      .LEN_STORE (_25, 8B, loop_len_25, ...);
+      _33 = loop_len_26 / 2;
+      ...
+      .LEN_STORE (_8, 16B, _33, ...);
+      _36 = loop_len_25 / 2;
+      ...
+      .LEN_STORE (_15, 16B, _36, ...);
+      ivtmp_42 = ivtmp_41 - _43;
+      ...
+      if (ivtmp_42 != 0)
+        goto <bb 4>; [83.33%]
+      else
+       goto <bb 5>; [16.67%]
+   */
+   vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step);
+ }
+      return index_after_incr;
+    }
+  else
+    {
+      /* Create increment IV.  */
+      create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
+ loop, &incr_gsi, insert_after, &index_before_incr,
+ &index_after_incr);
+    }
   tree zero_index = build_int_cst (compare_type, 0);
   tree test_index, test_limit, first_limit;
@@ -753,6 +882,55 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
      continue;
  }
+ if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+     && rgc->max_nscalars_per_iter == 1
+     && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
+   {
+     /* Multiple rgroup (non-SLP):
+       ...
+       _38 = (unsigned long) n_12(D);
+       ...
+       # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
+       ...
+       _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
+       loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
+       _41 = _40 - loop_len_21;
+       loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
+       _42 = _40 - loop_len_20;
+       loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
+       _43 = _40 - loop_len_19;
+       loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
+       ...
+       vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
+       ...
+       vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
+       ...
+       vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
+       ...
+       vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
+       vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
+       vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
+       vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
+       ...
+       .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
+       ivtmp_39 = ivtmp_38 - _40;
+       ...
+       if (ivtmp_39 != 0)
+ goto <bb 3>; [92.31%]
+       else
+ goto <bb 4>; [7.69%]
+     */
+     rgroup_controls *sub_rgc
+       = &(*controls)[nmasks / rgc->controls.length () - 1];
+     if (!sub_rgc->controls.is_empty ())
+       {
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ vect_adjust_loop_lens_control (iv_type, &header_seq, rgc,
+        sub_rgc, NULL_TREE);
+ continue;
+       }
+   }
+
/* See whether zero-based IV would ever generate all-false masks
   or zero length before wrapping around.  */
bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index cf10132b0bf..53ac4d2a6c2 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2725,6 +2725,16 @@ start_over:
       && !vect_verify_loop_lens (loop_vinfo))
     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  /* If we're vectorizing an loop that uses length "controls" and
+     can iterate more than once, we apply decrementing IV approach
+     in loop control.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
+      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+    && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
+    LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
+
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 02d2ad6fba1..fba09b9ffd3 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -818,6 +818,13 @@ public:
      the vector loop can handle fewer than VF scalars.  */
   bool using_partial_vectors_p;
+  /* True if we've decided to use a decrementing loop control IV that counts
+     scalars. This can be done for any loop that:
+
+ (a) uses length "controls"; and
+ (b) can iterate more than once.  */
+  bool using_decrementing_iv_p;
+
   /* True if we've decided to use partially-populated vectors for the
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
@@ -890,6 +897,7 @@ public:
#define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
#define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
#define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
   (L)->epil_using_partial_vectors_p
#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
Richard Sandiford May 24, 2023, 11:23 a.m. UTC | #2
Sorry for the slow review.  I needed some time to go through this
patch and surrounding code to understand it, and to understand
why it wasn't structured the way I was expecting.

I've got some specific comments below, and then a general comment
about how I think we should structure this.

juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> gcc/ChangeLog:
>
>         * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function.
>         (vect_set_loop_controls_directly): Add decrement IV support.
>         (vect_set_loop_condition_partial_vectors): Ditto.
>         * tree-vect-loop.cc: Ditto.
>         * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.
>
> ---
>  gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++-
>  gcc/tree-vect-loop.cc       |  10 ++
>  gcc/tree-vectorizer.h       |   8 ++
>  3 files changed, 199 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index ff6159e08d5..94b38d1e0fb 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -385,6 +385,66 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
>    return false;
>  }
>  
> +/* Try to use adjust loop lens for non-SLP multiple-rgroups.
> +
> +     _36 = MIN_EXPR <ivtmp_34, VF>;
> +
> +     First length (MIN (X, VF/N)):
> +       loop_len_15 = MIN_EXPR <_36, VF/N>;
> +
> +     Second length:
> +       tmp = _36 - loop_len_15;
> +       loop_len_16 = MIN (tmp, VF/N);
> +
> +     Third length:
> +       tmp2 = tmp - loop_len_16;
> +       loop_len_17 = MIN (tmp2, VF/N);
> +
> +     Last length:
> +       loop_len_18 = tmp2 - loop_len_17;
> +*/
> +
> +static void
> +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
> +			       rgroup_controls *dest_rgm,
> +			       rgroup_controls *src_rgm, tree step)
> +{
> +  tree ctrl_type = dest_rgm->type;
> +  poly_uint64 nitems_per_ctrl
> +    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +  tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
> +
> +  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> +    {
> +      if (!step)
> +	step = src_rgm->controls[i / dest_rgm->controls.length ()];

Could you explain this index?  It looks like it will always be 0
due to the range of i.

Since this is the only use of src_rgm, it might be cleaner to drop
src_rgm and only pass the step.

> +      tree ctrl = dest_rgm->controls[i];
> +      if (i == 0)
> +	{
> +	  /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
> +	  gassign *assign
> +	    = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +	  gimple_seq_add_stmt (seq, assign);
> +	}
> +      else if (i == dest_rgm->controls.length () - 1)
> +	{
> +	  /* Last iteration: Remain capped to the range [0, VF/N].  */
> +	  gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
> +						 dest_rgm->controls[i - 1]);
> +	  gimple_seq_add_stmt (seq, assign);
> +	}
> +      else
> +	{
> +	  /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
> +	  step = gimple_build (seq, MINUS_EXPR, iv_type, step,
> +			       dest_rgm->controls[i - 1]);
> +	  gassign *assign
> +	    = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +	  gimple_seq_add_stmt (seq, assign);
> +	}
> +    }
> +}

Not your fault, but the structure seems kind-of awkward, since
it's really a MINUS_EXPR for i != 0 followed by a MIN_EXPR for i != last.
But I agree that it probably has to be written as above given that
the final destination is fixed in advance.

> +
>  /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
>     for all the rgroup controls in RGC and return a control that is nonzero
>     when the loop needs to iterate.  Add any new preheader statements to
> @@ -468,9 +528,78 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    gimple_stmt_iterator incr_gsi;
>    bool insert_after;
>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> -  create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> -	     loop, &incr_gsi, insert_after, &index_before_incr,
> -	     &index_after_incr);
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +    {
> +      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +      tree step = make_ssa_name (iv_type);
> +      /* Create decrement IV.  */
> +      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
> +		 insert_after, &index_before_incr, &index_after_incr);
> +      tree temp = gimple_build (header_seq, MIN_EXPR, iv_type,
> +				index_before_incr, nitems_step);
> +      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp));

Probably simpler to build the MIN_EXPR as a gimple_build_assign,
with "step" as its destination.  No simplification is likely given
that the input is a freshly-minted IV.

> +
> +      if (rgc->max_nscalars_per_iter == 1)

I'm not sure this is the correct condition.  Isn't the split really
between whether rgc->controls.length () is 1 or greater than 1?
It's possible (even common) for rgc->controls.length () == 1
&& rgc->max_nscalars_per_iter != 1, and in that case,
vect_adjust_loop_lens_control will create an unnecessary MIN_EXPR.

When rgc->controls.length () is 1, it would be good to use
rgc->controls[0] as "step" above, rather than make a new SSA name.

Thanks for putting the code here though.  It looks like the right
place to me.

More on this below.

> +	{
> +	  /* single rgroup:
> +	     ...
> +	     _10 = (unsigned long) count_12(D);
> +	     ...
> +	     # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> +	     _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> +	     ...
> +	     vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
> +	     ...
> +	     ivtmp_35 = ivtmp_9 - _36;
> +	     ...
> +	     if (ivtmp_35 != 0)
> +	       goto <bb 4>; [83.33%]
> +	     else
> +	       goto <bb 5>; [16.67%]
> +	  */
> +	  gassign *assign = gimple_build_assign (rgc->controls[0], step);
> +	  gimple_seq_add_stmt (header_seq, assign);
> +	}
> +      else
> +	{
> +	  /* Multiple rgroup (SLP):
> +	     ...
> +	     _38 = (unsigned long) bnd.7_29;
> +	     _39 = _38 * 2;
> +	     ...
> +	     # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
> +	     ...
> +	     _43 = MIN_EXPR <ivtmp_41, 32>;
> +	     loop_len_26 = MIN_EXPR <_43, 16>;
> +	     loop_len_25 = _43 - loop_len_26;
> +	     ...
> +	     .LEN_STORE (_6, 8B, loop_len_26, ...);
> +	     ...
> +	     .LEN_STORE (_25, 8B, loop_len_25, ...);
> +	     _33 = loop_len_26 / 2;
> +	     ...
> +	     .LEN_STORE (_8, 16B, _33, ...);
> +	     _36 = loop_len_25 / 2;
> +	     ...
> +	     .LEN_STORE (_15, 16B, _36, ...);
> +	     ivtmp_42 = ivtmp_41 - _43;
> +	     ...
> +	     if (ivtmp_42 != 0)
> +	       goto <bb 4>; [83.33%]
> +	     else
> +	      goto <bb 5>; [16.67%]
> +	  */
> +	  vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step);
> +	}
> +      return index_after_incr;
> +    }
> +  else
> +    {
> +      /* Create increment IV.  */
> +      create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> +		 loop, &incr_gsi, insert_after, &index_before_incr,
> +		 &index_after_incr);
> +    }

Probably better to have no "else" and instead fall through to this
create_iv.  It makes it clearer that the decrementing IV case
doesn't continue beyond the "if" above.

>  
>    tree zero_index = build_int_cst (compare_type, 0);
>    tree test_index, test_limit, first_limit;
> @@ -753,6 +882,55 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>  	      continue;
>  	  }
>  
> +	if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +	    && rgc->max_nscalars_per_iter == 1
> +	    && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
> +	  {
> +	    /* Multiple rgroup (non-SLP):
> +	      ...
> +	      _38 = (unsigned long) n_12(D);
> +	      ...
> +	      # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> +	      ...
> +	      _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> +	      loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> +	      _41 = _40 - loop_len_21;
> +	      loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> +	      _42 = _40 - loop_len_20;
> +	      loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> +	      _43 = _40 - loop_len_19;
> +	      loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> +	      ...
> +	      vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> +	      ...
> +	      vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> +	      ...
> +	      vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> +	      ...
> +	      vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> +	      vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
> +	      vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
> +	      vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
> +	      ...
> +	      .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> +	      ivtmp_39 = ivtmp_38 - _40;
> +	      ...
> +	      if (ivtmp_39 != 0)
> +		goto <bb 3>; [92.31%]
> +	      else
> +		goto <bb 4>; [7.69%]
> +	    */
> +	    rgroup_controls *sub_rgc
> +	      = &(*controls)[nmasks / rgc->controls.length () - 1];
> +	    if (!sub_rgc->controls.is_empty ())
> +	      {
> +		tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +		vect_adjust_loop_lens_control (iv_type, &header_seq, rgc,
> +					       sub_rgc, NULL_TREE);
> +		continue;
> +	      }
> +	  }

I don't think this is right, since if we have 1-control, 2-control
and 4-control groups, the fan-out for the 4-control group must be
based on the 1-control group rather than any general sub_rgc.
Using any other rgroup would mean that the input was already
clamped by a MIN_EXPR (because that input would itself have been
created by vect_adjust_loop_lens_control).

> +
>  	/* See whether zero-based IV would ever generate all-false masks
>  	   or zero length before wrapping around.  */
>  	bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index cf10132b0bf..53ac4d2a6c2 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -2725,6 +2725,16 @@ start_over:
>        && !vect_verify_loop_lens (loop_vinfo))
>      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  
> +  /* If we're vectorizing an loop that uses length "controls" and
> +     can iterate more than once, we apply decrementing IV approach
> +     in loop control.  */
> +  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> +      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
> +      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +	   && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
> +			LOOP_VINFO_VECT_FACTOR (loop_vinfo))))

I think this also needs to check:

 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0

-----

I think another way to think about the patch is this:

Things would be easy if there was always a 1-control group.
Then we could create a decrementing IV for that 1-control group,
in the way that you do above.  All other groups would use the
1-control group as input.  They would fan that input out using
vect_adjust_loop_lens_control.

However, there isn't guaranteed to be 1-control group, and creating
a fake one might not be easy.  We'd need to calculate valid values
for the factor, type, nscalars_per_iter, etc. fields.

So when there is no 1-control group, it's probably easier to create
a decrementing IV based on the first non-empty group.  This is what
your patch does.  And I think it's the right approach.

However, the IV created in this way is "special".  It doesn't exist
as part of the normal rgroup framework.  It's something created
outside the rgroup framework to make the controls easier to populate.

In your case, this is the case where "step" is created and is not
assigned directly to rgc->controls.

Given that, I think we should structure the code as follows,
changing the behaviour only for LOOP_VINFO_USING_DECREMENTING_IV_P:

(1) In vect_set_loop_condition_partial_vectors, for the first iteration of:

  FOR_EACH_VEC_ELT (*controls, i, rgc)
    if (!rgc->controls.is_empty ())

call vect_set_loop_controls_directly.  That is:

	/* See whether zero-based IV would ever generate all-false masks
	   or zero length before wrapping around.  */
	bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);

	/* Set up all controls for this group.  */
	test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
						     &preheader_seq,
						     &header_seq,
						     loop_cond_gsi, rgc,
						     niters, niters_skip,
						     might_wrap_p);

needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P)
is only executed on the first iteration.

(2) vect_set_loop_controls_directly calculates "step" as in your patch.
If rgc has 1 control, this step is the SSA name created for that control.
Otherwise the step is a fresh SSA name, as in your patch.

(3) vect_set_loop_controls_directly stores this step somewhere for later
use, probably in LOOP_VINFO.  Let's use "S" to refer to this stored step.

(4) After the vect_set_loop_controls_directly call above, and outside
the "if" statement that now contains vect_set_loop_controls_directly,
check whether rgc->controls.length () > 1.  If so, use
vect_adjust_loop_lens_control to set the controls based on S.

Then the only caller of vect_adjust_loop_lens_control is
vect_set_loop_condition_partial_vectors.  And the starting
step for vect_adjust_loop_lens_control is always S.

It sounds complicated when I write it like that, but I think it
will be clearer in code.

Hope that makes sense.  Please let me know if not.

Thanks,
Richard
juzhe.zhong@rivai.ai May 24, 2023, 11:52 a.m. UTC | #3
Hi, Richard.  It's quite complicated for me and I am not sure whether I can catch up with you.
So I will rather split the work step by step to  implement the decrement IV

For the first step you mentioned:

>> (1) In vect_set_loop_condition_partial_vectors, for the first iteration of:

 >>  FOR_EACH_VEC_ELT (*controls, i, rgc)
 >>    if (!rgc->controls.is_empty ())

>> call vect_set_loop_controls_directly.  That is:

>> >> /* See whether zero-based IV would ever generate all-false masks
>>    or zero length before wrapping around.  */
>> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
>> 
/* Set up all controls for this group.  */
>> test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 >>     &preheader_seq,
  >>    &header_seq,
 >>     loop_cond_gsi, rgc,
 >>     niters, niters_skip,
 >>     might_wrap_p);

>> needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P)
>> is only executed on the first iteration.

Is it correct like this?

  FOR_EACH_VEC_ELT (*controls, i, rgc)
    if (!rgc->controls.is_empty ())
      {
        /* First try using permutes.  This adds a single vector
           instruction to the loop for each mask, but needs no extra
           loop invariants or IVs.  */
        unsigned int nmasks = i + 1;
        if (use_masks_p && (nmasks & 1) == 0)
          {
            rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
            if (!half_rgc->controls.is_empty ()
                && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
              continue;
          }

        /* See whether zero-based IV would ever generate all-false masks
           or zero length before wrapping around.  */
        bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);

        /* Set up all controls for this group.  */
        test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
                                                     &preheader_seq,
                                                     &header_seq,
                                                     loop_cond_gsi, rgc,
                                                     niters, niters_skip,
                                                     might_wrap_p);

        /* Decrement IV only run vect_set_loop_controls_directly once.  */
        if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
          break;
      }

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-24 19:23
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support
Sorry for the slow review.  I needed some time to go through this
patch and surrounding code to understand it, and to understand
why it wasn't structured the way I was expecting.
 
I've got some specific comments below, and then a general comment
about how I think we should structure this.
 
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> gcc/ChangeLog:
>
>         * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function.
>         (vect_set_loop_controls_directly): Add decrement IV support.
>         (vect_set_loop_condition_partial_vectors): Ditto.
>         * tree-vect-loop.cc: Ditto.
>         * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.
>
> ---
>  gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++-
>  gcc/tree-vect-loop.cc       |  10 ++
>  gcc/tree-vectorizer.h       |   8 ++
>  3 files changed, 199 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index ff6159e08d5..94b38d1e0fb 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -385,6 +385,66 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
>    return false;
>  }
>  
> +/* Try to use adjust loop lens for non-SLP multiple-rgroups.
> +
> +     _36 = MIN_EXPR <ivtmp_34, VF>;
> +
> +     First length (MIN (X, VF/N)):
> +       loop_len_15 = MIN_EXPR <_36, VF/N>;
> +
> +     Second length:
> +       tmp = _36 - loop_len_15;
> +       loop_len_16 = MIN (tmp, VF/N);
> +
> +     Third length:
> +       tmp2 = tmp - loop_len_16;
> +       loop_len_17 = MIN (tmp2, VF/N);
> +
> +     Last length:
> +       loop_len_18 = tmp2 - loop_len_17;
> +*/
> +
> +static void
> +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
> +        rgroup_controls *dest_rgm,
> +        rgroup_controls *src_rgm, tree step)
> +{
> +  tree ctrl_type = dest_rgm->type;
> +  poly_uint64 nitems_per_ctrl
> +    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +  tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
> +
> +  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> +    {
> +      if (!step)
> + step = src_rgm->controls[i / dest_rgm->controls.length ()];
 
Could you explain this index?  It looks like it will always be 0
due to the range of i.
 
Since this is the only use of src_rgm, it might be cleaner to drop
src_rgm and only pass the step.
 
> +      tree ctrl = dest_rgm->controls[i];
> +      if (i == 0)
> + {
> +   /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
> +   gassign *assign
> +     = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +   gimple_seq_add_stmt (seq, assign);
> + }
> +      else if (i == dest_rgm->controls.length () - 1)
> + {
> +   /* Last iteration: Remain capped to the range [0, VF/N].  */
> +   gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
> + dest_rgm->controls[i - 1]);
> +   gimple_seq_add_stmt (seq, assign);
> + }
> +      else
> + {
> +   /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
> +   step = gimple_build (seq, MINUS_EXPR, iv_type, step,
> +        dest_rgm->controls[i - 1]);
> +   gassign *assign
> +     = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +   gimple_seq_add_stmt (seq, assign);
> + }
> +    }
> +}
 
Not your fault, but the structure seems kind-of awkward, since
it's really a MINUS_EXPR for i != 0 followed by a MIN_EXPR for i != last.
But I agree that it probably has to be written as above given that
the final destination is fixed in advance.
 
> +
>  /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
>     for all the rgroup controls in RGC and return a control that is nonzero
>     when the loop needs to iterate.  Add any new preheader statements to
> @@ -468,9 +528,78 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    gimple_stmt_iterator incr_gsi;
>    bool insert_after;
>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> -  create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> -      loop, &incr_gsi, insert_after, &index_before_incr,
> -      &index_after_incr);
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +    {
> +      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +      tree step = make_ssa_name (iv_type);
> +      /* Create decrement IV.  */
> +      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
> + insert_after, &index_before_incr, &index_after_incr);
> +      tree temp = gimple_build (header_seq, MIN_EXPR, iv_type,
> + index_before_incr, nitems_step);
> +      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp));
 
Probably simpler to build the MIN_EXPR as a gimple_build_assign,
with "step" as its destination.  No simplification is likely given
that the input is a freshly-minted IV.
 
> +
> +      if (rgc->max_nscalars_per_iter == 1)
 
I'm not sure this is the correct condition.  Isn't the split really
between whether rgc->controls.length () is 1 or greater than 1?
It's possible (even common) for rgc->controls.length () == 1
&& rgc->max_nscalars_per_iter != 1, and in that case,
vect_adjust_loop_lens_control will create an unnecessary MIN_EXPR.
 
When rgc->controls.length () is 1, it would be good to use
rgc->controls[0] as "step" above, rather than make a new SSA name.
 
Thanks for putting the code here though.  It looks like the right
place to me.
 
More on this below.
 
> + {
> +   /* single rgroup:
> +      ...
> +      _10 = (unsigned long) count_12(D);
> +      ...
> +      # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> +      _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> +      ...
> +      vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
> +      ...
> +      ivtmp_35 = ivtmp_9 - _36;
> +      ...
> +      if (ivtmp_35 != 0)
> +        goto <bb 4>; [83.33%]
> +      else
> +        goto <bb 5>; [16.67%]
> +   */
> +   gassign *assign = gimple_build_assign (rgc->controls[0], step);
> +   gimple_seq_add_stmt (header_seq, assign);
> + }
> +      else
> + {
> +   /* Multiple rgroup (SLP):
> +      ...
> +      _38 = (unsigned long) bnd.7_29;
> +      _39 = _38 * 2;
> +      ...
> +      # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
> +      ...
> +      _43 = MIN_EXPR <ivtmp_41, 32>;
> +      loop_len_26 = MIN_EXPR <_43, 16>;
> +      loop_len_25 = _43 - loop_len_26;
> +      ...
> +      .LEN_STORE (_6, 8B, loop_len_26, ...);
> +      ...
> +      .LEN_STORE (_25, 8B, loop_len_25, ...);
> +      _33 = loop_len_26 / 2;
> +      ...
> +      .LEN_STORE (_8, 16B, _33, ...);
> +      _36 = loop_len_25 / 2;
> +      ...
> +      .LEN_STORE (_15, 16B, _36, ...);
> +      ivtmp_42 = ivtmp_41 - _43;
> +      ...
> +      if (ivtmp_42 != 0)
> +        goto <bb 4>; [83.33%]
> +      else
> +       goto <bb 5>; [16.67%]
> +   */
> +   vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step);
> + }
> +      return index_after_incr;
> +    }
> +  else
> +    {
> +      /* Create increment IV.  */
> +      create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> + loop, &incr_gsi, insert_after, &index_before_incr,
> + &index_after_incr);
> +    }
 
Probably better to have no "else" and instead fall through to this
create_iv.  It makes it clearer that the decrementing IV case
doesn't continue beyond the "if" above.
 
>  
>    tree zero_index = build_int_cst (compare_type, 0);
>    tree test_index, test_limit, first_limit;
> @@ -753,6 +882,55 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>        continue;
>    }
>  
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +     && rgc->max_nscalars_per_iter == 1
> +     && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
> +   {
> +     /* Multiple rgroup (non-SLP):
> +       ...
> +       _38 = (unsigned long) n_12(D);
> +       ...
> +       # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> +       ...
> +       _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> +       loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> +       _41 = _40 - loop_len_21;
> +       loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> +       _42 = _40 - loop_len_20;
> +       loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> +       _43 = _40 - loop_len_19;
> +       loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> +       ...
> +       vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> +       ...
> +       vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> +       ...
> +       vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> +       ...
> +       vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> +       vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
> +       vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
> +       vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
> +       ...
> +       .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> +       ivtmp_39 = ivtmp_38 - _40;
> +       ...
> +       if (ivtmp_39 != 0)
> + goto <bb 3>; [92.31%]
> +       else
> + goto <bb 4>; [7.69%]
> +     */
> +     rgroup_controls *sub_rgc
> +       = &(*controls)[nmasks / rgc->controls.length () - 1];
> +     if (!sub_rgc->controls.is_empty ())
> +       {
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc,
> +        sub_rgc, NULL_TREE);
> + continue;
> +       }
> +   }
 
I don't think this is right, since if we have 1-control, 2-control
and 4-control groups, the fan-out for the 4-control group must be
based on the 1-control group rather than any general sub_rgc.
Using any other rgroup would mean that the input was already
clamped by a MIN_EXPR (because that input would itself have been
created by vect_adjust_loop_lens_control).
 
> +
>  /* See whether zero-based IV would ever generate all-false masks
>     or zero length before wrapping around.  */
>  bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index cf10132b0bf..53ac4d2a6c2 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -2725,6 +2725,16 @@ start_over:
>        && !vect_verify_loop_lens (loop_vinfo))
>      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  
> +  /* If we're vectorizing an loop that uses length "controls" and
> +     can iterate more than once, we apply decrementing IV approach
> +     in loop control.  */
> +  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> +      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
> +      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +    && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
> + LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
 
I think this also needs to check:
 
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
 
-----
 
I think another way to think about the patch is this:
 
Things would be easy if there was always a 1-control group.
Then we could create a decrementing IV for that 1-control group,
in the way that you do above.  All other groups would use the
1-control group as input.  They would fan that input out using
vect_adjust_loop_lens_control.
 
However, there isn't guaranteed to be 1-control group, and creating
a fake one might not be easy.  We'd need to calculate valid values
for the factor, type, nscalars_per_iter, etc. fields.
 
So when there is no 1-control group, it's probably easier to create
a decrementing IV based on the first non-empty group.  This is what
your patch does.  And I think it's the right approach.
 
However, the IV created in this way is "special".  It doesn't exist
as part of the normal rgroup framework.  It's something created
outside the rgroup framework to make the controls easier to populate.
 
In your case, this is the case where "step" is created and is not
assigned directly to rgc->controls.
 
Given that, I think we should structure the code as follows,
changing the behaviour only for LOOP_VINFO_USING_DECREMENTING_IV_P:
 
(1) In vect_set_loop_condition_partial_vectors, for the first iteration of:
 
  FOR_EACH_VEC_ELT (*controls, i, rgc)
    if (!rgc->controls.is_empty ())
 
call vect_set_loop_controls_directly.  That is:
 
/* See whether zero-based IV would ever generate all-false masks
   or zero length before wrapping around.  */
bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 
/* Set up all controls for this group.  */
test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
     &preheader_seq,
     &header_seq,
     loop_cond_gsi, rgc,
     niters, niters_skip,
     might_wrap_p);
 
needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P)
is only executed on the first iteration.
 
(2) vect_set_loop_controls_directly calculates "step" as in your patch.
If rgc has 1 control, this step is the SSA name created for that control.
Otherwise the step is a fresh SSA name, as in your patch.
 
(3) vect_set_loop_controls_directly stores this step somewhere for later
use, probably in LOOP_VINFO.  Let's use "S" to refer to this stored step.
 
(4) After the vect_set_loop_controls_directly call above, and outside
the "if" statement that now contains vect_set_loop_controls_directly,
check whether rgc->controls.length () > 1.  If so, use
vect_adjust_loop_lens_control to set the controls based on S.
 
Then the only caller of vect_adjust_loop_lens_control is
vect_set_loop_condition_partial_vectors.  And the starting
step for vect_adjust_loop_lens_control is always S.
 
It sounds complicated when I write it like that, but I think it
will be clearer in code.
 
Hope that makes sense.  Please let me know if not.
 
Thanks,
Richard
juzhe.zhong@rivai.ai May 24, 2023, 12:19 p.m. UTC | #4
Hi, Richard.
For step 1. I have write this patch. Could you take a look at it?

Thanks.



juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-24 19:23
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support
Sorry for the slow review.  I needed some time to go through this
patch and surrounding code to understand it, and to understand
why it wasn't structured the way I was expecting.
 
I've got some specific comments below, and then a general comment
about how I think we should structure this.
 
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> gcc/ChangeLog:
>
>         * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function.
>         (vect_set_loop_controls_directly): Add decrement IV support.
>         (vect_set_loop_condition_partial_vectors): Ditto.
>         * tree-vect-loop.cc: Ditto.
>         * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.
>
> ---
>  gcc/tree-vect-loop-manip.cc | 184 +++++++++++++++++++++++++++++++++++-
>  gcc/tree-vect-loop.cc       |  10 ++
>  gcc/tree-vectorizer.h       |   8 ++
>  3 files changed, 199 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index ff6159e08d5..94b38d1e0fb 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -385,6 +385,66 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
>    return false;
>  }
>  
> +/* Try to use adjust loop lens for non-SLP multiple-rgroups.
> +
> +     _36 = MIN_EXPR <ivtmp_34, VF>;
> +
> +     First length (MIN (X, VF/N)):
> +       loop_len_15 = MIN_EXPR <_36, VF/N>;
> +
> +     Second length:
> +       tmp = _36 - loop_len_15;
> +       loop_len_16 = MIN (tmp, VF/N);
> +
> +     Third length:
> +       tmp2 = tmp - loop_len_16;
> +       loop_len_17 = MIN (tmp2, VF/N);
> +
> +     Last length:
> +       loop_len_18 = tmp2 - loop_len_17;
> +*/
> +
> +static void
> +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
> +        rgroup_controls *dest_rgm,
> +        rgroup_controls *src_rgm, tree step)
> +{
> +  tree ctrl_type = dest_rgm->type;
> +  poly_uint64 nitems_per_ctrl
> +    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +  tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
> +
> +  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> +    {
> +      if (!step)
> + step = src_rgm->controls[i / dest_rgm->controls.length ()];
 
Could you explain this index?  It looks like it will always be 0
due to the range of i.
 
Since this is the only use of src_rgm, it might be cleaner to drop
src_rgm and only pass the step.
 
> +      tree ctrl = dest_rgm->controls[i];
> +      if (i == 0)
> + {
> +   /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
> +   gassign *assign
> +     = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +   gimple_seq_add_stmt (seq, assign);
> + }
> +      else if (i == dest_rgm->controls.length () - 1)
> + {
> +   /* Last iteration: Remain capped to the range [0, VF/N].  */
> +   gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
> + dest_rgm->controls[i - 1]);
> +   gimple_seq_add_stmt (seq, assign);
> + }
> +      else
> + {
> +   /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
> +   step = gimple_build (seq, MINUS_EXPR, iv_type, step,
> +        dest_rgm->controls[i - 1]);
> +   gassign *assign
> +     = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +   gimple_seq_add_stmt (seq, assign);
> + }
> +    }
> +}
 
Not your fault, but the structure seems kind-of awkward, since
it's really a MINUS_EXPR for i != 0 followed by a MIN_EXPR for i != last.
But I agree that it probably has to be written as above given that
the final destination is fixed in advance.
 
> +
>  /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
>     for all the rgroup controls in RGC and return a control that is nonzero
>     when the loop needs to iterate.  Add any new preheader statements to
> @@ -468,9 +528,78 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    gimple_stmt_iterator incr_gsi;
>    bool insert_after;
>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> -  create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> -      loop, &incr_gsi, insert_after, &index_before_incr,
> -      &index_after_incr);
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +    {
> +      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +      tree step = make_ssa_name (iv_type);
> +      /* Create decrement IV.  */
> +      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
> + insert_after, &index_before_incr, &index_after_incr);
> +      tree temp = gimple_build (header_seq, MIN_EXPR, iv_type,
> + index_before_incr, nitems_step);
> +      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp));
 
Probably simpler to build the MIN_EXPR as a gimple_build_assign,
with "step" as its destination.  No simplification is likely given
that the input is a freshly-minted IV.
 
> +
> +      if (rgc->max_nscalars_per_iter == 1)
 
I'm not sure this is the correct condition.  Isn't the split really
between whether rgc->controls.length () is 1 or greater than 1?
It's possible (even common) for rgc->controls.length () == 1
&& rgc->max_nscalars_per_iter != 1, and in that case,
vect_adjust_loop_lens_control will create an unnecessary MIN_EXPR.
 
When rgc->controls.length () is 1, it would be good to use
rgc->controls[0] as "step" above, rather than make a new SSA name.
 
Thanks for putting the code here though.  It looks like the right
place to me.
 
More on this below.
 
> + {
> +   /* single rgroup:
> +      ...
> +      _10 = (unsigned long) count_12(D);
> +      ...
> +      # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> +      _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> +      ...
> +      vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
> +      ...
> +      ivtmp_35 = ivtmp_9 - _36;
> +      ...
> +      if (ivtmp_35 != 0)
> +        goto <bb 4>; [83.33%]
> +      else
> +        goto <bb 5>; [16.67%]
> +   */
> +   gassign *assign = gimple_build_assign (rgc->controls[0], step);
> +   gimple_seq_add_stmt (header_seq, assign);
> + }
> +      else
> + {
> +   /* Multiple rgroup (SLP):
> +      ...
> +      _38 = (unsigned long) bnd.7_29;
> +      _39 = _38 * 2;
> +      ...
> +      # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
> +      ...
> +      _43 = MIN_EXPR <ivtmp_41, 32>;
> +      loop_len_26 = MIN_EXPR <_43, 16>;
> +      loop_len_25 = _43 - loop_len_26;
> +      ...
> +      .LEN_STORE (_6, 8B, loop_len_26, ...);
> +      ...
> +      .LEN_STORE (_25, 8B, loop_len_25, ...);
> +      _33 = loop_len_26 / 2;
> +      ...
> +      .LEN_STORE (_8, 16B, _33, ...);
> +      _36 = loop_len_25 / 2;
> +      ...
> +      .LEN_STORE (_15, 16B, _36, ...);
> +      ivtmp_42 = ivtmp_41 - _43;
> +      ...
> +      if (ivtmp_42 != 0)
> +        goto <bb 4>; [83.33%]
> +      else
> +       goto <bb 5>; [16.67%]
> +   */
> +   vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step);
> + }
> +      return index_after_incr;
> +    }
> +  else
> +    {
> +      /* Create increment IV.  */
> +      create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> + loop, &incr_gsi, insert_after, &index_before_incr,
> + &index_after_incr);
> +    }
 
Probably better to have no "else" and instead fall through to this
create_iv.  It makes it clearer that the decrementing IV case
doesn't continue beyond the "if" above.
 
>  
>    tree zero_index = build_int_cst (compare_type, 0);
>    tree test_index, test_limit, first_limit;
> @@ -753,6 +882,55 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>        continue;
>    }
>  
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +     && rgc->max_nscalars_per_iter == 1
> +     && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
> +   {
> +     /* Multiple rgroup (non-SLP):
> +       ...
> +       _38 = (unsigned long) n_12(D);
> +       ...
> +       # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> +       ...
> +       _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> +       loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> +       _41 = _40 - loop_len_21;
> +       loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> +       _42 = _40 - loop_len_20;
> +       loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> +       _43 = _40 - loop_len_19;
> +       loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> +       ...
> +       vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> +       ...
> +       vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> +       ...
> +       vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> +       ...
> +       vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> +       vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
> +       vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
> +       vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
> +       ...
> +       .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> +       ivtmp_39 = ivtmp_38 - _40;
> +       ...
> +       if (ivtmp_39 != 0)
> + goto <bb 3>; [92.31%]
> +       else
> + goto <bb 4>; [7.69%]
> +     */
> +     rgroup_controls *sub_rgc
> +       = &(*controls)[nmasks / rgc->controls.length () - 1];
> +     if (!sub_rgc->controls.is_empty ())
> +       {
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc,
> +        sub_rgc, NULL_TREE);
> + continue;
> +       }
> +   }
 
I don't think this is right, since if we have 1-control, 2-control
and 4-control groups, the fan-out for the 4-control group must be
based on the 1-control group rather than any general sub_rgc.
Using any other rgroup would mean that the input was already
clamped by a MIN_EXPR (because that input would itself have been
created by vect_adjust_loop_lens_control).
 
> +
>  /* See whether zero-based IV would ever generate all-false masks
>     or zero length before wrapping around.  */
>  bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index cf10132b0bf..53ac4d2a6c2 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -2725,6 +2725,16 @@ start_over:
>        && !vect_verify_loop_lens (loop_vinfo))
>      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  
> +  /* If we're vectorizing an loop that uses length "controls" and
> +     can iterate more than once, we apply decrementing IV approach
> +     in loop control.  */
> +  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> +      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
> +      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +    && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
> + LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
 
I think this also needs to check:
 
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
 
-----
 
I think another way to think about the patch is this:
 
Things would be easy if there was always a 1-control group.
Then we could create a decrementing IV for that 1-control group,
in the way that you do above.  All other groups would use the
1-control group as input.  They would fan that input out using
vect_adjust_loop_lens_control.
 
However, there isn't guaranteed to be 1-control group, and creating
a fake one might not be easy.  We'd need to calculate valid values
for the factor, type, nscalars_per_iter, etc. fields.
 
So when there is no 1-control group, it's probably easier to create
a decrementing IV based on the first non-empty group.  This is what
your patch does.  And I think it's the right approach.
 
However, the IV created in this way is "special".  It doesn't exist
as part of the normal rgroup framework.  It's something created
outside the rgroup framework to make the controls easier to populate.
 
In your case, this is the case where "step" is created and is not
assigned directly to rgc->controls.
 
Given that, I think we should structure the code as follows,
changing the behaviour only for LOOP_VINFO_USING_DECREMENTING_IV_P:
 
(1) In vect_set_loop_condition_partial_vectors, for the first iteration of:
 
  FOR_EACH_VEC_ELT (*controls, i, rgc)
    if (!rgc->controls.is_empty ())
 
call vect_set_loop_controls_directly.  That is:
 
/* See whether zero-based IV would ever generate all-false masks
   or zero length before wrapping around.  */
bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 
/* Set up all controls for this group.  */
test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
     &preheader_seq,
     &header_seq,
     loop_cond_gsi, rgc,
     niters, niters_skip,
     might_wrap_p);
 
needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P)
is only executed on the first iteration.
 
(2) vect_set_loop_controls_directly calculates "step" as in your patch.
If rgc has 1 control, this step is the SSA name created for that control.
Otherwise the step is a fresh SSA name, as in your patch.
 
(3) vect_set_loop_controls_directly stores this step somewhere for later
use, probably in LOOP_VINFO.  Let's use "S" to refer to this stored step.
 
(4) After the vect_set_loop_controls_directly call above, and outside
the "if" statement that now contains vect_set_loop_controls_directly,
check whether rgc->controls.length () > 1.  If so, use
vect_adjust_loop_lens_control to set the controls based on S.
 
Then the only caller of vect_adjust_loop_lens_control is
vect_set_loop_condition_partial_vectors.  And the starting
step for vect_adjust_loop_lens_control is always S.
 
It sounds complicated when I write it like that, but I think it
will be clearer in code.
 
Hope that makes sense.  Please let me know if not.
 
Thanks,
Richard
Richard Sandiford May 24, 2023, 12:41 p.m. UTC | #5
Sorry, I realised later that I had an implicit assumption here:
if there are multiple rgroups, it's better to have a single IV
for the smallest rgroup and scale that up to bigger rgroups.

E.g. if the loop control IV is taken from an N-control rgroup
and has a step S, an N*M-control rgroup would be based on M*S.

Of course, it's also OK to create multiple IVs if you prefer.
It's just a question of which approach gives the best output
in practice.

Another way of going from an N-control rgroup ("G1") to an N*M-control
rgroup ("G2") would be to reuse all N controls from G1.  E.g. the
first M controls in G2 would come from G1[0], the next M from
G1[1], etc.  That might lower the longest dependency chain.

But whatever we do, it doesn't feel like max_nscalars_per_iter
should be part of the decision.  (I realise it will be part of
the decision for the follow-on SELECT_IV patch.  But that's
because we require the number of elements processed in each
iteration to be a multiple of max_nscalars_per_iter, and AIUI
SELECT_IV wouldn't guarantee that.  max_nscalars_per_iter shouldn't
matter for the current patch though.)

钟居哲 <juzhe.zhong@rivai.ai> writes:
> Hi, Richard.  It's quite complicated for me and I am not sure whether I can catch up with you.
> So I will rather split the work step by step to  implement the decrement IV
>
> For the first step you mentioned:
>
>>> (1) In vect_set_loop_condition_partial_vectors, for the first iteration of:
>
>  >>  FOR_EACH_VEC_ELT (*controls, i, rgc)
>  >>    if (!rgc->controls.is_empty ())
>
>>> call vect_set_loop_controls_directly.  That is:
>
>>> >> /* See whether zero-based IV would ever generate all-false masks
>>>    or zero length before wrapping around.  */
>>> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
>>> 
> /* Set up all controls for this group.  */
>>> test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
>  >>     &preheader_seq,
>   >>    &header_seq,
>  >>     loop_cond_gsi, rgc,
>  >>     niters, niters_skip,
>  >>     might_wrap_p);
>
>>> needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P)
>>> is only executed on the first iteration.
>
> Is it correct like this?
>
>   FOR_EACH_VEC_ELT (*controls, i, rgc)
>     if (!rgc->controls.is_empty ())
>       {
>         /* First try using permutes.  This adds a single vector
>            instruction to the loop for each mask, but needs no extra
>            loop invariants or IVs.  */
>         unsigned int nmasks = i + 1;
>         if (use_masks_p && (nmasks & 1) == 0)
>           {
>             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
>             if (!half_rgc->controls.is_empty ()
>                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
>               continue;
>           }
>
>         /* See whether zero-based IV would ever generate all-false masks
>            or zero length before wrapping around.  */
>         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
>
>         /* Set up all controls for this group.  */
>         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
>                                                      &preheader_seq,
>                                                      &header_seq,
>                                                      loop_cond_gsi, rgc,
>                                                      niters, niters_skip,
>                                                      might_wrap_p);
>
>         /* Decrement IV only run vect_set_loop_controls_directly once.  */
>         if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
>           break;
>       }

I meant something like:

  FOR_EACH_VEC_ELT (*controls, i, rgc)
    if (!rgc->controls.is_empty ())
      {
        /* First try using permutes.  This adds a single vector
           instruction to the loop for each mask, but needs no extra
           loop invariants or IVs.  */
        unsigned int nmasks = i + 1;
        if (use_masks_p && (nmasks & 1) == 0)
          {
            rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
            if (!half_rgc->controls.is_empty ()
                && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
              continue;
          }

        if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
            || !LOOP_VINFO_DECREMENTING_IV_STEP (loop_info))
	  {
            /* See whether zero-based IV would ever generate all-false masks
               or zero length before wrapping around.  */
            bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);

            /* Set up all controls for this group.  */
            test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
                                                         &preheader_seq,
                                                         &header_seq,
                                                         loop_cond_gsi, rgc,
                                                         niters, niters_skip,
                                                         might_wrap_p);
	  }
	
        if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
	    && rgc->controls.length () > 1)
	  ...use vect_adjust_loop_lens_control...
      }

where LOOP_VINFO_DECREMENTING_IV_STEP (loop_info) is "S" from my
previous review.

vect_set_loop_controls_directly would then set
LOOP_VINFO_DECREMENTING_IV_STEP but would not call
vect_adjust_loop_lens_control.

But like I say, this is all based on the assumption that we should
have a single IV and scale it up for later rgroups.  If you'd prefer
separate IVs then that's fine.  But then I think it's less clear
why we have:

> +	if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +	    && rgc->max_nscalars_per_iter == 1
> +	    && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
> +	  {
> +	    /* Multiple rgroup (non-SLP):
> +	      ...
> +	      _38 = (unsigned long) n_12(D);
> +	      ...
> +	      # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> +	      ...
> +	      _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> +	      loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> +	      _41 = _40 - loop_len_21;
> +	      loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> +	      _42 = _40 - loop_len_20;
> +	      loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> +	      _43 = _40 - loop_len_19;
> +	      loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> +	      ...
> +	      vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> +	      ...
> +	      vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> +	      ...
> +	      vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> +	      ...
> +	      vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> +	      vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
> +	      vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
> +	      vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
> +	      ...
> +	      .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> +	      ivtmp_39 = ivtmp_38 - _40;
> +	      ...
> +	      if (ivtmp_39 != 0)
> +		goto <bb 3>; [92.31%]
> +	      else
> +		goto <bb 4>; [7.69%]
> +	    */
> +	    rgroup_controls *sub_rgc
> +	      = &(*controls)[nmasks / rgc->controls.length () - 1];
> +	    if (!sub_rgc->controls.is_empty ())
> +	      {
> +		tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +		vect_adjust_loop_lens_control (iv_type, &header_seq, rgc,
> +					       sub_rgc, NULL_TREE);
> +		continue;
> +	      }
> +	  }

In other words, why is this different from what
vect_set_loop_controls_directly would do?

Thanks,
Richard
Richard Biener May 24, 2023, 12:51 p.m. UTC | #6
On Wed, 24 May 2023, Richard Sandiford wrote:

> Sorry, I realised later that I had an implicit assumption here:
> if there are multiple rgroups, it's better to have a single IV
> for the smallest rgroup and scale that up to bigger rgroups.
> 
> E.g. if the loop control IV is taken from an N-control rgroup
> and has a step S, an N*M-control rgroup would be based on M*S.
> 
> Of course, it's also OK to create multiple IVs if you prefer.
> It's just a question of which approach gives the best output
> in practice.

One thing to check is whether IVOPTs is ever able to eliminate
one such IV using another.  You can then also check whether
when presented with a single IV it already considers the
others you can create as candidates so you get the optimal
selection in the end.

Richard.
juzhe.zhong@rivai.ai May 24, 2023, 1:26 p.m. UTC | #7
>> In other words, why is this different from what
>>vect_set_loop_controls_directly would do?
Oh, I see.  You are confused that why I do not make multiple-rgroup vec_trunk
handling inside "vect_set_loop_controls_directly".

Well. Frankly, I just replicate the handling of ARM SVE:
unsigned int nmasks = i + 1;
        if (use_masks_p && (nmasks & 1) == 0)
          {
            rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
            if (!half_rgc->controls.is_empty ()
                && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
              continue;
          }

/* Try to use permutes to define the masks in DEST_RGM using the masks
   in SRC_RGM, given that the former has twice as many masks as the
   latter.  Return true on success, adding any new statements to SEQ.  */

static bool
vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
                               rgroup_controls *src_rgm)
{
  tree src_masktype = src_rgm->type;
  tree dest_masktype = dest_rgm->type;
  machine_mode src_mode = TYPE_MODE (src_masktype);
  insn_code icode1, icode2;
  if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
      && (icode1 = optab_handler (vec_unpacku_hi_optab,
                                  src_mode)) != CODE_FOR_nothing
      && (icode2 = optab_handler (vec_unpacku_lo_optab,
                                  src_mode)) != CODE_FOR_nothing)
    {
      /* Unpacking the source masks gives at least as many mask bits as
         we need.  We can then VIEW_CONVERT any excess bits away.  */
      machine_mode dest_mode = insn_data[icode1].operand[0].mode;
      gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
      tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
      for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
        {
          tree src = src_rgm->controls[i / 2];
          tree dest = dest_rgm->controls[i];
          tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
                            ? VEC_UNPACK_HI_EXPR
                            : VEC_UNPACK_LO_EXPR);
          gassign *stmt;
          if (dest_masktype == unpack_masktype)
            stmt = gimple_build_assign (dest, code, src);
          else
            {
              tree temp = make_ssa_name (unpack_masktype);
              stmt = gimple_build_assign (temp, code, src);
              gimple_seq_add_stmt (seq, stmt);
              stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
                                          build1 (VIEW_CONVERT_EXPR,
                                                  dest_masktype, temp));
            }
          gimple_seq_add_stmt (seq, stmt);
        }
      return true;
    }
  vec_perm_indices indices[2];
  if (dest_masktype == src_masktype
      && interleave_supported_p (&indices[0], src_masktype, 0)
      && interleave_supported_p (&indices[1], src_masktype, 1))
    {
      /* The destination requires twice as many mask bits as the source, so
         we can use interleaving permutes to double up the number of bits.  */
      tree masks[2];
      for (unsigned int i = 0; i < 2; ++i)
        masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
      for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
        {
          tree src = src_rgm->controls[i / 2];
          tree dest = dest_rgm->controls[i];
          gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
                                              src, src, masks[i & 1]);
          gimple_seq_add_stmt (seq, stmt);
        }
      return true;
    }
  return false;
}

I know this is just optimization for ARM SVE with sub_rgc (int16)  is half size of rgc (int8).
But when I just copy the codes from ARM SVE and make it general for all cases (int8 <-> int64).
They all work well and codegen is good. 

If you don't like this way, would you mind give me some suggestions?

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-24 20:41
To: 钟居哲
CC: gcc-patches; rguenther
Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support
Sorry, I realised later that I had an implicit assumption here:
if there are multiple rgroups, it's better to have a single IV
for the smallest rgroup and scale that up to bigger rgroups.
 
E.g. if the loop control IV is taken from an N-control rgroup
and has a step S, an N*M-control rgroup would be based on M*S.
 
Of course, it's also OK to create multiple IVs if you prefer.
It's just a question of which approach gives the best output
in practice.
 
Another way of going from an N-control rgroup ("G1") to an N*M-control
rgroup ("G2") would be to reuse all N controls from G1.  E.g. the
first M controls in G2 would come from G1[0], the next M from
G1[1], etc.  That might lower the longest dependency chain.
 
But whatever we do, it doesn't feel like max_nscalars_per_iter
should be part of the decision.  (I realise it will be part of
the decision for the follow-on SELECT_IV patch.  But that's
because we require the number of elements processed in each
iteration to be a multiple of max_nscalars_per_iter, and AIUI
SELECT_IV wouldn't guarantee that.  max_nscalars_per_iter shouldn't
matter for the current patch though.)
 
钟居哲 <juzhe.zhong@rivai.ai> writes:
> Hi, Richard.  It's quite complicated for me and I am not sure whether I can catch up with you.
> So I will rather split the work step by step to  implement the decrement IV
>
> For the first step you mentioned:
>
>>> (1) In vect_set_loop_condition_partial_vectors, for the first iteration of:
>
>  >>  FOR_EACH_VEC_ELT (*controls, i, rgc)
>  >>    if (!rgc->controls.is_empty ())
>
>>> call vect_set_loop_controls_directly.  That is:
>
>>> >> /* See whether zero-based IV would ever generate all-false masks
>>>    or zero length before wrapping around.  */
>>> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
>>> 
> /* Set up all controls for this group.  */
>>> test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
>  >>     &preheader_seq,
>   >>    &header_seq,
>  >>     loop_cond_gsi, rgc,
>  >>     niters, niters_skip,
>  >>     might_wrap_p);
>
>>> needs to be an "if" that (for LOOP_VINFO_USING_DECREMENTING_IV_P)
>>> is only executed on the first iteration.
>
> Is it correct like this?
>
>   FOR_EACH_VEC_ELT (*controls, i, rgc)
>     if (!rgc->controls.is_empty ())
>       {
>         /* First try using permutes.  This adds a single vector
>            instruction to the loop for each mask, but needs no extra
>            loop invariants or IVs.  */
>         unsigned int nmasks = i + 1;
>         if (use_masks_p && (nmasks & 1) == 0)
>           {
>             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
>             if (!half_rgc->controls.is_empty ()
>                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
>               continue;
>           }
>
>         /* See whether zero-based IV would ever generate all-false masks
>            or zero length before wrapping around.  */
>         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
>
>         /* Set up all controls for this group.  */
>         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
>                                                      &preheader_seq,
>                                                      &header_seq,
>                                                      loop_cond_gsi, rgc,
>                                                      niters, niters_skip,
>                                                      might_wrap_p);
>
>         /* Decrement IV only run vect_set_loop_controls_directly once.  */
>         if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
>           break;
>       }
 
I meant something like:
 
  FOR_EACH_VEC_ELT (*controls, i, rgc)
    if (!rgc->controls.is_empty ())
      {
        /* First try using permutes.  This adds a single vector
           instruction to the loop for each mask, but needs no extra
           loop invariants or IVs.  */
        unsigned int nmasks = i + 1;
        if (use_masks_p && (nmasks & 1) == 0)
          {
            rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
            if (!half_rgc->controls.is_empty ()
                && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
              continue;
          }
 
        if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
            || !LOOP_VINFO_DECREMENTING_IV_STEP (loop_info))
  {
            /* See whether zero-based IV would ever generate all-false masks
               or zero length before wrapping around.  */
            bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 
            /* Set up all controls for this group.  */
            test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
                                                         &preheader_seq,
                                                         &header_seq,
                                                         loop_cond_gsi, rgc,
                                                         niters, niters_skip,
                                                         might_wrap_p);
  }
        if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
    && rgc->controls.length () > 1)
  ...use vect_adjust_loop_lens_control...
      }
 
where LOOP_VINFO_DECREMENTING_IV_STEP (loop_info) is "S" from my
previous review.
 
vect_set_loop_controls_directly would then set
LOOP_VINFO_DECREMENTING_IV_STEP but would not call
vect_adjust_loop_lens_control.
 
But like I say, this is all based on the assumption that we should
have a single IV and scale it up for later rgroups.  If you'd prefer
separate IVs then that's fine.  But then I think it's less clear
why we have:
 
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +     && rgc->max_nscalars_per_iter == 1
> +     && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
> +   {
> +     /* Multiple rgroup (non-SLP):
> +       ...
> +       _38 = (unsigned long) n_12(D);
> +       ...
> +       # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> +       ...
> +       _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> +       loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> +       _41 = _40 - loop_len_21;
> +       loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> +       _42 = _40 - loop_len_20;
> +       loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> +       _43 = _40 - loop_len_19;
> +       loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> +       ...
> +       vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> +       ...
> +       vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> +       ...
> +       vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> +       ...
> +       vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> +       vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
> +       vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
> +       vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
> +       ...
> +       .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> +       ivtmp_39 = ivtmp_38 - _40;
> +       ...
> +       if (ivtmp_39 != 0)
> + goto <bb 3>; [92.31%]
> +       else
> + goto <bb 4>; [7.69%]
> +     */
> +     rgroup_controls *sub_rgc
> +       = &(*controls)[nmasks / rgc->controls.length () - 1];
> +     if (!sub_rgc->controls.is_empty ())
> +       {
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> + vect_adjust_loop_lens_control (iv_type, &header_seq, rgc,
> +        sub_rgc, NULL_TREE);
> + continue;
> +       }
> +   }
 
In other words, why is this different from what
vect_set_loop_controls_directly would do?
 
Thanks,
Richard
juzhe.zhong@rivai.ai May 24, 2023, 1:27 p.m. UTC | #8
OK. Thanks. I am gonna refine the patch following Richard's idea and test it.
Thanks both Richard and Richi.



juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-05-24 20:51
To: Richard Sandiford
CC: 钟居哲; gcc-patches
Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support
On Wed, 24 May 2023, Richard Sandiford wrote:
 
> Sorry, I realised later that I had an implicit assumption here:
> if there are multiple rgroups, it's better to have a single IV
> for the smallest rgroup and scale that up to bigger rgroups.
> 
> E.g. if the loop control IV is taken from an N-control rgroup
> and has a step S, an N*M-control rgroup would be based on M*S.
> 
> Of course, it's also OK to create multiple IVs if you prefer.
> It's just a question of which approach gives the best output
> in practice.
 
One thing to check is whether IVOPTs is ever able to eliminate
one such IV using another.  You can then also check whether
when presented with a single IV it already considers the
others you can create as candidates so you get the optimal
selection in the end.
 
Richard.
Richard Sandiford May 24, 2023, 2:01 p.m. UTC | #9
钟居哲 <juzhe.zhong@rivai.ai> writes:
>>> In other words, why is this different from what
>>>vect_set_loop_controls_directly would do?
> Oh, I see.  You are confused that why I do not make multiple-rgroup vec_trunk
> handling inside "vect_set_loop_controls_directly".
>
> Well. Frankly, I just replicate the handling of ARM SVE:
> unsigned int nmasks = i + 1;
>         if (use_masks_p && (nmasks & 1) == 0)
>           {
>             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
>             if (!half_rgc->controls.is_empty ()
>                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
>               continue;
>           }
>
> /* Try to use permutes to define the masks in DEST_RGM using the masks
>    in SRC_RGM, given that the former has twice as many masks as the
>    latter.  Return true on success, adding any new statements to SEQ.  */
>
> static bool
> vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
>                                rgroup_controls *src_rgm)
> {
>   tree src_masktype = src_rgm->type;
>   tree dest_masktype = dest_rgm->type;
>   machine_mode src_mode = TYPE_MODE (src_masktype);
>   insn_code icode1, icode2;
>   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
>       && (icode1 = optab_handler (vec_unpacku_hi_optab,
>                                   src_mode)) != CODE_FOR_nothing
>       && (icode2 = optab_handler (vec_unpacku_lo_optab,
>                                   src_mode)) != CODE_FOR_nothing)
>     {
>       /* Unpacking the source masks gives at least as many mask bits as
>          we need.  We can then VIEW_CONVERT any excess bits away.  */
>       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
>       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
>       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
>       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
>         {
>           tree src = src_rgm->controls[i / 2];
>           tree dest = dest_rgm->controls[i];
>           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
>                             ? VEC_UNPACK_HI_EXPR
>                             : VEC_UNPACK_LO_EXPR);
>           gassign *stmt;
>           if (dest_masktype == unpack_masktype)
>             stmt = gimple_build_assign (dest, code, src);
>           else
>             {
>               tree temp = make_ssa_name (unpack_masktype);
>               stmt = gimple_build_assign (temp, code, src);
>               gimple_seq_add_stmt (seq, stmt);
>               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
>                                           build1 (VIEW_CONVERT_EXPR,
>                                                   dest_masktype, temp));
>             }
>           gimple_seq_add_stmt (seq, stmt);
>         }
>       return true;
>     }
>   vec_perm_indices indices[2];
>   if (dest_masktype == src_masktype
>       && interleave_supported_p (&indices[0], src_masktype, 0)
>       && interleave_supported_p (&indices[1], src_masktype, 1))
>     {
>       /* The destination requires twice as many mask bits as the source, so
>          we can use interleaving permutes to double up the number of bits.  */
>       tree masks[2];
>       for (unsigned int i = 0; i < 2; ++i)
>         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
>       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
>         {
>           tree src = src_rgm->controls[i / 2];
>           tree dest = dest_rgm->controls[i];
>           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
>                                               src, src, masks[i & 1]);
>           gimple_seq_add_stmt (seq, stmt);
>         }
>       return true;
>     }
>   return false;
> }
>
> I know this is just optimization for ARM SVE with sub_rgc (int16)  is half size of rgc (int8).
> But when I just copy the codes from ARM SVE and make it general for all cases (int8 <-> int64).
> They all work well and codegen is good. 
>
> If you don't like this way, would you mind give me some suggestions?

It's not a case of disliking one approach or disliking another.
There are two separate parts of this: one specific and one general.

The specific part is that the code had:

	    rgroup_controls *sub_rgc
	      = &(*controls)[nmasks / rgc->controls.length () - 1];
	    if (!sub_rgc->controls.is_empty ())
	      {
		tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
		vect_adjust_loop_lens_control (iv_type, &header_seq, rgc,
					       sub_rgc, NULL_TREE);
		continue;
	      }

But AIUI, nmasks is always equal to rgc->controls.length ()
(if rgc->controls is non-empty).  So I think this always used
(*controls)[0] as the source rgroup.  And I think that's the
only case that would work, since vect_adjust_loop_lens_control
only reads from sub_rgc once.  Is that right?

It would be possible to make other source rgroups work, but it
would involve reading every element of sub_rgc->controls, not just
sub_rgc->controls[0].  (This is also what the SVE code does.)

The general point is that there are two extreme approaches:

(1) Use one IV, and base everything off that.

(2) Use a separate IV for every rgroup.

It's also possible to do something between these two extremes.
(And we should, if it gives better code.)

The code above reuses an existing IV.  Doing that for all multi-control
rgroups would give (1).  At the other extreme, (2) would involve calling
vect_set_loop_controls_directly for every rgroup.

Both approaches are fine.  I'm not against one or the other.

What I didn't understand was why your patch only reuses existing IVs
for max_nscalars_per_iter == 1.  Was it to avoid having to do a
multiplication (well, really a shift left) when moving from one
rgroup to another?  E.g. if one rgroup had;

  nscalars_per_iter == 2 && factor == 1

and another had:

  nscalars_per_iter == 4 && factor == 1

then we would need to mulitply by 2 when going from the first rgroup
to the second.

If so, avoiding a multiplication seems like a good reason for the choice
you were making in the path.  But we then need to check
max_nscalars_per_iter == 1 for both the source rgroup and the
destination rgroup, not just the destination.  And I think the
condition for “no multiplication needed” should be that:

  max_nscalars_per_iter * factor

(aka "nitems_per_iter") is the same for both groups.

Thanks,
Richard
juzhe.zhong@rivai.ai May 24, 2023, 2:10 p.m. UTC | #10
>> Both approaches are fine.  I'm not against one or the other.

>> What I didn't understand was why your patch only reuses existing IVs
>> for max_nscalars_per_iter == 1.  Was it to avoid having to do a
>> multiplication (well, really a shift left) when moving from one
>> rgroup to another?  E.g. if one rgroup had;

>>   nscalars_per_iter == 2 && factor == 1

>> and another had:

>>   nscalars_per_iter == 4 && factor == 1

>> then we would need to mulitply by 2 when going from the first rgroup
>> to the second.

>> If so, avoiding a multiplication seems like a good reason for the choice
>> you were making in the path.  But we then need to check
>> max_nscalars_per_iter == 1 for both the source rgroup and the
>> destination rgroup, not just the destination.  And I think the
>> condition for “no multiplication needed” should be that:

Oh, I didn't realize such complicated problem. Frankly, I didn't understand well
rgroup. Sorry about that :).

I just remember last time you said I need to handle multiple-rgroup
not only for SLP but also non-SLP (which is vec_pack_trunk that I tested).
Then I asked you when is non-SLP, you said max_nscalars_per_iter == 1.
Then I use max_nscalars_per_iter == 1 here (I didn't really lean very well from this, just add it as you said). 

Actually, I just want to hanlde multip-rgroup for non-SLP here, I am trying to avoid  multiplication and I think
scalar multiplication (not cost too much) is fine in modern CPU.

So, what do you suggest that I handle multiple-rgroup for non-SLP.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-24 22:01
To: 钟居哲
CC: gcc-patches; rguenther
Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support
钟居哲 <juzhe.zhong@rivai.ai> writes:
>>> In other words, why is this different from what
>>>vect_set_loop_controls_directly would do?
> Oh, I see.  You are confused that why I do not make multiple-rgroup vec_trunk
> handling inside "vect_set_loop_controls_directly".
>
> Well. Frankly, I just replicate the handling of ARM SVE:
> unsigned int nmasks = i + 1;
>         if (use_masks_p && (nmasks & 1) == 0)
>           {
>             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
>             if (!half_rgc->controls.is_empty ()
>                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
>               continue;
>           }
>
> /* Try to use permutes to define the masks in DEST_RGM using the masks
>    in SRC_RGM, given that the former has twice as many masks as the
>    latter.  Return true on success, adding any new statements to SEQ.  */
>
> static bool
> vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
>                                rgroup_controls *src_rgm)
> {
>   tree src_masktype = src_rgm->type;
>   tree dest_masktype = dest_rgm->type;
>   machine_mode src_mode = TYPE_MODE (src_masktype);
>   insn_code icode1, icode2;
>   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
>       && (icode1 = optab_handler (vec_unpacku_hi_optab,
>                                   src_mode)) != CODE_FOR_nothing
>       && (icode2 = optab_handler (vec_unpacku_lo_optab,
>                                   src_mode)) != CODE_FOR_nothing)
>     {
>       /* Unpacking the source masks gives at least as many mask bits as
>          we need.  We can then VIEW_CONVERT any excess bits away.  */
>       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
>       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
>       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
>       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
>         {
>           tree src = src_rgm->controls[i / 2];
>           tree dest = dest_rgm->controls[i];
>           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
>                             ? VEC_UNPACK_HI_EXPR
>                             : VEC_UNPACK_LO_EXPR);
>           gassign *stmt;
>           if (dest_masktype == unpack_masktype)
>             stmt = gimple_build_assign (dest, code, src);
>           else
>             {
>               tree temp = make_ssa_name (unpack_masktype);
>               stmt = gimple_build_assign (temp, code, src);
>               gimple_seq_add_stmt (seq, stmt);
>               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
>                                           build1 (VIEW_CONVERT_EXPR,
>                                                   dest_masktype, temp));
>             }
>           gimple_seq_add_stmt (seq, stmt);
>         }
>       return true;
>     }
>   vec_perm_indices indices[2];
>   if (dest_masktype == src_masktype
>       && interleave_supported_p (&indices[0], src_masktype, 0)
>       && interleave_supported_p (&indices[1], src_masktype, 1))
>     {
>       /* The destination requires twice as many mask bits as the source, so
>          we can use interleaving permutes to double up the number of bits.  */
>       tree masks[2];
>       for (unsigned int i = 0; i < 2; ++i)
>         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
>       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
>         {
>           tree src = src_rgm->controls[i / 2];
>           tree dest = dest_rgm->controls[i];
>           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
>                                               src, src, masks[i & 1]);
>           gimple_seq_add_stmt (seq, stmt);
>         }
>       return true;
>     }
>   return false;
> }
>
> I know this is just optimization for ARM SVE with sub_rgc (int16)  is half size of rgc (int8).
> But when I just copy the codes from ARM SVE and make it general for all cases (int8 <-> int64).
> They all work well and codegen is good. 
>
> If you don't like this way, would you mind give me some suggestions?
 
It's not a case of disliking one approach or disliking another.
There are two separate parts of this: one specific and one general.
 
The specific part is that the code had:
 
    rgroup_controls *sub_rgc
      = &(*controls)[nmasks / rgc->controls.length () - 1];
    if (!sub_rgc->controls.is_empty ())
      {
tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
vect_adjust_loop_lens_control (iv_type, &header_seq, rgc,
       sub_rgc, NULL_TREE);
continue;
      }
 
But AIUI, nmasks is always equal to rgc->controls.length ()
(if rgc->controls is non-empty).  So I think this always used
(*controls)[0] as the source rgroup.  And I think that's the
only case that would work, since vect_adjust_loop_lens_control
only reads from sub_rgc once.  Is that right?
 
It would be possible to make other source rgroups work, but it
would involve reading every element of sub_rgc->controls, not just
sub_rgc->controls[0].  (This is also what the SVE code does.)
 
The general point is that there are two extreme approaches:
 
(1) Use one IV, and base everything off that.
 
(2) Use a separate IV for every rgroup.
 
It's also possible to do something between these two extremes.
(And we should, if it gives better code.)
 
The code above reuses an existing IV.  Doing that for all multi-control
rgroups would give (1).  At the other extreme, (2) would involve calling
vect_set_loop_controls_directly for every rgroup.
 
Both approaches are fine.  I'm not against one or the other.
 
What I didn't understand was why your patch only reuses existing IVs
for max_nscalars_per_iter == 1.  Was it to avoid having to do a
multiplication (well, really a shift left) when moving from one
rgroup to another?  E.g. if one rgroup had;
 
  nscalars_per_iter == 2 && factor == 1
 
and another had:
 
  nscalars_per_iter == 4 && factor == 1
 
then we would need to mulitply by 2 when going from the first rgroup
to the second.
 
If so, avoiding a multiplication seems like a good reason for the choice
you were making in the path.  But we then need to check
max_nscalars_per_iter == 1 for both the source rgroup and the
destination rgroup, not just the destination.  And I think the
condition for “no multiplication needed” should be that:
 
  max_nscalars_per_iter * factor
 
(aka "nitems_per_iter") is the same for both groups.
 
Thanks,
Richard
juzhe.zhong@rivai.ai May 24, 2023, 2:35 p.m. UTC | #11
Hi. Richard. I have sent V13:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619475.html 
It looks more reasonable now.
Could you continue review it again?

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-24 22:01
To: 钟居哲
CC: gcc-patches; rguenther
Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support
钟居哲 <juzhe.zhong@rivai.ai> writes:
>>> In other words, why is this different from what
>>>vect_set_loop_controls_directly would do?
> Oh, I see.  You are confused that why I do not make multiple-rgroup vec_trunk
> handling inside "vect_set_loop_controls_directly".
>
> Well. Frankly, I just replicate the handling of ARM SVE:
> unsigned int nmasks = i + 1;
>         if (use_masks_p && (nmasks & 1) == 0)
>           {
>             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
>             if (!half_rgc->controls.is_empty ()
>                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
>               continue;
>           }
>
> /* Try to use permutes to define the masks in DEST_RGM using the masks
>    in SRC_RGM, given that the former has twice as many masks as the
>    latter.  Return true on success, adding any new statements to SEQ.  */
>
> static bool
> vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
>                                rgroup_controls *src_rgm)
> {
>   tree src_masktype = src_rgm->type;
>   tree dest_masktype = dest_rgm->type;
>   machine_mode src_mode = TYPE_MODE (src_masktype);
>   insn_code icode1, icode2;
>   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
>       && (icode1 = optab_handler (vec_unpacku_hi_optab,
>                                   src_mode)) != CODE_FOR_nothing
>       && (icode2 = optab_handler (vec_unpacku_lo_optab,
>                                   src_mode)) != CODE_FOR_nothing)
>     {
>       /* Unpacking the source masks gives at least as many mask bits as
>          we need.  We can then VIEW_CONVERT any excess bits away.  */
>       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
>       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
>       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
>       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
>         {
>           tree src = src_rgm->controls[i / 2];
>           tree dest = dest_rgm->controls[i];
>           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
>                             ? VEC_UNPACK_HI_EXPR
>                             : VEC_UNPACK_LO_EXPR);
>           gassign *stmt;
>           if (dest_masktype == unpack_masktype)
>             stmt = gimple_build_assign (dest, code, src);
>           else
>             {
>               tree temp = make_ssa_name (unpack_masktype);
>               stmt = gimple_build_assign (temp, code, src);
>               gimple_seq_add_stmt (seq, stmt);
>               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
>                                           build1 (VIEW_CONVERT_EXPR,
>                                                   dest_masktype, temp));
>             }
>           gimple_seq_add_stmt (seq, stmt);
>         }
>       return true;
>     }
>   vec_perm_indices indices[2];
>   if (dest_masktype == src_masktype
>       && interleave_supported_p (&indices[0], src_masktype, 0)
>       && interleave_supported_p (&indices[1], src_masktype, 1))
>     {
>       /* The destination requires twice as many mask bits as the source, so
>          we can use interleaving permutes to double up the number of bits.  */
>       tree masks[2];
>       for (unsigned int i = 0; i < 2; ++i)
>         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
>       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
>         {
>           tree src = src_rgm->controls[i / 2];
>           tree dest = dest_rgm->controls[i];
>           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
>                                               src, src, masks[i & 1]);
>           gimple_seq_add_stmt (seq, stmt);
>         }
>       return true;
>     }
>   return false;
> }
>
> I know this is just optimization for ARM SVE with sub_rgc (int16)  is half size of rgc (int8).
> But when I just copy the codes from ARM SVE and make it general for all cases (int8 <-> int64).
> They all work well and codegen is good. 
>
> If you don't like this way, would you mind give me some suggestions?
 
It's not a case of disliking one approach or disliking another.
There are two separate parts of this: one specific and one general.
 
The specific part is that the code had:
 
    rgroup_controls *sub_rgc
      = &(*controls)[nmasks / rgc->controls.length () - 1];
    if (!sub_rgc->controls.is_empty ())
      {
tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
vect_adjust_loop_lens_control (iv_type, &header_seq, rgc,
       sub_rgc, NULL_TREE);
continue;
      }
 
But AIUI, nmasks is always equal to rgc->controls.length ()
(if rgc->controls is non-empty).  So I think this always used
(*controls)[0] as the source rgroup.  And I think that's the
only case that would work, since vect_adjust_loop_lens_control
only reads from sub_rgc once.  Is that right?
 
It would be possible to make other source rgroups work, but it
would involve reading every element of sub_rgc->controls, not just
sub_rgc->controls[0].  (This is also what the SVE code does.)
 
The general point is that there are two extreme approaches:
 
(1) Use one IV, and base everything off that.
 
(2) Use a separate IV for every rgroup.
 
It's also possible to do something between these two extremes.
(And we should, if it gives better code.)
 
The code above reuses an existing IV.  Doing that for all multi-control
rgroups would give (1).  At the other extreme, (2) would involve calling
vect_set_loop_controls_directly for every rgroup.
 
Both approaches are fine.  I'm not against one or the other.
 
What I didn't understand was why your patch only reuses existing IVs
for max_nscalars_per_iter == 1.  Was it to avoid having to do a
multiplication (well, really a shift left) when moving from one
rgroup to another?  E.g. if one rgroup had;
 
  nscalars_per_iter == 2 && factor == 1
 
and another had:
 
  nscalars_per_iter == 4 && factor == 1
 
then we would need to mulitply by 2 when going from the first rgroup
to the second.
 
If so, avoiding a multiplication seems like a good reason for the choice
you were making in the path.  But we then need to check
max_nscalars_per_iter == 1 for both the source rgroup and the
destination rgroup, not just the destination.  And I think the
condition for “no multiplication needed” should be that:
 
  max_nscalars_per_iter * factor
 
(aka "nitems_per_iter") is the same for both groups.
 
Thanks,
Richard
Richard Sandiford May 24, 2023, 2:57 p.m. UTC | #12
钟居哲 <juzhe.zhong@rivai.ai> writes:
>>> Both approaches are fine.  I'm not against one or the other.
>
>>> What I didn't understand was why your patch only reuses existing IVs
>>> for max_nscalars_per_iter == 1.  Was it to avoid having to do a
>>> multiplication (well, really a shift left) when moving from one
>>> rgroup to another?  E.g. if one rgroup had;
>
>>>   nscalars_per_iter == 2 && factor == 1
>
>>> and another had:
>
>>>   nscalars_per_iter == 4 && factor == 1
>
>>> then we would need to mulitply by 2 when going from the first rgroup
>>> to the second.
>
>>> If so, avoiding a multiplication seems like a good reason for the choice
>>> you were making in the path.  But we then need to check
>>> max_nscalars_per_iter == 1 for both the source rgroup and the
>>> destination rgroup, not just the destination.  And I think the
>>> condition for “no multiplication needed” should be that:
>
> Oh, I didn't realize such complicated problem. Frankly, I didn't understand well
> rgroup. Sorry about that :).
>
> I just remember last time you said I need to handle multiple-rgroup
> not only for SLP but also non-SLP (which is vec_pack_trunk that I tested).
> Then I asked you when is non-SLP, you said max_nscalars_per_iter == 1.

Yeah, max_nscalars_per_iter == 1 is the right way of checking for non-SLP.

But I'm never been convinced that SLP vs. non-SLP is a meaningful
distinction for this patch (that is, the parts that don't use
SELECT_VL).

SLP vs. non-SLP matters for SELECT_VL.  But the rgroup abstraction
should mean that SLP vs. non-SLP doesn't matter otherwise.

Thanks,
Richard
juzhe.zhong@rivai.ai May 24, 2023, 2:58 p.m. UTC | #13
Yeah. Thanks. I have sent V14:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619478.html 
which I found there is no distinction between SLP and non-SLP.

Could you review it? I think it's more reasonable now.

Thanks.



juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-24 22:57
To: 钟居哲
CC: gcc-patches; rguenther
Subject: Re: [PATCH V12] VECT: Add decrement IV iteration loop control by variable amount support
钟居哲 <juzhe.zhong@rivai.ai> writes:
>>> Both approaches are fine.  I'm not against one or the other.
>
>>> What I didn't understand was why your patch only reuses existing IVs
>>> for max_nscalars_per_iter == 1.  Was it to avoid having to do a
>>> multiplication (well, really a shift left) when moving from one
>>> rgroup to another?  E.g. if one rgroup had;
>
>>>   nscalars_per_iter == 2 && factor == 1
>
>>> and another had:
>
>>>   nscalars_per_iter == 4 && factor == 1
>
>>> then we would need to mulitply by 2 when going from the first rgroup
>>> to the second.
>
>>> If so, avoiding a multiplication seems like a good reason for the choice
>>> you were making in the path.  But we then need to check
>>> max_nscalars_per_iter == 1 for both the source rgroup and the
>>> destination rgroup, not just the destination.  And I think the
>>> condition for “no multiplication needed” should be that:
>
> Oh, I didn't realize such complicated problem. Frankly, I didn't understand well
> rgroup. Sorry about that :).
>
> I just remember last time you said I need to handle multiple-rgroup
> not only for SLP but also non-SLP (which is vec_pack_trunk that I tested).
> Then I asked you when is non-SLP, you said max_nscalars_per_iter == 1.
 
Yeah, max_nscalars_per_iter == 1 is the right way of checking for non-SLP.
 
But I'm never been convinced that SLP vs. non-SLP is a meaningful
distinction for this patch (that is, the parts that don't use
SELECT_VL).
 
SLP vs. non-SLP matters for SELECT_VL.  But the rgroup abstraction
should mean that SLP vs. non-SLP doesn't matter otherwise.
 
Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index ff6159e08d5..94b38d1e0fb 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -385,6 +385,66 @@  vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
   return false;
 }
 
+/* Try to use adjust loop lens for non-SLP multiple-rgroups.
+
+     _36 = MIN_EXPR <ivtmp_34, VF>;
+
+     First length (MIN (X, VF/N)):
+       loop_len_15 = MIN_EXPR <_36, VF/N>;
+
+     Second length:
+       tmp = _36 - loop_len_15;
+       loop_len_16 = MIN (tmp, VF/N);
+
+     Third length:
+       tmp2 = tmp - loop_len_16;
+       loop_len_17 = MIN (tmp2, VF/N);
+
+     Last length:
+       loop_len_18 = tmp2 - loop_len_17;
+*/
+
+static void
+vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
+			       rgroup_controls *dest_rgm,
+			       rgroup_controls *src_rgm, tree step)
+{
+  tree ctrl_type = dest_rgm->type;
+  poly_uint64 nitems_per_ctrl
+    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
+  tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
+
+  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
+    {
+      if (!step)
+	step = src_rgm->controls[i / dest_rgm->controls.length ()];
+      tree ctrl = dest_rgm->controls[i];
+      if (i == 0)
+	{
+	  /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
+	  gassign *assign
+	    = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
+	  gimple_seq_add_stmt (seq, assign);
+	}
+      else if (i == dest_rgm->controls.length () - 1)
+	{
+	  /* Last iteration: Remain capped to the range [0, VF/N].  */
+	  gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
+						 dest_rgm->controls[i - 1]);
+	  gimple_seq_add_stmt (seq, assign);
+	}
+      else
+	{
+	  /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
+	  step = gimple_build (seq, MINUS_EXPR, iv_type, step,
+			       dest_rgm->controls[i - 1]);
+	  gassign *assign
+	    = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
+	  gimple_seq_add_stmt (seq, assign);
+	}
+    }
+}
+
 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
    for all the rgroup controls in RGC and return a control that is nonzero
    when the loop needs to iterate.  Add any new preheader statements to
@@ -468,9 +528,78 @@  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
-	     loop, &incr_gsi, insert_after, &index_before_incr,
-	     &index_after_incr);
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+    {
+      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+      tree step = make_ssa_name (iv_type);
+      /* Create decrement IV.  */
+      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
+		 insert_after, &index_before_incr, &index_after_incr);
+      tree temp = gimple_build (header_seq, MIN_EXPR, iv_type,
+				index_before_incr, nitems_step);
+      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp));
+
+      if (rgc->max_nscalars_per_iter == 1)
+	{
+	  /* single rgroup:
+	     ...
+	     _10 = (unsigned long) count_12(D);
+	     ...
+	     # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
+	     _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+	     ...
+	     vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
+	     ...
+	     ivtmp_35 = ivtmp_9 - _36;
+	     ...
+	     if (ivtmp_35 != 0)
+	       goto <bb 4>; [83.33%]
+	     else
+	       goto <bb 5>; [16.67%]
+	  */
+	  gassign *assign = gimple_build_assign (rgc->controls[0], step);
+	  gimple_seq_add_stmt (header_seq, assign);
+	}
+      else
+	{
+	  /* Multiple rgroup (SLP):
+	     ...
+	     _38 = (unsigned long) bnd.7_29;
+	     _39 = _38 * 2;
+	     ...
+	     # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
+	     ...
+	     _43 = MIN_EXPR <ivtmp_41, 32>;
+	     loop_len_26 = MIN_EXPR <_43, 16>;
+	     loop_len_25 = _43 - loop_len_26;
+	     ...
+	     .LEN_STORE (_6, 8B, loop_len_26, ...);
+	     ...
+	     .LEN_STORE (_25, 8B, loop_len_25, ...);
+	     _33 = loop_len_26 / 2;
+	     ...
+	     .LEN_STORE (_8, 16B, _33, ...);
+	     _36 = loop_len_25 / 2;
+	     ...
+	     .LEN_STORE (_15, 16B, _36, ...);
+	     ivtmp_42 = ivtmp_41 - _43;
+	     ...
+	     if (ivtmp_42 != 0)
+	       goto <bb 4>; [83.33%]
+	     else
+	      goto <bb 5>; [16.67%]
+	  */
+	  vect_adjust_loop_lens_control (iv_type, header_seq, rgc, NULL, step);
+	}
+      return index_after_incr;
+    }
+  else
+    {
+      /* Create increment IV.  */
+      create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
+		 loop, &incr_gsi, insert_after, &index_before_incr,
+		 &index_after_incr);
+    }
 
   tree zero_index = build_int_cst (compare_type, 0);
   tree test_index, test_limit, first_limit;
@@ -753,6 +882,55 @@  vect_set_loop_condition_partial_vectors (class loop *loop,
 	      continue;
 	  }
 
+	if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+	    && rgc->max_nscalars_per_iter == 1
+	    && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
+	  {
+	    /* Multiple rgroup (non-SLP):
+	      ...
+	      _38 = (unsigned long) n_12(D);
+	      ...
+	      # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
+	      ...
+	      _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
+	      loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
+	      _41 = _40 - loop_len_21;
+	      loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
+	      _42 = _40 - loop_len_20;
+	      loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
+	      _43 = _40 - loop_len_19;
+	      loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
+	      ...
+	      vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
+	      ...
+	      vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
+	      ...
+	      vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
+	      ...
+	      vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
+	      vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
+	      vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
+	      vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
+	      ...
+	      .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
+	      ivtmp_39 = ivtmp_38 - _40;
+	      ...
+	      if (ivtmp_39 != 0)
+		goto <bb 3>; [92.31%]
+	      else
+		goto <bb 4>; [7.69%]
+	    */
+	    rgroup_controls *sub_rgc
+	      = &(*controls)[nmasks / rgc->controls.length () - 1];
+	    if (!sub_rgc->controls.is_empty ())
+	      {
+		tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+		vect_adjust_loop_lens_control (iv_type, &header_seq, rgc,
+					       sub_rgc, NULL_TREE);
+		continue;
+	      }
+	  }
+
 	/* See whether zero-based IV would ever generate all-false masks
 	   or zero length before wrapping around.  */
 	bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index cf10132b0bf..53ac4d2a6c2 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2725,6 +2725,16 @@  start_over:
       && !vect_verify_loop_lens (loop_vinfo))
     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 
+  /* If we're vectorizing an loop that uses length "controls" and
+     can iterate more than once, we apply decrementing IV approach
+     in loop control.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
+      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	   && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
+			LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
+    LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
+
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 02d2ad6fba1..fba09b9ffd3 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -818,6 +818,13 @@  public:
      the vector loop can handle fewer than VF scalars.  */
   bool using_partial_vectors_p;
 
+  /* True if we've decided to use a decrementing loop control IV that counts
+     scalars. This can be done for any loop that:
+
+	(a) uses length "controls"; and
+	(b) can iterate more than once.  */
+  bool using_decrementing_iv_p;
+
   /* True if we've decided to use partially-populated vectors for the
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
@@ -890,6 +897,7 @@  public:
 #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
 #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
 #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
 #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
   (L)->epil_using_partial_vectors_p
 #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias