diff mbox series

[V14] VECT: Add decrement IV iteration loop control by variable amount support

Message ID 20230524144801.73537-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V14] VECT: Add decrement IV iteration loop control by variable amount support | expand

Commit Message

juzhe.zhong@rivai.ai May 24, 2023, 2:48 p.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

This patch is supporting decrement IV by following the flow designed by Richard:

(1) In vect_set_loop_condition_partial_vectors, for the first iteration of:
    call vect_set_loop_controls_directly.

(2) vect_set_loop_controls_directly calculates "step" as in your patch.
If rgc has 1 control, this step is the SSA name created for that control.
Otherwise the step is a fresh SSA name, as in your patch.

(3) vect_set_loop_controls_directly stores this step somewhere for later
use, probably in LOOP_VINFO.  Let's use "S" to refer to this stored step.

(4) After the vect_set_loop_controls_directly call above, and outside
the "if" statement that now contains vect_set_loop_controls_directly,
check whether rgc->controls.length () > 1.  If so, use
vect_adjust_loop_lens_control to set the controls based on S.

Then the only caller of vect_adjust_loop_lens_control is
vect_set_loop_condition_partial_vectors.  And the starting
step for vect_adjust_loop_lens_control is always S.

This patch has well tested for single-rgroup and multiple-rgroup (SLP) and
passed all testcase in RISC-V port.

Also, pass tests for multiple-rgroup (non-SLP) tested on vec_pack_trunk.

---
 gcc/tree-vect-loop-manip.cc | 178 +++++++++++++++++++++++++++++++++---
 gcc/tree-vect-loop.cc       |  13 +++
 gcc/tree-vectorizer.h       |  12 +++
 3 files changed, 192 insertions(+), 11 deletions(-)

Comments

Richard Sandiford May 24, 2023, 3:07 p.m. UTC | #1
Thanks for trying it.  I'm still surprised that no multiplication
is needed though.  Does the patch work for:

short x[100];
int y[200];

void f() {
  for (int i = 0, j = 0; i < 100; i += 2, j += 4) {
    x[i + 0] += 1;
    x[i + 1] += 2;
    y[j + 0] += 1;
    y[j + 1] += 2;
    y[j + 2] += 3;
    y[j + 3] += 4;
  }
}

?  Here, there should be a single-control rgroup for x, counting
2 units per scalar iteration.  I'd expect the IV to use this scale.

There should also be a 4-control rgroup for y, counting 4 units per
scalar iteration.  So I think the IV would need to be multiplied by 2
before being used for the y rgroup.

Thanks,
Richard

juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> This patch is supporting decrement IV by following the flow designed by Richard:
>
> (1) In vect_set_loop_condition_partial_vectors, for the first iteration of:
>     call vect_set_loop_controls_directly.
>
> (2) vect_set_loop_controls_directly calculates "step" as in your patch.
> If rgc has 1 control, this step is the SSA name created for that control.
> Otherwise the step is a fresh SSA name, as in your patch.
>
> (3) vect_set_loop_controls_directly stores this step somewhere for later
> use, probably in LOOP_VINFO.  Let's use "S" to refer to this stored step.
>
> (4) After the vect_set_loop_controls_directly call above, and outside
> the "if" statement that now contains vect_set_loop_controls_directly,
> check whether rgc->controls.length () > 1.  If so, use
> vect_adjust_loop_lens_control to set the controls based on S.
>
> Then the only caller of vect_adjust_loop_lens_control is
> vect_set_loop_condition_partial_vectors.  And the starting
> step for vect_adjust_loop_lens_control is always S.
>
> This patch has well tested for single-rgroup and multiple-rgroup (SLP) and
> passed all testcase in RISC-V port.
>
> Also, pass tests for multiple-rgroup (non-SLP) tested on vec_pack_trunk.
>
> ---
>  gcc/tree-vect-loop-manip.cc | 178 +++++++++++++++++++++++++++++++++---
>  gcc/tree-vect-loop.cc       |  13 +++
>  gcc/tree-vectorizer.h       |  12 +++
>  3 files changed, 192 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index ff6159e08d5..578ac5b783e 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -468,6 +468,38 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    gimple_stmt_iterator incr_gsi;
>    bool insert_after;
>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +    {
> +      /* single rgroup:
> +	 ...
> +	 _10 = (unsigned long) count_12(D);
> +	 ...
> +	 # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> +	 _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> +	 ...
> +	 vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
> +	 ...
> +	 ivtmp_35 = ivtmp_9 - _36;
> +	 ...
> +	 if (ivtmp_35 != 0)
> +	   goto <bb 4>; [83.33%]
> +	 else
> +	   goto <bb 5>; [16.67%]
> +      */
> +      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +      tree step = rgc->controls.length () == 1 ? rgc->controls[0]
> +					       : make_ssa_name (iv_type);
> +      /* Create decrement IV.  */
> +      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
> +		 insert_after, &index_before_incr, &index_after_incr);
> +      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
> +							    index_before_incr,
> +							    nitems_step));
> +      LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo) = step;
> +      return index_after_incr;
> +    }
> +
> +  /* Create increment IV.  */
>    create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
>  	     loop, &incr_gsi, insert_after, &index_before_incr,
>  	     &index_after_incr);
> @@ -683,6 +715,63 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    return next_ctrl;
>  }
>  
> +/* Try to use adjust loop lens for multiple-rgroups.
> +
> +     _36 = MIN_EXPR <ivtmp_34, VF>;
> +
> +     First length (MIN (X, VF/N)):
> +       loop_len_15 = MIN_EXPR <_36, VF/N>;
> +
> +     Second length:
> +       tmp = _36 - loop_len_15;
> +       loop_len_16 = MIN (tmp, VF/N);
> +
> +     Third length:
> +       tmp2 = tmp - loop_len_16;
> +       loop_len_17 = MIN (tmp2, VF/N);
> +
> +     Last length:
> +       loop_len_18 = tmp2 - loop_len_17;
> +*/
> +
> +static void
> +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
> +			       rgroup_controls *dest_rgm, tree step)
> +{
> +  tree ctrl_type = dest_rgm->type;
> +  poly_uint64 nitems_per_ctrl
> +    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +  tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
> +
> +  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> +    {
> +      tree ctrl = dest_rgm->controls[i];
> +      if (i == 0)
> +	{
> +	  /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
> +	  gassign *assign
> +	    = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +	  gimple_seq_add_stmt (seq, assign);
> +	}
> +      else if (i == dest_rgm->controls.length () - 1)
> +	{
> +	  /* Last iteration: Remain capped to the range [0, VF/N].  */
> +	  gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
> +						 dest_rgm->controls[i - 1]);
> +	  gimple_seq_add_stmt (seq, assign);
> +	}
> +      else
> +	{
> +	  /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
> +	  step = gimple_build (seq, MINUS_EXPR, iv_type, step,
> +			       dest_rgm->controls[i - 1]);
> +	  gassign *assign
> +	    = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +	  gimple_seq_add_stmt (seq, assign);
> +	}
> +    }
> +}
> +
>  /* Set up the iteration condition and rgroup controls for LOOP, given
>     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
>     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> @@ -753,17 +842,84 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>  	      continue;
>  	  }
>  
> -	/* See whether zero-based IV would ever generate all-false masks
> -	   or zero length before wrapping around.  */
> -	bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> -
> -	/* Set up all controls for this group.  */
> -	test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> -						     &preheader_seq,
> -						     &header_seq,
> -						     loop_cond_gsi, rgc,
> -						     niters, niters_skip,
> -						     might_wrap_p);
> +	if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +	    || !LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo))
> +	  {
> +	    /* See whether zero-based IV would ever generate all-false masks
> +	       or zero length before wrapping around.  */
> +	    bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> +
> +	    /* Set up all controls for this group.  */
> +	    test_ctrl
> +	      = vect_set_loop_controls_directly (loop, loop_vinfo,
> +						 &preheader_seq, &header_seq,
> +						 loop_cond_gsi, rgc, niters,
> +						 niters_skip, might_wrap_p);
> +	  }
> +
> +	/* Decrement IV only run vect_set_loop_controls_directly once.  */
> +	if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +	    && rgc->controls.length () > 1)
> +	  {
> +	    /*
> +	       - Multiple rgroup (SLP):
> +		 ...
> +		 _38 = (unsigned long) bnd.7_29;
> +		 _39 = _38 * 2;
> +		 ...
> +		 # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
> +		 ...
> +		 _43 = MIN_EXPR <ivtmp_41, 32>;
> +		 loop_len_26 = MIN_EXPR <_43, 16>;
> +		 loop_len_25 = _43 - loop_len_26;
> +		 ...
> +		 .LEN_STORE (_6, 8B, loop_len_26, ...);
> +		 ...
> +		 .LEN_STORE (_25, 8B, loop_len_25, ...);
> +		 _33 = loop_len_26 / 2;
> +		 ...
> +		 .LEN_STORE (_8, 16B, _33, ...);
> +		 _36 = loop_len_25 / 2;
> +		 ...
> +		 .LEN_STORE (_15, 16B, _36, ...);
> +		 ivtmp_42 = ivtmp_41 - _43;
> +		 ...
> +
> +	       - Multiple rgroup (non-SLP):
> +		 ...
> +		 _38 = (unsigned long) n_12(D);
> +		 ...
> +		 # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> +		 ...
> +		 _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> +		 loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> +		 _41 = _40 - loop_len_21;
> +		 loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> +		 _42 = _40 - loop_len_20;
> +		 loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> +		 _43 = _40 - loop_len_19;
> +		 loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> +		 ...
> +		 vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> +		 ...
> +		 vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> +		 ...
> +		 vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> +		 ...
> +		 vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> +		 vect__7.13_31 = VEC_PACK_TRUNC_EXPR <...>,
> +		 vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
> +		 vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
> +		 ...
> +		 .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> +		 ivtmp_39 = ivtmp_38 - _40;
> +		 ...
> +	    */
> +	    tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +	    tree step = LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo);
> +	    gcc_assert (step);
> +	    vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, step);
> +	  }
>        }
>  
>    /* Emit all accumulated statements.  */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index cf10132b0bf..456f50fa7cc 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -973,6 +973,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
>      vectorizable (false),
>      can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
>      using_partial_vectors_p (false),
> +    using_decrementing_iv_p (false),
> +    decrementing_iv_step (NULL_TREE),
>      epil_using_partial_vectors_p (false),
>      partial_load_store_bias (0),
>      peeling_for_gaps (false),
> @@ -2725,6 +2727,17 @@ start_over:
>        && !vect_verify_loop_lens (loop_vinfo))
>      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  
> +  /* If we're vectorizing an loop that uses length "controls" and
> +     can iterate more than once, we apply decrementing IV approach
> +     in loop control.  */
> +  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> +      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
> +      && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
> +      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +	   && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
> +			LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
> +    LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
> +
>    /* If we're vectorizing an epilogue loop, the vectorized loop either needs
>       to be able to handle fewer than VF scalars, or needs to have a lower VF
>       than the main loop.  */
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 02d2ad6fba1..7ed079f543a 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -818,6 +818,16 @@ public:
>       the vector loop can handle fewer than VF scalars.  */
>    bool using_partial_vectors_p;
>  
> +  /* True if we've decided to use a decrementing loop control IV that counts
> +     scalars. This can be done for any loop that:
> +
> +	(a) uses length "controls"; and
> +	(b) can iterate more than once.  */
> +  bool using_decrementing_iv_p;
> +
> +  /* The variable amount step for decrement IV.  */
> +  tree decrementing_iv_step;
> +
>    /* True if we've decided to use partially-populated vectors for the
>       epilogue of loop.  */
>    bool epil_using_partial_vectors_p;
> @@ -890,6 +900,8 @@ public:
>  #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
>  #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
> +#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
> +#define LOOP_VINFO_DECREMENTING_IV_STEP(L) (L)->decrementing_iv_step
>  #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
>    (L)->epil_using_partial_vectors_p
>  #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
juzhe.zhong@rivai.ai May 24, 2023, 3:13 p.m. UTC | #2
Hi, the .optimized dump is like this:

  <bb 2> [local count: 21045336]:
  ivtmp.26_36 = (unsigned long) &x;
  ivtmp.27_3 = (unsigned long) &y;
  ivtmp.30_6 = (unsigned long) &MEM <int[200]> [(void *)&y + 16B];
  ivtmp.31_10 = (unsigned long) &MEM <int[200]> [(void *)&y + 32B];
  ivtmp.32_14 = (unsigned long) &MEM <int[200]> [(void *)&y + 48B];

  <bb 3> [local count: 273589366]:
  # ivtmp_72 = PHI <ivtmp_73(3), 100(2)>
  # ivtmp.26_41 = PHI <ivtmp.26_37(3), ivtmp.26_36(2)>
  # ivtmp.27_1 = PHI <ivtmp.27_2(3), ivtmp.27_3(2)>
  # ivtmp.30_4 = PHI <ivtmp.30_5(3), ivtmp.30_6(2)>
  # ivtmp.31_8 = PHI <ivtmp.31_9(3), ivtmp.31_10(2)>
  # ivtmp.32_12 = PHI <ivtmp.32_13(3), ivtmp.32_14(2)>
  loop_len_34 = MIN_EXPR <ivtmp_72, 8>;
  loop_len_48 = MIN_EXPR <loop_len_34, 4>;
  _74 = loop_len_34 - loop_len_48;
  loop_len_49 = MIN_EXPR <_74, 4>;
  _75 = _74 - loop_len_49;
  loop_len_50 = MIN_EXPR <_75, 4>;
  loop_len_51 = _75 - loop_len_50;
  _16 = (void *) ivtmp.26_41;
  _17 = &MEM <vector(8) short int> [(short int *)_16];
  vect__1.6_33 = .LEN_LOAD (_17, 16B, loop_len_34, 0);
  vect__2.7_23 = VIEW_CONVERT_EXPR<vector(8) unsigned short>(vect__1.6_33);
  vect__3.8_22 = vect__2.7_23 + { 1, 2, 1, 2, 1, 2, 1, 2 };
  vect__4.9_21 = VIEW_CONVERT_EXPR<vector(8) short int>(vect__3.8_22);
  .LEN_STORE (_17, 16B, loop_len_34, vect__4.9_21, 0);
  _20 = (void *) ivtmp.27_1;
  _31 = &MEM <vector(4) int> [(int *)_20];
  vect__10.14_52 = .LEN_LOAD (_31, 32B, loop_len_48, 0);
  _30 = (void *) ivtmp.30_4;
  _29 = &MEM <vector(4) int> [(int *)_30];
  vect__10.15_54 = .LEN_LOAD (_29, 32B, loop_len_49, 0);
  _26 = (void *) ivtmp.31_8;
  _25 = &MEM <vector(4) int> [(int *)_26];
  vect__10.16_56 = .LEN_LOAD (_25, 32B, loop_len_50, 0);
  _78 = (void *) ivtmp.32_12;
  _79 = &MEM <vector(4) int> [(int *)_78];
  vect__10.17_58 = .LEN_LOAD (_79, 32B, loop_len_51, 0);
  vect__11.18_59 = vect__10.14_52 + { 1, 2, 3, 4 };
  vect__11.18_60 = vect__10.15_54 + { 1, 2, 3, 4 };
  vect__11.18_61 = vect__10.16_56 + { 1, 2, 3, 4 };
  vect__11.18_62 = vect__10.17_58 + { 1, 2, 3, 4 };
  .LEN_STORE (_31, 32B, loop_len_48, vect__11.18_59, 0);
  .LEN_STORE (_29, 32B, loop_len_49, vect__11.18_60, 0);
  .LEN_STORE (_25, 32B, loop_len_50, vect__11.18_61, 0);
  .LEN_STORE (_79, 32B, loop_len_51, vect__11.18_62, 0);
  ivtmp_73 = ivtmp_72 - loop_len_34;
  ivtmp.26_37 = ivtmp.26_41 + 16;
  ivtmp.27_2 = ivtmp.27_1 + 64;
  ivtmp.30_5 = ivtmp.30_4 + 64;
  ivtmp.31_9 = ivtmp.31_8 + 64;
  ivtmp.32_13 = ivtmp.32_12 + 64;
  if (ivtmp_73 != 0)
    goto <bb 3>; [92.31%]
  else
    goto <bb 4>; [7.69%]

I am still check about it but I send it to you earlier.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-24 23:07
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V14] VECT: Add decrement IV iteration loop control by variable amount support
Thanks for trying it.  I'm still surprised that no multiplication
is needed though.  Does the patch work for:
 
short x[100];
int y[200];
 
void f() {
  for (int i = 0, j = 0; i < 100; i += 2, j += 4) {
    x[i + 0] += 1;
    x[i + 1] += 2;
    y[j + 0] += 1;
    y[j + 1] += 2;
    y[j + 2] += 3;
    y[j + 3] += 4;
  }
}
 
?  Here, there should be a single-control rgroup for x, counting
2 units per scalar iteration.  I'd expect the IV to use this scale.
 
There should also be a 4-control rgroup for y, counting 4 units per
scalar iteration.  So I think the IV would need to be multiplied by 2
before being used for the y rgroup.
 
Thanks,
Richard
 
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> This patch is supporting decrement IV by following the flow designed by Richard:
>
> (1) In vect_set_loop_condition_partial_vectors, for the first iteration of:
>     call vect_set_loop_controls_directly.
>
> (2) vect_set_loop_controls_directly calculates "step" as in your patch.
> If rgc has 1 control, this step is the SSA name created for that control.
> Otherwise the step is a fresh SSA name, as in your patch.
>
> (3) vect_set_loop_controls_directly stores this step somewhere for later
> use, probably in LOOP_VINFO.  Let's use "S" to refer to this stored step.
>
> (4) After the vect_set_loop_controls_directly call above, and outside
> the "if" statement that now contains vect_set_loop_controls_directly,
> check whether rgc->controls.length () > 1.  If so, use
> vect_adjust_loop_lens_control to set the controls based on S.
>
> Then the only caller of vect_adjust_loop_lens_control is
> vect_set_loop_condition_partial_vectors.  And the starting
> step for vect_adjust_loop_lens_control is always S.
>
> This patch has well tested for single-rgroup and multiple-rgroup (SLP) and
> passed all testcase in RISC-V port.
>
> Also, pass tests for multiple-rgroup (non-SLP) tested on vec_pack_trunk.
>
> ---
>  gcc/tree-vect-loop-manip.cc | 178 +++++++++++++++++++++++++++++++++---
>  gcc/tree-vect-loop.cc       |  13 +++
>  gcc/tree-vectorizer.h       |  12 +++
>  3 files changed, 192 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index ff6159e08d5..578ac5b783e 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -468,6 +468,38 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    gimple_stmt_iterator incr_gsi;
>    bool insert_after;
>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +    {
> +      /* single rgroup:
> + ...
> + _10 = (unsigned long) count_12(D);
> + ...
> + # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> + _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> + ...
> + vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
> + ...
> + ivtmp_35 = ivtmp_9 - _36;
> + ...
> + if (ivtmp_35 != 0)
> +    goto <bb 4>; [83.33%]
> + else
> +    goto <bb 5>; [16.67%]
> +      */
> +      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +      tree step = rgc->controls.length () == 1 ? rgc->controls[0]
> +        : make_ssa_name (iv_type);
> +      /* Create decrement IV.  */
> +      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
> + insert_after, &index_before_incr, &index_after_incr);
> +      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
> +     index_before_incr,
> +     nitems_step));
> +      LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo) = step;
> +      return index_after_incr;
> +    }
> +
> +  /* Create increment IV.  */
>    create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
>       loop, &incr_gsi, insert_after, &index_before_incr,
>       &index_after_incr);
> @@ -683,6 +715,63 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    return next_ctrl;
>  }
>  
> +/* Try to use adjust loop lens for multiple-rgroups.
> +
> +     _36 = MIN_EXPR <ivtmp_34, VF>;
> +
> +     First length (MIN (X, VF/N)):
> +       loop_len_15 = MIN_EXPR <_36, VF/N>;
> +
> +     Second length:
> +       tmp = _36 - loop_len_15;
> +       loop_len_16 = MIN (tmp, VF/N);
> +
> +     Third length:
> +       tmp2 = tmp - loop_len_16;
> +       loop_len_17 = MIN (tmp2, VF/N);
> +
> +     Last length:
> +       loop_len_18 = tmp2 - loop_len_17;
> +*/
> +
> +static void
> +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
> +        rgroup_controls *dest_rgm, tree step)
> +{
> +  tree ctrl_type = dest_rgm->type;
> +  poly_uint64 nitems_per_ctrl
> +    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +  tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
> +
> +  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> +    {
> +      tree ctrl = dest_rgm->controls[i];
> +      if (i == 0)
> + {
> +   /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
> +   gassign *assign
> +     = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +   gimple_seq_add_stmt (seq, assign);
> + }
> +      else if (i == dest_rgm->controls.length () - 1)
> + {
> +   /* Last iteration: Remain capped to the range [0, VF/N].  */
> +   gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
> + dest_rgm->controls[i - 1]);
> +   gimple_seq_add_stmt (seq, assign);
> + }
> +      else
> + {
> +   /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
> +   step = gimple_build (seq, MINUS_EXPR, iv_type, step,
> +        dest_rgm->controls[i - 1]);
> +   gassign *assign
> +     = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +   gimple_seq_add_stmt (seq, assign);
> + }
> +    }
> +}
> +
>  /* Set up the iteration condition and rgroup controls for LOOP, given
>     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
>     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> @@ -753,17 +842,84 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>        continue;
>    }
>  
> - /* See whether zero-based IV would ever generate all-false masks
> -    or zero length before wrapping around.  */
> - bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> -
> - /* Set up all controls for this group.  */
> - test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> -      &preheader_seq,
> -      &header_seq,
> -      loop_cond_gsi, rgc,
> -      niters, niters_skip,
> -      might_wrap_p);
> + if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +     || !LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo))
> +   {
> +     /* See whether zero-based IV would ever generate all-false masks
> +        or zero length before wrapping around.  */
> +     bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> +
> +     /* Set up all controls for this group.  */
> +     test_ctrl
> +       = vect_set_loop_controls_directly (loop, loop_vinfo,
> + &preheader_seq, &header_seq,
> + loop_cond_gsi, rgc, niters,
> + niters_skip, might_wrap_p);
> +   }
> +
> + /* Decrement IV only run vect_set_loop_controls_directly once.  */
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +     && rgc->controls.length () > 1)
> +   {
> +     /*
> +        - Multiple rgroup (SLP):
> + ...
> + _38 = (unsigned long) bnd.7_29;
> + _39 = _38 * 2;
> + ...
> + # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
> + ...
> + _43 = MIN_EXPR <ivtmp_41, 32>;
> + loop_len_26 = MIN_EXPR <_43, 16>;
> + loop_len_25 = _43 - loop_len_26;
> + ...
> + .LEN_STORE (_6, 8B, loop_len_26, ...);
> + ...
> + .LEN_STORE (_25, 8B, loop_len_25, ...);
> + _33 = loop_len_26 / 2;
> + ...
> + .LEN_STORE (_8, 16B, _33, ...);
> + _36 = loop_len_25 / 2;
> + ...
> + .LEN_STORE (_15, 16B, _36, ...);
> + ivtmp_42 = ivtmp_41 - _43;
> + ...
> +
> +        - Multiple rgroup (non-SLP):
> + ...
> + _38 = (unsigned long) n_12(D);
> + ...
> + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> + ...
> + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> + _41 = _40 - loop_len_21;
> + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> + _42 = _40 - loop_len_20;
> + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> + _43 = _40 - loop_len_19;
> + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> + ...
> + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> + ...
> + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> + ...
> + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> + ...
> + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <...>,
> + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
> + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
> + ...
> + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> + ivtmp_39 = ivtmp_38 - _40;
> + ...
> +     */
> +     tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +     tree step = LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo);
> +     gcc_assert (step);
> +     vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, step);
> +   }
>        }
>  
>    /* Emit all accumulated statements.  */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index cf10132b0bf..456f50fa7cc 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -973,6 +973,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
>      vectorizable (false),
>      can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
>      using_partial_vectors_p (false),
> +    using_decrementing_iv_p (false),
> +    decrementing_iv_step (NULL_TREE),
>      epil_using_partial_vectors_p (false),
>      partial_load_store_bias (0),
>      peeling_for_gaps (false),
> @@ -2725,6 +2727,17 @@ start_over:
>        && !vect_verify_loop_lens (loop_vinfo))
>      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  
> +  /* If we're vectorizing an loop that uses length "controls" and
> +     can iterate more than once, we apply decrementing IV approach
> +     in loop control.  */
> +  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> +      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
> +      && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
> +      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +    && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
> + LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
> +    LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
> +
>    /* If we're vectorizing an epilogue loop, the vectorized loop either needs
>       to be able to handle fewer than VF scalars, or needs to have a lower VF
>       than the main loop.  */
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 02d2ad6fba1..7ed079f543a 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -818,6 +818,16 @@ public:
>       the vector loop can handle fewer than VF scalars.  */
>    bool using_partial_vectors_p;
>  
> +  /* True if we've decided to use a decrementing loop control IV that counts
> +     scalars. This can be done for any loop that:
> +
> + (a) uses length "controls"; and
> + (b) can iterate more than once.  */
> +  bool using_decrementing_iv_p;
> +
> +  /* The variable amount step for decrement IV.  */
> +  tree decrementing_iv_step;
> +
>    /* True if we've decided to use partially-populated vectors for the
>       epilogue of loop.  */
>    bool epil_using_partial_vectors_p;
> @@ -890,6 +900,8 @@ public:
>  #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
>  #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
> +#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
> +#define LOOP_VINFO_DECREMENTING_IV_STEP(L) (L)->decrementing_iv_step
>  #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
>    (L)->epil_using_partial_vectors_p
>  #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
Richard Sandiford May 24, 2023, 3:31 p.m. UTC | #3
钟居哲 <juzhe.zhong@rivai.ai> writes:
> Hi, the .optimized dump is like this:
>
>   <bb 2> [local count: 21045336]:
>   ivtmp.26_36 = (unsigned long) &x;
>   ivtmp.27_3 = (unsigned long) &y;
>   ivtmp.30_6 = (unsigned long) &MEM <int[200]> [(void *)&y + 16B];
>   ivtmp.31_10 = (unsigned long) &MEM <int[200]> [(void *)&y + 32B];
>   ivtmp.32_14 = (unsigned long) &MEM <int[200]> [(void *)&y + 48B];
>
>   <bb 3> [local count: 273589366]:
>   # ivtmp_72 = PHI <ivtmp_73(3), 100(2)>
>   # ivtmp.26_41 = PHI <ivtmp.26_37(3), ivtmp.26_36(2)>
>   # ivtmp.27_1 = PHI <ivtmp.27_2(3), ivtmp.27_3(2)>
>   # ivtmp.30_4 = PHI <ivtmp.30_5(3), ivtmp.30_6(2)>
>   # ivtmp.31_8 = PHI <ivtmp.31_9(3), ivtmp.31_10(2)>
>   # ivtmp.32_12 = PHI <ivtmp.32_13(3), ivtmp.32_14(2)>
>   loop_len_34 = MIN_EXPR <ivtmp_72, 8>;
>   loop_len_48 = MIN_EXPR <loop_len_34, 4>;
>   _74 = loop_len_34 - loop_len_48;

Yeah, I think this needs to be:

  loop_len_48 = MIN_EXPR <loop_len_34 * 2, 4>;
  _74 = loop_len_34 * 2 - loop_len_48;
  
(as valid gimple).  The point is that...

>   loop_len_49 = MIN_EXPR <_74, 4>;
>   _75 = _74 - loop_len_49;
>   loop_len_50 = MIN_EXPR <_75, 4>;
>   loop_len_51 = _75 - loop_len_50;

...there are 4 lengths capped to 4, for a total element count of 16.
But loop_len_34 is never greater than 8.

So for this case we either need to multiply, or we need to create
a fresh IV for the second rgroup.  Both approaches are fine.

Thanks,
Richard
juzhe.zhong@rivai.ai May 24, 2023, 3:42 p.m. UTC | #4
Hi, Richard. I still don't understand it. Sorry about that.

>>  loop_len_48 = MIN_EXPR <loop_len_34 * 2, 4>;
  >>   _74 = loop_len_34 * 2 - loop_len_48;

I have the tests already tested.
We have a MIN_EXPR to calculate the total elements:
loop_len_34 = MIN_EXPR <ivtmp_72, 8>;
I think "8" is already multiplied by 2?

Why do we need loop_len_34 * 2 ?
Could you give me more informations, The similiar tests you present we already have
execution check and passed. I am not sure whether this patch has the issue that I didn't notice.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-24 23:31
To: 钟居哲
CC: gcc-patches; rguenther
Subject: Re: [PATCH V14] VECT: Add decrement IV iteration loop control by variable amount support
钟居哲 <juzhe.zhong@rivai.ai> writes:
> Hi, the .optimized dump is like this:
>
>   <bb 2> [local count: 21045336]:
>   ivtmp.26_36 = (unsigned long) &x;
>   ivtmp.27_3 = (unsigned long) &y;
>   ivtmp.30_6 = (unsigned long) &MEM <int[200]> [(void *)&y + 16B];
>   ivtmp.31_10 = (unsigned long) &MEM <int[200]> [(void *)&y + 32B];
>   ivtmp.32_14 = (unsigned long) &MEM <int[200]> [(void *)&y + 48B];
>
>   <bb 3> [local count: 273589366]:
>   # ivtmp_72 = PHI <ivtmp_73(3), 100(2)>
>   # ivtmp.26_41 = PHI <ivtmp.26_37(3), ivtmp.26_36(2)>
>   # ivtmp.27_1 = PHI <ivtmp.27_2(3), ivtmp.27_3(2)>
>   # ivtmp.30_4 = PHI <ivtmp.30_5(3), ivtmp.30_6(2)>
>   # ivtmp.31_8 = PHI <ivtmp.31_9(3), ivtmp.31_10(2)>
>   # ivtmp.32_12 = PHI <ivtmp.32_13(3), ivtmp.32_14(2)>
>   loop_len_34 = MIN_EXPR <ivtmp_72, 8>;
>   loop_len_48 = MIN_EXPR <loop_len_34, 4>;
>   _74 = loop_len_34 - loop_len_48;
 
Yeah, I think this needs to be:
 
  loop_len_48 = MIN_EXPR <loop_len_34 * 2, 4>;
  _74 = loop_len_34 * 2 - loop_len_48;
  
(as valid gimple).  The point is that...
 
>   loop_len_49 = MIN_EXPR <_74, 4>;
>   _75 = _74 - loop_len_49;
>   loop_len_50 = MIN_EXPR <_75, 4>;
>   loop_len_51 = _75 - loop_len_50;
 
...there are 4 lengths capped to 4, for a total element count of 16.
But loop_len_34 is never greater than 8.
 
So for this case we either need to multiply, or we need to create
a fresh IV for the second rgroup.  Both approaches are fine.
 
Thanks,
Richard
Richard Sandiford May 24, 2023, 3:47 p.m. UTC | #5
钟居哲 <juzhe.zhong@rivai.ai> writes:
> Hi, Richard. I still don't understand it. Sorry about that.
>
>>>  loop_len_48 = MIN_EXPR <loop_len_34 * 2, 4>;
>   >>   _74 = loop_len_34 * 2 - loop_len_48;
>
> I have the tests already tested.
> We have a MIN_EXPR to calculate the total elements:
> loop_len_34 = MIN_EXPR <ivtmp_72, 8>;
> I think "8" is already multiplied by 2?
>
> Why do we need loop_len_34 * 2 ?
> Could you give me more informations, The similiar tests you present we already have
> execution check and passed. I am not sure whether this patch has the issue that I didn't notice.

Think about the maximum values of each SSA name:

   loop_len_34 = MIN_EXPR <ivtmp_72, 8>;       // MAX 8
   loop_len_48 = MIN_EXPR <loop_len_34, 4>;    // MAX 4
   _74 = loop_len_34 - loop_len_48;            // MAX 4
   loop_len_49 = MIN_EXPR <_74, 4>;            // MAX 4 (always == _74)
   _75 = _74 - loop_len_49;                    // 0
   loop_len_50 = MIN_EXPR <_75, 4>;            // 0
   loop_len_51 = _75 - loop_len_50;            // 0

So the final two y vectors will always have 0 controls.

Thanks,
Richard
juzhe.zhong@rivai.ai May 24, 2023, 3:52 p.m. UTC | #6
Oh. I see. Thank you so much for pointing this.
Could you tell me what I should do in the codes?
It seems that I should adjust it in 
vect_adjust_loop_lens_control

muliply by some factor ? Is this correct multiply by max_nscalars_per_iter
?
Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-24 23:47
To: 钟居哲
CC: gcc-patches; rguenther
Subject: Re: [PATCH V14] VECT: Add decrement IV iteration loop control by variable amount support
钟居哲 <juzhe.zhong@rivai.ai> writes:
> Hi, Richard. I still don't understand it. Sorry about that.
>
>>>  loop_len_48 = MIN_EXPR <loop_len_34 * 2, 4>;
>   >>   _74 = loop_len_34 * 2 - loop_len_48;
>
> I have the tests already tested.
> We have a MIN_EXPR to calculate the total elements:
> loop_len_34 = MIN_EXPR <ivtmp_72, 8>;
> I think "8" is already multiplied by 2?
>
> Why do we need loop_len_34 * 2 ?
> Could you give me more informations, The similiar tests you present we already have
> execution check and passed. I am not sure whether this patch has the issue that I didn't notice.
 
Think about the maximum values of each SSA name:
 
   loop_len_34 = MIN_EXPR <ivtmp_72, 8>;       // MAX 8
   loop_len_48 = MIN_EXPR <loop_len_34, 4>;    // MAX 4
   _74 = loop_len_34 - loop_len_48;            // MAX 4
   loop_len_49 = MIN_EXPR <_74, 4>;            // MAX 4 (always == _74)
   _75 = _74 - loop_len_49;                    // 0
   loop_len_50 = MIN_EXPR <_75, 4>;            // 0
   loop_len_51 = _75 - loop_len_50;            // 0
 
So the final two y vectors will always have 0 controls.
 
Thanks,
Richard
Richard Sandiford May 24, 2023, 4 p.m. UTC | #7
钟居哲 <juzhe.zhong@rivai.ai> writes:
> Oh. I see. Thank you so much for pointing this.
> Could you tell me what I should do in the codes?
> It seems that I should adjust it in 
> vect_adjust_loop_lens_control
>
> muliply by some factor ? Is this correct multiply by max_nscalars_per_iter
> ?

max_nscalars_per_iter * factor rather than just max_nscalars_per_iter

Note that it's possible for later max_nscalars_per_iter * factor to
be smaller, so a division might be needed in rare cases.  E.g.:

uint64_t x[100];
uint16_t y[200];

void f() {
  for (int i = 0, j = 0; i < 100; i += 2, j += 4) {
    x[i + 0] += 1;
    x[i + 1] += 2;
    y[j + 0] += 1;
    y[j + 1] += 2;
    y[j + 2] += 3;
    y[j + 3] += 4;
  }
}

where y has a single-control rgroup with max_nscalars_per_iter == 4
and x has a 2-control rgroup with max_nscalars_per_iter == 2

What gives the best code in these cases?  Is emitting a multiplication
better?  Or is using a new IV better?

Thanks,
Richard
juzhe.zhong@rivai.ai May 24, 2023, 4:15 p.m. UTC | #8
Hi, For the first piece of code ,I tried:
  unsigned int nitems_per_iter
    = dest_rgm->max_nscalars_per_iter * dest_rgm->factor;
  step = gimple_build (seq, MULT_EXPR, iv_type, step,
                       build_int_cst (iv_type, nitems_per_iter));

Then optimized IR:
loop_len_34 = MIN_EXPR <ivtmp_72, 8>;
  _74 = loop_len_34 * 4;
  loop_len_51 = _74 + 18446744073709551604;

  _16 = (void *) ivtmp.27_41;
  _17 = &MEM <vector(8) short int> [(short int *)_16];

  vect__1.7_33 = .LEN_LOAD (_17, 16B, loop_len_34, 0);

  vect__2.8_23 = VIEW_CONVERT_EXPR<vector(8) unsigned short>(vect__1.7_33);
  vect__3.9_22 = vect__2.8_23 + { 1, 2, 1, 2, 1, 2, 1, 2 };
  vect__4.10_21 = VIEW_CONVERT_EXPR<vector(8) short int>(vect__3.9_22);
  .LEN_STORE (_17, 16B, loop_len_34, vect__4.10_21, 0);
  _20 = (void *) ivtmp.28_1;
  _31 = &MEM <vector(4) int> [(int *)_20];

  vect__10.15_52 = .LEN_LOAD (_31, 32B, 4, 0);

  _30 = (void *) ivtmp.31_4;
  _29 = &MEM <vector(4) int> [(int *)_30];

  vect__10.16_54 = .LEN_LOAD (_29, 32B, 4, 0);

  _26 = (void *) ivtmp.32_8;
  _25 = &MEM <vector(4) int> [(int *)_26];

  vect__10.17_56 = .LEN_LOAD (_25, 32B, 4, 0);

  _79 = (void *) ivtmp.33_12;
  _80 = &MEM <vector(4) int> [(int *)_79];

  vect__10.18_58 = .LEN_LOAD (_80, 32B, loop_len_51, 0);

Is it correct ? It looks wierd ? 


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-25 00:00
To: 钟居哲
CC: gcc-patches; rguenther
Subject: Re: [PATCH V14] VECT: Add decrement IV iteration loop control by variable amount support
钟居哲 <juzhe.zhong@rivai.ai> writes:
> Oh. I see. Thank you so much for pointing this.
> Could you tell me what I should do in the codes?
> It seems that I should adjust it in 
> vect_adjust_loop_lens_control
>
> muliply by some factor ? Is this correct multiply by max_nscalars_per_iter
> ?
 
max_nscalars_per_iter * factor rather than just max_nscalars_per_iter
 
Note that it's possible for later max_nscalars_per_iter * factor to
be smaller, so a division might be needed in rare cases.  E.g.:
 
uint64_t x[100];
uint16_t y[200];
 
void f() {
  for (int i = 0, j = 0; i < 100; i += 2, j += 4) {
    x[i + 0] += 1;
    x[i + 1] += 2;
    y[j + 0] += 1;
    y[j + 1] += 2;
    y[j + 2] += 3;
    y[j + 3] += 4;
  }
}
 
where y has a single-control rgroup with max_nscalars_per_iter == 4
and x has a 2-control rgroup with max_nscalars_per_iter == 2
 
What gives the best code in these cases?  Is emitting a multiplication
better?  Or is using a new IV better?
 
Thanks,
Richard
juzhe.zhong@rivai.ai May 24, 2023, 4:37 p.m. UTC | #9
Hi, Richard. After I fix codes, now IR is correct I think:

loop_len_34 = MIN_EXPR <ivtmp_72, 8>;
  _74 = loop_len_34 * 2;
  loop_len_48 = MIN_EXPR <_74, 4>;
  _75 = _74 - loop_len_48;
  loop_len_49 = MIN_EXPR <_75, 4>;
  _76 = _75 - loop_len_49;
  loop_len_50 = MIN_EXPR <_76, 4>;
  loop_len_51 = _76 - loop_len_50;
  ...
  vect__1.8_33 = .LEN_LOAD (_17, 16B, loop_len_34, 0);
...
  .LEN_STORE (_17, 16B, loop_len_34, vect__4.11_21, 0);
...

  vect__10.16_52 = .LEN_LOAD (_31, 32B, loop_len_48, 0);
...
  vect__10.17_54 = .LEN_LOAD (_29, 32B, loop_len_49, 0);
...
  vect__10.18_56 = .LEN_LOAD (_25, 32B, loop_len_50, 0);
...
  vect__10.19_58 = .LEN_LOAD (_80, 32B, loop_len_51, 0);


For this case:

uint64_t x2[100];
uint16_t y2[200];

void f2(int n) {
  for (int i = 0, j = 0; i < n; i += 2, j += 4) {
    x2[i + 0] += 1;
    x2[i + 1] += 2;
    y2[j + 0] += 1;
    y2[j + 1] += 2;
    y2[j + 2] += 3;
    y2[j + 3] += 4;
  }
}

The IR is like this:

  loop_len_56 = MIN_EXPR <ivtmp_64, 8>;
  _66 = loop_len_56 * 4;
  loop_len_43 = _66 + 18446744073709551614;
  ...
  vect__1.44_44 = .LEN_LOAD (_6, 64B, 2, 0);
  ...
  vect__1.45_46 = .LEN_LOAD (_14, 64B, loop_len_43, 0);
  vect__2.46_47 = vect__1.44_44 + { 1, 2 };
  vect__2.46_48 = vect__1.45_46 + { 1, 2 };
  .LEN_STORE (_6, 64B, 2, vect__2.46_47, 0);
  .LEN_STORE (_14, 64B, loop_len_43, vect__2.46_48, 0);
  ...
  vect__6.51_57 = .LEN_LOAD (_10, 16B, loop_len_56, 0);

  vect__7.52_58 = vect__6.51_57 + { 1, 2, 3, 4, 1, 2, 3, 4 };
  .LEN_STORE (_10, 16B, loop_len_56, vect__7.52_58, 0);

It seems correct too ?

>> What gives the best code in these cases?  Is emitting a multiplication
>> better?  Or is using a new IV better?
Could you give me more detail information about "new refresh IV" approach.
I'd like to try that.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-25 00:00
To: 钟居哲
CC: gcc-patches; rguenther
Subject: Re: [PATCH V14] VECT: Add decrement IV iteration loop control by variable amount support
钟居哲 <juzhe.zhong@rivai.ai> writes:
> Oh. I see. Thank you so much for pointing this.
> Could you tell me what I should do in the codes?
> It seems that I should adjust it in 
> vect_adjust_loop_lens_control
>
> muliply by some factor ? Is this correct multiply by max_nscalars_per_iter
> ?
 
max_nscalars_per_iter * factor rather than just max_nscalars_per_iter
 
Note that it's possible for later max_nscalars_per_iter * factor to
be smaller, so a division might be needed in rare cases.  E.g.:
 
uint64_t x[100];
uint16_t y[200];
 
void f() {
  for (int i = 0, j = 0; i < 100; i += 2, j += 4) {
    x[i + 0] += 1;
    x[i + 1] += 2;
    y[j + 0] += 1;
    y[j + 1] += 2;
    y[j + 2] += 3;
    y[j + 3] += 4;
  }
}
 
where y has a single-control rgroup with max_nscalars_per_iter == 4
and x has a 2-control rgroup with max_nscalars_per_iter == 2
 
What gives the best code in these cases?  Is emitting a multiplication
better?  Or is using a new IV better?
 
Thanks,
Richard
Richard Sandiford May 24, 2023, 8:05 p.m. UTC | #10
I'll look at the samples tomorrow, but just to address one thing:

钟居哲 <juzhe.zhong@rivai.ai> writes:
>>> What gives the best code in these cases?  Is emitting a multiplication
>>> better?  Or is using a new IV better?
> Could you give me more detail information about "new refresh IV" approach.
> I'd like to try that.

By “using a new IV” I meant calling vect_set_loop_controls_directly
for every rgroup, not just the first.  So in the earlier example,
there would be one decrementing IV for x and one decrementing IV for y.

Thanks,
Richard
juzhe.zhong@rivai.ai May 25, 2023, 3:05 a.m. UTC | #11
Hi, Richard. 
After several tries with your testcases (I already added into V15 patch).
I think "using a new IV" would be better than "multiplication"

Now:
 loop_len_34 = MIN_EXPR <ivtmp_72, 8>;
  _74 = MIN_EXPR <ivtmp_75, 16>;   ------> multiplication approach will changed into  _74 = loop_len_34  * 2;
  loop_len_48 = MIN_EXPR <_74, 4>;
  _77 = _74 - loop_len_48;
  loop_len_49 = MIN_EXPR <_77, 4>;
  _78 = _77 - loop_len_49;
  loop_len_50 = MIN_EXPR <_78, 4>;
  loop_len_51 = _78 - loop_len_50;

I prefer "new IV" since it looks more reasonable and better codegen.
Could you take a look at it:
V15 patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619534.html 
  
Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-25 04:05
To: 钟居哲
CC: gcc-patches; rguenther
Subject: Re: [PATCH V14] VECT: Add decrement IV iteration loop control by variable amount support
I'll look at the samples tomorrow, but just to address one thing:
 
钟居哲 <juzhe.zhong@rivai.ai> writes:
>>> What gives the best code in these cases?  Is emitting a multiplication
>>> better?  Or is using a new IV better?
> Could you give me more detail information about "new refresh IV" approach.
> I'd like to try that.
 
By “using a new IV” I meant calling vect_set_loop_controls_directly
for every rgroup, not just the first.  So in the earlier example,
there would be one decrementing IV for x and one decrementing IV for y.
 
Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index ff6159e08d5..578ac5b783e 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -468,6 +468,38 @@  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+    {
+      /* single rgroup:
+	 ...
+	 _10 = (unsigned long) count_12(D);
+	 ...
+	 # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
+	 _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+	 ...
+	 vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
+	 ...
+	 ivtmp_35 = ivtmp_9 - _36;
+	 ...
+	 if (ivtmp_35 != 0)
+	   goto <bb 4>; [83.33%]
+	 else
+	   goto <bb 5>; [16.67%]
+      */
+      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+      tree step = rgc->controls.length () == 1 ? rgc->controls[0]
+					       : make_ssa_name (iv_type);
+      /* Create decrement IV.  */
+      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
+		 insert_after, &index_before_incr, &index_after_incr);
+      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
+							    index_before_incr,
+							    nitems_step));
+      LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo) = step;
+      return index_after_incr;
+    }
+
+  /* Create increment IV.  */
   create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
 	     loop, &incr_gsi, insert_after, &index_before_incr,
 	     &index_after_incr);
@@ -683,6 +715,63 @@  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   return next_ctrl;
 }
 
+/* Try to use adjust loop lens for multiple-rgroups.
+
+     _36 = MIN_EXPR <ivtmp_34, VF>;
+
+     First length (MIN (X, VF/N)):
+       loop_len_15 = MIN_EXPR <_36, VF/N>;
+
+     Second length:
+       tmp = _36 - loop_len_15;
+       loop_len_16 = MIN (tmp, VF/N);
+
+     Third length:
+       tmp2 = tmp - loop_len_16;
+       loop_len_17 = MIN (tmp2, VF/N);
+
+     Last length:
+       loop_len_18 = tmp2 - loop_len_17;
+*/
+
+static void
+vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
+			       rgroup_controls *dest_rgm, tree step)
+{
+  tree ctrl_type = dest_rgm->type;
+  poly_uint64 nitems_per_ctrl
+    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
+  tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
+
+  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
+    {
+      tree ctrl = dest_rgm->controls[i];
+      if (i == 0)
+	{
+	  /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
+	  gassign *assign
+	    = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
+	  gimple_seq_add_stmt (seq, assign);
+	}
+      else if (i == dest_rgm->controls.length () - 1)
+	{
+	  /* Last iteration: Remain capped to the range [0, VF/N].  */
+	  gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
+						 dest_rgm->controls[i - 1]);
+	  gimple_seq_add_stmt (seq, assign);
+	}
+      else
+	{
+	  /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
+	  step = gimple_build (seq, MINUS_EXPR, iv_type, step,
+			       dest_rgm->controls[i - 1]);
+	  gassign *assign
+	    = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
+	  gimple_seq_add_stmt (seq, assign);
+	}
+    }
+}
+
 /* Set up the iteration condition and rgroup controls for LOOP, given
    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
@@ -753,17 +842,84 @@  vect_set_loop_condition_partial_vectors (class loop *loop,
 	      continue;
 	  }
 
-	/* See whether zero-based IV would ever generate all-false masks
-	   or zero length before wrapping around.  */
-	bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
-
-	/* Set up all controls for this group.  */
-	test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
-						     &preheader_seq,
-						     &header_seq,
-						     loop_cond_gsi, rgc,
-						     niters, niters_skip,
-						     might_wrap_p);
+	if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+	    || !LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo))
+	  {
+	    /* See whether zero-based IV would ever generate all-false masks
+	       or zero length before wrapping around.  */
+	    bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
+
+	    /* Set up all controls for this group.  */
+	    test_ctrl
+	      = vect_set_loop_controls_directly (loop, loop_vinfo,
+						 &preheader_seq, &header_seq,
+						 loop_cond_gsi, rgc, niters,
+						 niters_skip, might_wrap_p);
+	  }
+
+	/* Decrement IV only run vect_set_loop_controls_directly once.  */
+	if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+	    && rgc->controls.length () > 1)
+	  {
+	    /*
+	       - Multiple rgroup (SLP):
+		 ...
+		 _38 = (unsigned long) bnd.7_29;
+		 _39 = _38 * 2;
+		 ...
+		 # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
+		 ...
+		 _43 = MIN_EXPR <ivtmp_41, 32>;
+		 loop_len_26 = MIN_EXPR <_43, 16>;
+		 loop_len_25 = _43 - loop_len_26;
+		 ...
+		 .LEN_STORE (_6, 8B, loop_len_26, ...);
+		 ...
+		 .LEN_STORE (_25, 8B, loop_len_25, ...);
+		 _33 = loop_len_26 / 2;
+		 ...
+		 .LEN_STORE (_8, 16B, _33, ...);
+		 _36 = loop_len_25 / 2;
+		 ...
+		 .LEN_STORE (_15, 16B, _36, ...);
+		 ivtmp_42 = ivtmp_41 - _43;
+		 ...
+
+	       - Multiple rgroup (non-SLP):
+		 ...
+		 _38 = (unsigned long) n_12(D);
+		 ...
+		 # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
+		 ...
+		 _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
+		 loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
+		 _41 = _40 - loop_len_21;
+		 loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
+		 _42 = _40 - loop_len_20;
+		 loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
+		 _43 = _40 - loop_len_19;
+		 loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
+		 ...
+		 vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
+		 ...
+		 vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
+		 ...
+		 vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
+		 ...
+		 vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
+		 vect__7.13_31 = VEC_PACK_TRUNC_EXPR <...>,
+		 vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
+		 vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
+		 ...
+		 .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
+		 ivtmp_39 = ivtmp_38 - _40;
+		 ...
+	    */
+	    tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+	    tree step = LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo);
+	    gcc_assert (step);
+	    vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, step);
+	  }
       }
 
   /* Emit all accumulated statements.  */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index cf10132b0bf..456f50fa7cc 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -973,6 +973,8 @@  _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     vectorizable (false),
     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
+    using_decrementing_iv_p (false),
+    decrementing_iv_step (NULL_TREE),
     epil_using_partial_vectors_p (false),
     partial_load_store_bias (0),
     peeling_for_gaps (false),
@@ -2725,6 +2727,17 @@  start_over:
       && !vect_verify_loop_lens (loop_vinfo))
     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 
+  /* If we're vectorizing an loop that uses length "controls" and
+     can iterate more than once, we apply decrementing IV approach
+     in loop control.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
+      && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
+      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	   && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
+			LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
+    LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
+
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 02d2ad6fba1..7ed079f543a 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -818,6 +818,16 @@  public:
      the vector loop can handle fewer than VF scalars.  */
   bool using_partial_vectors_p;
 
+  /* True if we've decided to use a decrementing loop control IV that counts
+     scalars. This can be done for any loop that:
+
+	(a) uses length "controls"; and
+	(b) can iterate more than once.  */
+  bool using_decrementing_iv_p;
+
+  /* The variable amount step for decrement IV.  */
+  tree decrementing_iv_step;
+
   /* True if we've decided to use partially-populated vectors for the
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
@@ -890,6 +900,8 @@  public:
 #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
 #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
 #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
+#define LOOP_VINFO_DECREMENTING_IV_STEP(L) (L)->decrementing_iv_step
 #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
   (L)->epil_using_partial_vectors_p
 #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias