diff mbox series

[V6] VECT: Add SELECT_VL support

Message ID 20230609083934.556871-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V6] VECT: Add SELECT_VL support | expand

Commit Message

juzhe.zhong@rivai.ai June 9, 2023, 8:39 a.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Co-authored-by: Richard Sandiford<richard.sandiford@arm.com>
Co-authored-by: Richard Biener <rguenther@suse.de>

This patch address comments from Richard && Richi and rebase to trunk.

This patch is adding SELECT_VL middle-end support
allow target have target dependent optimization in case of
length calculation.

This patch is inspired by RVV ISA and LLVM:
https://reviews.llvm.org/D99750

The SELECT_VL is same behavior as LLVM "get_vector_length" with
these following properties:

1. Only apply on single-rgroup.
2. non SLP.
3. adjust loop control IV.
4. adjust data reference IV.
5. allow non-vf elements processing in non-final iteration

Code:
   # void vvaddint32(size_t n, const int*x, const int*y, int*z)
    # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }

Take RVV codegen for example:

Before this patch:
vvaddint32:
        ble     a0,zero,.L6
        csrr    a4,vlenb
        srli    a6,a4,2
.L4:
        mv      a5,a0
        bleu    a0,a6,.L3
        mv      a5,a6
.L3:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetvli a7,zero,e32,m1,ta,ma
        sub     a0,a0,a5
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a3)
        add     a2,a2,a4
        add     a3,a3,a4
        add     a1,a1,a4
        bne     a0,zero,.L4
.L6:
        ret

After this patch:

vvaddint32:
    vsetvli t0, a0, e32, ta, ma  # Set vector length based on 32-bit vectors
    vle32.v v0, (a1)         # Get first vector
      sub a0, a0, t0         # Decrement number done
      slli t0, t0, 2         # Multiply number done by 4 bytes
      add a1, a1, t0         # Bump pointer
    vle32.v v1, (a2)         # Get second vector
      add a2, a2, t0         # Bump pointer
    vadd.vv v2, v0, v1       # Sum vectors
    vse32.v v2, (a3)         # Store result
      add a3, a3, t0         # Bump pointer
      bnez a0, vvaddint32    # Loop back
      ret                    # Finished

gcc/ChangeLog:

        * doc/md.texi: Add SELECT_VL support.
        * internal-fn.def (SELECT_VL): Ditto.
        * optabs.def (OPTAB_D): Ditto.
        * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
        * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Ditto.
        * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): Ditto.

---
 gcc/doc/md.texi             | 22 ++++++++++++
 gcc/internal-fn.def         |  1 +
 gcc/optabs.def              |  1 +
 gcc/tree-vect-loop-manip.cc | 32 ++++++++++++-----
 gcc/tree-vect-loop.cc       | 72 +++++++++++++++++++++++++++++++++++++
 gcc/tree-vect-stmts.cc      | 69 +++++++++++++++++++++++++++++++----
 gcc/tree-vectorizer.h       |  6 ++++
 7 files changed, 187 insertions(+), 16 deletions(-)

Comments

juzhe.zhong@rivai.ai June 9, 2023, 9:20 a.m. UTC | #1
Just finish running Boostrap on X86 has passed.
Ok for trunk?

Thanks.


juzhe.zhong@rivai.ai
 
From: juzhe.zhong
Date: 2023-06-09 16:39
To: gcc-patches
CC: richard.sandiford; rguenther; Ju-Zhe Zhong
Subject: [PATCH V6] VECT: Add SELECT_VL support
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
 
Co-authored-by: Richard Sandiford<richard.sandiford@arm.com>
Co-authored-by: Richard Biener <rguenther@suse.de>
 
This patch address comments from Richard && Richi and rebase to trunk.
 
This patch is adding SELECT_VL middle-end support
allow target have target dependent optimization in case of
length calculation.
 
This patch is inspired by RVV ISA and LLVM:
https://reviews.llvm.org/D99750
 
The SELECT_VL is same behavior as LLVM "get_vector_length" with
these following properties:
 
1. Only apply on single-rgroup.
2. non SLP.
3. adjust loop control IV.
4. adjust data reference IV.
5. allow non-vf elements processing in non-final iteration
 
Code:
   # void vvaddint32(size_t n, const int*x, const int*y, int*z)
    # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
 
Take RVV codegen for example:
 
Before this patch:
vvaddint32:
        ble     a0,zero,.L6
        csrr    a4,vlenb
        srli    a6,a4,2
.L4:
        mv      a5,a0
        bleu    a0,a6,.L3
        mv      a5,a6
.L3:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetvli a7,zero,e32,m1,ta,ma
        sub     a0,a0,a5
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a3)
        add     a2,a2,a4
        add     a3,a3,a4
        add     a1,a1,a4
        bne     a0,zero,.L4
.L6:
        ret
 
After this patch:
 
vvaddint32:
    vsetvli t0, a0, e32, ta, ma  # Set vector length based on 32-bit vectors
    vle32.v v0, (a1)         # Get first vector
      sub a0, a0, t0         # Decrement number done
      slli t0, t0, 2         # Multiply number done by 4 bytes
      add a1, a1, t0         # Bump pointer
    vle32.v v1, (a2)         # Get second vector
      add a2, a2, t0         # Bump pointer
    vadd.vv v2, v0, v1       # Sum vectors
    vse32.v v2, (a3)         # Store result
      add a3, a3, t0         # Bump pointer
      bnez a0, vvaddint32    # Loop back
      ret                    # Finished
 
gcc/ChangeLog:
 
        * doc/md.texi: Add SELECT_VL support.
        * internal-fn.def (SELECT_VL): Ditto.
        * optabs.def (OPTAB_D): Ditto.
        * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
        * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Ditto.
        * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): Ditto.
 
---
gcc/doc/md.texi             | 22 ++++++++++++
gcc/internal-fn.def         |  1 +
gcc/optabs.def              |  1 +
gcc/tree-vect-loop-manip.cc | 32 ++++++++++++-----
gcc/tree-vect-loop.cc       | 72 +++++++++++++++++++++++++++++++++++++
gcc/tree-vect-stmts.cc      | 69 +++++++++++++++++++++++++++++++----
gcc/tree-vectorizer.h       |  6 ++++
7 files changed, 187 insertions(+), 16 deletions(-)
 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 6a435eb4461..95f7fe1f802 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4974,6 +4974,28 @@ for (i = 1; i < operand3; i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
@end smallexample
+@cindex @code{select_vl@var{m}} instruction pattern
+@item @code{select_vl@var{m}}
+Set operand 0 to the number of scalar iterations that should be handled
+by one iteration of a vector loop.  Operand 1 is the total number of
+scalar iterations that the loop needs to process and operand 2 is a
+maximum bound on the result (also known as the maximum ``vectorization
+factor'').
+
+The maximum value of operand 0 is given by:
+@smallexample
+operand0 = MIN (operand1, operand2)
+@end smallexample
+However, targets might choose a lower value than this, based on
+target-specific criteria.  Each iteration of the vector loop might
+therefore process a different number of scalar iterations, which in turn
+means that induction variables will have a variable step.  Because of
+this, it is generally not useful to define this instruction if it will
+always calculate the maximum value.
+
+This optab is only useful on targets that implement @samp{len_load_@var{m}}
+and/or @samp{len_store_@var{m}}.
+
@cindex @code{check_raw_ptrs@var{m}} instruction pattern
@item @samp{check_raw_ptrs@var{m}}
Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 3ac9d82aace..5d638de6d06 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -177,6 +177,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
       check_raw_ptrs, check_ptrs)
DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 6c064ff4993..f31b69c5d85 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -488,3 +488,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
OPTAB_D (len_load_optab, "len_load_$a")
OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (select_vl_optab, "select_vl$a")
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 3f735945e67..1c8100c1a1c 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -534,7 +534,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   _10 = (unsigned long) count_12(D);
   ...
   # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
-    _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+    _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
   ...
   vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
   ...
@@ -549,15 +549,28 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
       tree step = rgc->controls.length () == 1 ? rgc->controls[0]
       : make_ssa_name (iv_type);
       /* Create decrement IV.  */
-      create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
- &incr_gsi, insert_after, &index_before_incr,
- &index_after_incr);
-      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
-     index_before_incr,
-     nitems_step));
+      if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+ {
+   create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
+      insert_after, &index_before_incr, &index_after_incr);
+   tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
+    index_before_incr, nitems_step);
+   gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
+ }
+      else
+ {
+   create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
+      &incr_gsi, insert_after, &index_before_incr,
+      &index_after_incr);
+   gimple_seq_add_stmt (header_seq,
+        gimple_build_assign (step, MIN_EXPR,
+     index_before_incr,
+     nitems_step));
+ }
       *iv_step = step;
       *compare_step = nitems_step;
-      return index_before_incr;
+      return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
+        : index_before_incr;
     }
   /* Create increment IV.  */
@@ -888,7 +901,8 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
   /* Get a boolean result that tells us whether to iterate.  */
   edge exit_edge = single_exit (loop);
   gcond *cond_stmt;
-  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+      && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
     {
       gcc_assert (compare_step);
       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5b7a0da0034..ace9e759f5b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -974,6 +974,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
     using_decrementing_iv_p (false),
+    using_select_vl_p (false),
     epil_using_partial_vectors_p (false),
     partial_load_store_bias (0),
     peeling_for_gaps (false),
@@ -2737,6 +2738,77 @@ start_over:
LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
+  /* If a loop uses length controls and has a decrementing loop control IV,
+     we will normally pass that IV through a MIN_EXPR to calcaluate the
+     basis for the length controls.  E.g. in a loop that processes one
+     element per scalar iteration, the number of elements would be
+     MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
+
+     This MIN_EXPR approach allows us to use pointer IVs with an invariant
+     step, since only the final iteration of the vector loop can have
+     inactive lanes.
+
+     However, some targets have a dedicated instruction for calculating the
+     preferred length, given the total number of elements that still need to
+     be processed.  This is encapsulated in the SELECT_VL internal function.
+
+     If the target supports SELECT_VL, we can use it instead of MIN_EXPR
+     to determine the basis for the length controls.  However, unlike the
+     MIN_EXPR calculation, the SELECT_VL calculation can decide to make
+     lanes inactive in any iteration of the vector loop, not just the last
+     iteration.  This SELECT_VL approach therefore requires us to use pointer
+     IVs with variable steps.
+
+     Once we've decided how many elements should be processed by one
+     iteration of the vector loop, we need to populate the rgroup controls.
+     If a loop has multiple rgroups, we need to make sure that those rgroups
+     "line up" (that is, they must be consistent about which elements are
+     active and which aren't).  This is done by vect_adjust_loop_lens_control.
+
+     In principle, it would be possible to use vect_adjust_loop_lens_control
+     on either the result of a MIN_EXPR or the result of a SELECT_VL.
+     However:
+
+     (1) In practice, it only makes sense to use SELECT_VL when a vector
+ operation will be controlled directly by the result.  It is not
+ worth using SELECT_VL if it would only be the input to other
+ calculations.
+
+     (2) If we use SELECT_VL for an rgroup that has N controls, each associated
+ pointer IV will need N updates by a variable amount (N-1 updates
+ within the iteration and 1 update to move to the next iteration).
+
+     Because of this, we prefer to use the MIN_EXPR approach whenever there
+     is more than one length control.
+
+     In addition, SELECT_VL always operates to a granularity of 1 unit.
+     If we wanted to use it to control an SLP operation on N consecutive
+     elements, we would need to make the SELECT_VL inputs measure scalar
+     iterations (rather than elements) and then multiply the SELECT_VL
+     result by N.  But using SELECT_VL this way is inefficient because
+     of (1) above.
+
+     2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
+ satisfied:
+
+     (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
+     (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
+
+     Since SELECT_VL (variable step) will make SCEV analysis failed and then
+     we will fail to gain benefits of following unroll optimizations. We prefer
+     using the MIN_EXPR approach in this situation.  */
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+    {
+      tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+      if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
+   OPTIMIZE_FOR_SPEED)
+   && LOOP_VINFO_LENS (loop_vinfo).length () == 1
+   && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
+   && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
+ LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
+    }
+
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c7e4e71d3c5..a7acc032d47 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3128,19 +3128,72 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
   *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
}
+/* Prepare the pointer IVs which needs to be updated by a variable amount.
+   Such variable amount is the outcome of .SELECT_VL. In this case, we can
+   allow each iteration process the flexible number of elements as long as
+   the number <= vf elments.
+
+   Return data reference according to SELECT_VL.
+   If new statements are needed, insert them before GSI.  */
+
+static tree
+vect_get_loop_variant_data_ptr_increment (
+  vec_info *vinfo, tree aggr_type, gimple_stmt_iterator *gsi,
+  vec_loop_lens *loop_lens, dr_vec_info *dr_info,
+  vect_memory_access_type memory_access_type)
+{
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
+  tree step = vect_dr_behavior (vinfo, dr_info)->step;
+
+  /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
+     IVs are updated by variable amount but we will support them in the future.
+   */
+  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
+       && memory_access_type != VMAT_LOAD_STORE_LANES);
+
+  /* When we support SELECT_VL pattern, we dynamic adjust
+     the memory address by .SELECT_VL result.
+
+     The result of .SELECT_VL is the number of elements to
+     be processed of each iteration. So the memory address
+     adjustment operation should be:
+
+     addr = addr + .SELECT_VL (ARG..) * step;
+  */
+  tree loop_len
+    = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0, 0);
+  tree len_type = TREE_TYPE (loop_len);
+  /* Since the outcome of .SELECT_VL is element size, we should adjust
+     it into bytesize so that it can be used in address pointer variable
+     amount IVs adjustment.  */
+  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
+   wide_int_to_tree (len_type, wi::to_widest (step)));
+  tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
+  gassign *assign = gimple_build_assign (bump, tmp);
+  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+  return bump;
+}
+
/* Return the amount that should be added to a vector pointer to move
    to the next or previous copy of AGGR_TYPE.  DR_INFO is the data reference
    being vectorized and MEMORY_ACCESS_TYPE describes the type of
    vectorization.  */
static tree
-vect_get_data_ptr_increment (vec_info *vinfo,
+vect_get_data_ptr_increment (vec_info *vinfo, gimple_stmt_iterator *gsi,
     dr_vec_info *dr_info, tree aggr_type,
-      vect_memory_access_type memory_access_type)
+      vect_memory_access_type memory_access_type,
+      vec_loop_lens *loop_lens = nullptr)
{
   if (memory_access_type == VMAT_INVARIANT)
     return size_zero_node;
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
+  if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+    return vect_get_loop_variant_data_ptr_increment (vinfo, aggr_type, gsi,
+      loop_lens, dr_info,
+      memory_access_type);
+
   tree iv_step = TYPE_SIZE_UNIT (aggr_type);
   tree step = vect_dr_behavior (vinfo, dr_info)->step;
   if (tree_int_cst_sgn (step) == -1)
@@ -7629,7 +7682,7 @@ vectorizable_scan_store (vec_info *vinfo,
   tree vec_oprnd3 = NULL_TREE;
   tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr);
   tree dataref_offset = build_int_cst (ref_type, 0);
-  tree bump = vect_get_data_ptr_increment (vinfo, dr_info,
+  tree bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info,
   vectype, VMAT_CONTIGUOUS);
   tree ldataref_ptr = NULL_TREE;
   tree orig = NULL_TREE;
@@ -8539,8 +8592,8 @@ vectorizable_store (vec_info *vinfo,
aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
       else
aggr_type = vectype;
-      bump = vect_get_data_ptr_increment (vinfo, dr_info, aggr_type,
-   memory_access_type);
+      bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
+   memory_access_type, loop_lens);
     }
   if (mask)
@@ -8663,6 +8716,7 @@ vectorizable_store (vec_info *vinfo,
}
       else
{
+   gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));  
  /* For interleaved stores we created vectorized defs for all the
     defs stored in OPRNDS in the previous iteration (previous copy).
     DR_CHAIN is then used as an input to vect_permute_store_chain().
@@ -9975,8 +10029,8 @@ vectorizable_load (vec_info *vinfo,
aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
       else
aggr_type = vectype;
-      bump = vect_get_data_ptr_increment (vinfo, dr_info, aggr_type,
-   memory_access_type);
+      bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
+   memory_access_type, loop_lens);
     }
   auto_vec<tree> vec_offsets;
@@ -10056,6 +10110,7 @@ vectorizable_load (vec_info *vinfo,
}
       else
{
+   gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
  if (dataref_offset)
    dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
      bump);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1e44f8542d7..af25d20bd7e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -825,6 +825,11 @@ public:
(b) can iterate more than once.  */
   bool using_decrementing_iv_p;
+  /* True if we've decided to use output of select_vl to adjust IV of
+     both loop control and data reference pointer. This is only true
+     for single-rgroup control.  */
+  bool using_select_vl_p;
+
   /* True if we've decided to use partially-populated vectors for the
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
@@ -898,6 +903,7 @@ public:
#define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
#define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
+#define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p
#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
   (L)->epil_using_partial_vectors_p
#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
Richard Biener June 9, 2023, 11:02 a.m. UTC | #2
On Fri, 9 Jun 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Co-authored-by: Richard Sandiford<richard.sandiford@arm.com>
> Co-authored-by: Richard Biener <rguenther@suse.de>
> 
> This patch address comments from Richard && Richi and rebase to trunk.
> 
> This patch is adding SELECT_VL middle-end support
> allow target have target dependent optimization in case of
> length calculation.
> 
> This patch is inspired by RVV ISA and LLVM:
> https://reviews.llvm.org/D99750
> 
> The SELECT_VL is same behavior as LLVM "get_vector_length" with
> these following properties:
> 
> 1. Only apply on single-rgroup.
> 2. non SLP.
> 3. adjust loop control IV.
> 4. adjust data reference IV.
> 5. allow non-vf elements processing in non-final iteration
> 
> Code:
>    # void vvaddint32(size_t n, const int*x, const int*y, int*z)
>     # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
> 
> Take RVV codegen for example:
> 
> Before this patch:
> vvaddint32:
>         ble     a0,zero,.L6
>         csrr    a4,vlenb
>         srli    a6,a4,2
> .L4:
>         mv      a5,a0
>         bleu    a0,a6,.L3
>         mv      a5,a6
> .L3:
>         vsetvli zero,a5,e32,m1,ta,ma
>         vle32.v v2,0(a1)
>         vle32.v v1,0(a2)
>         vsetvli a7,zero,e32,m1,ta,ma
>         sub     a0,a0,a5
>         vadd.vv v1,v1,v2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vse32.v v1,0(a3)
>         add     a2,a2,a4
>         add     a3,a3,a4
>         add     a1,a1,a4
>         bne     a0,zero,.L4
> .L6:
>         ret
> 
> After this patch:
> 
> vvaddint32:
>     vsetvli t0, a0, e32, ta, ma  # Set vector length based on 32-bit vectors
>     vle32.v v0, (a1)         # Get first vector
>       sub a0, a0, t0         # Decrement number done
>       slli t0, t0, 2         # Multiply number done by 4 bytes
>       add a1, a1, t0         # Bump pointer
>     vle32.v v1, (a2)         # Get second vector
>       add a2, a2, t0         # Bump pointer
>     vadd.vv v2, v0, v1       # Sum vectors
>     vse32.v v2, (a3)         # Store result
>       add a3, a3, t0         # Bump pointer
>       bnez a0, vvaddint32    # Loop back
>       ret                    # Finished

OK.

Thanks,
Richard.

> gcc/ChangeLog:
> 
>         * doc/md.texi: Add SELECT_VL support.
>         * internal-fn.def (SELECT_VL): Ditto.
>         * optabs.def (OPTAB_D): Ditto.
>         * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
>         * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Ditto.
>         * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): Ditto.
>         (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): Ditto.
> 
> ---
>  gcc/doc/md.texi             | 22 ++++++++++++
>  gcc/internal-fn.def         |  1 +
>  gcc/optabs.def              |  1 +
>  gcc/tree-vect-loop-manip.cc | 32 ++++++++++++-----
>  gcc/tree-vect-loop.cc       | 72 +++++++++++++++++++++++++++++++++++++
>  gcc/tree-vect-stmts.cc      | 69 +++++++++++++++++++++++++++++++----
>  gcc/tree-vectorizer.h       |  6 ++++
>  7 files changed, 187 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 6a435eb4461..95f7fe1f802 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -4974,6 +4974,28 @@ for (i = 1; i < operand3; i++)
>    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
>  @end smallexample
>  
> +@cindex @code{select_vl@var{m}} instruction pattern
> +@item @code{select_vl@var{m}}
> +Set operand 0 to the number of scalar iterations that should be handled
> +by one iteration of a vector loop.  Operand 1 is the total number of
> +scalar iterations that the loop needs to process and operand 2 is a
> +maximum bound on the result (also known as the maximum ``vectorization
> +factor'').
> +
> +The maximum value of operand 0 is given by:
> +@smallexample
> +operand0 = MIN (operand1, operand2)
> +@end smallexample
> +However, targets might choose a lower value than this, based on
> +target-specific criteria.  Each iteration of the vector loop might
> +therefore process a different number of scalar iterations, which in turn
> +means that induction variables will have a variable step.  Because of
> +this, it is generally not useful to define this instruction if it will
> +always calculate the maximum value.
> +
> +This optab is only useful on targets that implement @samp{len_load_@var{m}}
> +and/or @samp{len_store_@var{m}}.
> +
>  @cindex @code{check_raw_ptrs@var{m}} instruction pattern
>  @item @samp{check_raw_ptrs@var{m}}
>  Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 3ac9d82aace..5d638de6d06 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -177,6 +177,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
>  DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>  
>  DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> +DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
>  DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
>  		       check_raw_ptrs, check_ptrs)
>  DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 6c064ff4993..f31b69c5d85 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -488,3 +488,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
>  OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
>  OPTAB_D (len_load_optab, "len_load_$a")
>  OPTAB_D (len_store_optab, "len_store_$a")
> +OPTAB_D (select_vl_optab, "select_vl$a")
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 3f735945e67..1c8100c1a1c 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -534,7 +534,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>  	   _10 = (unsigned long) count_12(D);
>  	   ...
>  	   # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> -	   _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> +	   _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
>  	   ...
>  	   vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
>  	   ...
> @@ -549,15 +549,28 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>        tree step = rgc->controls.length () == 1 ? rgc->controls[0]
>  					       : make_ssa_name (iv_type);
>        /* Create decrement IV.  */
> -      create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
> -		 &incr_gsi, insert_after, &index_before_incr,
> -		 &index_after_incr);
> -      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
> -							    index_before_incr,
> -							    nitems_step));
> +      if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> +	{
> +	  create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
> +		     insert_after, &index_before_incr, &index_after_incr);
> +	  tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
> +				   index_before_incr, nitems_step);
> +	  gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
> +	}
> +      else
> +	{
> +	  create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
> +		     &incr_gsi, insert_after, &index_before_incr,
> +		     &index_after_incr);
> +	  gimple_seq_add_stmt (header_seq,
> +			       gimple_build_assign (step, MIN_EXPR,
> +						    index_before_incr,
> +						    nitems_step));
> +	}
>        *iv_step = step;
>        *compare_step = nitems_step;
> -      return index_before_incr;
> +      return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
> +						       : index_before_incr;
>      }
>  
>    /* Create increment IV.  */
> @@ -888,7 +901,8 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>    /* Get a boolean result that tells us whether to iterate.  */
>    edge exit_edge = single_exit (loop);
>    gcond *cond_stmt;
> -  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +      && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
>      {
>        gcc_assert (compare_step);
>        tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 5b7a0da0034..ace9e759f5b 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -974,6 +974,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
>      can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
>      using_partial_vectors_p (false),
>      using_decrementing_iv_p (false),
> +    using_select_vl_p (false),
>      epil_using_partial_vectors_p (false),
>      partial_load_store_bias (0),
>      peeling_for_gaps (false),
> @@ -2737,6 +2738,77 @@ start_over:
>  			LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
>      LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
>  
> +  /* If a loop uses length controls and has a decrementing loop control IV,
> +     we will normally pass that IV through a MIN_EXPR to calcaluate the
> +     basis for the length controls.  E.g. in a loop that processes one
> +     element per scalar iteration, the number of elements would be
> +     MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
> +
> +     This MIN_EXPR approach allows us to use pointer IVs with an invariant
> +     step, since only the final iteration of the vector loop can have
> +     inactive lanes.
> +
> +     However, some targets have a dedicated instruction for calculating the
> +     preferred length, given the total number of elements that still need to
> +     be processed.  This is encapsulated in the SELECT_VL internal function.
> +
> +     If the target supports SELECT_VL, we can use it instead of MIN_EXPR
> +     to determine the basis for the length controls.  However, unlike the
> +     MIN_EXPR calculation, the SELECT_VL calculation can decide to make
> +     lanes inactive in any iteration of the vector loop, not just the last
> +     iteration.  This SELECT_VL approach therefore requires us to use pointer
> +     IVs with variable steps.
> +
> +     Once we've decided how many elements should be processed by one
> +     iteration of the vector loop, we need to populate the rgroup controls.
> +     If a loop has multiple rgroups, we need to make sure that those rgroups
> +     "line up" (that is, they must be consistent about which elements are
> +     active and which aren't).  This is done by vect_adjust_loop_lens_control.
> +
> +     In principle, it would be possible to use vect_adjust_loop_lens_control
> +     on either the result of a MIN_EXPR or the result of a SELECT_VL.
> +     However:
> +
> +     (1) In practice, it only makes sense to use SELECT_VL when a vector
> +	 operation will be controlled directly by the result.  It is not
> +	 worth using SELECT_VL if it would only be the input to other
> +	 calculations.
> +
> +     (2) If we use SELECT_VL for an rgroup that has N controls, each associated
> +	 pointer IV will need N updates by a variable amount (N-1 updates
> +	 within the iteration and 1 update to move to the next iteration).
> +
> +     Because of this, we prefer to use the MIN_EXPR approach whenever there
> +     is more than one length control.
> +
> +     In addition, SELECT_VL always operates to a granularity of 1 unit.
> +     If we wanted to use it to control an SLP operation on N consecutive
> +     elements, we would need to make the SELECT_VL inputs measure scalar
> +     iterations (rather than elements) and then multiply the SELECT_VL
> +     result by N.  But using SELECT_VL this way is inefficient because
> +     of (1) above.
> +
> +     2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
> +	satisfied:
> +
> +     (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
> +     (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
> +
> +     Since SELECT_VL (variable step) will make SCEV analysis failed and then
> +     we will fail to gain benefits of following unroll optimizations. We prefer
> +     using the MIN_EXPR approach in this situation.  */
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +    {
> +      tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +      if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
> +					  OPTIMIZE_FOR_SPEED)
> +	  && LOOP_VINFO_LENS (loop_vinfo).length () == 1
> +	  && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
> +	  && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +	      || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
> +	LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
> +    }
> +
>    /* If we're vectorizing an epilogue loop, the vectorized loop either needs
>       to be able to handle fewer than VF scalars, or needs to have a lower VF
>       than the main loop.  */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index c7e4e71d3c5..a7acc032d47 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -3128,19 +3128,72 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
>    *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
>  }
>  
> +/* Prepare the pointer IVs which needs to be updated by a variable amount.
> +   Such variable amount is the outcome of .SELECT_VL. In this case, we can
> +   allow each iteration process the flexible number of elements as long as
> +   the number <= vf elments.
> +
> +   Return data reference according to SELECT_VL.
> +   If new statements are needed, insert them before GSI.  */
> +
> +static tree
> +vect_get_loop_variant_data_ptr_increment (
> +  vec_info *vinfo, tree aggr_type, gimple_stmt_iterator *gsi,
> +  vec_loop_lens *loop_lens, dr_vec_info *dr_info,
> +  vect_memory_access_type memory_access_type)
> +{
> +  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
> +  tree step = vect_dr_behavior (vinfo, dr_info)->step;
> +
> +  /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
> +     IVs are updated by variable amount but we will support them in the future.
> +   */
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> +	      && memory_access_type != VMAT_LOAD_STORE_LANES);
> +
> +  /* When we support SELECT_VL pattern, we dynamic adjust
> +     the memory address by .SELECT_VL result.
> +
> +     The result of .SELECT_VL is the number of elements to
> +     be processed of each iteration. So the memory address
> +     adjustment operation should be:
> +
> +     addr = addr + .SELECT_VL (ARG..) * step;
> +  */
> +  tree loop_len
> +    = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0, 0);
> +  tree len_type = TREE_TYPE (loop_len);
> +  /* Since the outcome of .SELECT_VL is element size, we should adjust
> +     it into bytesize so that it can be used in address pointer variable
> +     amount IVs adjustment.  */
> +  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
> +			  wide_int_to_tree (len_type, wi::to_widest (step)));
> +  tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
> +  gassign *assign = gimple_build_assign (bump, tmp);
> +  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> +  return bump;
> +}
> +
>  /* Return the amount that should be added to a vector pointer to move
>     to the next or previous copy of AGGR_TYPE.  DR_INFO is the data reference
>     being vectorized and MEMORY_ACCESS_TYPE describes the type of
>     vectorization.  */
>  
>  static tree
> -vect_get_data_ptr_increment (vec_info *vinfo,
> +vect_get_data_ptr_increment (vec_info *vinfo, gimple_stmt_iterator *gsi,
>  			     dr_vec_info *dr_info, tree aggr_type,
> -			     vect_memory_access_type memory_access_type)
> +			     vect_memory_access_type memory_access_type,
> +			     vec_loop_lens *loop_lens = nullptr)
>  {
>    if (memory_access_type == VMAT_INVARIANT)
>      return size_zero_node;
>  
> +  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
> +  if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> +    return vect_get_loop_variant_data_ptr_increment (vinfo, aggr_type, gsi,
> +						     loop_lens, dr_info,
> +						     memory_access_type);
> +
>    tree iv_step = TYPE_SIZE_UNIT (aggr_type);
>    tree step = vect_dr_behavior (vinfo, dr_info)->step;
>    if (tree_int_cst_sgn (step) == -1)
> @@ -7629,7 +7682,7 @@ vectorizable_scan_store (vec_info *vinfo,
>    tree vec_oprnd3 = NULL_TREE;
>    tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr);
>    tree dataref_offset = build_int_cst (ref_type, 0);
> -  tree bump = vect_get_data_ptr_increment (vinfo, dr_info,
> +  tree bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info,
>  					   vectype, VMAT_CONTIGUOUS);
>    tree ldataref_ptr = NULL_TREE;
>    tree orig = NULL_TREE;
> @@ -8539,8 +8592,8 @@ vectorizable_store (vec_info *vinfo,
>  	aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
>        else
>  	aggr_type = vectype;
> -      bump = vect_get_data_ptr_increment (vinfo, dr_info, aggr_type,
> -					  memory_access_type);
> +      bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
> +					  memory_access_type, loop_lens);
>      }
>  
>    if (mask)
> @@ -8663,6 +8716,7 @@ vectorizable_store (vec_info *vinfo,
>  	}
>        else
>  	{
> +	  gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));  
>  	  /* For interleaved stores we created vectorized defs for all the
>  	     defs stored in OPRNDS in the previous iteration (previous copy).
>  	     DR_CHAIN is then used as an input to vect_permute_store_chain().
> @@ -9975,8 +10029,8 @@ vectorizable_load (vec_info *vinfo,
>  	aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
>        else
>  	aggr_type = vectype;
> -      bump = vect_get_data_ptr_increment (vinfo, dr_info, aggr_type,
> -					  memory_access_type);
> +      bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
> +					  memory_access_type, loop_lens);
>      }
>  
>    auto_vec<tree> vec_offsets;
> @@ -10056,6 +10110,7 @@ vectorizable_load (vec_info *vinfo,
>  	}
>        else
>  	{
> +	  gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>  	  if (dataref_offset)
>  	    dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
>  					      bump);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 1e44f8542d7..af25d20bd7e 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -825,6 +825,11 @@ public:
>  	(b) can iterate more than once.  */
>    bool using_decrementing_iv_p;
>  
> +  /* True if we've decided to use output of select_vl to adjust IV of
> +     both loop control and data reference pointer. This is only true
> +     for single-rgroup control.  */
> +  bool using_select_vl_p;
> +
>    /* True if we've decided to use partially-populated vectors for the
>       epilogue of loop.  */
>    bool epil_using_partial_vectors_p;
> @@ -898,6 +903,7 @@ public:
>  #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
>  #define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
> +#define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p
>  #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
>    (L)->epil_using_partial_vectors_p
>  #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
>
juzhe.zhong@rivai.ai June 9, 2023, 11:16 a.m. UTC | #3
Thanks, Richi.

Should I wait for Richard ACK gain ? 
Since the last email of this patch, he just asked me to adjust comment no codes change.
I am not sure whether he is ok.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-06-09 19:02
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V6] VECT: Add SELECT_VL support
On Fri, 9 Jun 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Co-authored-by: Richard Sandiford<richard.sandiford@arm.com>
> Co-authored-by: Richard Biener <rguenther@suse.de>
> 
> This patch address comments from Richard && Richi and rebase to trunk.
> 
> This patch is adding SELECT_VL middle-end support
> allow target have target dependent optimization in case of
> length calculation.
> 
> This patch is inspired by RVV ISA and LLVM:
> https://reviews.llvm.org/D99750
> 
> The SELECT_VL is same behavior as LLVM "get_vector_length" with
> these following properties:
> 
> 1. Only apply on single-rgroup.
> 2. non SLP.
> 3. adjust loop control IV.
> 4. adjust data reference IV.
> 5. allow non-vf elements processing in non-final iteration
> 
> Code:
>    # void vvaddint32(size_t n, const int*x, const int*y, int*z)
>     # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
> 
> Take RVV codegen for example:
> 
> Before this patch:
> vvaddint32:
>         ble     a0,zero,.L6
>         csrr    a4,vlenb
>         srli    a6,a4,2
> .L4:
>         mv      a5,a0
>         bleu    a0,a6,.L3
>         mv      a5,a6
> .L3:
>         vsetvli zero,a5,e32,m1,ta,ma
>         vle32.v v2,0(a1)
>         vle32.v v1,0(a2)
>         vsetvli a7,zero,e32,m1,ta,ma
>         sub     a0,a0,a5
>         vadd.vv v1,v1,v2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vse32.v v1,0(a3)
>         add     a2,a2,a4
>         add     a3,a3,a4
>         add     a1,a1,a4
>         bne     a0,zero,.L4
> .L6:
>         ret
> 
> After this patch:
> 
> vvaddint32:
>     vsetvli t0, a0, e32, ta, ma  # Set vector length based on 32-bit vectors
>     vle32.v v0, (a1)         # Get first vector
>       sub a0, a0, t0         # Decrement number done
>       slli t0, t0, 2         # Multiply number done by 4 bytes
>       add a1, a1, t0         # Bump pointer
>     vle32.v v1, (a2)         # Get second vector
>       add a2, a2, t0         # Bump pointer
>     vadd.vv v2, v0, v1       # Sum vectors
>     vse32.v v2, (a3)         # Store result
>       add a3, a3, t0         # Bump pointer
>       bnez a0, vvaddint32    # Loop back
>       ret                    # Finished
 
OK.
 
Thanks,
Richard.
 
> gcc/ChangeLog:
> 
>         * doc/md.texi: Add SELECT_VL support.
>         * internal-fn.def (SELECT_VL): Ditto.
>         * optabs.def (OPTAB_D): Ditto.
>         * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
>         * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Ditto.
>         * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): Ditto.
>         (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): Ditto.
> 
> ---
>  gcc/doc/md.texi             | 22 ++++++++++++
>  gcc/internal-fn.def         |  1 +
>  gcc/optabs.def              |  1 +
>  gcc/tree-vect-loop-manip.cc | 32 ++++++++++++-----
>  gcc/tree-vect-loop.cc       | 72 +++++++++++++++++++++++++++++++++++++
>  gcc/tree-vect-stmts.cc      | 69 +++++++++++++++++++++++++++++++----
>  gcc/tree-vectorizer.h       |  6 ++++
>  7 files changed, 187 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 6a435eb4461..95f7fe1f802 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -4974,6 +4974,28 @@ for (i = 1; i < operand3; i++)
>    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
>  @end smallexample
>  
> +@cindex @code{select_vl@var{m}} instruction pattern
> +@item @code{select_vl@var{m}}
> +Set operand 0 to the number of scalar iterations that should be handled
> +by one iteration of a vector loop.  Operand 1 is the total number of
> +scalar iterations that the loop needs to process and operand 2 is a
> +maximum bound on the result (also known as the maximum ``vectorization
> +factor'').
> +
> +The maximum value of operand 0 is given by:
> +@smallexample
> +operand0 = MIN (operand1, operand2)
> +@end smallexample
> +However, targets might choose a lower value than this, based on
> +target-specific criteria.  Each iteration of the vector loop might
> +therefore process a different number of scalar iterations, which in turn
> +means that induction variables will have a variable step.  Because of
> +this, it is generally not useful to define this instruction if it will
> +always calculate the maximum value.
> +
> +This optab is only useful on targets that implement @samp{len_load_@var{m}}
> +and/or @samp{len_store_@var{m}}.
> +
>  @cindex @code{check_raw_ptrs@var{m}} instruction pattern
>  @item @samp{check_raw_ptrs@var{m}}
>  Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 3ac9d82aace..5d638de6d06 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -177,6 +177,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
>  DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>  
>  DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> +DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
>  DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
>         check_raw_ptrs, check_ptrs)
>  DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 6c064ff4993..f31b69c5d85 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -488,3 +488,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
>  OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
>  OPTAB_D (len_load_optab, "len_load_$a")
>  OPTAB_D (len_store_optab, "len_store_$a")
> +OPTAB_D (select_vl_optab, "select_vl$a")
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 3f735945e67..1c8100c1a1c 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -534,7 +534,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>     _10 = (unsigned long) count_12(D);
>     ...
>     # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> -    _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> +    _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
>     ...
>     vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
>     ...
> @@ -549,15 +549,28 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>        tree step = rgc->controls.length () == 1 ? rgc->controls[0]
>         : make_ssa_name (iv_type);
>        /* Create decrement IV.  */
> -      create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
> - &incr_gsi, insert_after, &index_before_incr,
> - &index_after_incr);
> -      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
> -     index_before_incr,
> -     nitems_step));
> +      if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> + {
> +   create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
> +      insert_after, &index_before_incr, &index_after_incr);
> +   tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
> +    index_before_incr, nitems_step);
> +   gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
> + }
> +      else
> + {
> +   create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
> +      &incr_gsi, insert_after, &index_before_incr,
> +      &index_after_incr);
> +   gimple_seq_add_stmt (header_seq,
> +        gimple_build_assign (step, MIN_EXPR,
> +     index_before_incr,
> +     nitems_step));
> + }
>        *iv_step = step;
>        *compare_step = nitems_step;
> -      return index_before_incr;
> +      return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
> +        : index_before_incr;
>      }
>  
>    /* Create increment IV.  */
> @@ -888,7 +901,8 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>    /* Get a boolean result that tells us whether to iterate.  */
>    edge exit_edge = single_exit (loop);
>    gcond *cond_stmt;
> -  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +      && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
>      {
>        gcc_assert (compare_step);
>        tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 5b7a0da0034..ace9e759f5b 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -974,6 +974,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
>      can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
>      using_partial_vectors_p (false),
>      using_decrementing_iv_p (false),
> +    using_select_vl_p (false),
>      epil_using_partial_vectors_p (false),
>      partial_load_store_bias (0),
>      peeling_for_gaps (false),
> @@ -2737,6 +2738,77 @@ start_over:
>  LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
>      LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
>  
> +  /* If a loop uses length controls and has a decrementing loop control IV,
> +     we will normally pass that IV through a MIN_EXPR to calcaluate the
> +     basis for the length controls.  E.g. in a loop that processes one
> +     element per scalar iteration, the number of elements would be
> +     MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
> +
> +     This MIN_EXPR approach allows us to use pointer IVs with an invariant
> +     step, since only the final iteration of the vector loop can have
> +     inactive lanes.
> +
> +     However, some targets have a dedicated instruction for calculating the
> +     preferred length, given the total number of elements that still need to
> +     be processed.  This is encapsulated in the SELECT_VL internal function.
> +
> +     If the target supports SELECT_VL, we can use it instead of MIN_EXPR
> +     to determine the basis for the length controls.  However, unlike the
> +     MIN_EXPR calculation, the SELECT_VL calculation can decide to make
> +     lanes inactive in any iteration of the vector loop, not just the last
> +     iteration.  This SELECT_VL approach therefore requires us to use pointer
> +     IVs with variable steps.
> +
> +     Once we've decided how many elements should be processed by one
> +     iteration of the vector loop, we need to populate the rgroup controls.
> +     If a loop has multiple rgroups, we need to make sure that those rgroups
> +     "line up" (that is, they must be consistent about which elements are
> +     active and which aren't).  This is done by vect_adjust_loop_lens_control.
> +
> +     In principle, it would be possible to use vect_adjust_loop_lens_control
> +     on either the result of a MIN_EXPR or the result of a SELECT_VL.
> +     However:
> +
> +     (1) In practice, it only makes sense to use SELECT_VL when a vector
> + operation will be controlled directly by the result.  It is not
> + worth using SELECT_VL if it would only be the input to other
> + calculations.
> +
> +     (2) If we use SELECT_VL for an rgroup that has N controls, each associated
> + pointer IV will need N updates by a variable amount (N-1 updates
> + within the iteration and 1 update to move to the next iteration).
> +
> +     Because of this, we prefer to use the MIN_EXPR approach whenever there
> +     is more than one length control.
> +
> +     In addition, SELECT_VL always operates to a granularity of 1 unit.
> +     If we wanted to use it to control an SLP operation on N consecutive
> +     elements, we would need to make the SELECT_VL inputs measure scalar
> +     iterations (rather than elements) and then multiply the SELECT_VL
> +     result by N.  But using SELECT_VL this way is inefficient because
> +     of (1) above.
> +
> +     2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
> + satisfied:
> +
> +     (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
> +     (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
> +
> +     Since SELECT_VL (variable step) will make SCEV analysis failed and then
> +     we will fail to gain benefits of following unroll optimizations. We prefer
> +     using the MIN_EXPR approach in this situation.  */
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +    {
> +      tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +      if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
> +   OPTIMIZE_FOR_SPEED)
> +   && LOOP_VINFO_LENS (loop_vinfo).length () == 1
> +   && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
> +   && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
> + LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
> +    }
> +
>    /* If we're vectorizing an epilogue loop, the vectorized loop either needs
>       to be able to handle fewer than VF scalars, or needs to have a lower VF
>       than the main loop.  */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index c7e4e71d3c5..a7acc032d47 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -3128,19 +3128,72 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
>    *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
>  }
>  
> +/* Prepare the pointer IVs which needs to be updated by a variable amount.
> +   Such variable amount is the outcome of .SELECT_VL. In this case, we can
> +   allow each iteration process the flexible number of elements as long as
> +   the number <= vf elments.
> +
> +   Return data reference according to SELECT_VL.
> +   If new statements are needed, insert them before GSI.  */
> +
> +static tree
> +vect_get_loop_variant_data_ptr_increment (
> +  vec_info *vinfo, tree aggr_type, gimple_stmt_iterator *gsi,
> +  vec_loop_lens *loop_lens, dr_vec_info *dr_info,
> +  vect_memory_access_type memory_access_type)
> +{
> +  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
> +  tree step = vect_dr_behavior (vinfo, dr_info)->step;
> +
> +  /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
> +     IVs are updated by variable amount but we will support them in the future.
> +   */
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> +       && memory_access_type != VMAT_LOAD_STORE_LANES);
> +
> +  /* When we support SELECT_VL pattern, we dynamic adjust
> +     the memory address by .SELECT_VL result.
> +
> +     The result of .SELECT_VL is the number of elements to
> +     be processed of each iteration. So the memory address
> +     adjustment operation should be:
> +
> +     addr = addr + .SELECT_VL (ARG..) * step;
> +  */
> +  tree loop_len
> +    = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0, 0);
> +  tree len_type = TREE_TYPE (loop_len);
> +  /* Since the outcome of .SELECT_VL is element size, we should adjust
> +     it into bytesize so that it can be used in address pointer variable
> +     amount IVs adjustment.  */
> +  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
> +   wide_int_to_tree (len_type, wi::to_widest (step)));
> +  tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
> +  gassign *assign = gimple_build_assign (bump, tmp);
> +  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> +  return bump;
> +}
> +
>  /* Return the amount that should be added to a vector pointer to move
>     to the next or previous copy of AGGR_TYPE.  DR_INFO is the data reference
>     being vectorized and MEMORY_ACCESS_TYPE describes the type of
>     vectorization.  */
>  
>  static tree
> -vect_get_data_ptr_increment (vec_info *vinfo,
> +vect_get_data_ptr_increment (vec_info *vinfo, gimple_stmt_iterator *gsi,
>       dr_vec_info *dr_info, tree aggr_type,
> -      vect_memory_access_type memory_access_type)
> +      vect_memory_access_type memory_access_type,
> +      vec_loop_lens *loop_lens = nullptr)
>  {
>    if (memory_access_type == VMAT_INVARIANT)
>      return size_zero_node;
>  
> +  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
> +  if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> +    return vect_get_loop_variant_data_ptr_increment (vinfo, aggr_type, gsi,
> +      loop_lens, dr_info,
> +      memory_access_type);
> +
>    tree iv_step = TYPE_SIZE_UNIT (aggr_type);
>    tree step = vect_dr_behavior (vinfo, dr_info)->step;
>    if (tree_int_cst_sgn (step) == -1)
> @@ -7629,7 +7682,7 @@ vectorizable_scan_store (vec_info *vinfo,
>    tree vec_oprnd3 = NULL_TREE;
>    tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr);
>    tree dataref_offset = build_int_cst (ref_type, 0);
> -  tree bump = vect_get_data_ptr_increment (vinfo, dr_info,
> +  tree bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info,
>     vectype, VMAT_CONTIGUOUS);
>    tree ldataref_ptr = NULL_TREE;
>    tree orig = NULL_TREE;
> @@ -8539,8 +8592,8 @@ vectorizable_store (vec_info *vinfo,
>  aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
>        else
>  aggr_type = vectype;
> -      bump = vect_get_data_ptr_increment (vinfo, dr_info, aggr_type,
> -   memory_access_type);
> +      bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
> +   memory_access_type, loop_lens);
>      }
>  
>    if (mask)
> @@ -8663,6 +8716,7 @@ vectorizable_store (vec_info *vinfo,
>  }
>        else
>  {
> +   gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));  
>    /* For interleaved stores we created vectorized defs for all the
>       defs stored in OPRNDS in the previous iteration (previous copy).
>       DR_CHAIN is then used as an input to vect_permute_store_chain().
> @@ -9975,8 +10029,8 @@ vectorizable_load (vec_info *vinfo,
>  aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
>        else
>  aggr_type = vectype;
> -      bump = vect_get_data_ptr_increment (vinfo, dr_info, aggr_type,
> -   memory_access_type);
> +      bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
> +   memory_access_type, loop_lens);
>      }
>  
>    auto_vec<tree> vec_offsets;
> @@ -10056,6 +10110,7 @@ vectorizable_load (vec_info *vinfo,
>  }
>        else
>  {
> +   gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>    if (dataref_offset)
>      dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
>        bump);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 1e44f8542d7..af25d20bd7e 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -825,6 +825,11 @@ public:
>  (b) can iterate more than once.  */
>    bool using_decrementing_iv_p;
>  
> +  /* True if we've decided to use output of select_vl to adjust IV of
> +     both loop control and data reference pointer. This is only true
> +     for single-rgroup control.  */
> +  bool using_select_vl_p;
> +
>    /* True if we've decided to use partially-populated vectors for the
>       epilogue of loop.  */
>    bool epil_using_partial_vectors_p;
> @@ -898,6 +903,7 @@ public:
>  #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
>  #define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
> +#define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p
>  #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
>    (L)->epil_using_partial_vectors_p
>  #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
>
juzhe.zhong@rivai.ai June 9, 2023, 11:32 a.m. UTC | #4
Thanks a lot Richi.

Even though last time Richard asked me no need to wait for 2nd ACK,
I am still want to wait for Richard final approval since I am not sure this patch is ok for him.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-06-09 19:02
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V6] VECT: Add SELECT_VL support
On Fri, 9 Jun 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Co-authored-by: Richard Sandiford<richard.sandiford@arm.com>
> Co-authored-by: Richard Biener <rguenther@suse.de>
> 
> This patch address comments from Richard && Richi and rebase to trunk.
> 
> This patch is adding SELECT_VL middle-end support
> allow target have target dependent optimization in case of
> length calculation.
> 
> This patch is inspired by RVV ISA and LLVM:
> https://reviews.llvm.org/D99750
> 
> The SELECT_VL is same behavior as LLVM "get_vector_length" with
> these following properties:
> 
> 1. Only apply on single-rgroup.
> 2. non SLP.
> 3. adjust loop control IV.
> 4. adjust data reference IV.
> 5. allow non-vf elements processing in non-final iteration
> 
> Code:
>    # void vvaddint32(size_t n, const int*x, const int*y, int*z)
>     # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
> 
> Take RVV codegen for example:
> 
> Before this patch:
> vvaddint32:
>         ble     a0,zero,.L6
>         csrr    a4,vlenb
>         srli    a6,a4,2
> .L4:
>         mv      a5,a0
>         bleu    a0,a6,.L3
>         mv      a5,a6
> .L3:
>         vsetvli zero,a5,e32,m1,ta,ma
>         vle32.v v2,0(a1)
>         vle32.v v1,0(a2)
>         vsetvli a7,zero,e32,m1,ta,ma
>         sub     a0,a0,a5
>         vadd.vv v1,v1,v2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vse32.v v1,0(a3)
>         add     a2,a2,a4
>         add     a3,a3,a4
>         add     a1,a1,a4
>         bne     a0,zero,.L4
> .L6:
>         ret
> 
> After this patch:
> 
> vvaddint32:
>     vsetvli t0, a0, e32, ta, ma  # Set vector length based on 32-bit vectors
>     vle32.v v0, (a1)         # Get first vector
>       sub a0, a0, t0         # Decrement number done
>       slli t0, t0, 2         # Multiply number done by 4 bytes
>       add a1, a1, t0         # Bump pointer
>     vle32.v v1, (a2)         # Get second vector
>       add a2, a2, t0         # Bump pointer
>     vadd.vv v2, v0, v1       # Sum vectors
>     vse32.v v2, (a3)         # Store result
>       add a3, a3, t0         # Bump pointer
>       bnez a0, vvaddint32    # Loop back
>       ret                    # Finished
 
OK.
 
Thanks,
Richard.
 
> gcc/ChangeLog:
> 
>         * doc/md.texi: Add SELECT_VL support.
>         * internal-fn.def (SELECT_VL): Ditto.
>         * optabs.def (OPTAB_D): Ditto.
>         * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
>         * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Ditto.
>         * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): Ditto.
>         (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): Ditto.
> 
> ---
>  gcc/doc/md.texi             | 22 ++++++++++++
>  gcc/internal-fn.def         |  1 +
>  gcc/optabs.def              |  1 +
>  gcc/tree-vect-loop-manip.cc | 32 ++++++++++++-----
>  gcc/tree-vect-loop.cc       | 72 +++++++++++++++++++++++++++++++++++++
>  gcc/tree-vect-stmts.cc      | 69 +++++++++++++++++++++++++++++++----
>  gcc/tree-vectorizer.h       |  6 ++++
>  7 files changed, 187 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 6a435eb4461..95f7fe1f802 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -4974,6 +4974,28 @@ for (i = 1; i < operand3; i++)
>    operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
>  @end smallexample
>  
> +@cindex @code{select_vl@var{m}} instruction pattern
> +@item @code{select_vl@var{m}}
> +Set operand 0 to the number of scalar iterations that should be handled
> +by one iteration of a vector loop.  Operand 1 is the total number of
> +scalar iterations that the loop needs to process and operand 2 is a
> +maximum bound on the result (also known as the maximum ``vectorization
> +factor'').
> +
> +The maximum value of operand 0 is given by:
> +@smallexample
> +operand0 = MIN (operand1, operand2)
> +@end smallexample
> +However, targets might choose a lower value than this, based on
> +target-specific criteria.  Each iteration of the vector loop might
> +therefore process a different number of scalar iterations, which in turn
> +means that induction variables will have a variable step.  Because of
> +this, it is generally not useful to define this instruction if it will
> +always calculate the maximum value.
> +
> +This optab is only useful on targets that implement @samp{len_load_@var{m}}
> +and/or @samp{len_store_@var{m}}.
> +
>  @cindex @code{check_raw_ptrs@var{m}} instruction pattern
>  @item @samp{check_raw_ptrs@var{m}}
>  Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 3ac9d82aace..5d638de6d06 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -177,6 +177,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
>  DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>  
>  DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> +DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
>  DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
>         check_raw_ptrs, check_ptrs)
>  DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 6c064ff4993..f31b69c5d85 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -488,3 +488,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
>  OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
>  OPTAB_D (len_load_optab, "len_load_$a")
>  OPTAB_D (len_store_optab, "len_store_$a")
> +OPTAB_D (select_vl_optab, "select_vl$a")
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 3f735945e67..1c8100c1a1c 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -534,7 +534,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>     _10 = (unsigned long) count_12(D);
>     ...
>     # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> -    _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> +    _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
>     ...
>     vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
>     ...
> @@ -549,15 +549,28 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>        tree step = rgc->controls.length () == 1 ? rgc->controls[0]
>         : make_ssa_name (iv_type);
>        /* Create decrement IV.  */
> -      create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
> - &incr_gsi, insert_after, &index_before_incr,
> - &index_after_incr);
> -      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
> -     index_before_incr,
> -     nitems_step));
> +      if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> + {
> +   create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
> +      insert_after, &index_before_incr, &index_after_incr);
> +   tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
> +    index_before_incr, nitems_step);
> +   gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
> + }
> +      else
> + {
> +   create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
> +      &incr_gsi, insert_after, &index_before_incr,
> +      &index_after_incr);
> +   gimple_seq_add_stmt (header_seq,
> +        gimple_build_assign (step, MIN_EXPR,
> +     index_before_incr,
> +     nitems_step));
> + }
>        *iv_step = step;
>        *compare_step = nitems_step;
> -      return index_before_incr;
> +      return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
> +        : index_before_incr;
>      }
>  
>    /* Create increment IV.  */
> @@ -888,7 +901,8 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>    /* Get a boolean result that tells us whether to iterate.  */
>    edge exit_edge = single_exit (loop);
>    gcond *cond_stmt;
> -  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +      && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
>      {
>        gcc_assert (compare_step);
>        tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 5b7a0da0034..ace9e759f5b 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -974,6 +974,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
>      can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
>      using_partial_vectors_p (false),
>      using_decrementing_iv_p (false),
> +    using_select_vl_p (false),
>      epil_using_partial_vectors_p (false),
>      partial_load_store_bias (0),
>      peeling_for_gaps (false),
> @@ -2737,6 +2738,77 @@ start_over:
>  LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
>      LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
>  
> +  /* If a loop uses length controls and has a decrementing loop control IV,
> +     we will normally pass that IV through a MIN_EXPR to calcaluate the
> +     basis for the length controls.  E.g. in a loop that processes one
> +     element per scalar iteration, the number of elements would be
> +     MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
> +
> +     This MIN_EXPR approach allows us to use pointer IVs with an invariant
> +     step, since only the final iteration of the vector loop can have
> +     inactive lanes.
> +
> +     However, some targets have a dedicated instruction for calculating the
> +     preferred length, given the total number of elements that still need to
> +     be processed.  This is encapsulated in the SELECT_VL internal function.
> +
> +     If the target supports SELECT_VL, we can use it instead of MIN_EXPR
> +     to determine the basis for the length controls.  However, unlike the
> +     MIN_EXPR calculation, the SELECT_VL calculation can decide to make
> +     lanes inactive in any iteration of the vector loop, not just the last
> +     iteration.  This SELECT_VL approach therefore requires us to use pointer
> +     IVs with variable steps.
> +
> +     Once we've decided how many elements should be processed by one
> +     iteration of the vector loop, we need to populate the rgroup controls.
> +     If a loop has multiple rgroups, we need to make sure that those rgroups
> +     "line up" (that is, they must be consistent about which elements are
> +     active and which aren't).  This is done by vect_adjust_loop_lens_control.
> +
> +     In principle, it would be possible to use vect_adjust_loop_lens_control
> +     on either the result of a MIN_EXPR or the result of a SELECT_VL.
> +     However:
> +
> +     (1) In practice, it only makes sense to use SELECT_VL when a vector
> + operation will be controlled directly by the result.  It is not
> + worth using SELECT_VL if it would only be the input to other
> + calculations.
> +
> +     (2) If we use SELECT_VL for an rgroup that has N controls, each associated
> + pointer IV will need N updates by a variable amount (N-1 updates
> + within the iteration and 1 update to move to the next iteration).
> +
> +     Because of this, we prefer to use the MIN_EXPR approach whenever there
> +     is more than one length control.
> +
> +     In addition, SELECT_VL always operates to a granularity of 1 unit.
> +     If we wanted to use it to control an SLP operation on N consecutive
> +     elements, we would need to make the SELECT_VL inputs measure scalar
> +     iterations (rather than elements) and then multiply the SELECT_VL
> +     result by N.  But using SELECT_VL this way is inefficient because
> +     of (1) above.
> +
> +     2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
> + satisfied:
> +
> +     (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
> +     (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
> +
> +     Since SELECT_VL (variable step) will make SCEV analysis failed and then
> +     we will fail to gain benefits of following unroll optimizations. We prefer
> +     using the MIN_EXPR approach in this situation.  */
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +    {
> +      tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +      if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
> +   OPTIMIZE_FOR_SPEED)
> +   && LOOP_VINFO_LENS (loop_vinfo).length () == 1
> +   && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
> +   && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
> + LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
> +    }
> +
>    /* If we're vectorizing an epilogue loop, the vectorized loop either needs
>       to be able to handle fewer than VF scalars, or needs to have a lower VF
>       than the main loop.  */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index c7e4e71d3c5..a7acc032d47 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -3128,19 +3128,72 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
>    *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
>  }
>  
> +/* Prepare the pointer IVs which needs to be updated by a variable amount.
> +   Such variable amount is the outcome of .SELECT_VL. In this case, we can
> +   allow each iteration process the flexible number of elements as long as
> +   the number <= vf elments.
> +
> +   Return data reference according to SELECT_VL.
> +   If new statements are needed, insert them before GSI.  */
> +
> +static tree
> +vect_get_loop_variant_data_ptr_increment (
> +  vec_info *vinfo, tree aggr_type, gimple_stmt_iterator *gsi,
> +  vec_loop_lens *loop_lens, dr_vec_info *dr_info,
> +  vect_memory_access_type memory_access_type)
> +{
> +  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
> +  tree step = vect_dr_behavior (vinfo, dr_info)->step;
> +
> +  /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
> +     IVs are updated by variable amount but we will support them in the future.
> +   */
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> +       && memory_access_type != VMAT_LOAD_STORE_LANES);
> +
> +  /* When we support SELECT_VL pattern, we dynamic adjust
> +     the memory address by .SELECT_VL result.
> +
> +     The result of .SELECT_VL is the number of elements to
> +     be processed of each iteration. So the memory address
> +     adjustment operation should be:
> +
> +     addr = addr + .SELECT_VL (ARG..) * step;
> +  */
> +  tree loop_len
> +    = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0, 0);
> +  tree len_type = TREE_TYPE (loop_len);
> +  /* Since the outcome of .SELECT_VL is element size, we should adjust
> +     it into bytesize so that it can be used in address pointer variable
> +     amount IVs adjustment.  */
> +  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
> +   wide_int_to_tree (len_type, wi::to_widest (step)));
> +  tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
> +  gassign *assign = gimple_build_assign (bump, tmp);
> +  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> +  return bump;
> +}
> +
>  /* Return the amount that should be added to a vector pointer to move
>     to the next or previous copy of AGGR_TYPE.  DR_INFO is the data reference
>     being vectorized and MEMORY_ACCESS_TYPE describes the type of
>     vectorization.  */
>  
>  static tree
> -vect_get_data_ptr_increment (vec_info *vinfo,
> +vect_get_data_ptr_increment (vec_info *vinfo, gimple_stmt_iterator *gsi,
>       dr_vec_info *dr_info, tree aggr_type,
> -      vect_memory_access_type memory_access_type)
> +      vect_memory_access_type memory_access_type,
> +      vec_loop_lens *loop_lens = nullptr)
>  {
>    if (memory_access_type == VMAT_INVARIANT)
>      return size_zero_node;
>  
> +  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
> +  if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> +    return vect_get_loop_variant_data_ptr_increment (vinfo, aggr_type, gsi,
> +      loop_lens, dr_info,
> +      memory_access_type);
> +
>    tree iv_step = TYPE_SIZE_UNIT (aggr_type);
>    tree step = vect_dr_behavior (vinfo, dr_info)->step;
>    if (tree_int_cst_sgn (step) == -1)
> @@ -7629,7 +7682,7 @@ vectorizable_scan_store (vec_info *vinfo,
>    tree vec_oprnd3 = NULL_TREE;
>    tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr);
>    tree dataref_offset = build_int_cst (ref_type, 0);
> -  tree bump = vect_get_data_ptr_increment (vinfo, dr_info,
> +  tree bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info,
>     vectype, VMAT_CONTIGUOUS);
>    tree ldataref_ptr = NULL_TREE;
>    tree orig = NULL_TREE;
> @@ -8539,8 +8592,8 @@ vectorizable_store (vec_info *vinfo,
>  aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
>        else
>  aggr_type = vectype;
> -      bump = vect_get_data_ptr_increment (vinfo, dr_info, aggr_type,
> -   memory_access_type);
> +      bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
> +   memory_access_type, loop_lens);
>      }
>  
>    if (mask)
> @@ -8663,6 +8716,7 @@ vectorizable_store (vec_info *vinfo,
>  }
>        else
>  {
> +   gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));  
>    /* For interleaved stores we created vectorized defs for all the
>       defs stored in OPRNDS in the previous iteration (previous copy).
>       DR_CHAIN is then used as an input to vect_permute_store_chain().
> @@ -9975,8 +10029,8 @@ vectorizable_load (vec_info *vinfo,
>  aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
>        else
>  aggr_type = vectype;
> -      bump = vect_get_data_ptr_increment (vinfo, dr_info, aggr_type,
> -   memory_access_type);
> +      bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
> +   memory_access_type, loop_lens);
>      }
>  
>    auto_vec<tree> vec_offsets;
> @@ -10056,6 +10110,7 @@ vectorizable_load (vec_info *vinfo,
>  }
>        else
>  {
> +   gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>    if (dataref_offset)
>      dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
>        bump);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 1e44f8542d7..af25d20bd7e 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -825,6 +825,11 @@ public:
>  (b) can iterate more than once.  */
>    bool using_decrementing_iv_p;
>  
> +  /* True if we've decided to use output of select_vl to adjust IV of
> +     both loop control and data reference pointer. This is only true
> +     for single-rgroup control.  */
> +  bool using_select_vl_p;
> +
>    /* True if we've decided to use partially-populated vectors for the
>       epilogue of loop.  */
>    bool epil_using_partial_vectors_p;
> @@ -898,6 +903,7 @@ public:
>  #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
>  #define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
> +#define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p
>  #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
>    (L)->epil_using_partial_vectors_p
>  #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
>
Jeff Law June 9, 2023, 2:42 p.m. UTC | #5
On 6/9/23 05:32, juzhe.zhong@rivai.ai wrote:
> Thanks a lot Richi.
> 
> Even though last time Richard asked me no need to wait for 2nd ACK,
> I am still want to wait for Richard final approval since I am not sure this patch is ok for him.
If Richard had asked you to wait for Richi and you've done updates based 
on Richi's feedback, then it becomes a judgment call -- if the changes 
are significant, then we might want Richard to take another look.  If 
the changes are minor, then getting another ACK isn't necessary.

It's not always clear what the best path forward ought to be and in 
cases where it isn't clear, a bit of caution is appreciated.

So to give some clear guidance.  Based on my understanding both Richard 
and Richi are basically on board with what you've implemented.  So how 
about this, if Richard hasn't chimed in by the start of your day on 
Tuesday, go ahead with the patch.  That gives Richard the rest of today 
and his Monday if there's something he wants to comment on.

jeff
juzhe.zhong@rivai.ai June 9, 2023, 2:43 p.m. UTC | #6
Ok. Thanks Jeff.



juzhe.zhong@rivai.ai
 
From: Jeff Law
Date: 2023-06-09 22:42
To: juzhe.zhong@rivai.ai; rguenther
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V6] VECT: Add SELECT_VL support
 
 
On 6/9/23 05:32, juzhe.zhong@rivai.ai wrote:
> Thanks a lot Richi.
> 
> Even though last time Richard asked me no need to wait for 2nd ACK,
> I am still want to wait for Richard final approval since I am not sure this patch is ok for him.
If Richard had asked you to wait for Richi and you've done updates based 
on Richi's feedback, then it becomes a judgment call -- if the changes 
are significant, then we might want Richard to take another look.  If 
the changes are minor, then getting another ACK isn't necessary.
 
It's not always clear what the best path forward ought to be and in 
cases where it isn't clear, a bit of caution is appreciated.
 
So to give some clear guidance.  Based on my understanding both Richard 
and Richi are basically on board with what you've implemented.  So how 
about this, if Richard hasn't chimed in by the start of your day on 
Tuesday, go ahead with the patch.  That gives Richard the rest of today 
and his Monday if there's something he wants to comment on.
 
jeff
Richard Sandiford June 9, 2023, 5:44 p.m. UTC | #7
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Thanks, Richi.
>
> Should I wait for Richard ACK gain ? 
> Since the last email of this patch, he just asked me to adjust comment no codes change.
> I am not sure whether he is ok.

Yeah, OK from my POV too, thanks.

Richard
Li, Pan2 via Gcc-patches June 9, 2023, 11:54 p.m. UTC | #8
Committed with bootstrap and regression test passed, thanks all.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Richard Sandiford via Gcc-patches
Sent: Saturday, June 10, 2023 1:44 AM
To: juzhe.zhong@rivai.ai
Cc: rguenther <rguenther@suse.de>; gcc-patches <gcc-patches@gcc.gnu.org>
Subject: Re: [PATCH V6] VECT: Add SELECT_VL support

"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Thanks, Richi.
>
> Should I wait for Richard ACK gain ? 
> Since the last email of this patch, he just asked me to adjust comment no codes change.
> I am not sure whether he is ok.

Yeah, OK from my POV too, thanks.

Richard
diff mbox series

Patch

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 6a435eb4461..95f7fe1f802 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4974,6 +4974,28 @@  for (i = 1; i < operand3; i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
 @end smallexample
 
+@cindex @code{select_vl@var{m}} instruction pattern
+@item @code{select_vl@var{m}}
+Set operand 0 to the number of scalar iterations that should be handled
+by one iteration of a vector loop.  Operand 1 is the total number of
+scalar iterations that the loop needs to process and operand 2 is a
+maximum bound on the result (also known as the maximum ``vectorization
+factor'').
+
+The maximum value of operand 0 is given by:
+@smallexample
+operand0 = MIN (operand1, operand2)
+@end smallexample
+However, targets might choose a lower value than this, based on
+target-specific criteria.  Each iteration of the vector loop might
+therefore process a different number of scalar iterations, which in turn
+means that induction variables will have a variable step.  Because of
+this, it is generally not useful to define this instruction if it will
+always calculate the maximum value.
+
+This optab is only useful on targets that implement @samp{len_load_@var{m}}
+and/or @samp{len_store_@var{m}}.
+
 @cindex @code{check_raw_ptrs@var{m}} instruction pattern
 @item @samp{check_raw_ptrs@var{m}}
 Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 3ac9d82aace..5d638de6d06 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -177,6 +177,7 @@  DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
 DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
 DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
 		       check_raw_ptrs, check_ptrs)
 DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 6c064ff4993..f31b69c5d85 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -488,3 +488,4 @@  OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
 OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
 OPTAB_D (len_load_optab, "len_load_$a")
 OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (select_vl_optab, "select_vl$a")
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 3f735945e67..1c8100c1a1c 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -534,7 +534,7 @@  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 	   _10 = (unsigned long) count_12(D);
 	   ...
 	   # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
-	   _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+	   _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
 	   ...
 	   vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
 	   ...
@@ -549,15 +549,28 @@  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
       tree step = rgc->controls.length () == 1 ? rgc->controls[0]
 					       : make_ssa_name (iv_type);
       /* Create decrement IV.  */
-      create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
-		 &incr_gsi, insert_after, &index_before_incr,
-		 &index_after_incr);
-      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
-							    index_before_incr,
-							    nitems_step));
+      if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+	{
+	  create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
+		     insert_after, &index_before_incr, &index_after_incr);
+	  tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
+				   index_before_incr, nitems_step);
+	  gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
+	}
+      else
+	{
+	  create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
+		     &incr_gsi, insert_after, &index_before_incr,
+		     &index_after_incr);
+	  gimple_seq_add_stmt (header_seq,
+			       gimple_build_assign (step, MIN_EXPR,
+						    index_before_incr,
+						    nitems_step));
+	}
       *iv_step = step;
       *compare_step = nitems_step;
-      return index_before_incr;
+      return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
+						       : index_before_incr;
     }
 
   /* Create increment IV.  */
@@ -888,7 +901,8 @@  vect_set_loop_condition_partial_vectors (class loop *loop,
   /* Get a boolean result that tells us whether to iterate.  */
   edge exit_edge = single_exit (loop);
   gcond *cond_stmt;
-  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+      && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
     {
       gcc_assert (compare_step);
       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5b7a0da0034..ace9e759f5b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -974,6 +974,7 @@  _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
     using_decrementing_iv_p (false),
+    using_select_vl_p (false),
     epil_using_partial_vectors_p (false),
     partial_load_store_bias (0),
     peeling_for_gaps (false),
@@ -2737,6 +2738,77 @@  start_over:
 			LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
 
+  /* If a loop uses length controls and has a decrementing loop control IV,
+     we will normally pass that IV through a MIN_EXPR to calcaluate the
+     basis for the length controls.  E.g. in a loop that processes one
+     element per scalar iteration, the number of elements would be
+     MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
+
+     This MIN_EXPR approach allows us to use pointer IVs with an invariant
+     step, since only the final iteration of the vector loop can have
+     inactive lanes.
+
+     However, some targets have a dedicated instruction for calculating the
+     preferred length, given the total number of elements that still need to
+     be processed.  This is encapsulated in the SELECT_VL internal function.
+
+     If the target supports SELECT_VL, we can use it instead of MIN_EXPR
+     to determine the basis for the length controls.  However, unlike the
+     MIN_EXPR calculation, the SELECT_VL calculation can decide to make
+     lanes inactive in any iteration of the vector loop, not just the last
+     iteration.  This SELECT_VL approach therefore requires us to use pointer
+     IVs with variable steps.
+
+     Once we've decided how many elements should be processed by one
+     iteration of the vector loop, we need to populate the rgroup controls.
+     If a loop has multiple rgroups, we need to make sure that those rgroups
+     "line up" (that is, they must be consistent about which elements are
+     active and which aren't).  This is done by vect_adjust_loop_lens_control.
+
+     In principle, it would be possible to use vect_adjust_loop_lens_control
+     on either the result of a MIN_EXPR or the result of a SELECT_VL.
+     However:
+
+     (1) In practice, it only makes sense to use SELECT_VL when a vector
+	 operation will be controlled directly by the result.  It is not
+	 worth using SELECT_VL if it would only be the input to other
+	 calculations.
+
+     (2) If we use SELECT_VL for an rgroup that has N controls, each associated
+	 pointer IV will need N updates by a variable amount (N-1 updates
+	 within the iteration and 1 update to move to the next iteration).
+
+     Because of this, we prefer to use the MIN_EXPR approach whenever there
+     is more than one length control.
+
+     In addition, SELECT_VL always operates to a granularity of 1 unit.
+     If we wanted to use it to control an SLP operation on N consecutive
+     elements, we would need to make the SELECT_VL inputs measure scalar
+     iterations (rather than elements) and then multiply the SELECT_VL
+     result by N.  But using SELECT_VL this way is inefficient because
+     of (1) above.
+
+     2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
+	satisfied:
+
+     (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
+     (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
+
+     Since SELECT_VL (variable step) will make SCEV analysis failed and then
+     we will fail to gain benefits of following unroll optimizations. We prefer
+     using the MIN_EXPR approach in this situation.  */
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+    {
+      tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+      if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
+					  OPTIMIZE_FOR_SPEED)
+	  && LOOP_VINFO_LENS (loop_vinfo).length () == 1
+	  && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
+	  && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	      || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
+	LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
+    }
+
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c7e4e71d3c5..a7acc032d47 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3128,19 +3128,72 @@  vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
   *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
 }
 
+/* Prepare the pointer IVs which needs to be updated by a variable amount.
+   Such variable amount is the outcome of .SELECT_VL. In this case, we can
+   allow each iteration process the flexible number of elements as long as
+   the number <= vf elments.
+
+   Return data reference according to SELECT_VL.
+   If new statements are needed, insert them before GSI.  */
+
+static tree
+vect_get_loop_variant_data_ptr_increment (
+  vec_info *vinfo, tree aggr_type, gimple_stmt_iterator *gsi,
+  vec_loop_lens *loop_lens, dr_vec_info *dr_info,
+  vect_memory_access_type memory_access_type)
+{
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
+  tree step = vect_dr_behavior (vinfo, dr_info)->step;
+
+  /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
+     IVs are updated by variable amount but we will support them in the future.
+   */
+  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
+	      && memory_access_type != VMAT_LOAD_STORE_LANES);
+
+  /* When we support SELECT_VL pattern, we dynamic adjust
+     the memory address by .SELECT_VL result.
+
+     The result of .SELECT_VL is the number of elements to
+     be processed of each iteration. So the memory address
+     adjustment operation should be:
+
+     addr = addr + .SELECT_VL (ARG..) * step;
+  */
+  tree loop_len
+    = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0, 0);
+  tree len_type = TREE_TYPE (loop_len);
+  /* Since the outcome of .SELECT_VL is element size, we should adjust
+     it into bytesize so that it can be used in address pointer variable
+     amount IVs adjustment.  */
+  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
+			  wide_int_to_tree (len_type, wi::to_widest (step)));
+  tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
+  gassign *assign = gimple_build_assign (bump, tmp);
+  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+  return bump;
+}
+
 /* Return the amount that should be added to a vector pointer to move
    to the next or previous copy of AGGR_TYPE.  DR_INFO is the data reference
    being vectorized and MEMORY_ACCESS_TYPE describes the type of
    vectorization.  */
 
 static tree
-vect_get_data_ptr_increment (vec_info *vinfo,
+vect_get_data_ptr_increment (vec_info *vinfo, gimple_stmt_iterator *gsi,
 			     dr_vec_info *dr_info, tree aggr_type,
-			     vect_memory_access_type memory_access_type)
+			     vect_memory_access_type memory_access_type,
+			     vec_loop_lens *loop_lens = nullptr)
 {
   if (memory_access_type == VMAT_INVARIANT)
     return size_zero_node;
 
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
+  if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+    return vect_get_loop_variant_data_ptr_increment (vinfo, aggr_type, gsi,
+						     loop_lens, dr_info,
+						     memory_access_type);
+
   tree iv_step = TYPE_SIZE_UNIT (aggr_type);
   tree step = vect_dr_behavior (vinfo, dr_info)->step;
   if (tree_int_cst_sgn (step) == -1)
@@ -7629,7 +7682,7 @@  vectorizable_scan_store (vec_info *vinfo,
   tree vec_oprnd3 = NULL_TREE;
   tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr);
   tree dataref_offset = build_int_cst (ref_type, 0);
-  tree bump = vect_get_data_ptr_increment (vinfo, dr_info,
+  tree bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info,
 					   vectype, VMAT_CONTIGUOUS);
   tree ldataref_ptr = NULL_TREE;
   tree orig = NULL_TREE;
@@ -8539,8 +8592,8 @@  vectorizable_store (vec_info *vinfo,
 	aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
       else
 	aggr_type = vectype;
-      bump = vect_get_data_ptr_increment (vinfo, dr_info, aggr_type,
-					  memory_access_type);
+      bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
+					  memory_access_type, loop_lens);
     }
 
   if (mask)
@@ -8663,6 +8716,7 @@  vectorizable_store (vec_info *vinfo,
 	}
       else
 	{
+	  gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));  
 	  /* For interleaved stores we created vectorized defs for all the
 	     defs stored in OPRNDS in the previous iteration (previous copy).
 	     DR_CHAIN is then used as an input to vect_permute_store_chain().
@@ -9975,8 +10029,8 @@  vectorizable_load (vec_info *vinfo,
 	aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
       else
 	aggr_type = vectype;
-      bump = vect_get_data_ptr_increment (vinfo, dr_info, aggr_type,
-					  memory_access_type);
+      bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
+					  memory_access_type, loop_lens);
     }
 
   auto_vec<tree> vec_offsets;
@@ -10056,6 +10110,7 @@  vectorizable_load (vec_info *vinfo,
 	}
       else
 	{
+	  gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
 	  if (dataref_offset)
 	    dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
 					      bump);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1e44f8542d7..af25d20bd7e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -825,6 +825,11 @@  public:
 	(b) can iterate more than once.  */
   bool using_decrementing_iv_p;
 
+  /* True if we've decided to use output of select_vl to adjust IV of
+     both loop control and data reference pointer. This is only true
+     for single-rgroup control.  */
+  bool using_select_vl_p;
+
   /* True if we've decided to use partially-populated vectors for the
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
@@ -898,6 +903,7 @@  public:
 #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
 #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
 #define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
+#define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p
 #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
   (L)->epil_using_partial_vectors_p
 #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias