diff mbox

Reducing number of alias checks in vectorization.

Message ID CAK=A3=3sjM_MCqDoXwBXPsDiBDRGPuGh3oBkBOt_3685=dUXPw@mail.gmail.com
State New
Headers show

Commit Message

Cong Hou Oct. 2, 2013, 2:12 a.m. UTC
When alias exists between data refs in a loop, to vectorize it GCC
does loop versioning and adds runtime alias checks. Basically for each
pair of data refs with possible data dependence, there will be two
comparisons generated to make sure there is no aliasing between them
in each iteration of the vectorized loop. If there are many such data
refs pairs, the number of comparisons can be very large, which is a
big overhead.

However, in some cases it is possible to reduce the number of those
comparisons. For example, for the following loop, we can detect that
b[0] and b[1] are two consecutive member accesses so that we can
combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
checking a[0:100]&b[0:2]:

void foo(int*a, int* b)
{
   for (int i = 0; i < 100; ++i)
    a[i] = b[0] + b[1];
}

Actually, the requirement of consecutive memory accesses is too
strict. For the following loop, we can still combine the alias checks
between a[0:100]&b[0] and a[0:100]&b[100]:

void foo(int*a, int* b)
{
   for (int i = 0; i < 100; ++i)
    a[i] = b[0] + b[100];
}

This is because if b[0] is not in a[0:100] and b[100] is not in
a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
to check a[0:100] and b[0:101] don't overlap.

More generally, consider two pairs of data refs (a, b1) and (a, b2).
Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
segment length of a, b1, and b2. Then we can combine the two
comparisons into one if the following condition is satisfied:

offset_b2- offset_b1 - segment_length_b1 < segment_length_a


This patch detects those combination opportunities to reduce the
number of alias checks. It is tested on an x86-64 machine.


thanks,
Cong

Comments

Andrew Pinski Oct. 2, 2013, 2:22 a.m. UTC | #1
> On Oct 1, 2013, at 7:12 PM, Cong Hou <congh@google.com> wrote:
> 
> When alias exists between data refs in a loop, to vectorize it GCC
> does loop versioning and adds runtime alias checks. Basically for each
> pair of data refs with possible data dependence, there will be two
> comparisons generated to make sure there is no aliasing between them
> in each iteration of the vectorized loop. If there are many such data
> refs pairs, the number of comparisons can be very large, which is a
> big overhead.
> 
> However, in some cases it is possible to reduce the number of those
> comparisons. For example, for the following loop, we can detect that
> b[0] and b[1] are two consecutive member accesses so that we can
> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
> checking a[0:100]&b[0:2]:
> 
> void foo(int*a, int* b)
> {
>   for (int i = 0; i < 100; ++i)
>    a[i] = b[0] + b[1];
> }
> 
> Actually, the requirement of consecutive memory accesses is too
> strict. For the following loop, we can still combine the alias checks
> between a[0:100]&b[0] and a[0:100]&b[100]:
> 
> void foo(int*a, int* b)
> {
>   for (int i = 0; i < 100; ++i)
>    a[i] = b[0] + b[100];
> }
> 
> This is because if b[0] is not in a[0:100] and b[100] is not in
> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
> to check a[0:100] and b[0:101] don't overlap.
> 
> More generally, consider two pairs of data refs (a, b1) and (a, b2).
> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
> segment length of a, b1, and b2. Then we can combine the two
> comparisons into one if the following condition is satisfied:
> 
> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
> 
> 
> This patch detects those combination opportunities to reduce the
> number of alias checks. It is tested on an x86-64 machine.

I like the idea of this patch but I am not a fan of using stl really.  It seems a little too much dependence on c++ features for my liking.

Thanks,
Andrew

> 
> 
> thanks,
> Cong
> 
> 
> 
> Index: gcc/tree-vect-loop-manip.c
> ===================================================================
> --- gcc/tree-vect-loop-manip.c (revision 202662)
> +++ gcc/tree-vect-loop-manip.c (working copy)
> @@ -19,6 +19,10 @@ You should have received a copy of the G
> along with GCC; see the file COPYING3.  If not see
> <http://www.gnu.org/licenses/>.  */
> 
> +#include <vector>
> +#include <utility>
> +#include <algorithm>
> +
> #include "config.h"
> #include "system.h"
> #include "coretypes.h"
> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>   return segment_length;
> }
> 
> +namespace
> +{
> +
> +/* struct dr_addr_with_seg_len
> +
> +   A struct storing information of a data reference, including the data
> +   ref itself, its basic address, the access offset and the segment length
> +   for aliasing checks.  */
> +
> +struct dr_addr_with_seg_len
> +{
> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
> +
> +  data_reference* dr;
> +  tree basic_addr;
> +  tree offset;
> +  tree seg_len;
> +};
> +
> +/* Operator == between two dr_addr_with_seg_len objects.
> +
> +   This equality operator is used to make sure two data refs
> +   are the same one so that we will consider to combine the
> +   aliasing checks of those two pairs of data dependent data
> +   refs.  */
> +
> +bool operator == (const dr_addr_with_seg_len& d1,
> +  const dr_addr_with_seg_len& d2)
> +{
> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
> + && operand_equal_p (d1.offset, d2.offset, 0)
> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
> +}
> +
> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
> + dr_addr_with_seg_len_pair_t;
> +
> +
> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
> +
> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
> +   so that we can combine aliasing checks during one scan.  */
> +
> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
> + const dr_addr_with_seg_len_pair_t& p2)
> +{
> +  const dr_addr_with_seg_len& p11 = p1.first;
> +  const dr_addr_with_seg_len& p12 = p1.second;
> +  const dr_addr_with_seg_len& p21 = p2.first;
> +  const dr_addr_with_seg_len& p22 = p2.second;
> +
> +  if (p11.basic_addr != p21.basic_addr)
> +    return p11.basic_addr < p21.basic_addr;
> +  if (p12.basic_addr != p22.basic_addr)
> +    return p12.basic_addr < p22.basic_addr;
> +  if (TREE_CODE (p11.offset) != INTEGER_CST
> +      || TREE_CODE (p21.offset) != INTEGER_CST)
> +    return p11.offset < p21.offset;
> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
> +  if (TREE_CODE (p12.offset) != INTEGER_CST
> +      || TREE_CODE (p22.offset) != INTEGER_CST)
> +    return p12.offset < p22.offset;
> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
> +}
> +
> +}
> 
> /* Function vect_create_cond_for_alias_checks.
> 
> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>   if (may_alias_ddrs.is_empty ())
>     return;
> 
> +
> +  /* Basically, for each pair of dependent data refs store_ptr_0
> +     and load_ptr_0, we create an expression:
> +
> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
> +
> +     for aliasing checks. However, in some cases we can decrease
> +     the number of checks by combining two checks into one. For
> +     example, suppose we have another pair of data refs store_ptr_0
> +     and load_ptr_1, and if the following condition is satisfied:
> +
> +     load_ptr_0 < load_ptr_1  &&
> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
> +
> +     (this condition means, in each iteration of vectorized loop,
> +     the accessed memory of store_ptr_0 cannot be between the memory
> +     of load_ptr_0 and load_ptr_1.)
> +
> +     we then can use only the following expression to finish the
> +     alising checks between store_ptr_0 & load_ptr_0 and
> +     store_ptr_0 & load_ptr_1:
> +
> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
> +
> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
> +     same basic address.  */
> +
> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
> +
> +  /* First, we collect all data ref pairs for aliasing checks.  */
> +
>   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>     {
>       struct data_reference *dr_a, *dr_b;
>       gimple dr_group_first_a, dr_group_first_b;
> -      tree addr_base_a, addr_base_b;
>       tree segment_length_a, segment_length_b;
>       gimple stmt_a, stmt_b;
> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
> 
>       dr_a = DDR_A (ddr);
>       stmt_a = DR_STMT (DDR_A (ddr));
>       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>       if (dr_group_first_a)
> -        {
> + {
>   stmt_a = dr_group_first_a;
>   dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>  }
> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>       stmt_b = DR_STMT (DDR_B (ddr));
>       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>       if (dr_group_first_b)
> -        {
> + {
>   stmt_b = dr_group_first_b;
>   dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>  }
> 
> -      addr_base_a
> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
> -       DR_INIT (dr_a)));
> -      addr_base_b
> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
> -       DR_INIT (dr_b)));
> -
>       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>  length_factor = scalar_loop_iters;
>       else
> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
> 
> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
> +  (dr_addr_with_seg_len
> +       (dr_a, DR_BASE_ADDRESS (dr_a),
> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
> + segment_length_a),
> +   dr_addr_with_seg_len
> +       (dr_b, DR_BASE_ADDRESS (dr_b),
> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
> + segment_length_b));
> +
> +      if (dr_with_seg_len_pair.first.basic_addr >
> +  dr_with_seg_len_pair.second.basic_addr)
> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
> +
> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
> +    }
> +
> +  /* Second, we sort the collected data ref pairs so that we can scan
> +     them once to combine all possible aliasing checks.  */
> +
> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
> +
> +  /* Remove duplicate data ref pairs.  */
> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
> + ddrs_with_seg_len.end()),
> +   ddrs_with_seg_len.end());
> +
> +  /* We then scan the sorted dr pairs and check if we can combine
> +     alias checks of two neighbouring dr pairs.  */
> +
> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
> +    {
> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
> +
> +      if (dr_a1 == dr_a2)
> + {
> +  if (dr_b1.basic_addr != dr_b2.basic_addr
> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
> +    continue;
> +
> +  int diff = int_cst_value (dr_b2.offset) -
> +     int_cst_value (dr_b1.offset);
> +
> +  gcc_assert (diff > 0);
> +
> +  if (diff <= vect_factor
> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
> +  && diff - int_cst_value (dr_b1.seg_len) <
> +     int_cst_value (dr_a1.seg_len)))
> +    {
> +      if (dump_enabled_p ())
> + {
> +  dump_printf_loc
> +      (MSG_NOTE, vect_location,
> +       "combining two runtime checks for data references ");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
> +  dump_printf (MSG_NOTE, " and ");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
> +  dump_printf (MSG_NOTE, "\n");
> + }
> +
> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
> +  dr_b2.seg_len, size_int (diff));
> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
> +      --i;
> +    }
> + }
> +      else if (dr_b1 == dr_b2)
> + {
> +  if (dr_a1.basic_addr != dr_a2.basic_addr
> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
> +    continue;
> +
> +  int diff = int_cst_value (dr_a2.offset) -
> +     int_cst_value (dr_a1.offset);
> +
> +  gcc_assert (diff > 0);
> +
> +  if (diff <= vect_factor
> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
> +  && diff - int_cst_value (dr_a1.seg_len) <
> +     int_cst_value (dr_b1.seg_len)))
> +    {
> +      if (dump_enabled_p ())
> + {
> +  dump_printf_loc
> +      (MSG_NOTE, vect_location,
> +       "combining two runtime checks for data references ");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
> +  dump_printf (MSG_NOTE, " and ");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
> +  dump_printf (MSG_NOTE, "\n");
> + }
> +
> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
> +  dr_a2.seg_len, size_int (diff));
> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
> +      --i;
> +    }
> + }
> +    }
> +
> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
> +    {
> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
> +      tree segment_length_a = dr_a.seg_len;
> +      tree segment_length_b = dr_b.seg_len;
> +
> +      tree addr_base_a
> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
> +      tree addr_base_b
> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
> +
>       if (dump_enabled_p ())
>  {
>   dump_printf_loc (MSG_NOTE, vect_location,
> -                           "create runtime check for data references ");
> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
> +   "create runtime check for data references ");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>   dump_printf (MSG_NOTE, " and ");
> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
> -          dump_printf (MSG_NOTE, "\n");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
> +  dump_printf (MSG_NOTE, "\n");
>  }
> 
> -      seg_a_min = addr_base_a;
> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
> +      tree seg_a_min = addr_base_a;
> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>  seg_a_min = seg_a_max, seg_a_max = addr_base_a;
> 
> -      seg_b_min = addr_base_b;
> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
> +      tree seg_b_min = addr_base_b;
> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>  seg_b_min = seg_b_max, seg_b_max = addr_base_b;
> 
>       part_cond_expr =
> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>       adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>     }
> 
> +  /* Extract load and store statements on pointers with zero-stride
> +     accesses.  */
> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
> +    {
> +
> +      /* In the loop body, we iterate each statement to check if it is a load
> + or store. Then we check the DR_STEP of the data reference.  If
> + DR_STEP is zero, then we will hoist the load statement to the loop
> + preheader, and move the store statement to the loop exit.  */
> +
> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
> +   !gsi_end_p (si); )
> + {
> +  gimple stmt = gsi_stmt (si);
> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> +
> +
> +  if (dr && integer_zerop (DR_STEP (dr)))
> +    {
> +      if (DR_IS_READ (dr))
> + {
> +  if (dump_file)
> +    {
> +      fprintf (dump_file,
> +       "Hoist the load to outside of the loop:\n");
> +      print_gimple_stmt (dump_file, stmt, 0,
> + TDF_VOPS|TDF_MEMSYMS);
> +    }
> +
> +  basic_block preheader = loop_preheader_edge (loop)->src;
> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
> +  gsi_move_after (&si, &si_dst);
> + }
> +      else
> + {
> +  gimple_stmt_iterator si_dst =
> +      gsi_last_bb (single_exit (loop)->dest);
> +  gsi_move_after (&si, &si_dst);
> + }
> +              continue;
> +    }
> +  else if (!dr)
> +          {
> +            bool hoist = true;
> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
> +            {
> +              tree op = gimple_op (stmt, i);
> +              if (TREE_CODE (op) == INTEGER_CST
> +                  || TREE_CODE (op) == REAL_CST)
> +                continue;
> +              if (TREE_CODE (op) == SSA_NAME)
> +              {
> +                gimple def = SSA_NAME_DEF_STMT (op);
> +                if (def == stmt
> +                    || gimple_nop_p (def)
> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
> +                  continue;
> +              }
> +              hoist = false;
> +              break;
> +            }
> +
> +            if (hoist)
> +            {
> +              basic_block preheader = loop_preheader_edge (loop)->src;
> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
> +              gsi_move_after (&si, &si_dst);
> +              continue;
> +            }
> +          }
> +          gsi_next (&si);
> + }
> +    }
> +
>   /* End loop-exit-fixes after versioning.  */
> 
>   if (cond_expr_stmt_list)
> Index: gcc/ChangeLog
> ===================================================================
> --- gcc/ChangeLog (revision 202663)
> +++ gcc/ChangeLog (working copy)
> @@ -1,3 +1,8 @@
> +2013-10-01  Cong Hou  <congh@google.com>
> +
> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
> + alias checks if it is possible to amortize the runtime overhead.
> +
Jakub Jelinek Oct. 2, 2013, 6:35 a.m. UTC | #2
On Tue, Oct 01, 2013 at 07:12:54PM -0700, Cong Hou wrote:
> --- gcc/tree-vect-loop-manip.c (revision 202662)
> +++ gcc/tree-vect-loop-manip.c (working copy)

Your mailer ate all the tabs, so the formatting of the whole patch
can't be checked.

> @@ -19,6 +19,10 @@ You should have received a copy of the G
>  along with GCC; see the file COPYING3.  If not see
>  <http://www.gnu.org/licenses/>.  */
> 
> +#include <vector>
> +#include <utility>
> +#include <algorithm>

Why?  GCC has it's vec.h vectors, why don't you use those?
There is even qsort method for you in there.  And for pairs, you can
easily just use structs with two members as structure elements in the
vector.

> +struct dr_addr_with_seg_len
> +{
> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
> +
> +  data_reference* dr;

Space should be before *, not after it.

> +  if (TREE_CODE (p11.offset) != INTEGER_CST
> +      || TREE_CODE (p21.offset) != INTEGER_CST)
> +    return p11.offset < p21.offset;

If offset isn't INTEGER_CST, you are comparing the pointer values?
That is never a good idea, then compilation will depend on how say address
space randomization randomizes virtual address space.  GCC needs to have
reproduceable compilations.

> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);

This is going to ICE whenever the offsets wouldn't fit into a
HOST_WIDE_INT.

I'd say you just shouldn't put into the vector entries where offset isn't
host_integerp, those would never be merged with other checks, or something
similar.

	Jakub
Richard Biener Oct. 2, 2013, 11:24 a.m. UTC | #3
On Tue, 1 Oct 2013, Cong Hou wrote:

> When alias exists between data refs in a loop, to vectorize it GCC
> does loop versioning and adds runtime alias checks. Basically for each
> pair of data refs with possible data dependence, there will be two
> comparisons generated to make sure there is no aliasing between them
> in each iteration of the vectorized loop. If there are many such data
> refs pairs, the number of comparisons can be very large, which is a
> big overhead.
> 
> However, in some cases it is possible to reduce the number of those
> comparisons. For example, for the following loop, we can detect that
> b[0] and b[1] are two consecutive member accesses so that we can
> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
> checking a[0:100]&b[0:2]:
> 
> void foo(int*a, int* b)
> {
>    for (int i = 0; i < 100; ++i)
>     a[i] = b[0] + b[1];
> }
> 
> Actually, the requirement of consecutive memory accesses is too
> strict. For the following loop, we can still combine the alias checks
> between a[0:100]&b[0] and a[0:100]&b[100]:
> 
> void foo(int*a, int* b)
> {
>    for (int i = 0; i < 100; ++i)
>     a[i] = b[0] + b[100];
> }
> 
> This is because if b[0] is not in a[0:100] and b[100] is not in
> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
> to check a[0:100] and b[0:101] don't overlap.
> 
> More generally, consider two pairs of data refs (a, b1) and (a, b2).
> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
> segment length of a, b1, and b2. Then we can combine the two
> comparisons into one if the following condition is satisfied:
> 
> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
> 
> 
> This patch detects those combination opportunities to reduce the
> number of alias checks. It is tested on an x86-64 machine.

Apart from the other comments you got (to which I agree) the patch
seems to do two things, namely also:

+  /* Extract load and store statements on pointers with zero-stride
+     accesses.  */
+  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
+    {

which I'd rather see in a separate patch (and done also when
the loop doesn't require versioning for alias).

Also combining the alias checks in vect_create_cond_for_alias_checks
is nice but doesn't properly fix the use of the
vect-max-version-for-alias-checks param which currently inhibits
vectorization of the HIMENO benchmark by default (and make us look bad 
compared to LLVM).

So I believe this merging should be done incrementally when
we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.

Thanks for working on this,
Richard.

> 
> thanks,
> Cong
> 
> 
> 
> Index: gcc/tree-vect-loop-manip.c
> ===================================================================
> --- gcc/tree-vect-loop-manip.c (revision 202662)
> +++ gcc/tree-vect-loop-manip.c (working copy)
> @@ -19,6 +19,10 @@ You should have received a copy of the G
>  along with GCC; see the file COPYING3.  If not see
>  <http://www.gnu.org/licenses/>.  */
> 
> +#include <vector>
> +#include <utility>
> +#include <algorithm>
> +
>  #include "config.h"
>  #include "system.h"
>  #include "coretypes.h"
> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>    return segment_length;
>  }
> 
> +namespace
> +{
> +
> +/* struct dr_addr_with_seg_len
> +
> +   A struct storing information of a data reference, including the data
> +   ref itself, its basic address, the access offset and the segment length
> +   for aliasing checks.  */
> +
> +struct dr_addr_with_seg_len
> +{
> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
> +
> +  data_reference* dr;
> +  tree basic_addr;
> +  tree offset;
> +  tree seg_len;
> +};
> +
> +/* Operator == between two dr_addr_with_seg_len objects.
> +
> +   This equality operator is used to make sure two data refs
> +   are the same one so that we will consider to combine the
> +   aliasing checks of those two pairs of data dependent data
> +   refs.  */
> +
> +bool operator == (const dr_addr_with_seg_len& d1,
> +  const dr_addr_with_seg_len& d2)
> +{
> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
> + && operand_equal_p (d1.offset, d2.offset, 0)
> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
> +}
> +
> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
> + dr_addr_with_seg_len_pair_t;
> +
> +
> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
> +
> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
> +   so that we can combine aliasing checks during one scan.  */
> +
> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
> + const dr_addr_with_seg_len_pair_t& p2)
> +{
> +  const dr_addr_with_seg_len& p11 = p1.first;
> +  const dr_addr_with_seg_len& p12 = p1.second;
> +  const dr_addr_with_seg_len& p21 = p2.first;
> +  const dr_addr_with_seg_len& p22 = p2.second;
> +
> +  if (p11.basic_addr != p21.basic_addr)
> +    return p11.basic_addr < p21.basic_addr;
> +  if (p12.basic_addr != p22.basic_addr)
> +    return p12.basic_addr < p22.basic_addr;
> +  if (TREE_CODE (p11.offset) != INTEGER_CST
> +      || TREE_CODE (p21.offset) != INTEGER_CST)
> +    return p11.offset < p21.offset;
> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
> +  if (TREE_CODE (p12.offset) != INTEGER_CST
> +      || TREE_CODE (p22.offset) != INTEGER_CST)
> +    return p12.offset < p22.offset;
> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
> +}
> +
> +}
> 
>  /* Function vect_create_cond_for_alias_checks.
> 
> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>    if (may_alias_ddrs.is_empty ())
>      return;
> 
> +
> +  /* Basically, for each pair of dependent data refs store_ptr_0
> +     and load_ptr_0, we create an expression:
> +
> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
> +
> +     for aliasing checks. However, in some cases we can decrease
> +     the number of checks by combining two checks into one. For
> +     example, suppose we have another pair of data refs store_ptr_0
> +     and load_ptr_1, and if the following condition is satisfied:
> +
> +     load_ptr_0 < load_ptr_1  &&
> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
> +
> +     (this condition means, in each iteration of vectorized loop,
> +     the accessed memory of store_ptr_0 cannot be between the memory
> +     of load_ptr_0 and load_ptr_1.)
> +
> +     we then can use only the following expression to finish the
> +     alising checks between store_ptr_0 & load_ptr_0 and
> +     store_ptr_0 & load_ptr_1:
> +
> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
> +
> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
> +     same basic address.  */
> +
> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
> +
> +  /* First, we collect all data ref pairs for aliasing checks.  */
> +
>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>      {
>        struct data_reference *dr_a, *dr_b;
>        gimple dr_group_first_a, dr_group_first_b;
> -      tree addr_base_a, addr_base_b;
>        tree segment_length_a, segment_length_b;
>        gimple stmt_a, stmt_b;
> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
> 
>        dr_a = DDR_A (ddr);
>        stmt_a = DR_STMT (DDR_A (ddr));
>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>        if (dr_group_first_a)
> -        {
> + {
>    stmt_a = dr_group_first_a;
>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>   }
> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>        stmt_b = DR_STMT (DDR_B (ddr));
>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>        if (dr_group_first_b)
> -        {
> + {
>    stmt_b = dr_group_first_b;
>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>   }
> 
> -      addr_base_a
> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
> -       DR_INIT (dr_a)));
> -      addr_base_b
> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
> -       DR_INIT (dr_b)));
> -
>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>   length_factor = scalar_loop_iters;
>        else
> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
> 
> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
> +  (dr_addr_with_seg_len
> +       (dr_a, DR_BASE_ADDRESS (dr_a),
> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
> + segment_length_a),
> +   dr_addr_with_seg_len
> +       (dr_b, DR_BASE_ADDRESS (dr_b),
> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
> + segment_length_b));
> +
> +      if (dr_with_seg_len_pair.first.basic_addr >
> +  dr_with_seg_len_pair.second.basic_addr)
> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
> +
> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
> +    }
> +
> +  /* Second, we sort the collected data ref pairs so that we can scan
> +     them once to combine all possible aliasing checks.  */
> +
> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
> +
> +  /* Remove duplicate data ref pairs.  */
> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
> + ddrs_with_seg_len.end()),
> +   ddrs_with_seg_len.end());
> +
> +  /* We then scan the sorted dr pairs and check if we can combine
> +     alias checks of two neighbouring dr pairs.  */
> +
> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
> +    {
> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
> +
> +      if (dr_a1 == dr_a2)
> + {
> +  if (dr_b1.basic_addr != dr_b2.basic_addr
> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
> +    continue;
> +
> +  int diff = int_cst_value (dr_b2.offset) -
> +     int_cst_value (dr_b1.offset);
> +
> +  gcc_assert (diff > 0);
> +
> +  if (diff <= vect_factor
> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
> +  && diff - int_cst_value (dr_b1.seg_len) <
> +     int_cst_value (dr_a1.seg_len)))
> +    {
> +      if (dump_enabled_p ())
> + {
> +  dump_printf_loc
> +      (MSG_NOTE, vect_location,
> +       "combining two runtime checks for data references ");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
> +  dump_printf (MSG_NOTE, " and ");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
> +  dump_printf (MSG_NOTE, "\n");
> + }
> +
> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
> +  dr_b2.seg_len, size_int (diff));
> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
> +      --i;
> +    }
> + }
> +      else if (dr_b1 == dr_b2)
> + {
> +  if (dr_a1.basic_addr != dr_a2.basic_addr
> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
> +    continue;
> +
> +  int diff = int_cst_value (dr_a2.offset) -
> +     int_cst_value (dr_a1.offset);
> +
> +  gcc_assert (diff > 0);
> +
> +  if (diff <= vect_factor
> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
> +  && diff - int_cst_value (dr_a1.seg_len) <
> +     int_cst_value (dr_b1.seg_len)))
> +    {
> +      if (dump_enabled_p ())
> + {
> +  dump_printf_loc
> +      (MSG_NOTE, vect_location,
> +       "combining two runtime checks for data references ");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
> +  dump_printf (MSG_NOTE, " and ");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
> +  dump_printf (MSG_NOTE, "\n");
> + }
> +
> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
> +  dr_a2.seg_len, size_int (diff));
> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
> +      --i;
> +    }
> + }
> +    }
> +
> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
> +    {
> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
> +      tree segment_length_a = dr_a.seg_len;
> +      tree segment_length_b = dr_b.seg_len;
> +
> +      tree addr_base_a
> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
> +      tree addr_base_b
> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
> +
>        if (dump_enabled_p ())
>   {
>    dump_printf_loc (MSG_NOTE, vect_location,
> -                           "create runtime check for data references ");
> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
> +   "create runtime check for data references ");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>    dump_printf (MSG_NOTE, " and ");
> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
> -          dump_printf (MSG_NOTE, "\n");
> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
> +  dump_printf (MSG_NOTE, "\n");
>   }
> 
> -      seg_a_min = addr_base_a;
> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
> +      tree seg_a_min = addr_base_a;
> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
> 
> -      seg_b_min = addr_base_b;
> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
> +      tree seg_b_min = addr_base_b;
> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
> 
>        part_cond_expr =
> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>      }
> 
> +  /* Extract load and store statements on pointers with zero-stride
> +     accesses.  */
> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
> +    {
> +
> +      /* In the loop body, we iterate each statement to check if it is a load
> + or store. Then we check the DR_STEP of the data reference.  If
> + DR_STEP is zero, then we will hoist the load statement to the loop
> + preheader, and move the store statement to the loop exit.  */
> +
> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
> +   !gsi_end_p (si); )
> + {
> +  gimple stmt = gsi_stmt (si);
> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> +
> +
> +  if (dr && integer_zerop (DR_STEP (dr)))
> +    {
> +      if (DR_IS_READ (dr))
> + {
> +  if (dump_file)
> +    {
> +      fprintf (dump_file,
> +       "Hoist the load to outside of the loop:\n");
> +      print_gimple_stmt (dump_file, stmt, 0,
> + TDF_VOPS|TDF_MEMSYMS);
> +    }
> +
> +  basic_block preheader = loop_preheader_edge (loop)->src;
> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
> +  gsi_move_after (&si, &si_dst);
> + }
> +      else
> + {
> +  gimple_stmt_iterator si_dst =
> +      gsi_last_bb (single_exit (loop)->dest);
> +  gsi_move_after (&si, &si_dst);
> + }
> +              continue;
> +    }
> +  else if (!dr)
> +          {
> +            bool hoist = true;
> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
> +            {
> +              tree op = gimple_op (stmt, i);
> +              if (TREE_CODE (op) == INTEGER_CST
> +                  || TREE_CODE (op) == REAL_CST)
> +                continue;
> +              if (TREE_CODE (op) == SSA_NAME)
> +              {
> +                gimple def = SSA_NAME_DEF_STMT (op);
> +                if (def == stmt
> +                    || gimple_nop_p (def)
> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
> +                  continue;
> +              }
> +              hoist = false;
> +              break;
> +            }
> +
> +            if (hoist)
> +            {
> +              basic_block preheader = loop_preheader_edge (loop)->src;
> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
> +              gsi_move_after (&si, &si_dst);
> +              continue;
> +            }
> +          }
> +          gsi_next (&si);
> + }
> +    }
> +
>    /* End loop-exit-fixes after versioning.  */
> 
>    if (cond_expr_stmt_list)
> Index: gcc/ChangeLog
> ===================================================================
> --- gcc/ChangeLog (revision 202663)
> +++ gcc/ChangeLog (working copy)
> @@ -1,3 +1,8 @@
> +2013-10-01  Cong Hou  <congh@google.com>
> +
> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
> + alias checks if it is possible to amortize the runtime overhead.
> +
> 
>
Cong Hou Oct. 2, 2013, 5:50 p.m. UTC | #4
On Tue, Oct 1, 2013 at 11:35 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Tue, Oct 01, 2013 at 07:12:54PM -0700, Cong Hou wrote:
>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>> +++ gcc/tree-vect-loop-manip.c (working copy)
>
> Your mailer ate all the tabs, so the formatting of the whole patch
> can't be checked.
>


I'll pay attention to this problem in my later patch submission.


>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>  along with GCC; see the file COPYING3.  If not see
>>  <http://www.gnu.org/licenses/>.  */
>>
>> +#include <vector>
>> +#include <utility>
>> +#include <algorithm>
>
> Why?  GCC has it's vec.h vectors, why don't you use those?
> There is even qsort method for you in there.  And for pairs, you can
> easily just use structs with two members as structure elements in the
> vector.
>


GCC is now restructured using C++ and STL is one of the most important
part of C++. I am new to GCC community and more familiar to STL (and I
think allowing STL in GCC could attract more new developers for GCC).
I agree using GCC's vec can maintain a uniform style but STL is just
so powerful and easy to use...

I just did a search in GCC source tree and found <vector> is not used
yet. I will change std::vector to GCC's vec for now (and also qsort),
but am still wondering if one day GCC would accept STL.


>> +struct dr_addr_with_seg_len
>> +{
>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>> +
>> +  data_reference* dr;
>
> Space should be before *, not after it.
>
>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>> +    return p11.offset < p21.offset;
>
> If offset isn't INTEGER_CST, you are comparing the pointer values?
> That is never a good idea, then compilation will depend on how say address
> space randomization randomizes virtual address space.  GCC needs to have
> reproduceable compilations.


I this scenario comparing pointers is safe. The sort is used to put
together any two pairs of data refs which can be merged. For example,
if we have (a, b) (a, c), (a, b+1), then after sorting them we should
have either (a, b), (a, b+1), (a, c) or (a, c), (a, b), (a, b+1). We
don't care the relative order of "non-mergable" dr pairs here. So
although the sorting result may vary the final result we get should
not change.


>
>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>
> This is going to ICE whenever the offsets wouldn't fit into a
> HOST_WIDE_INT.
>
> I'd say you just shouldn't put into the vector entries where offset isn't
> host_integerp, those would never be merged with other checks, or something
> similar.

Do you mean I should use widest_int_cst_value()? Then I will replace
all int_cst_value() here with it. I also changed the type of "diff"
variable into HOST_WIDEST_INT.



Thank you very much for your comments!

Cong



>
>         Jakub
Dehao Chen Oct. 2, 2013, 6:26 p.m. UTC | #5
On Wed, Oct 2, 2013 at 10:50 AM, Cong Hou <congh@google.com> wrote:
> On Tue, Oct 1, 2013 at 11:35 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>> On Tue, Oct 01, 2013 at 07:12:54PM -0700, Cong Hou wrote:
>>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>>> +++ gcc/tree-vect-loop-manip.c (working copy)
>>
>> Your mailer ate all the tabs, so the formatting of the whole patch
>> can't be checked.
>>
>
>
> I'll pay attention to this problem in my later patch submission.
>
>
>>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>>  along with GCC; see the file COPYING3.  If not see
>>>  <http://www.gnu.org/licenses/>.  */
>>>
>>> +#include <vector>
>>> +#include <utility>
>>> +#include <algorithm>
>>
>> Why?  GCC has it's vec.h vectors, why don't you use those?
>> There is even qsort method for you in there.  And for pairs, you can
>> easily just use structs with two members as structure elements in the
>> vector.
>>
>
>
> GCC is now restructured using C++ and STL is one of the most important
> part of C++. I am new to GCC community and more familiar to STL (and I
> think allowing STL in GCC could attract more new developers for GCC).
> I agree using GCC's vec can maintain a uniform style but STL is just
> so powerful and easy to use...
>
> I just did a search in GCC source tree and found <vector> is not used
> yet. I will change std::vector to GCC's vec for now (and also qsort),
> but am still wondering if one day GCC would accept STL.

I talked with Ian and Diego before, they are both OK to have STL in
GCC code as soon as you just use it for local data structure that does
not use gcc garbage collector.

STL can greatly simply source code (e.g. In
http://gcc.gnu.org/viewcvs/gcc?view=revision&revision=201615 STL
helped reduce auto-profile.c from 1721 LOC to 1371 LOC). STL could
also attract more C++ developers to GCC community.

Any comments?
Dehao

>
>
>>> +struct dr_addr_with_seg_len
>>> +{
>>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>>> +
>>> +  data_reference* dr;
>>
>> Space should be before *, not after it.
>>
>>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>>> +    return p11.offset < p21.offset;
>>
>> If offset isn't INTEGER_CST, you are comparing the pointer values?
>> That is never a good idea, then compilation will depend on how say address
>> space randomization randomizes virtual address space.  GCC needs to have
>> reproduceable compilations.
>
>
> I this scenario comparing pointers is safe. The sort is used to put
> together any two pairs of data refs which can be merged. For example,
> if we have (a, b) (a, c), (a, b+1), then after sorting them we should
> have either (a, b), (a, b+1), (a, c) or (a, c), (a, b), (a, b+1). We
> don't care the relative order of "non-mergable" dr pairs here. So
> although the sorting result may vary the final result we get should
> not change.
>
>
>>
>>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>>
>> This is going to ICE whenever the offsets wouldn't fit into a
>> HOST_WIDE_INT.
>>
>> I'd say you just shouldn't put into the vector entries where offset isn't
>> host_integerp, those would never be merged with other checks, or something
>> similar.
>
> Do you mean I should use widest_int_cst_value()? Then I will replace
> all int_cst_value() here with it. I also changed the type of "diff"
> variable into HOST_WIDEST_INT.
>
>
>
> Thank you very much for your comments!
>
> Cong
>
>
>
>>
>>         Jakub
Jakub Jelinek Oct. 2, 2013, 6:26 p.m. UTC | #6
On Wed, Oct 02, 2013 at 10:50:21AM -0700, Cong Hou wrote:
> >> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
> >> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
> >
> > This is going to ICE whenever the offsets wouldn't fit into a
> > HOST_WIDE_INT.
> >
> > I'd say you just shouldn't put into the vector entries where offset isn't
> > host_integerp, those would never be merged with other checks, or something
> > similar.
> 
> Do you mean I should use widest_int_cst_value()? Then I will replace
> all int_cst_value() here with it. I also changed the type of "diff"
> variable into HOST_WIDEST_INT.

Actually, best would be just to use
tree_int_cst_compare (p11.offset, p21.offset)
that will handle any INTEGER_CSTs, not just those that fit into HWI.

	Jakub
Xinliang David Li Oct. 2, 2013, 9:18 p.m. UTC | #7
On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
> On Tue, 1 Oct 2013, Cong Hou wrote:
>
>> When alias exists between data refs in a loop, to vectorize it GCC
>> does loop versioning and adds runtime alias checks. Basically for each
>> pair of data refs with possible data dependence, there will be two
>> comparisons generated to make sure there is no aliasing between them
>> in each iteration of the vectorized loop. If there are many such data
>> refs pairs, the number of comparisons can be very large, which is a
>> big overhead.
>>
>> However, in some cases it is possible to reduce the number of those
>> comparisons. For example, for the following loop, we can detect that
>> b[0] and b[1] are two consecutive member accesses so that we can
>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>> checking a[0:100]&b[0:2]:
>>
>> void foo(int*a, int* b)
>> {
>>    for (int i = 0; i < 100; ++i)
>>     a[i] = b[0] + b[1];
>> }
>>
>> Actually, the requirement of consecutive memory accesses is too
>> strict. For the following loop, we can still combine the alias checks
>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>
>> void foo(int*a, int* b)
>> {
>>    for (int i = 0; i < 100; ++i)
>>     a[i] = b[0] + b[100];
>> }
>>
>> This is because if b[0] is not in a[0:100] and b[100] is not in
>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>> to check a[0:100] and b[0:101] don't overlap.
>>
>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>> segment length of a, b1, and b2. Then we can combine the two
>> comparisons into one if the following condition is satisfied:
>>
>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>
>>
>> This patch detects those combination opportunities to reduce the
>> number of alias checks. It is tested on an x86-64 machine.
>
> Apart from the other comments you got (to which I agree) the patch
> seems to do two things, namely also:
>
> +  /* Extract load and store statements on pointers with zero-stride
> +     accesses.  */
> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
> +    {
>
> which I'd rather see in a separate patch (and done also when
> the loop doesn't require versioning for alias).

yes.

>
> Also combining the alias checks in vect_create_cond_for_alias_checks
> is nice but doesn't properly fix the use of the
> vect-max-version-for-alias-checks param

Yes. The handling of this should be moved to
'vect_prune_runtime_alias_test_list' to avoid premature decisions.



>which currently inhibits
> vectorization of the HIMENO benchmark by default (and make us look bad
> compared to LLVM).

Here is a small reproducible:

struct  A {
  int *base;
  int offset;
  int offset2;
  int offset3;
  int offset4;
  int offset5;
  int offset6;
  int offset7;
  int offset8;
};

void foo (struct A * ar1, struct A* ar2)
{
      int i;
      for (i = 0; i < 10000; i++)
        {
           ar1->base[i]  = 2*ar2->base[i] + ar2->offset + ar2->offset2
+ ar2->offset3 + ar2->offset4 + ar2->offset5 + ar2->offset6; /* +
ar2->offset7 + ar2->offset8;*/
        }
}

GCC trunk won't vectorize it at O2 due to the limit.


There is another problem we should be tracking: GCC no longer
vectorize the loop (with large
--param=vect-max-version-for-alias-checks=40) when -fno-strict-alias
is specified.   However with additional runtime alias check, the loop
should be vectorizable.

David


>
> So I believe this merging should be done incrementally when
> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>
> Thanks for working on this,
> Richard.
>
>>
>> thanks,
>> Cong
>>
>>
>>
>> Index: gcc/tree-vect-loop-manip.c
>> ===================================================================
>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>> +++ gcc/tree-vect-loop-manip.c (working copy)
>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>  along with GCC; see the file COPYING3.  If not see
>>  <http://www.gnu.org/licenses/>.  */
>>
>> +#include <vector>
>> +#include <utility>
>> +#include <algorithm>
>> +
>>  #include "config.h"
>>  #include "system.h"
>>  #include "coretypes.h"
>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>    return segment_length;
>>  }
>>
>> +namespace
>> +{
>> +
>> +/* struct dr_addr_with_seg_len
>> +
>> +   A struct storing information of a data reference, including the data
>> +   ref itself, its basic address, the access offset and the segment length
>> +   for aliasing checks.  */
>> +
>> +struct dr_addr_with_seg_len
>> +{
>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>> +
>> +  data_reference* dr;
>> +  tree basic_addr;
>> +  tree offset;
>> +  tree seg_len;
>> +};
>> +
>> +/* Operator == between two dr_addr_with_seg_len objects.
>> +
>> +   This equality operator is used to make sure two data refs
>> +   are the same one so that we will consider to combine the
>> +   aliasing checks of those two pairs of data dependent data
>> +   refs.  */
>> +
>> +bool operator == (const dr_addr_with_seg_len& d1,
>> +  const dr_addr_with_seg_len& d2)
>> +{
>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>> + && operand_equal_p (d1.offset, d2.offset, 0)
>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>> +}
>> +
>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>> + dr_addr_with_seg_len_pair_t;
>> +
>> +
>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>> +
>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>> +   so that we can combine aliasing checks during one scan.  */
>> +
>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>> + const dr_addr_with_seg_len_pair_t& p2)
>> +{
>> +  const dr_addr_with_seg_len& p11 = p1.first;
>> +  const dr_addr_with_seg_len& p12 = p1.second;
>> +  const dr_addr_with_seg_len& p21 = p2.first;
>> +  const dr_addr_with_seg_len& p22 = p2.second;
>> +
>> +  if (p11.basic_addr != p21.basic_addr)
>> +    return p11.basic_addr < p21.basic_addr;
>> +  if (p12.basic_addr != p22.basic_addr)
>> +    return p12.basic_addr < p22.basic_addr;
>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>> +    return p11.offset < p21.offset;
>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>> +    return p12.offset < p22.offset;
>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>> +}
>> +
>> +}
>>
>>  /* Function vect_create_cond_for_alias_checks.
>>
>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>    if (may_alias_ddrs.is_empty ())
>>      return;
>>
>> +
>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>> +     and load_ptr_0, we create an expression:
>> +
>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>> +
>> +     for aliasing checks. However, in some cases we can decrease
>> +     the number of checks by combining two checks into one. For
>> +     example, suppose we have another pair of data refs store_ptr_0
>> +     and load_ptr_1, and if the following condition is satisfied:
>> +
>> +     load_ptr_0 < load_ptr_1  &&
>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>> +
>> +     (this condition means, in each iteration of vectorized loop,
>> +     the accessed memory of store_ptr_0 cannot be between the memory
>> +     of load_ptr_0 and load_ptr_1.)
>> +
>> +     we then can use only the following expression to finish the
>> +     alising checks between store_ptr_0 & load_ptr_0 and
>> +     store_ptr_0 & load_ptr_1:
>> +
>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>> +
>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>> +     same basic address.  */
>> +
>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>> +
>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>> +
>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>      {
>>        struct data_reference *dr_a, *dr_b;
>>        gimple dr_group_first_a, dr_group_first_b;
>> -      tree addr_base_a, addr_base_b;
>>        tree segment_length_a, segment_length_b;
>>        gimple stmt_a, stmt_b;
>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>
>>        dr_a = DDR_A (ddr);
>>        stmt_a = DR_STMT (DDR_A (ddr));
>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>        if (dr_group_first_a)
>> -        {
>> + {
>>    stmt_a = dr_group_first_a;
>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>   }
>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>        stmt_b = DR_STMT (DDR_B (ddr));
>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>        if (dr_group_first_b)
>> -        {
>> + {
>>    stmt_b = dr_group_first_b;
>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>   }
>>
>> -      addr_base_a
>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>> -       DR_INIT (dr_a)));
>> -      addr_base_b
>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>> -       DR_INIT (dr_b)));
>> -
>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>   length_factor = scalar_loop_iters;
>>        else
>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>
>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>> +  (dr_addr_with_seg_len
>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>> + segment_length_a),
>> +   dr_addr_with_seg_len
>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>> + segment_length_b));
>> +
>> +      if (dr_with_seg_len_pair.first.basic_addr >
>> +  dr_with_seg_len_pair.second.basic_addr)
>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>> +
>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>> +    }
>> +
>> +  /* Second, we sort the collected data ref pairs so that we can scan
>> +     them once to combine all possible aliasing checks.  */
>> +
>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>> +
>> +  /* Remove duplicate data ref pairs.  */
>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>> + ddrs_with_seg_len.end()),
>> +   ddrs_with_seg_len.end());
>> +
>> +  /* We then scan the sorted dr pairs and check if we can combine
>> +     alias checks of two neighbouring dr pairs.  */
>> +
>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>> +    {
>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>> +
>> +      if (dr_a1 == dr_a2)
>> + {
>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>> +    continue;
>> +
>> +  int diff = int_cst_value (dr_b2.offset) -
>> +     int_cst_value (dr_b1.offset);
>> +
>> +  gcc_assert (diff > 0);
>> +
>> +  if (diff <= vect_factor
>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>> +  && diff - int_cst_value (dr_b1.seg_len) <
>> +     int_cst_value (dr_a1.seg_len)))
>> +    {
>> +      if (dump_enabled_p ())
>> + {
>> +  dump_printf_loc
>> +      (MSG_NOTE, vect_location,
>> +       "combining two runtime checks for data references ");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>> +  dump_printf (MSG_NOTE, " and ");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>> +  dump_printf (MSG_NOTE, "\n");
>> + }
>> +
>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>> +  dr_b2.seg_len, size_int (diff));
>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>> +      --i;
>> +    }
>> + }
>> +      else if (dr_b1 == dr_b2)
>> + {
>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>> +    continue;
>> +
>> +  int diff = int_cst_value (dr_a2.offset) -
>> +     int_cst_value (dr_a1.offset);
>> +
>> +  gcc_assert (diff > 0);
>> +
>> +  if (diff <= vect_factor
>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>> +  && diff - int_cst_value (dr_a1.seg_len) <
>> +     int_cst_value (dr_b1.seg_len)))
>> +    {
>> +      if (dump_enabled_p ())
>> + {
>> +  dump_printf_loc
>> +      (MSG_NOTE, vect_location,
>> +       "combining two runtime checks for data references ");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>> +  dump_printf (MSG_NOTE, " and ");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>> +  dump_printf (MSG_NOTE, "\n");
>> + }
>> +
>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>> +  dr_a2.seg_len, size_int (diff));
>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>> +      --i;
>> +    }
>> + }
>> +    }
>> +
>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>> +    {
>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>> +      tree segment_length_a = dr_a.seg_len;
>> +      tree segment_length_b = dr_b.seg_len;
>> +
>> +      tree addr_base_a
>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>> +      tree addr_base_b
>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>> +
>>        if (dump_enabled_p ())
>>   {
>>    dump_printf_loc (MSG_NOTE, vect_location,
>> -                           "create runtime check for data references ");
>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>> +   "create runtime check for data references ");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>    dump_printf (MSG_NOTE, " and ");
>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>> -          dump_printf (MSG_NOTE, "\n");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>> +  dump_printf (MSG_NOTE, "\n");
>>   }
>>
>> -      seg_a_min = addr_base_a;
>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>> +      tree seg_a_min = addr_base_a;
>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>
>> -      seg_b_min = addr_base_b;
>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>> +      tree seg_b_min = addr_base_b;
>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>
>>        part_cond_expr =
>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>      }
>>
>> +  /* Extract load and store statements on pointers with zero-stride
>> +     accesses.  */
>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>> +    {
>> +
>> +      /* In the loop body, we iterate each statement to check if it is a load
>> + or store. Then we check the DR_STEP of the data reference.  If
>> + DR_STEP is zero, then we will hoist the load statement to the loop
>> + preheader, and move the store statement to the loop exit.  */
>> +
>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>> +   !gsi_end_p (si); )
>> + {
>> +  gimple stmt = gsi_stmt (si);
>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>> +
>> +
>> +  if (dr && integer_zerop (DR_STEP (dr)))
>> +    {
>> +      if (DR_IS_READ (dr))
>> + {
>> +  if (dump_file)
>> +    {
>> +      fprintf (dump_file,
>> +       "Hoist the load to outside of the loop:\n");
>> +      print_gimple_stmt (dump_file, stmt, 0,
>> + TDF_VOPS|TDF_MEMSYMS);
>> +    }
>> +
>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>> +  gsi_move_after (&si, &si_dst);
>> + }
>> +      else
>> + {
>> +  gimple_stmt_iterator si_dst =
>> +      gsi_last_bb (single_exit (loop)->dest);
>> +  gsi_move_after (&si, &si_dst);
>> + }
>> +              continue;
>> +    }
>> +  else if (!dr)
>> +          {
>> +            bool hoist = true;
>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>> +            {
>> +              tree op = gimple_op (stmt, i);
>> +              if (TREE_CODE (op) == INTEGER_CST
>> +                  || TREE_CODE (op) == REAL_CST)
>> +                continue;
>> +              if (TREE_CODE (op) == SSA_NAME)
>> +              {
>> +                gimple def = SSA_NAME_DEF_STMT (op);
>> +                if (def == stmt
>> +                    || gimple_nop_p (def)
>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>> +                  continue;
>> +              }
>> +              hoist = false;
>> +              break;
>> +            }
>> +
>> +            if (hoist)
>> +            {
>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>> +              gsi_move_after (&si, &si_dst);
>> +              continue;
>> +            }
>> +          }
>> +          gsi_next (&si);
>> + }
>> +    }
>> +
>>    /* End loop-exit-fixes after versioning.  */
>>
>>    if (cond_expr_stmt_list)
>> Index: gcc/ChangeLog
>> ===================================================================
>> --- gcc/ChangeLog (revision 202663)
>> +++ gcc/ChangeLog (working copy)
>> @@ -1,3 +1,8 @@
>> +2013-10-01  Cong Hou  <congh@google.com>
>> +
>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>> + alias checks if it is possible to amortize the runtime overhead.
>> +
>>
>>
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE / SUSE Labs
> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Cong Hou Oct. 2, 2013, 9:34 p.m. UTC | #8
On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
> On Tue, 1 Oct 2013, Cong Hou wrote:
>
>> When alias exists between data refs in a loop, to vectorize it GCC
>> does loop versioning and adds runtime alias checks. Basically for each
>> pair of data refs with possible data dependence, there will be two
>> comparisons generated to make sure there is no aliasing between them
>> in each iteration of the vectorized loop. If there are many such data
>> refs pairs, the number of comparisons can be very large, which is a
>> big overhead.
>>
>> However, in some cases it is possible to reduce the number of those
>> comparisons. For example, for the following loop, we can detect that
>> b[0] and b[1] are two consecutive member accesses so that we can
>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>> checking a[0:100]&b[0:2]:
>>
>> void foo(int*a, int* b)
>> {
>>    for (int i = 0; i < 100; ++i)
>>     a[i] = b[0] + b[1];
>> }
>>
>> Actually, the requirement of consecutive memory accesses is too
>> strict. For the following loop, we can still combine the alias checks
>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>
>> void foo(int*a, int* b)
>> {
>>    for (int i = 0; i < 100; ++i)
>>     a[i] = b[0] + b[100];
>> }
>>
>> This is because if b[0] is not in a[0:100] and b[100] is not in
>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>> to check a[0:100] and b[0:101] don't overlap.
>>
>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>> segment length of a, b1, and b2. Then we can combine the two
>> comparisons into one if the following condition is satisfied:
>>
>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>
>>
>> This patch detects those combination opportunities to reduce the
>> number of alias checks. It is tested on an x86-64 machine.
>
> Apart from the other comments you got (to which I agree) the patch
> seems to do two things, namely also:
>
> +  /* Extract load and store statements on pointers with zero-stride
> +     accesses.  */
> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
> +    {
>
> which I'd rather see in a separate patch (and done also when
> the loop doesn't require versioning for alias).
>


My mistake.. I am working on those two patches at the same time and
pasted that one also here by mistake. I will send another patch about
the "hoist" topic.


> Also combining the alias checks in vect_create_cond_for_alias_checks
> is nice but doesn't properly fix the use of the
> vect-max-version-for-alias-checks param which currently inhibits
> vectorization of the HIMENO benchmark by default (and make us look bad
> compared to LLVM).
>
> So I believe this merging should be done incrementally when
> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>


I agree that vect-max-version-for-alias-checks param should count the
number of checks after the merge. However, the struct
data_dependence_relation could not record the new information produced
by the merge. The new information I mentioned contains the new segment
length for comparisons. This length is calculated right in
vect_create_cond_for_alias_checks() function. Since
vect-max-version-for-alias-checks is used during analysis phase, shall
we move all those (get segment length for each data ref and merge
alias checks) from transformation to analysis phase? If we cannot
store the result properly (data_dependence_relation is not enough),
shall we do it twice in both phases?

I also noticed a possible bug in the function vect_same_range_drs()
called by vect_prune_runtime_alias_test_list(). For the following code
I get two pairs of data refs after
vect_prune_runtime_alias_test_list(), but in
vect_create_cond_for_alias_checks() after detecting grouped accesses I
got two identical pairs of data refs. The consequence is two identical
alias checks are produced.


void yuv2yuyv_ref (int *d, int *src, int n)
{
  char *dest = (char *)d;
  int i;

  for(i=0;i<n/2;i++){
    dest[i*4 + 0] = (src[i*2 + 0])>>16;
    dest[i*4 + 1] = (src[i*2 + 1])>>8;
    dest[i*4 + 2] = (src[i*2 + 0])>>16;
    dest[i*4 + 3] = (src[i*2 + 0])>>0;
  }
}


I think the solution to this problem is changing

GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
== GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)

into

STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
(vinfo_for_stmt (stmt_i))))
== STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
(vinfo_for_stmt (stmt_j)))


in function vect_same_range_drs(). What do you think about it?


thanks,
Cong



> Thanks for working on this,
> Richard.
>
>>
>> thanks,
>> Cong
>>
>>
>>
>> Index: gcc/tree-vect-loop-manip.c
>> ===================================================================
>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>> +++ gcc/tree-vect-loop-manip.c (working copy)
>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>  along with GCC; see the file COPYING3.  If not see
>>  <http://www.gnu.org/licenses/>.  */
>>
>> +#include <vector>
>> +#include <utility>
>> +#include <algorithm>
>> +
>>  #include "config.h"
>>  #include "system.h"
>>  #include "coretypes.h"
>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>    return segment_length;
>>  }
>>
>> +namespace
>> +{
>> +
>> +/* struct dr_addr_with_seg_len
>> +
>> +   A struct storing information of a data reference, including the data
>> +   ref itself, its basic address, the access offset and the segment length
>> +   for aliasing checks.  */
>> +
>> +struct dr_addr_with_seg_len
>> +{
>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>> +
>> +  data_reference* dr;
>> +  tree basic_addr;
>> +  tree offset;
>> +  tree seg_len;
>> +};
>> +
>> +/* Operator == between two dr_addr_with_seg_len objects.
>> +
>> +   This equality operator is used to make sure two data refs
>> +   are the same one so that we will consider to combine the
>> +   aliasing checks of those two pairs of data dependent data
>> +   refs.  */
>> +
>> +bool operator == (const dr_addr_with_seg_len& d1,
>> +  const dr_addr_with_seg_len& d2)
>> +{
>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>> + && operand_equal_p (d1.offset, d2.offset, 0)
>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>> +}
>> +
>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>> + dr_addr_with_seg_len_pair_t;
>> +
>> +
>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>> +
>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>> +   so that we can combine aliasing checks during one scan.  */
>> +
>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>> + const dr_addr_with_seg_len_pair_t& p2)
>> +{
>> +  const dr_addr_with_seg_len& p11 = p1.first;
>> +  const dr_addr_with_seg_len& p12 = p1.second;
>> +  const dr_addr_with_seg_len& p21 = p2.first;
>> +  const dr_addr_with_seg_len& p22 = p2.second;
>> +
>> +  if (p11.basic_addr != p21.basic_addr)
>> +    return p11.basic_addr < p21.basic_addr;
>> +  if (p12.basic_addr != p22.basic_addr)
>> +    return p12.basic_addr < p22.basic_addr;
>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>> +    return p11.offset < p21.offset;
>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>> +    return p12.offset < p22.offset;
>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>> +}
>> +
>> +}
>>
>>  /* Function vect_create_cond_for_alias_checks.
>>
>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>    if (may_alias_ddrs.is_empty ())
>>      return;
>>
>> +
>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>> +     and load_ptr_0, we create an expression:
>> +
>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>> +
>> +     for aliasing checks. However, in some cases we can decrease
>> +     the number of checks by combining two checks into one. For
>> +     example, suppose we have another pair of data refs store_ptr_0
>> +     and load_ptr_1, and if the following condition is satisfied:
>> +
>> +     load_ptr_0 < load_ptr_1  &&
>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>> +
>> +     (this condition means, in each iteration of vectorized loop,
>> +     the accessed memory of store_ptr_0 cannot be between the memory
>> +     of load_ptr_0 and load_ptr_1.)
>> +
>> +     we then can use only the following expression to finish the
>> +     alising checks between store_ptr_0 & load_ptr_0 and
>> +     store_ptr_0 & load_ptr_1:
>> +
>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>> +
>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>> +     same basic address.  */
>> +
>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>> +
>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>> +
>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>      {
>>        struct data_reference *dr_a, *dr_b;
>>        gimple dr_group_first_a, dr_group_first_b;
>> -      tree addr_base_a, addr_base_b;
>>        tree segment_length_a, segment_length_b;
>>        gimple stmt_a, stmt_b;
>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>
>>        dr_a = DDR_A (ddr);
>>        stmt_a = DR_STMT (DDR_A (ddr));
>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>        if (dr_group_first_a)
>> -        {
>> + {
>>    stmt_a = dr_group_first_a;
>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>   }
>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>        stmt_b = DR_STMT (DDR_B (ddr));
>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>        if (dr_group_first_b)
>> -        {
>> + {
>>    stmt_b = dr_group_first_b;
>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>   }
>>
>> -      addr_base_a
>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>> -       DR_INIT (dr_a)));
>> -      addr_base_b
>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>> -       DR_INIT (dr_b)));
>> -
>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>   length_factor = scalar_loop_iters;
>>        else
>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>
>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>> +  (dr_addr_with_seg_len
>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>> + segment_length_a),
>> +   dr_addr_with_seg_len
>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>> + segment_length_b));
>> +
>> +      if (dr_with_seg_len_pair.first.basic_addr >
>> +  dr_with_seg_len_pair.second.basic_addr)
>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>> +
>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>> +    }
>> +
>> +  /* Second, we sort the collected data ref pairs so that we can scan
>> +     them once to combine all possible aliasing checks.  */
>> +
>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>> +
>> +  /* Remove duplicate data ref pairs.  */
>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>> + ddrs_with_seg_len.end()),
>> +   ddrs_with_seg_len.end());
>> +
>> +  /* We then scan the sorted dr pairs and check if we can combine
>> +     alias checks of two neighbouring dr pairs.  */
>> +
>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>> +    {
>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>> +
>> +      if (dr_a1 == dr_a2)
>> + {
>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>> +    continue;
>> +
>> +  int diff = int_cst_value (dr_b2.offset) -
>> +     int_cst_value (dr_b1.offset);
>> +
>> +  gcc_assert (diff > 0);
>> +
>> +  if (diff <= vect_factor
>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>> +  && diff - int_cst_value (dr_b1.seg_len) <
>> +     int_cst_value (dr_a1.seg_len)))
>> +    {
>> +      if (dump_enabled_p ())
>> + {
>> +  dump_printf_loc
>> +      (MSG_NOTE, vect_location,
>> +       "combining two runtime checks for data references ");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>> +  dump_printf (MSG_NOTE, " and ");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>> +  dump_printf (MSG_NOTE, "\n");
>> + }
>> +
>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>> +  dr_b2.seg_len, size_int (diff));
>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>> +      --i;
>> +    }
>> + }
>> +      else if (dr_b1 == dr_b2)
>> + {
>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>> +    continue;
>> +
>> +  int diff = int_cst_value (dr_a2.offset) -
>> +     int_cst_value (dr_a1.offset);
>> +
>> +  gcc_assert (diff > 0);
>> +
>> +  if (diff <= vect_factor
>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>> +  && diff - int_cst_value (dr_a1.seg_len) <
>> +     int_cst_value (dr_b1.seg_len)))
>> +    {
>> +      if (dump_enabled_p ())
>> + {
>> +  dump_printf_loc
>> +      (MSG_NOTE, vect_location,
>> +       "combining two runtime checks for data references ");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>> +  dump_printf (MSG_NOTE, " and ");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>> +  dump_printf (MSG_NOTE, "\n");
>> + }
>> +
>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>> +  dr_a2.seg_len, size_int (diff));
>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>> +      --i;
>> +    }
>> + }
>> +    }
>> +
>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>> +    {
>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>> +      tree segment_length_a = dr_a.seg_len;
>> +      tree segment_length_b = dr_b.seg_len;
>> +
>> +      tree addr_base_a
>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>> +      tree addr_base_b
>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>> +
>>        if (dump_enabled_p ())
>>   {
>>    dump_printf_loc (MSG_NOTE, vect_location,
>> -                           "create runtime check for data references ");
>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>> +   "create runtime check for data references ");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>    dump_printf (MSG_NOTE, " and ");
>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>> -          dump_printf (MSG_NOTE, "\n");
>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>> +  dump_printf (MSG_NOTE, "\n");
>>   }
>>
>> -      seg_a_min = addr_base_a;
>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>> +      tree seg_a_min = addr_base_a;
>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>
>> -      seg_b_min = addr_base_b;
>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>> +      tree seg_b_min = addr_base_b;
>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>
>>        part_cond_expr =
>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>      }
>>
>> +  /* Extract load and store statements on pointers with zero-stride
>> +     accesses.  */
>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>> +    {
>> +
>> +      /* In the loop body, we iterate each statement to check if it is a load
>> + or store. Then we check the DR_STEP of the data reference.  If
>> + DR_STEP is zero, then we will hoist the load statement to the loop
>> + preheader, and move the store statement to the loop exit.  */
>> +
>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>> +   !gsi_end_p (si); )
>> + {
>> +  gimple stmt = gsi_stmt (si);
>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>> +
>> +
>> +  if (dr && integer_zerop (DR_STEP (dr)))
>> +    {
>> +      if (DR_IS_READ (dr))
>> + {
>> +  if (dump_file)
>> +    {
>> +      fprintf (dump_file,
>> +       "Hoist the load to outside of the loop:\n");
>> +      print_gimple_stmt (dump_file, stmt, 0,
>> + TDF_VOPS|TDF_MEMSYMS);
>> +    }
>> +
>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>> +  gsi_move_after (&si, &si_dst);
>> + }
>> +      else
>> + {
>> +  gimple_stmt_iterator si_dst =
>> +      gsi_last_bb (single_exit (loop)->dest);
>> +  gsi_move_after (&si, &si_dst);
>> + }
>> +              continue;
>> +    }
>> +  else if (!dr)
>> +          {
>> +            bool hoist = true;
>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>> +            {
>> +              tree op = gimple_op (stmt, i);
>> +              if (TREE_CODE (op) == INTEGER_CST
>> +                  || TREE_CODE (op) == REAL_CST)
>> +                continue;
>> +              if (TREE_CODE (op) == SSA_NAME)
>> +              {
>> +                gimple def = SSA_NAME_DEF_STMT (op);
>> +                if (def == stmt
>> +                    || gimple_nop_p (def)
>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>> +                  continue;
>> +              }
>> +              hoist = false;
>> +              break;
>> +            }
>> +
>> +            if (hoist)
>> +            {
>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>> +              gsi_move_after (&si, &si_dst);
>> +              continue;
>> +            }
>> +          }
>> +          gsi_next (&si);
>> + }
>> +    }
>> +
>>    /* End loop-exit-fixes after versioning.  */
>>
>>    if (cond_expr_stmt_list)
>> Index: gcc/ChangeLog
>> ===================================================================
>> --- gcc/ChangeLog (revision 202663)
>> +++ gcc/ChangeLog (working copy)
>> @@ -1,3 +1,8 @@
>> +2013-10-01  Cong Hou  <congh@google.com>
>> +
>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>> + alias checks if it is possible to amortize the runtime overhead.
>> +
>>
>>
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE / SUSE Labs
> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Xinliang David Li Oct. 2, 2013, 9:47 p.m. UTC | #9
I think you need to augment (using a wrapper class) the DDR to capture
more information about aliased memory pairs. It should be flexible
enough to handle the following cases (you don't have to handle all
cases in your first patch, but keep those in mind).

1) All accesses in the same group have constant offsets:

    b[i], b[i+1], b[i+2] etc

2) Accesses in the same group may have offset which is specified by a
unsigned value:

   unsigned N = ...

   b[i], b[i+N]

3) Accesses have offset with value range > 0:

   for (j = 0; j < 10000; j++)
           for (i = 0; i < ...; i++)
             {
                  .... b[i] ....
                  .... b[i + j ] ....               // j > 0
              }


4) base addresses are assigned from the same buffer:

    b1  = &buffer[0];
    b2 = &buffer[10000];
    b3 = &buffer[20000];

for (...)
  {
         ..b1[i]..
         ..b2[i]..
         ..
   }

5) More elaborate case:

   for (i = 0; i< 3; i++)
      base[i] = &buffer[i*N];

 b1 = base[0];
 b2 = base[1];
 ...
for ()
{
   .. b1[i]..
    ..
}

David


On Wed, Oct 2, 2013 at 2:34 PM, Cong Hou <congh@google.com> wrote:
> On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
>> On Tue, 1 Oct 2013, Cong Hou wrote:
>>
>>> When alias exists between data refs in a loop, to vectorize it GCC
>>> does loop versioning and adds runtime alias checks. Basically for each
>>> pair of data refs with possible data dependence, there will be two
>>> comparisons generated to make sure there is no aliasing between them
>>> in each iteration of the vectorized loop. If there are many such data
>>> refs pairs, the number of comparisons can be very large, which is a
>>> big overhead.
>>>
>>> However, in some cases it is possible to reduce the number of those
>>> comparisons. For example, for the following loop, we can detect that
>>> b[0] and b[1] are two consecutive member accesses so that we can
>>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>>> checking a[0:100]&b[0:2]:
>>>
>>> void foo(int*a, int* b)
>>> {
>>>    for (int i = 0; i < 100; ++i)
>>>     a[i] = b[0] + b[1];
>>> }
>>>
>>> Actually, the requirement of consecutive memory accesses is too
>>> strict. For the following loop, we can still combine the alias checks
>>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>>
>>> void foo(int*a, int* b)
>>> {
>>>    for (int i = 0; i < 100; ++i)
>>>     a[i] = b[0] + b[100];
>>> }
>>>
>>> This is because if b[0] is not in a[0:100] and b[100] is not in
>>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>>> to check a[0:100] and b[0:101] don't overlap.
>>>
>>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>>> segment length of a, b1, and b2. Then we can combine the two
>>> comparisons into one if the following condition is satisfied:
>>>
>>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>>
>>>
>>> This patch detects those combination opportunities to reduce the
>>> number of alias checks. It is tested on an x86-64 machine.
>>
>> Apart from the other comments you got (to which I agree) the patch
>> seems to do two things, namely also:
>>
>> +  /* Extract load and store statements on pointers with zero-stride
>> +     accesses.  */
>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>> +    {
>>
>> which I'd rather see in a separate patch (and done also when
>> the loop doesn't require versioning for alias).
>>
>
>
> My mistake.. I am working on those two patches at the same time and
> pasted that one also here by mistake. I will send another patch about
> the "hoist" topic.
>
>
>> Also combining the alias checks in vect_create_cond_for_alias_checks
>> is nice but doesn't properly fix the use of the
>> vect-max-version-for-alias-checks param which currently inhibits
>> vectorization of the HIMENO benchmark by default (and make us look bad
>> compared to LLVM).
>>
>> So I believe this merging should be done incrementally when
>> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>>
>
>
> I agree that vect-max-version-for-alias-checks param should count the
> number of checks after the merge. However, the struct
> data_dependence_relation could not record the new information produced
> by the merge. The new information I mentioned contains the new segment
> length for comparisons. This length is calculated right in
> vect_create_cond_for_alias_checks() function. Since
> vect-max-version-for-alias-checks is used during analysis phase, shall
> we move all those (get segment length for each data ref and merge
> alias checks) from transformation to analysis phase? If we cannot
> store the result properly (data_dependence_relation is not enough),
> shall we do it twice in both phases?
>
> I also noticed a possible bug in the function vect_same_range_drs()
> called by vect_prune_runtime_alias_test_list(). For the following code
> I get two pairs of data refs after
> vect_prune_runtime_alias_test_list(), but in
> vect_create_cond_for_alias_checks() after detecting grouped accesses I
> got two identical pairs of data refs. The consequence is two identical
> alias checks are produced.
>
>
> void yuv2yuyv_ref (int *d, int *src, int n)
> {
>   char *dest = (char *)d;
>   int i;
>
>   for(i=0;i<n/2;i++){
>     dest[i*4 + 0] = (src[i*2 + 0])>>16;
>     dest[i*4 + 1] = (src[i*2 + 1])>>8;
>     dest[i*4 + 2] = (src[i*2 + 0])>>16;
>     dest[i*4 + 3] = (src[i*2 + 0])>>0;
>   }
> }
>
>
> I think the solution to this problem is changing
>
> GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
> == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)
>
> into
>
> STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
> (vinfo_for_stmt (stmt_i))))
> == STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
> (vinfo_for_stmt (stmt_j)))
>
>
> in function vect_same_range_drs(). What do you think about it?
>
>
> thanks,
> Cong
>
>
>
>> Thanks for working on this,
>> Richard.
>>
>>>
>>> thanks,
>>> Cong
>>>
>>>
>>>
>>> Index: gcc/tree-vect-loop-manip.c
>>> ===================================================================
>>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>>> +++ gcc/tree-vect-loop-manip.c (working copy)
>>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>>  along with GCC; see the file COPYING3.  If not see
>>>  <http://www.gnu.org/licenses/>.  */
>>>
>>> +#include <vector>
>>> +#include <utility>
>>> +#include <algorithm>
>>> +
>>>  #include "config.h"
>>>  #include "system.h"
>>>  #include "coretypes.h"
>>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>>    return segment_length;
>>>  }
>>>
>>> +namespace
>>> +{
>>> +
>>> +/* struct dr_addr_with_seg_len
>>> +
>>> +   A struct storing information of a data reference, including the data
>>> +   ref itself, its basic address, the access offset and the segment length
>>> +   for aliasing checks.  */
>>> +
>>> +struct dr_addr_with_seg_len
>>> +{
>>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>>> +
>>> +  data_reference* dr;
>>> +  tree basic_addr;
>>> +  tree offset;
>>> +  tree seg_len;
>>> +};
>>> +
>>> +/* Operator == between two dr_addr_with_seg_len objects.
>>> +
>>> +   This equality operator is used to make sure two data refs
>>> +   are the same one so that we will consider to combine the
>>> +   aliasing checks of those two pairs of data dependent data
>>> +   refs.  */
>>> +
>>> +bool operator == (const dr_addr_with_seg_len& d1,
>>> +  const dr_addr_with_seg_len& d2)
>>> +{
>>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>>> + && operand_equal_p (d1.offset, d2.offset, 0)
>>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>>> +}
>>> +
>>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>>> + dr_addr_with_seg_len_pair_t;
>>> +
>>> +
>>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>>> +
>>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>>> +   so that we can combine aliasing checks during one scan.  */
>>> +
>>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>>> + const dr_addr_with_seg_len_pair_t& p2)
>>> +{
>>> +  const dr_addr_with_seg_len& p11 = p1.first;
>>> +  const dr_addr_with_seg_len& p12 = p1.second;
>>> +  const dr_addr_with_seg_len& p21 = p2.first;
>>> +  const dr_addr_with_seg_len& p22 = p2.second;
>>> +
>>> +  if (p11.basic_addr != p21.basic_addr)
>>> +    return p11.basic_addr < p21.basic_addr;
>>> +  if (p12.basic_addr != p22.basic_addr)
>>> +    return p12.basic_addr < p22.basic_addr;
>>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>>> +    return p11.offset < p21.offset;
>>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>>> +    return p12.offset < p22.offset;
>>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>>> +}
>>> +
>>> +}
>>>
>>>  /* Function vect_create_cond_for_alias_checks.
>>>
>>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>>    if (may_alias_ddrs.is_empty ())
>>>      return;
>>>
>>> +
>>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>>> +     and load_ptr_0, we create an expression:
>>> +
>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>>> +
>>> +     for aliasing checks. However, in some cases we can decrease
>>> +     the number of checks by combining two checks into one. For
>>> +     example, suppose we have another pair of data refs store_ptr_0
>>> +     and load_ptr_1, and if the following condition is satisfied:
>>> +
>>> +     load_ptr_0 < load_ptr_1  &&
>>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>>> +
>>> +     (this condition means, in each iteration of vectorized loop,
>>> +     the accessed memory of store_ptr_0 cannot be between the memory
>>> +     of load_ptr_0 and load_ptr_1.)
>>> +
>>> +     we then can use only the following expression to finish the
>>> +     alising checks between store_ptr_0 & load_ptr_0 and
>>> +     store_ptr_0 & load_ptr_1:
>>> +
>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>>> +
>>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>>> +     same basic address.  */
>>> +
>>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>>> +
>>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>>> +
>>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>>      {
>>>        struct data_reference *dr_a, *dr_b;
>>>        gimple dr_group_first_a, dr_group_first_b;
>>> -      tree addr_base_a, addr_base_b;
>>>        tree segment_length_a, segment_length_b;
>>>        gimple stmt_a, stmt_b;
>>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>>
>>>        dr_a = DDR_A (ddr);
>>>        stmt_a = DR_STMT (DDR_A (ddr));
>>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>>        if (dr_group_first_a)
>>> -        {
>>> + {
>>>    stmt_a = dr_group_first_a;
>>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>>   }
>>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>>        stmt_b = DR_STMT (DDR_B (ddr));
>>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>>        if (dr_group_first_b)
>>> -        {
>>> + {
>>>    stmt_b = dr_group_first_b;
>>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>>   }
>>>
>>> -      addr_base_a
>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>>> -       DR_INIT (dr_a)));
>>> -      addr_base_b
>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>>> -       DR_INIT (dr_b)));
>>> -
>>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>>   length_factor = scalar_loop_iters;
>>>        else
>>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>>
>>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>>> +  (dr_addr_with_seg_len
>>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>>> + segment_length_a),
>>> +   dr_addr_with_seg_len
>>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>>> + segment_length_b));
>>> +
>>> +      if (dr_with_seg_len_pair.first.basic_addr >
>>> +  dr_with_seg_len_pair.second.basic_addr)
>>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>>> +
>>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>>> +    }
>>> +
>>> +  /* Second, we sort the collected data ref pairs so that we can scan
>>> +     them once to combine all possible aliasing checks.  */
>>> +
>>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>>> +
>>> +  /* Remove duplicate data ref pairs.  */
>>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>>> + ddrs_with_seg_len.end()),
>>> +   ddrs_with_seg_len.end());
>>> +
>>> +  /* We then scan the sorted dr pairs and check if we can combine
>>> +     alias checks of two neighbouring dr pairs.  */
>>> +
>>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>>> +    {
>>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>>> +
>>> +      if (dr_a1 == dr_a2)
>>> + {
>>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>>> +    continue;
>>> +
>>> +  int diff = int_cst_value (dr_b2.offset) -
>>> +     int_cst_value (dr_b1.offset);
>>> +
>>> +  gcc_assert (diff > 0);
>>> +
>>> +  if (diff <= vect_factor
>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_b1.seg_len) <
>>> +     int_cst_value (dr_a1.seg_len)))
>>> +    {
>>> +      if (dump_enabled_p ())
>>> + {
>>> +  dump_printf_loc
>>> +      (MSG_NOTE, vect_location,
>>> +       "combining two runtime checks for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>>> +  dump_printf (MSG_NOTE, " and ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>> + }
>>> +
>>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>>> +  dr_b2.seg_len, size_int (diff));
>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>> +      --i;
>>> +    }
>>> + }
>>> +      else if (dr_b1 == dr_b2)
>>> + {
>>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>>> +    continue;
>>> +
>>> +  int diff = int_cst_value (dr_a2.offset) -
>>> +     int_cst_value (dr_a1.offset);
>>> +
>>> +  gcc_assert (diff > 0);
>>> +
>>> +  if (diff <= vect_factor
>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_a1.seg_len) <
>>> +     int_cst_value (dr_b1.seg_len)))
>>> +    {
>>> +      if (dump_enabled_p ())
>>> + {
>>> +  dump_printf_loc
>>> +      (MSG_NOTE, vect_location,
>>> +       "combining two runtime checks for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>>> +  dump_printf (MSG_NOTE, " and ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>> + }
>>> +
>>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>>> +  dr_a2.seg_len, size_int (diff));
>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>> +      --i;
>>> +    }
>>> + }
>>> +    }
>>> +
>>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>>> +    {
>>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>>> +      tree segment_length_a = dr_a.seg_len;
>>> +      tree segment_length_b = dr_b.seg_len;
>>> +
>>> +      tree addr_base_a
>>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>>> +      tree addr_base_b
>>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>>> +
>>>        if (dump_enabled_p ())
>>>   {
>>>    dump_printf_loc (MSG_NOTE, vect_location,
>>> -                           "create runtime check for data references ");
>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>>> +   "create runtime check for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>>    dump_printf (MSG_NOTE, " and ");
>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>>> -          dump_printf (MSG_NOTE, "\n");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>>   }
>>>
>>> -      seg_a_min = addr_base_a;
>>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>>> +      tree seg_a_min = addr_base_a;
>>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>>
>>> -      seg_b_min = addr_base_b;
>>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>>> +      tree seg_b_min = addr_base_b;
>>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>>
>>>        part_cond_expr =
>>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>>      }
>>>
>>> +  /* Extract load and store statements on pointers with zero-stride
>>> +     accesses.  */
>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>> +    {
>>> +
>>> +      /* In the loop body, we iterate each statement to check if it is a load
>>> + or store. Then we check the DR_STEP of the data reference.  If
>>> + DR_STEP is zero, then we will hoist the load statement to the loop
>>> + preheader, and move the store statement to the loop exit.  */
>>> +
>>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>>> +   !gsi_end_p (si); )
>>> + {
>>> +  gimple stmt = gsi_stmt (si);
>>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>>> +
>>> +
>>> +  if (dr && integer_zerop (DR_STEP (dr)))
>>> +    {
>>> +      if (DR_IS_READ (dr))
>>> + {
>>> +  if (dump_file)
>>> +    {
>>> +      fprintf (dump_file,
>>> +       "Hoist the load to outside of the loop:\n");
>>> +      print_gimple_stmt (dump_file, stmt, 0,
>>> + TDF_VOPS|TDF_MEMSYMS);
>>> +    }
>>> +
>>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>> +  gsi_move_after (&si, &si_dst);
>>> + }
>>> +      else
>>> + {
>>> +  gimple_stmt_iterator si_dst =
>>> +      gsi_last_bb (single_exit (loop)->dest);
>>> +  gsi_move_after (&si, &si_dst);
>>> + }
>>> +              continue;
>>> +    }
>>> +  else if (!dr)
>>> +          {
>>> +            bool hoist = true;
>>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>>> +            {
>>> +              tree op = gimple_op (stmt, i);
>>> +              if (TREE_CODE (op) == INTEGER_CST
>>> +                  || TREE_CODE (op) == REAL_CST)
>>> +                continue;
>>> +              if (TREE_CODE (op) == SSA_NAME)
>>> +              {
>>> +                gimple def = SSA_NAME_DEF_STMT (op);
>>> +                if (def == stmt
>>> +                    || gimple_nop_p (def)
>>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>>> +                  continue;
>>> +              }
>>> +              hoist = false;
>>> +              break;
>>> +            }
>>> +
>>> +            if (hoist)
>>> +            {
>>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>> +              gsi_move_after (&si, &si_dst);
>>> +              continue;
>>> +            }
>>> +          }
>>> +          gsi_next (&si);
>>> + }
>>> +    }
>>> +
>>>    /* End loop-exit-fixes after versioning.  */
>>>
>>>    if (cond_expr_stmt_list)
>>> Index: gcc/ChangeLog
>>> ===================================================================
>>> --- gcc/ChangeLog (revision 202663)
>>> +++ gcc/ChangeLog (working copy)
>>> @@ -1,3 +1,8 @@
>>> +2013-10-01  Cong Hou  <congh@google.com>
>>> +
>>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>>> + alias checks if it is possible to amortize the runtime overhead.
>>> +
>>>
>>>
>>
>> --
>> Richard Biener <rguenther@suse.de>
>> SUSE / SUSE Labs
>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Cong Hou Oct. 2, 2013, 10:51 p.m. UTC | #10
On Wed, Oct 2, 2013 at 2:18 PM, Xinliang David Li <davidxl@google.com> wrote:
> On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
>> On Tue, 1 Oct 2013, Cong Hou wrote:
>>
>>> When alias exists between data refs in a loop, to vectorize it GCC
>>> does loop versioning and adds runtime alias checks. Basically for each
>>> pair of data refs with possible data dependence, there will be two
>>> comparisons generated to make sure there is no aliasing between them
>>> in each iteration of the vectorized loop. If there are many such data
>>> refs pairs, the number of comparisons can be very large, which is a
>>> big overhead.
>>>
>>> However, in some cases it is possible to reduce the number of those
>>> comparisons. For example, for the following loop, we can detect that
>>> b[0] and b[1] are two consecutive member accesses so that we can
>>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>>> checking a[0:100]&b[0:2]:
>>>
>>> void foo(int*a, int* b)
>>> {
>>>    for (int i = 0; i < 100; ++i)
>>>     a[i] = b[0] + b[1];
>>> }
>>>
>>> Actually, the requirement of consecutive memory accesses is too
>>> strict. For the following loop, we can still combine the alias checks
>>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>>
>>> void foo(int*a, int* b)
>>> {
>>>    for (int i = 0; i < 100; ++i)
>>>     a[i] = b[0] + b[100];
>>> }
>>>
>>> This is because if b[0] is not in a[0:100] and b[100] is not in
>>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>>> to check a[0:100] and b[0:101] don't overlap.
>>>
>>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>>> segment length of a, b1, and b2. Then we can combine the two
>>> comparisons into one if the following condition is satisfied:
>>>
>>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>>
>>>
>>> This patch detects those combination opportunities to reduce the
>>> number of alias checks. It is tested on an x86-64 machine.
>>
>> Apart from the other comments you got (to which I agree) the patch
>> seems to do two things, namely also:
>>
>> +  /* Extract load and store statements on pointers with zero-stride
>> +     accesses.  */
>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>> +    {
>>
>> which I'd rather see in a separate patch (and done also when
>> the loop doesn't require versioning for alias).
>
> yes.
>
>>
>> Also combining the alias checks in vect_create_cond_for_alias_checks
>> is nice but doesn't properly fix the use of the
>> vect-max-version-for-alias-checks param
>
> Yes. The handling of this should be moved to
> 'vect_prune_runtime_alias_test_list' to avoid premature decisions.
>
>
>
>>which currently inhibits
>> vectorization of the HIMENO benchmark by default (and make us look bad
>> compared to LLVM).
>
> Here is a small reproducible:
>
> struct  A {
>   int *base;
>   int offset;
>   int offset2;
>   int offset3;
>   int offset4;
>   int offset5;
>   int offset6;
>   int offset7;
>   int offset8;
> };
>
> void foo (struct A * ar1, struct A* ar2)
> {
>       int i;
>       for (i = 0; i < 10000; i++)
>         {
>            ar1->base[i]  = 2*ar2->base[i] + ar2->offset + ar2->offset2
> + ar2->offset3 + ar2->offset4 + ar2->offset5 + ar2->offset6; /* +
> ar2->offset7 + ar2->offset8;*/
>         }
> }
>
> GCC trunk won't vectorize it at O2 due to the limit.
>
>
> There is another problem we should be tracking: GCC no longer
> vectorize the loop (with large
> --param=vect-max-version-for-alias-checks=40) when -fno-strict-alias
> is specified.   However with additional runtime alias check, the loop
> should be vectorizable.


The problem can be reproduced by the following loop:


void foo (int* a, int** b)
{
  int i;
  for (i = 0; i < 1000; ++i)
    a[i] = (*b)[i];
}


When -fno-strict-aliasing is specified, the basic address of (*b)[i]
which is *b could be modified by a[i] if alias exists between them.
This forbids GCC from making the basic address of (*b)[i] a loop
invariant, and hence could not do vectorization. Although we can still
check the aliasing between a[i] and *b from a and b (b < a or b >=
a+1000), it must be done before vectorization and after this we have
to do one more nested aliasing check (a and *b now):


void foo (int* a, int** b)
{
  int i;
  if ((int*)b < a || (int*)b >= a+1000)
  {
    int* c = *b;
    for (i = 0; i < 1000; ++i)
      a[i] = c[i];
  }
  else
  {
    for (i = 0; i < 1000; ++i)
      a[i] = (*b)[i];
  }
}

I think this is too complicated to be able to get any benefit for us.


thanks,
Cong



>
> David
>
>
>>
>> So I believe this merging should be done incrementally when
>> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>>
>> Thanks for working on this,
>> Richard.
>>
>>>
>>> thanks,
>>> Cong
>>>
>>>
>>>
>>> Index: gcc/tree-vect-loop-manip.c
>>> ===================================================================
>>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>>> +++ gcc/tree-vect-loop-manip.c (working copy)
>>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>>  along with GCC; see the file COPYING3.  If not see
>>>  <http://www.gnu.org/licenses/>.  */
>>>
>>> +#include <vector>
>>> +#include <utility>
>>> +#include <algorithm>
>>> +
>>>  #include "config.h"
>>>  #include "system.h"
>>>  #include "coretypes.h"
>>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>>    return segment_length;
>>>  }
>>>
>>> +namespace
>>> +{
>>> +
>>> +/* struct dr_addr_with_seg_len
>>> +
>>> +   A struct storing information of a data reference, including the data
>>> +   ref itself, its basic address, the access offset and the segment length
>>> +   for aliasing checks.  */
>>> +
>>> +struct dr_addr_with_seg_len
>>> +{
>>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>>> +
>>> +  data_reference* dr;
>>> +  tree basic_addr;
>>> +  tree offset;
>>> +  tree seg_len;
>>> +};
>>> +
>>> +/* Operator == between two dr_addr_with_seg_len objects.
>>> +
>>> +   This equality operator is used to make sure two data refs
>>> +   are the same one so that we will consider to combine the
>>> +   aliasing checks of those two pairs of data dependent data
>>> +   refs.  */
>>> +
>>> +bool operator == (const dr_addr_with_seg_len& d1,
>>> +  const dr_addr_with_seg_len& d2)
>>> +{
>>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>>> + && operand_equal_p (d1.offset, d2.offset, 0)
>>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>>> +}
>>> +
>>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>>> + dr_addr_with_seg_len_pair_t;
>>> +
>>> +
>>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>>> +
>>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>>> +   so that we can combine aliasing checks during one scan.  */
>>> +
>>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>>> + const dr_addr_with_seg_len_pair_t& p2)
>>> +{
>>> +  const dr_addr_with_seg_len& p11 = p1.first;
>>> +  const dr_addr_with_seg_len& p12 = p1.second;
>>> +  const dr_addr_with_seg_len& p21 = p2.first;
>>> +  const dr_addr_with_seg_len& p22 = p2.second;
>>> +
>>> +  if (p11.basic_addr != p21.basic_addr)
>>> +    return p11.basic_addr < p21.basic_addr;
>>> +  if (p12.basic_addr != p22.basic_addr)
>>> +    return p12.basic_addr < p22.basic_addr;
>>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>>> +    return p11.offset < p21.offset;
>>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>>> +    return p12.offset < p22.offset;
>>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>>> +}
>>> +
>>> +}
>>>
>>>  /* Function vect_create_cond_for_alias_checks.
>>>
>>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>>    if (may_alias_ddrs.is_empty ())
>>>      return;
>>>
>>> +
>>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>>> +     and load_ptr_0, we create an expression:
>>> +
>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>>> +
>>> +     for aliasing checks. However, in some cases we can decrease
>>> +     the number of checks by combining two checks into one. For
>>> +     example, suppose we have another pair of data refs store_ptr_0
>>> +     and load_ptr_1, and if the following condition is satisfied:
>>> +
>>> +     load_ptr_0 < load_ptr_1  &&
>>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>>> +
>>> +     (this condition means, in each iteration of vectorized loop,
>>> +     the accessed memory of store_ptr_0 cannot be between the memory
>>> +     of load_ptr_0 and load_ptr_1.)
>>> +
>>> +     we then can use only the following expression to finish the
>>> +     alising checks between store_ptr_0 & load_ptr_0 and
>>> +     store_ptr_0 & load_ptr_1:
>>> +
>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>>> +
>>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>>> +     same basic address.  */
>>> +
>>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>>> +
>>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>>> +
>>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>>      {
>>>        struct data_reference *dr_a, *dr_b;
>>>        gimple dr_group_first_a, dr_group_first_b;
>>> -      tree addr_base_a, addr_base_b;
>>>        tree segment_length_a, segment_length_b;
>>>        gimple stmt_a, stmt_b;
>>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>>
>>>        dr_a = DDR_A (ddr);
>>>        stmt_a = DR_STMT (DDR_A (ddr));
>>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>>        if (dr_group_first_a)
>>> -        {
>>> + {
>>>    stmt_a = dr_group_first_a;
>>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>>   }
>>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>>        stmt_b = DR_STMT (DDR_B (ddr));
>>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>>        if (dr_group_first_b)
>>> -        {
>>> + {
>>>    stmt_b = dr_group_first_b;
>>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>>   }
>>>
>>> -      addr_base_a
>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>>> -       DR_INIT (dr_a)));
>>> -      addr_base_b
>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>>> -       DR_INIT (dr_b)));
>>> -
>>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>>   length_factor = scalar_loop_iters;
>>>        else
>>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>>
>>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>>> +  (dr_addr_with_seg_len
>>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>>> + segment_length_a),
>>> +   dr_addr_with_seg_len
>>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>>> + segment_length_b));
>>> +
>>> +      if (dr_with_seg_len_pair.first.basic_addr >
>>> +  dr_with_seg_len_pair.second.basic_addr)
>>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>>> +
>>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>>> +    }
>>> +
>>> +  /* Second, we sort the collected data ref pairs so that we can scan
>>> +     them once to combine all possible aliasing checks.  */
>>> +
>>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>>> +
>>> +  /* Remove duplicate data ref pairs.  */
>>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>>> + ddrs_with_seg_len.end()),
>>> +   ddrs_with_seg_len.end());
>>> +
>>> +  /* We then scan the sorted dr pairs and check if we can combine
>>> +     alias checks of two neighbouring dr pairs.  */
>>> +
>>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>>> +    {
>>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>>> +
>>> +      if (dr_a1 == dr_a2)
>>> + {
>>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>>> +    continue;
>>> +
>>> +  int diff = int_cst_value (dr_b2.offset) -
>>> +     int_cst_value (dr_b1.offset);
>>> +
>>> +  gcc_assert (diff > 0);
>>> +
>>> +  if (diff <= vect_factor
>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_b1.seg_len) <
>>> +     int_cst_value (dr_a1.seg_len)))
>>> +    {
>>> +      if (dump_enabled_p ())
>>> + {
>>> +  dump_printf_loc
>>> +      (MSG_NOTE, vect_location,
>>> +       "combining two runtime checks for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>>> +  dump_printf (MSG_NOTE, " and ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>> + }
>>> +
>>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>>> +  dr_b2.seg_len, size_int (diff));
>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>> +      --i;
>>> +    }
>>> + }
>>> +      else if (dr_b1 == dr_b2)
>>> + {
>>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>>> +    continue;
>>> +
>>> +  int diff = int_cst_value (dr_a2.offset) -
>>> +     int_cst_value (dr_a1.offset);
>>> +
>>> +  gcc_assert (diff > 0);
>>> +
>>> +  if (diff <= vect_factor
>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_a1.seg_len) <
>>> +     int_cst_value (dr_b1.seg_len)))
>>> +    {
>>> +      if (dump_enabled_p ())
>>> + {
>>> +  dump_printf_loc
>>> +      (MSG_NOTE, vect_location,
>>> +       "combining two runtime checks for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>>> +  dump_printf (MSG_NOTE, " and ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>> + }
>>> +
>>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>>> +  dr_a2.seg_len, size_int (diff));
>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>> +      --i;
>>> +    }
>>> + }
>>> +    }
>>> +
>>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>>> +    {
>>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>>> +      tree segment_length_a = dr_a.seg_len;
>>> +      tree segment_length_b = dr_b.seg_len;
>>> +
>>> +      tree addr_base_a
>>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>>> +      tree addr_base_b
>>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>>> +
>>>        if (dump_enabled_p ())
>>>   {
>>>    dump_printf_loc (MSG_NOTE, vect_location,
>>> -                           "create runtime check for data references ");
>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>>> +   "create runtime check for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>>    dump_printf (MSG_NOTE, " and ");
>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>>> -          dump_printf (MSG_NOTE, "\n");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>>   }
>>>
>>> -      seg_a_min = addr_base_a;
>>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>>> +      tree seg_a_min = addr_base_a;
>>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>>
>>> -      seg_b_min = addr_base_b;
>>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>>> +      tree seg_b_min = addr_base_b;
>>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>>
>>>        part_cond_expr =
>>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>>      }
>>>
>>> +  /* Extract load and store statements on pointers with zero-stride
>>> +     accesses.  */
>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>> +    {
>>> +
>>> +      /* In the loop body, we iterate each statement to check if it is a load
>>> + or store. Then we check the DR_STEP of the data reference.  If
>>> + DR_STEP is zero, then we will hoist the load statement to the loop
>>> + preheader, and move the store statement to the loop exit.  */
>>> +
>>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>>> +   !gsi_end_p (si); )
>>> + {
>>> +  gimple stmt = gsi_stmt (si);
>>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>>> +
>>> +
>>> +  if (dr && integer_zerop (DR_STEP (dr)))
>>> +    {
>>> +      if (DR_IS_READ (dr))
>>> + {
>>> +  if (dump_file)
>>> +    {
>>> +      fprintf (dump_file,
>>> +       "Hoist the load to outside of the loop:\n");
>>> +      print_gimple_stmt (dump_file, stmt, 0,
>>> + TDF_VOPS|TDF_MEMSYMS);
>>> +    }
>>> +
>>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>> +  gsi_move_after (&si, &si_dst);
>>> + }
>>> +      else
>>> + {
>>> +  gimple_stmt_iterator si_dst =
>>> +      gsi_last_bb (single_exit (loop)->dest);
>>> +  gsi_move_after (&si, &si_dst);
>>> + }
>>> +              continue;
>>> +    }
>>> +  else if (!dr)
>>> +          {
>>> +            bool hoist = true;
>>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>>> +            {
>>> +              tree op = gimple_op (stmt, i);
>>> +              if (TREE_CODE (op) == INTEGER_CST
>>> +                  || TREE_CODE (op) == REAL_CST)
>>> +                continue;
>>> +              if (TREE_CODE (op) == SSA_NAME)
>>> +              {
>>> +                gimple def = SSA_NAME_DEF_STMT (op);
>>> +                if (def == stmt
>>> +                    || gimple_nop_p (def)
>>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>>> +                  continue;
>>> +              }
>>> +              hoist = false;
>>> +              break;
>>> +            }
>>> +
>>> +            if (hoist)
>>> +            {
>>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>> +              gsi_move_after (&si, &si_dst);
>>> +              continue;
>>> +            }
>>> +          }
>>> +          gsi_next (&si);
>>> + }
>>> +    }
>>> +
>>>    /* End loop-exit-fixes after versioning.  */
>>>
>>>    if (cond_expr_stmt_list)
>>> Index: gcc/ChangeLog
>>> ===================================================================
>>> --- gcc/ChangeLog (revision 202663)
>>> +++ gcc/ChangeLog (working copy)
>>> @@ -1,3 +1,8 @@
>>> +2013-10-01  Cong Hou  <congh@google.com>
>>> +
>>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>>> + alias checks if it is possible to amortize the runtime overhead.
>>> +
>>>
>>>
>>
>> --
>> Richard Biener <rguenther@suse.de>
>> SUSE / SUSE Labs
>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Cong Hou Oct. 2, 2013, 10:54 p.m. UTC | #11
Forget to mention that the alias check merger can reduce the number of
checks from 7 to 2 for this example:

struct  A {
  int *base;
  int offset;
  int offset2;
  int offset3;
  int offset4;
  int offset5;
  int offset6;
  int offset7;
  int offset8;
};

void foo (struct A * ar1, struct A* ar2)
{
      int i;
      for (i = 0; i < 10000; i++)
        {
           ar1->base[i]  = 2*ar2->base[i] + ar2->offset + ar2->offset2
+ ar2->offset3 + ar2->offset4 + ar2->offset5 + ar2->offset6; /* +
ar2->offset7 + ar2->offset8;*/
        }
}


thanks,
Cong


On Wed, Oct 2, 2013 at 2:18 PM, Xinliang David Li <davidxl@google.com> wrote:
> On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
>> On Tue, 1 Oct 2013, Cong Hou wrote:
>>
>>> When alias exists between data refs in a loop, to vectorize it GCC
>>> does loop versioning and adds runtime alias checks. Basically for each
>>> pair of data refs with possible data dependence, there will be two
>>> comparisons generated to make sure there is no aliasing between them
>>> in each iteration of the vectorized loop. If there are many such data
>>> refs pairs, the number of comparisons can be very large, which is a
>>> big overhead.
>>>
>>> However, in some cases it is possible to reduce the number of those
>>> comparisons. For example, for the following loop, we can detect that
>>> b[0] and b[1] are two consecutive member accesses so that we can
>>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>>> checking a[0:100]&b[0:2]:
>>>
>>> void foo(int*a, int* b)
>>> {
>>>    for (int i = 0; i < 100; ++i)
>>>     a[i] = b[0] + b[1];
>>> }
>>>
>>> Actually, the requirement of consecutive memory accesses is too
>>> strict. For the following loop, we can still combine the alias checks
>>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>>
>>> void foo(int*a, int* b)
>>> {
>>>    for (int i = 0; i < 100; ++i)
>>>     a[i] = b[0] + b[100];
>>> }
>>>
>>> This is because if b[0] is not in a[0:100] and b[100] is not in
>>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>>> to check a[0:100] and b[0:101] don't overlap.
>>>
>>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>>> segment length of a, b1, and b2. Then we can combine the two
>>> comparisons into one if the following condition is satisfied:
>>>
>>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>>
>>>
>>> This patch detects those combination opportunities to reduce the
>>> number of alias checks. It is tested on an x86-64 machine.
>>
>> Apart from the other comments you got (to which I agree) the patch
>> seems to do two things, namely also:
>>
>> +  /* Extract load and store statements on pointers with zero-stride
>> +     accesses.  */
>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>> +    {
>>
>> which I'd rather see in a separate patch (and done also when
>> the loop doesn't require versioning for alias).
>
> yes.
>
>>
>> Also combining the alias checks in vect_create_cond_for_alias_checks
>> is nice but doesn't properly fix the use of the
>> vect-max-version-for-alias-checks param
>
> Yes. The handling of this should be moved to
> 'vect_prune_runtime_alias_test_list' to avoid premature decisions.
>
>
>
>>which currently inhibits
>> vectorization of the HIMENO benchmark by default (and make us look bad
>> compared to LLVM).
>
> Here is a small reproducible:
>
> struct  A {
>   int *base;
>   int offset;
>   int offset2;
>   int offset3;
>   int offset4;
>   int offset5;
>   int offset6;
>   int offset7;
>   int offset8;
> };
>
> void foo (struct A * ar1, struct A* ar2)
> {
>       int i;
>       for (i = 0; i < 10000; i++)
>         {
>            ar1->base[i]  = 2*ar2->base[i] + ar2->offset + ar2->offset2
> + ar2->offset3 + ar2->offset4 + ar2->offset5 + ar2->offset6; /* +
> ar2->offset7 + ar2->offset8;*/
>         }
> }
>
> GCC trunk won't vectorize it at O2 due to the limit.
>
>
> There is another problem we should be tracking: GCC no longer
> vectorize the loop (with large
> --param=vect-max-version-for-alias-checks=40) when -fno-strict-alias
> is specified.   However with additional runtime alias check, the loop
> should be vectorizable.
>
> David
>
>
>>
>> So I believe this merging should be done incrementally when
>> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>>
>> Thanks for working on this,
>> Richard.
>>
>>>
>>> thanks,
>>> Cong
>>>
>>>
>>>
>>> Index: gcc/tree-vect-loop-manip.c
>>> ===================================================================
>>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>>> +++ gcc/tree-vect-loop-manip.c (working copy)
>>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>>  along with GCC; see the file COPYING3.  If not see
>>>  <http://www.gnu.org/licenses/>.  */
>>>
>>> +#include <vector>
>>> +#include <utility>
>>> +#include <algorithm>
>>> +
>>>  #include "config.h"
>>>  #include "system.h"
>>>  #include "coretypes.h"
>>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>>    return segment_length;
>>>  }
>>>
>>> +namespace
>>> +{
>>> +
>>> +/* struct dr_addr_with_seg_len
>>> +
>>> +   A struct storing information of a data reference, including the data
>>> +   ref itself, its basic address, the access offset and the segment length
>>> +   for aliasing checks.  */
>>> +
>>> +struct dr_addr_with_seg_len
>>> +{
>>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>>> +
>>> +  data_reference* dr;
>>> +  tree basic_addr;
>>> +  tree offset;
>>> +  tree seg_len;
>>> +};
>>> +
>>> +/* Operator == between two dr_addr_with_seg_len objects.
>>> +
>>> +   This equality operator is used to make sure two data refs
>>> +   are the same one so that we will consider to combine the
>>> +   aliasing checks of those two pairs of data dependent data
>>> +   refs.  */
>>> +
>>> +bool operator == (const dr_addr_with_seg_len& d1,
>>> +  const dr_addr_with_seg_len& d2)
>>> +{
>>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>>> + && operand_equal_p (d1.offset, d2.offset, 0)
>>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>>> +}
>>> +
>>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>>> + dr_addr_with_seg_len_pair_t;
>>> +
>>> +
>>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>>> +
>>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>>> +   so that we can combine aliasing checks during one scan.  */
>>> +
>>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>>> + const dr_addr_with_seg_len_pair_t& p2)
>>> +{
>>> +  const dr_addr_with_seg_len& p11 = p1.first;
>>> +  const dr_addr_with_seg_len& p12 = p1.second;
>>> +  const dr_addr_with_seg_len& p21 = p2.first;
>>> +  const dr_addr_with_seg_len& p22 = p2.second;
>>> +
>>> +  if (p11.basic_addr != p21.basic_addr)
>>> +    return p11.basic_addr < p21.basic_addr;
>>> +  if (p12.basic_addr != p22.basic_addr)
>>> +    return p12.basic_addr < p22.basic_addr;
>>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>>> +    return p11.offset < p21.offset;
>>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>>> +    return p12.offset < p22.offset;
>>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>>> +}
>>> +
>>> +}
>>>
>>>  /* Function vect_create_cond_for_alias_checks.
>>>
>>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>>    if (may_alias_ddrs.is_empty ())
>>>      return;
>>>
>>> +
>>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>>> +     and load_ptr_0, we create an expression:
>>> +
>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>>> +
>>> +     for aliasing checks. However, in some cases we can decrease
>>> +     the number of checks by combining two checks into one. For
>>> +     example, suppose we have another pair of data refs store_ptr_0
>>> +     and load_ptr_1, and if the following condition is satisfied:
>>> +
>>> +     load_ptr_0 < load_ptr_1  &&
>>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>>> +
>>> +     (this condition means, in each iteration of vectorized loop,
>>> +     the accessed memory of store_ptr_0 cannot be between the memory
>>> +     of load_ptr_0 and load_ptr_1.)
>>> +
>>> +     we then can use only the following expression to finish the
>>> +     alising checks between store_ptr_0 & load_ptr_0 and
>>> +     store_ptr_0 & load_ptr_1:
>>> +
>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>>> +
>>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>>> +     same basic address.  */
>>> +
>>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>>> +
>>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>>> +
>>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>>      {
>>>        struct data_reference *dr_a, *dr_b;
>>>        gimple dr_group_first_a, dr_group_first_b;
>>> -      tree addr_base_a, addr_base_b;
>>>        tree segment_length_a, segment_length_b;
>>>        gimple stmt_a, stmt_b;
>>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>>
>>>        dr_a = DDR_A (ddr);
>>>        stmt_a = DR_STMT (DDR_A (ddr));
>>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>>        if (dr_group_first_a)
>>> -        {
>>> + {
>>>    stmt_a = dr_group_first_a;
>>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>>   }
>>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>>        stmt_b = DR_STMT (DDR_B (ddr));
>>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>>        if (dr_group_first_b)
>>> -        {
>>> + {
>>>    stmt_b = dr_group_first_b;
>>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>>   }
>>>
>>> -      addr_base_a
>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>>> -       DR_INIT (dr_a)));
>>> -      addr_base_b
>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>>> -       DR_INIT (dr_b)));
>>> -
>>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>>   length_factor = scalar_loop_iters;
>>>        else
>>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>>
>>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>>> +  (dr_addr_with_seg_len
>>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>>> + segment_length_a),
>>> +   dr_addr_with_seg_len
>>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>>> + segment_length_b));
>>> +
>>> +      if (dr_with_seg_len_pair.first.basic_addr >
>>> +  dr_with_seg_len_pair.second.basic_addr)
>>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>>> +
>>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>>> +    }
>>> +
>>> +  /* Second, we sort the collected data ref pairs so that we can scan
>>> +     them once to combine all possible aliasing checks.  */
>>> +
>>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>>> +
>>> +  /* Remove duplicate data ref pairs.  */
>>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>>> + ddrs_with_seg_len.end()),
>>> +   ddrs_with_seg_len.end());
>>> +
>>> +  /* We then scan the sorted dr pairs and check if we can combine
>>> +     alias checks of two neighbouring dr pairs.  */
>>> +
>>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>>> +    {
>>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>>> +
>>> +      if (dr_a1 == dr_a2)
>>> + {
>>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>>> +    continue;
>>> +
>>> +  int diff = int_cst_value (dr_b2.offset) -
>>> +     int_cst_value (dr_b1.offset);
>>> +
>>> +  gcc_assert (diff > 0);
>>> +
>>> +  if (diff <= vect_factor
>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_b1.seg_len) <
>>> +     int_cst_value (dr_a1.seg_len)))
>>> +    {
>>> +      if (dump_enabled_p ())
>>> + {
>>> +  dump_printf_loc
>>> +      (MSG_NOTE, vect_location,
>>> +       "combining two runtime checks for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>>> +  dump_printf (MSG_NOTE, " and ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>> + }
>>> +
>>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>>> +  dr_b2.seg_len, size_int (diff));
>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>> +      --i;
>>> +    }
>>> + }
>>> +      else if (dr_b1 == dr_b2)
>>> + {
>>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>>> +    continue;
>>> +
>>> +  int diff = int_cst_value (dr_a2.offset) -
>>> +     int_cst_value (dr_a1.offset);
>>> +
>>> +  gcc_assert (diff > 0);
>>> +
>>> +  if (diff <= vect_factor
>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_a1.seg_len) <
>>> +     int_cst_value (dr_b1.seg_len)))
>>> +    {
>>> +      if (dump_enabled_p ())
>>> + {
>>> +  dump_printf_loc
>>> +      (MSG_NOTE, vect_location,
>>> +       "combining two runtime checks for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>>> +  dump_printf (MSG_NOTE, " and ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>> + }
>>> +
>>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>>> +  dr_a2.seg_len, size_int (diff));
>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>> +      --i;
>>> +    }
>>> + }
>>> +    }
>>> +
>>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>>> +    {
>>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>>> +      tree segment_length_a = dr_a.seg_len;
>>> +      tree segment_length_b = dr_b.seg_len;
>>> +
>>> +      tree addr_base_a
>>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>>> +      tree addr_base_b
>>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>>> +
>>>        if (dump_enabled_p ())
>>>   {
>>>    dump_printf_loc (MSG_NOTE, vect_location,
>>> -                           "create runtime check for data references ");
>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>>> +   "create runtime check for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>>    dump_printf (MSG_NOTE, " and ");
>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>>> -          dump_printf (MSG_NOTE, "\n");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>>   }
>>>
>>> -      seg_a_min = addr_base_a;
>>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>>> +      tree seg_a_min = addr_base_a;
>>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>>
>>> -      seg_b_min = addr_base_b;
>>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>>> +      tree seg_b_min = addr_base_b;
>>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>>
>>>        part_cond_expr =
>>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>>      }
>>>
>>> +  /* Extract load and store statements on pointers with zero-stride
>>> +     accesses.  */
>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>> +    {
>>> +
>>> +      /* In the loop body, we iterate each statement to check if it is a load
>>> + or store. Then we check the DR_STEP of the data reference.  If
>>> + DR_STEP is zero, then we will hoist the load statement to the loop
>>> + preheader, and move the store statement to the loop exit.  */
>>> +
>>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>>> +   !gsi_end_p (si); )
>>> + {
>>> +  gimple stmt = gsi_stmt (si);
>>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>>> +
>>> +
>>> +  if (dr && integer_zerop (DR_STEP (dr)))
>>> +    {
>>> +      if (DR_IS_READ (dr))
>>> + {
>>> +  if (dump_file)
>>> +    {
>>> +      fprintf (dump_file,
>>> +       "Hoist the load to outside of the loop:\n");
>>> +      print_gimple_stmt (dump_file, stmt, 0,
>>> + TDF_VOPS|TDF_MEMSYMS);
>>> +    }
>>> +
>>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>> +  gsi_move_after (&si, &si_dst);
>>> + }
>>> +      else
>>> + {
>>> +  gimple_stmt_iterator si_dst =
>>> +      gsi_last_bb (single_exit (loop)->dest);
>>> +  gsi_move_after (&si, &si_dst);
>>> + }
>>> +              continue;
>>> +    }
>>> +  else if (!dr)
>>> +          {
>>> +            bool hoist = true;
>>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>>> +            {
>>> +              tree op = gimple_op (stmt, i);
>>> +              if (TREE_CODE (op) == INTEGER_CST
>>> +                  || TREE_CODE (op) == REAL_CST)
>>> +                continue;
>>> +              if (TREE_CODE (op) == SSA_NAME)
>>> +              {
>>> +                gimple def = SSA_NAME_DEF_STMT (op);
>>> +                if (def == stmt
>>> +                    || gimple_nop_p (def)
>>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>>> +                  continue;
>>> +              }
>>> +              hoist = false;
>>> +              break;
>>> +            }
>>> +
>>> +            if (hoist)
>>> +            {
>>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>> +              gsi_move_after (&si, &si_dst);
>>> +              continue;
>>> +            }
>>> +          }
>>> +          gsi_next (&si);
>>> + }
>>> +    }
>>> +
>>>    /* End loop-exit-fixes after versioning.  */
>>>
>>>    if (cond_expr_stmt_list)
>>> Index: gcc/ChangeLog
>>> ===================================================================
>>> --- gcc/ChangeLog (revision 202663)
>>> +++ gcc/ChangeLog (working copy)
>>> @@ -1,3 +1,8 @@
>>> +2013-10-01  Cong Hou  <congh@google.com>
>>> +
>>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>>> + alias checks if it is possible to amortize the runtime overhead.
>>> +
>>>
>>>
>>
>> --
>> Richard Biener <rguenther@suse.de>
>> SUSE / SUSE Labs
>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Cong Hou Oct. 3, 2013, 12:30 a.m. UTC | #12
On Wed, Oct 2, 2013 at 2:47 PM, Xinliang David Li <davidxl@google.com> wrote:
> I think you need to augment (using a wrapper class) the DDR to capture
> more information about aliased memory pairs. It should be flexible
> enough to handle the following cases (you don't have to handle all
> cases in your first patch, but keep those in mind).


In order to bring the information in this augmented structure from the
analysis phase to transformation phase, should we add one more member
to loop_vec_info? Note that currently almost all vectorization related
information is contained in that struct.


>
> 1) All accesses in the same group have constant offsets:
>
>     b[i], b[i+1], b[i+2] etc

This is the easy case.

>
> 2) Accesses in the same group may have offset which is specified by a
> unsigned value:
>
>    unsigned N = ...
>
>    b[i], b[i+N]

If the value of N or its upper bound (see the next case) is unknown at
compile time, we could not merge the alias checks for a & b[i] and a &
b[i+N]. This is because the segment of a may exist between b[i] and
b[i+N].

>
> 3) Accesses have offset with value range > 0:
>
>    for (j = 0; j < 10000; j++)
>            for (i = 0; i < ...; i++)
>              {
>                   .... b[i] ....
>                   .... b[i + j ] ....               // j > 0
>               }
>

If we know j is greater than 0 and has a constant upper bound, we can
utilize this information during alias checks merging. For an induction
variable j, its upper bound can be queried easily. What if j is not an
induction variable:

unsigned j = ...;
if (j < 1000)
{
 for (i = 0; i < ...; i++)
  {
    .... b[i] ....
    .... b[i + j ] ....
  }
}

In current GCC implementation, how to get the upper bound of j here?
Should we search the control dependent predicate of the loop to see if
we are lucky to get the upper bound of j?


>
> 4) base addresses are assigned from the same buffer:
>
>     b1  = &buffer[0];
>     b2 = &buffer[10000];
>     b3 = &buffer[20000];
>
> for (...)
>   {
>          ..b1[i]..
>          ..b2[i]..
>          ..
>    }

This case helped to find a bug in my patch. Here the basic address of
b1 is an addr_expr &buffer instead of buffer. I should not compare the
pointer values of two basic addresses any more but should use
operand_equal_p(). Then Jakub is right: I should not sort all ddr
pairs by comparing pointer values. I once wrote a comparison function
and will consider to use that for sorting.


>
> 5) More elaborate case:
>
>    for (i = 0; i< 3; i++)
>       base[i] = &buffer[i*N];
>
>  b1 = base[0];
>  b2 = base[1];
>  ...
> for ()
> {
>    .. b1[i]..
>     ..
> }

After loop unrolling this case becomes the same as the last one.


thanks,
Cong


>
> David
>
>
> On Wed, Oct 2, 2013 at 2:34 PM, Cong Hou <congh@google.com> wrote:
>> On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
>>> On Tue, 1 Oct 2013, Cong Hou wrote:
>>>
>>>> When alias exists between data refs in a loop, to vectorize it GCC
>>>> does loop versioning and adds runtime alias checks. Basically for each
>>>> pair of data refs with possible data dependence, there will be two
>>>> comparisons generated to make sure there is no aliasing between them
>>>> in each iteration of the vectorized loop. If there are many such data
>>>> refs pairs, the number of comparisons can be very large, which is a
>>>> big overhead.
>>>>
>>>> However, in some cases it is possible to reduce the number of those
>>>> comparisons. For example, for the following loop, we can detect that
>>>> b[0] and b[1] are two consecutive member accesses so that we can
>>>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>>>> checking a[0:100]&b[0:2]:
>>>>
>>>> void foo(int*a, int* b)
>>>> {
>>>>    for (int i = 0; i < 100; ++i)
>>>>     a[i] = b[0] + b[1];
>>>> }
>>>>
>>>> Actually, the requirement of consecutive memory accesses is too
>>>> strict. For the following loop, we can still combine the alias checks
>>>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>>>
>>>> void foo(int*a, int* b)
>>>> {
>>>>    for (int i = 0; i < 100; ++i)
>>>>     a[i] = b[0] + b[100];
>>>> }
>>>>
>>>> This is because if b[0] is not in a[0:100] and b[100] is not in
>>>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>>>> to check a[0:100] and b[0:101] don't overlap.
>>>>
>>>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>>>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>>>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>>>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>>>> segment length of a, b1, and b2. Then we can combine the two
>>>> comparisons into one if the following condition is satisfied:
>>>>
>>>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>>>
>>>>
>>>> This patch detects those combination opportunities to reduce the
>>>> number of alias checks. It is tested on an x86-64 machine.
>>>
>>> Apart from the other comments you got (to which I agree) the patch
>>> seems to do two things, namely also:
>>>
>>> +  /* Extract load and store statements on pointers with zero-stride
>>> +     accesses.  */
>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>> +    {
>>>
>>> which I'd rather see in a separate patch (and done also when
>>> the loop doesn't require versioning for alias).
>>>
>>
>>
>> My mistake.. I am working on those two patches at the same time and
>> pasted that one also here by mistake. I will send another patch about
>> the "hoist" topic.
>>
>>
>>> Also combining the alias checks in vect_create_cond_for_alias_checks
>>> is nice but doesn't properly fix the use of the
>>> vect-max-version-for-alias-checks param which currently inhibits
>>> vectorization of the HIMENO benchmark by default (and make us look bad
>>> compared to LLVM).
>>>
>>> So I believe this merging should be done incrementally when
>>> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>>>
>>
>>
>> I agree that vect-max-version-for-alias-checks param should count the
>> number of checks after the merge. However, the struct
>> data_dependence_relation could not record the new information produced
>> by the merge. The new information I mentioned contains the new segment
>> length for comparisons. This length is calculated right in
>> vect_create_cond_for_alias_checks() function. Since
>> vect-max-version-for-alias-checks is used during analysis phase, shall
>> we move all those (get segment length for each data ref and merge
>> alias checks) from transformation to analysis phase? If we cannot
>> store the result properly (data_dependence_relation is not enough),
>> shall we do it twice in both phases?
>>
>> I also noticed a possible bug in the function vect_same_range_drs()
>> called by vect_prune_runtime_alias_test_list(). For the following code
>> I get two pairs of data refs after
>> vect_prune_runtime_alias_test_list(), but in
>> vect_create_cond_for_alias_checks() after detecting grouped accesses I
>> got two identical pairs of data refs. The consequence is two identical
>> alias checks are produced.
>>
>>
>> void yuv2yuyv_ref (int *d, int *src, int n)
>> {
>>   char *dest = (char *)d;
>>   int i;
>>
>>   for(i=0;i<n/2;i++){
>>     dest[i*4 + 0] = (src[i*2 + 0])>>16;
>>     dest[i*4 + 1] = (src[i*2 + 1])>>8;
>>     dest[i*4 + 2] = (src[i*2 + 0])>>16;
>>     dest[i*4 + 3] = (src[i*2 + 0])>>0;
>>   }
>> }
>>
>>
>> I think the solution to this problem is changing
>>
>> GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
>> == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)
>>
>> into
>>
>> STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
>> (vinfo_for_stmt (stmt_i))))
>> == STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
>> (vinfo_for_stmt (stmt_j)))
>>
>>
>> in function vect_same_range_drs(). What do you think about it?
>>
>>
>> thanks,
>> Cong
>>
>>
>>
>>> Thanks for working on this,
>>> Richard.
>>>
>>>>
>>>> thanks,
>>>> Cong
>>>>
>>>>
>>>>
>>>> Index: gcc/tree-vect-loop-manip.c
>>>> ===================================================================
>>>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>>>> +++ gcc/tree-vect-loop-manip.c (working copy)
>>>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>>>  along with GCC; see the file COPYING3.  If not see
>>>>  <http://www.gnu.org/licenses/>.  */
>>>>
>>>> +#include <vector>
>>>> +#include <utility>
>>>> +#include <algorithm>
>>>> +
>>>>  #include "config.h"
>>>>  #include "system.h"
>>>>  #include "coretypes.h"
>>>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>>>    return segment_length;
>>>>  }
>>>>
>>>> +namespace
>>>> +{
>>>> +
>>>> +/* struct dr_addr_with_seg_len
>>>> +
>>>> +   A struct storing information of a data reference, including the data
>>>> +   ref itself, its basic address, the access offset and the segment length
>>>> +   for aliasing checks.  */
>>>> +
>>>> +struct dr_addr_with_seg_len
>>>> +{
>>>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>>>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>>>> +
>>>> +  data_reference* dr;
>>>> +  tree basic_addr;
>>>> +  tree offset;
>>>> +  tree seg_len;
>>>> +};
>>>> +
>>>> +/* Operator == between two dr_addr_with_seg_len objects.
>>>> +
>>>> +   This equality operator is used to make sure two data refs
>>>> +   are the same one so that we will consider to combine the
>>>> +   aliasing checks of those two pairs of data dependent data
>>>> +   refs.  */
>>>> +
>>>> +bool operator == (const dr_addr_with_seg_len& d1,
>>>> +  const dr_addr_with_seg_len& d2)
>>>> +{
>>>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>>>> + && operand_equal_p (d1.offset, d2.offset, 0)
>>>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>>>> +}
>>>> +
>>>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>>>> + dr_addr_with_seg_len_pair_t;
>>>> +
>>>> +
>>>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>>>> +
>>>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>>>> +   so that we can combine aliasing checks during one scan.  */
>>>> +
>>>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>>>> + const dr_addr_with_seg_len_pair_t& p2)
>>>> +{
>>>> +  const dr_addr_with_seg_len& p11 = p1.first;
>>>> +  const dr_addr_with_seg_len& p12 = p1.second;
>>>> +  const dr_addr_with_seg_len& p21 = p2.first;
>>>> +  const dr_addr_with_seg_len& p22 = p2.second;
>>>> +
>>>> +  if (p11.basic_addr != p21.basic_addr)
>>>> +    return p11.basic_addr < p21.basic_addr;
>>>> +  if (p12.basic_addr != p22.basic_addr)
>>>> +    return p12.basic_addr < p22.basic_addr;
>>>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>>>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>>>> +    return p11.offset < p21.offset;
>>>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>>>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>>>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>>>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>>>> +    return p12.offset < p22.offset;
>>>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>>>> +}
>>>> +
>>>> +}
>>>>
>>>>  /* Function vect_create_cond_for_alias_checks.
>>>>
>>>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>>>    if (may_alias_ddrs.is_empty ())
>>>>      return;
>>>>
>>>> +
>>>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>>>> +     and load_ptr_0, we create an expression:
>>>> +
>>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>>>> +
>>>> +     for aliasing checks. However, in some cases we can decrease
>>>> +     the number of checks by combining two checks into one. For
>>>> +     example, suppose we have another pair of data refs store_ptr_0
>>>> +     and load_ptr_1, and if the following condition is satisfied:
>>>> +
>>>> +     load_ptr_0 < load_ptr_1  &&
>>>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>>>> +
>>>> +     (this condition means, in each iteration of vectorized loop,
>>>> +     the accessed memory of store_ptr_0 cannot be between the memory
>>>> +     of load_ptr_0 and load_ptr_1.)
>>>> +
>>>> +     we then can use only the following expression to finish the
>>>> +     alising checks between store_ptr_0 & load_ptr_0 and
>>>> +     store_ptr_0 & load_ptr_1:
>>>> +
>>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>>>> +
>>>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>>>> +     same basic address.  */
>>>> +
>>>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>>>> +
>>>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>>>> +
>>>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>>>      {
>>>>        struct data_reference *dr_a, *dr_b;
>>>>        gimple dr_group_first_a, dr_group_first_b;
>>>> -      tree addr_base_a, addr_base_b;
>>>>        tree segment_length_a, segment_length_b;
>>>>        gimple stmt_a, stmt_b;
>>>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>>>
>>>>        dr_a = DDR_A (ddr);
>>>>        stmt_a = DR_STMT (DDR_A (ddr));
>>>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>>>        if (dr_group_first_a)
>>>> -        {
>>>> + {
>>>>    stmt_a = dr_group_first_a;
>>>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>>>   }
>>>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>>>        stmt_b = DR_STMT (DDR_B (ddr));
>>>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>>>        if (dr_group_first_b)
>>>> -        {
>>>> + {
>>>>    stmt_b = dr_group_first_b;
>>>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>>>   }
>>>>
>>>> -      addr_base_a
>>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>>>> -       DR_INIT (dr_a)));
>>>> -      addr_base_b
>>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>>>> -       DR_INIT (dr_b)));
>>>> -
>>>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>>>   length_factor = scalar_loop_iters;
>>>>        else
>>>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>>>
>>>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>>>> +  (dr_addr_with_seg_len
>>>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>>>> + segment_length_a),
>>>> +   dr_addr_with_seg_len
>>>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>>>> + segment_length_b));
>>>> +
>>>> +      if (dr_with_seg_len_pair.first.basic_addr >
>>>> +  dr_with_seg_len_pair.second.basic_addr)
>>>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>>>> +
>>>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>>>> +    }
>>>> +
>>>> +  /* Second, we sort the collected data ref pairs so that we can scan
>>>> +     them once to combine all possible aliasing checks.  */
>>>> +
>>>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>>>> +
>>>> +  /* Remove duplicate data ref pairs.  */
>>>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>>>> + ddrs_with_seg_len.end()),
>>>> +   ddrs_with_seg_len.end());
>>>> +
>>>> +  /* We then scan the sorted dr pairs and check if we can combine
>>>> +     alias checks of two neighbouring dr pairs.  */
>>>> +
>>>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>>>> +    {
>>>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>>>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>>>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>>>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>>>> +
>>>> +      if (dr_a1 == dr_a2)
>>>> + {
>>>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>>>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>>>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>>>> +    continue;
>>>> +
>>>> +  int diff = int_cst_value (dr_b2.offset) -
>>>> +     int_cst_value (dr_b1.offset);
>>>> +
>>>> +  gcc_assert (diff > 0);
>>>> +
>>>> +  if (diff <= vect_factor
>>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_b1.seg_len) <
>>>> +     int_cst_value (dr_a1.seg_len)))
>>>> +    {
>>>> +      if (dump_enabled_p ())
>>>> + {
>>>> +  dump_printf_loc
>>>> +      (MSG_NOTE, vect_location,
>>>> +       "combining two runtime checks for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>>>> +  dump_printf (MSG_NOTE, " and ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>> + }
>>>> +
>>>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>>>> +  dr_b2.seg_len, size_int (diff));
>>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>>> +      --i;
>>>> +    }
>>>> + }
>>>> +      else if (dr_b1 == dr_b2)
>>>> + {
>>>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>>>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>>>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>>>> +    continue;
>>>> +
>>>> +  int diff = int_cst_value (dr_a2.offset) -
>>>> +     int_cst_value (dr_a1.offset);
>>>> +
>>>> +  gcc_assert (diff > 0);
>>>> +
>>>> +  if (diff <= vect_factor
>>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_a1.seg_len) <
>>>> +     int_cst_value (dr_b1.seg_len)))
>>>> +    {
>>>> +      if (dump_enabled_p ())
>>>> + {
>>>> +  dump_printf_loc
>>>> +      (MSG_NOTE, vect_location,
>>>> +       "combining two runtime checks for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>>>> +  dump_printf (MSG_NOTE, " and ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>> + }
>>>> +
>>>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>>>> +  dr_a2.seg_len, size_int (diff));
>>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>>> +      --i;
>>>> +    }
>>>> + }
>>>> +    }
>>>> +
>>>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>>>> +    {
>>>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>>>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>>>> +      tree segment_length_a = dr_a.seg_len;
>>>> +      tree segment_length_b = dr_b.seg_len;
>>>> +
>>>> +      tree addr_base_a
>>>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>>>> +      tree addr_base_b
>>>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>>>> +
>>>>        if (dump_enabled_p ())
>>>>   {
>>>>    dump_printf_loc (MSG_NOTE, vect_location,
>>>> -                           "create runtime check for data references ");
>>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>>>> +   "create runtime check for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>>>    dump_printf (MSG_NOTE, " and ");
>>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>>>> -          dump_printf (MSG_NOTE, "\n");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>>   }
>>>>
>>>> -      seg_a_min = addr_base_a;
>>>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>>>> +      tree seg_a_min = addr_base_a;
>>>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>>>
>>>> -      seg_b_min = addr_base_b;
>>>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>>>> +      tree seg_b_min = addr_base_b;
>>>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>>>
>>>>        part_cond_expr =
>>>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>>>      }
>>>>
>>>> +  /* Extract load and store statements on pointers with zero-stride
>>>> +     accesses.  */
>>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>>> +    {
>>>> +
>>>> +      /* In the loop body, we iterate each statement to check if it is a load
>>>> + or store. Then we check the DR_STEP of the data reference.  If
>>>> + DR_STEP is zero, then we will hoist the load statement to the loop
>>>> + preheader, and move the store statement to the loop exit.  */
>>>> +
>>>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>>>> +   !gsi_end_p (si); )
>>>> + {
>>>> +  gimple stmt = gsi_stmt (si);
>>>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>>>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>>>> +
>>>> +
>>>> +  if (dr && integer_zerop (DR_STEP (dr)))
>>>> +    {
>>>> +      if (DR_IS_READ (dr))
>>>> + {
>>>> +  if (dump_file)
>>>> +    {
>>>> +      fprintf (dump_file,
>>>> +       "Hoist the load to outside of the loop:\n");
>>>> +      print_gimple_stmt (dump_file, stmt, 0,
>>>> + TDF_VOPS|TDF_MEMSYMS);
>>>> +    }
>>>> +
>>>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>>>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>>> +  gsi_move_after (&si, &si_dst);
>>>> + }
>>>> +      else
>>>> + {
>>>> +  gimple_stmt_iterator si_dst =
>>>> +      gsi_last_bb (single_exit (loop)->dest);
>>>> +  gsi_move_after (&si, &si_dst);
>>>> + }
>>>> +              continue;
>>>> +    }
>>>> +  else if (!dr)
>>>> +          {
>>>> +            bool hoist = true;
>>>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>>>> +            {
>>>> +              tree op = gimple_op (stmt, i);
>>>> +              if (TREE_CODE (op) == INTEGER_CST
>>>> +                  || TREE_CODE (op) == REAL_CST)
>>>> +                continue;
>>>> +              if (TREE_CODE (op) == SSA_NAME)
>>>> +              {
>>>> +                gimple def = SSA_NAME_DEF_STMT (op);
>>>> +                if (def == stmt
>>>> +                    || gimple_nop_p (def)
>>>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>>>> +                  continue;
>>>> +              }
>>>> +              hoist = false;
>>>> +              break;
>>>> +            }
>>>> +
>>>> +            if (hoist)
>>>> +            {
>>>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>>>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>>> +              gsi_move_after (&si, &si_dst);
>>>> +              continue;
>>>> +            }
>>>> +          }
>>>> +          gsi_next (&si);
>>>> + }
>>>> +    }
>>>> +
>>>>    /* End loop-exit-fixes after versioning.  */
>>>>
>>>>    if (cond_expr_stmt_list)
>>>> Index: gcc/ChangeLog
>>>> ===================================================================
>>>> --- gcc/ChangeLog (revision 202663)
>>>> +++ gcc/ChangeLog (working copy)
>>>> @@ -1,3 +1,8 @@
>>>> +2013-10-01  Cong Hou  <congh@google.com>
>>>> +
>>>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>>>> + alias checks if it is possible to amortize the runtime overhead.
>>>> +
>>>>
>>>>
>>>
>>> --
>>> Richard Biener <rguenther@suse.de>
>>> SUSE / SUSE Labs
>>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>>> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Xinliang David Li Oct. 3, 2013, 3:07 p.m. UTC | #13
General loop multi-versioning (pass) can be a very powerful enabler,
especially when combined with FDO. The loop MV can be done based on
many different conditions:
1) loop trip count (e.g, when we do value profiling on trip count, not
based on average count);
2) memory aliasing
3) target options -- though we do not yet support region specific
target options, the MVed loop can be outlined if believed to be
profitable.

David

On Wed, Oct 2, 2013 at 3:51 PM, Cong Hou <congh@google.com> wrote:
> On Wed, Oct 2, 2013 at 2:18 PM, Xinliang David Li <davidxl@google.com> wrote:
>> On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
>>> On Tue, 1 Oct 2013, Cong Hou wrote:
>>>
>>>> When alias exists between data refs in a loop, to vectorize it GCC
>>>> does loop versioning and adds runtime alias checks. Basically for each
>>>> pair of data refs with possible data dependence, there will be two
>>>> comparisons generated to make sure there is no aliasing between them
>>>> in each iteration of the vectorized loop. If there are many such data
>>>> refs pairs, the number of comparisons can be very large, which is a
>>>> big overhead.
>>>>
>>>> However, in some cases it is possible to reduce the number of those
>>>> comparisons. For example, for the following loop, we can detect that
>>>> b[0] and b[1] are two consecutive member accesses so that we can
>>>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>>>> checking a[0:100]&b[0:2]:
>>>>
>>>> void foo(int*a, int* b)
>>>> {
>>>>    for (int i = 0; i < 100; ++i)
>>>>     a[i] = b[0] + b[1];
>>>> }
>>>>
>>>> Actually, the requirement of consecutive memory accesses is too
>>>> strict. For the following loop, we can still combine the alias checks
>>>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>>>
>>>> void foo(int*a, int* b)
>>>> {
>>>>    for (int i = 0; i < 100; ++i)
>>>>     a[i] = b[0] + b[100];
>>>> }
>>>>
>>>> This is because if b[0] is not in a[0:100] and b[100] is not in
>>>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>>>> to check a[0:100] and b[0:101] don't overlap.
>>>>
>>>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>>>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>>>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>>>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>>>> segment length of a, b1, and b2. Then we can combine the two
>>>> comparisons into one if the following condition is satisfied:
>>>>
>>>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>>>
>>>>
>>>> This patch detects those combination opportunities to reduce the
>>>> number of alias checks. It is tested on an x86-64 machine.
>>>
>>> Apart from the other comments you got (to which I agree) the patch
>>> seems to do two things, namely also:
>>>
>>> +  /* Extract load and store statements on pointers with zero-stride
>>> +     accesses.  */
>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>> +    {
>>>
>>> which I'd rather see in a separate patch (and done also when
>>> the loop doesn't require versioning for alias).
>>
>> yes.
>>
>>>
>>> Also combining the alias checks in vect_create_cond_for_alias_checks
>>> is nice but doesn't properly fix the use of the
>>> vect-max-version-for-alias-checks param
>>
>> Yes. The handling of this should be moved to
>> 'vect_prune_runtime_alias_test_list' to avoid premature decisions.
>>
>>
>>
>>>which currently inhibits
>>> vectorization of the HIMENO benchmark by default (and make us look bad
>>> compared to LLVM).
>>
>> Here is a small reproducible:
>>
>> struct  A {
>>   int *base;
>>   int offset;
>>   int offset2;
>>   int offset3;
>>   int offset4;
>>   int offset5;
>>   int offset6;
>>   int offset7;
>>   int offset8;
>> };
>>
>> void foo (struct A * ar1, struct A* ar2)
>> {
>>       int i;
>>       for (i = 0; i < 10000; i++)
>>         {
>>            ar1->base[i]  = 2*ar2->base[i] + ar2->offset + ar2->offset2
>> + ar2->offset3 + ar2->offset4 + ar2->offset5 + ar2->offset6; /* +
>> ar2->offset7 + ar2->offset8;*/
>>         }
>> }
>>
>> GCC trunk won't vectorize it at O2 due to the limit.
>>
>>
>> There is another problem we should be tracking: GCC no longer
>> vectorize the loop (with large
>> --param=vect-max-version-for-alias-checks=40) when -fno-strict-alias
>> is specified.   However with additional runtime alias check, the loop
>> should be vectorizable.
>
>
> The problem can be reproduced by the following loop:
>
>
> void foo (int* a, int** b)
> {
>   int i;
>   for (i = 0; i < 1000; ++i)
>     a[i] = (*b)[i];
> }
>
>
> When -fno-strict-aliasing is specified, the basic address of (*b)[i]
> which is *b could be modified by a[i] if alias exists between them.
> This forbids GCC from making the basic address of (*b)[i] a loop
> invariant, and hence could not do vectorization. Although we can still
> check the aliasing between a[i] and *b from a and b (b < a or b >=
> a+1000), it must be done before vectorization and after this we have
> to do one more nested aliasing check (a and *b now):
>
>
> void foo (int* a, int** b)
> {
>   int i;
>   if ((int*)b < a || (int*)b >= a+1000)
>   {
>     int* c = *b;
>     for (i = 0; i < 1000; ++i)
>       a[i] = c[i];
>   }
>   else
>   {
>     for (i = 0; i < 1000; ++i)
>       a[i] = (*b)[i];
>   }
> }
>
> I think this is too complicated to be able to get any benefit for us.
>
>
> thanks,
> Cong
>
>
>
>>
>> David
>>
>>
>>>
>>> So I believe this merging should be done incrementally when
>>> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>>>
>>> Thanks for working on this,
>>> Richard.
>>>
>>>>
>>>> thanks,
>>>> Cong
>>>>
>>>>
>>>>
>>>> Index: gcc/tree-vect-loop-manip.c
>>>> ===================================================================
>>>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>>>> +++ gcc/tree-vect-loop-manip.c (working copy)
>>>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>>>  along with GCC; see the file COPYING3.  If not see
>>>>  <http://www.gnu.org/licenses/>.  */
>>>>
>>>> +#include <vector>
>>>> +#include <utility>
>>>> +#include <algorithm>
>>>> +
>>>>  #include "config.h"
>>>>  #include "system.h"
>>>>  #include "coretypes.h"
>>>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>>>    return segment_length;
>>>>  }
>>>>
>>>> +namespace
>>>> +{
>>>> +
>>>> +/* struct dr_addr_with_seg_len
>>>> +
>>>> +   A struct storing information of a data reference, including the data
>>>> +   ref itself, its basic address, the access offset and the segment length
>>>> +   for aliasing checks.  */
>>>> +
>>>> +struct dr_addr_with_seg_len
>>>> +{
>>>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>>>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>>>> +
>>>> +  data_reference* dr;
>>>> +  tree basic_addr;
>>>> +  tree offset;
>>>> +  tree seg_len;
>>>> +};
>>>> +
>>>> +/* Operator == between two dr_addr_with_seg_len objects.
>>>> +
>>>> +   This equality operator is used to make sure two data refs
>>>> +   are the same one so that we will consider to combine the
>>>> +   aliasing checks of those two pairs of data dependent data
>>>> +   refs.  */
>>>> +
>>>> +bool operator == (const dr_addr_with_seg_len& d1,
>>>> +  const dr_addr_with_seg_len& d2)
>>>> +{
>>>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>>>> + && operand_equal_p (d1.offset, d2.offset, 0)
>>>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>>>> +}
>>>> +
>>>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>>>> + dr_addr_with_seg_len_pair_t;
>>>> +
>>>> +
>>>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>>>> +
>>>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>>>> +   so that we can combine aliasing checks during one scan.  */
>>>> +
>>>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>>>> + const dr_addr_with_seg_len_pair_t& p2)
>>>> +{
>>>> +  const dr_addr_with_seg_len& p11 = p1.first;
>>>> +  const dr_addr_with_seg_len& p12 = p1.second;
>>>> +  const dr_addr_with_seg_len& p21 = p2.first;
>>>> +  const dr_addr_with_seg_len& p22 = p2.second;
>>>> +
>>>> +  if (p11.basic_addr != p21.basic_addr)
>>>> +    return p11.basic_addr < p21.basic_addr;
>>>> +  if (p12.basic_addr != p22.basic_addr)
>>>> +    return p12.basic_addr < p22.basic_addr;
>>>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>>>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>>>> +    return p11.offset < p21.offset;
>>>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>>>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>>>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>>>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>>>> +    return p12.offset < p22.offset;
>>>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>>>> +}
>>>> +
>>>> +}
>>>>
>>>>  /* Function vect_create_cond_for_alias_checks.
>>>>
>>>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>>>    if (may_alias_ddrs.is_empty ())
>>>>      return;
>>>>
>>>> +
>>>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>>>> +     and load_ptr_0, we create an expression:
>>>> +
>>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>>>> +
>>>> +     for aliasing checks. However, in some cases we can decrease
>>>> +     the number of checks by combining two checks into one. For
>>>> +     example, suppose we have another pair of data refs store_ptr_0
>>>> +     and load_ptr_1, and if the following condition is satisfied:
>>>> +
>>>> +     load_ptr_0 < load_ptr_1  &&
>>>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>>>> +
>>>> +     (this condition means, in each iteration of vectorized loop,
>>>> +     the accessed memory of store_ptr_0 cannot be between the memory
>>>> +     of load_ptr_0 and load_ptr_1.)
>>>> +
>>>> +     we then can use only the following expression to finish the
>>>> +     alising checks between store_ptr_0 & load_ptr_0 and
>>>> +     store_ptr_0 & load_ptr_1:
>>>> +
>>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>>>> +
>>>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>>>> +     same basic address.  */
>>>> +
>>>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>>>> +
>>>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>>>> +
>>>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>>>      {
>>>>        struct data_reference *dr_a, *dr_b;
>>>>        gimple dr_group_first_a, dr_group_first_b;
>>>> -      tree addr_base_a, addr_base_b;
>>>>        tree segment_length_a, segment_length_b;
>>>>        gimple stmt_a, stmt_b;
>>>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>>>
>>>>        dr_a = DDR_A (ddr);
>>>>        stmt_a = DR_STMT (DDR_A (ddr));
>>>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>>>        if (dr_group_first_a)
>>>> -        {
>>>> + {
>>>>    stmt_a = dr_group_first_a;
>>>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>>>   }
>>>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>>>        stmt_b = DR_STMT (DDR_B (ddr));
>>>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>>>        if (dr_group_first_b)
>>>> -        {
>>>> + {
>>>>    stmt_b = dr_group_first_b;
>>>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>>>   }
>>>>
>>>> -      addr_base_a
>>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>>>> -       DR_INIT (dr_a)));
>>>> -      addr_base_b
>>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>>>> -       DR_INIT (dr_b)));
>>>> -
>>>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>>>   length_factor = scalar_loop_iters;
>>>>        else
>>>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>>>
>>>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>>>> +  (dr_addr_with_seg_len
>>>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>>>> + segment_length_a),
>>>> +   dr_addr_with_seg_len
>>>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>>>> + segment_length_b));
>>>> +
>>>> +      if (dr_with_seg_len_pair.first.basic_addr >
>>>> +  dr_with_seg_len_pair.second.basic_addr)
>>>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>>>> +
>>>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>>>> +    }
>>>> +
>>>> +  /* Second, we sort the collected data ref pairs so that we can scan
>>>> +     them once to combine all possible aliasing checks.  */
>>>> +
>>>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>>>> +
>>>> +  /* Remove duplicate data ref pairs.  */
>>>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>>>> + ddrs_with_seg_len.end()),
>>>> +   ddrs_with_seg_len.end());
>>>> +
>>>> +  /* We then scan the sorted dr pairs and check if we can combine
>>>> +     alias checks of two neighbouring dr pairs.  */
>>>> +
>>>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>>>> +    {
>>>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>>>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>>>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>>>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>>>> +
>>>> +      if (dr_a1 == dr_a2)
>>>> + {
>>>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>>>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>>>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>>>> +    continue;
>>>> +
>>>> +  int diff = int_cst_value (dr_b2.offset) -
>>>> +     int_cst_value (dr_b1.offset);
>>>> +
>>>> +  gcc_assert (diff > 0);
>>>> +
>>>> +  if (diff <= vect_factor
>>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_b1.seg_len) <
>>>> +     int_cst_value (dr_a1.seg_len)))
>>>> +    {
>>>> +      if (dump_enabled_p ())
>>>> + {
>>>> +  dump_printf_loc
>>>> +      (MSG_NOTE, vect_location,
>>>> +       "combining two runtime checks for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>>>> +  dump_printf (MSG_NOTE, " and ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>> + }
>>>> +
>>>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>>>> +  dr_b2.seg_len, size_int (diff));
>>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>>> +      --i;
>>>> +    }
>>>> + }
>>>> +      else if (dr_b1 == dr_b2)
>>>> + {
>>>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>>>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>>>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>>>> +    continue;
>>>> +
>>>> +  int diff = int_cst_value (dr_a2.offset) -
>>>> +     int_cst_value (dr_a1.offset);
>>>> +
>>>> +  gcc_assert (diff > 0);
>>>> +
>>>> +  if (diff <= vect_factor
>>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_a1.seg_len) <
>>>> +     int_cst_value (dr_b1.seg_len)))
>>>> +    {
>>>> +      if (dump_enabled_p ())
>>>> + {
>>>> +  dump_printf_loc
>>>> +      (MSG_NOTE, vect_location,
>>>> +       "combining two runtime checks for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>>>> +  dump_printf (MSG_NOTE, " and ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>> + }
>>>> +
>>>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>>>> +  dr_a2.seg_len, size_int (diff));
>>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>>> +      --i;
>>>> +    }
>>>> + }
>>>> +    }
>>>> +
>>>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>>>> +    {
>>>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>>>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>>>> +      tree segment_length_a = dr_a.seg_len;
>>>> +      tree segment_length_b = dr_b.seg_len;
>>>> +
>>>> +      tree addr_base_a
>>>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>>>> +      tree addr_base_b
>>>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>>>> +
>>>>        if (dump_enabled_p ())
>>>>   {
>>>>    dump_printf_loc (MSG_NOTE, vect_location,
>>>> -                           "create runtime check for data references ");
>>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>>>> +   "create runtime check for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>>>    dump_printf (MSG_NOTE, " and ");
>>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>>>> -          dump_printf (MSG_NOTE, "\n");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>>   }
>>>>
>>>> -      seg_a_min = addr_base_a;
>>>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>>>> +      tree seg_a_min = addr_base_a;
>>>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>>>
>>>> -      seg_b_min = addr_base_b;
>>>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>>>> +      tree seg_b_min = addr_base_b;
>>>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>>>
>>>>        part_cond_expr =
>>>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>>>      }
>>>>
>>>> +  /* Extract load and store statements on pointers with zero-stride
>>>> +     accesses.  */
>>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>>> +    {
>>>> +
>>>> +      /* In the loop body, we iterate each statement to check if it is a load
>>>> + or store. Then we check the DR_STEP of the data reference.  If
>>>> + DR_STEP is zero, then we will hoist the load statement to the loop
>>>> + preheader, and move the store statement to the loop exit.  */
>>>> +
>>>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>>>> +   !gsi_end_p (si); )
>>>> + {
>>>> +  gimple stmt = gsi_stmt (si);
>>>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>>>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>>>> +
>>>> +
>>>> +  if (dr && integer_zerop (DR_STEP (dr)))
>>>> +    {
>>>> +      if (DR_IS_READ (dr))
>>>> + {
>>>> +  if (dump_file)
>>>> +    {
>>>> +      fprintf (dump_file,
>>>> +       "Hoist the load to outside of the loop:\n");
>>>> +      print_gimple_stmt (dump_file, stmt, 0,
>>>> + TDF_VOPS|TDF_MEMSYMS);
>>>> +    }
>>>> +
>>>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>>>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>>> +  gsi_move_after (&si, &si_dst);
>>>> + }
>>>> +      else
>>>> + {
>>>> +  gimple_stmt_iterator si_dst =
>>>> +      gsi_last_bb (single_exit (loop)->dest);
>>>> +  gsi_move_after (&si, &si_dst);
>>>> + }
>>>> +              continue;
>>>> +    }
>>>> +  else if (!dr)
>>>> +          {
>>>> +            bool hoist = true;
>>>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>>>> +            {
>>>> +              tree op = gimple_op (stmt, i);
>>>> +              if (TREE_CODE (op) == INTEGER_CST
>>>> +                  || TREE_CODE (op) == REAL_CST)
>>>> +                continue;
>>>> +              if (TREE_CODE (op) == SSA_NAME)
>>>> +              {
>>>> +                gimple def = SSA_NAME_DEF_STMT (op);
>>>> +                if (def == stmt
>>>> +                    || gimple_nop_p (def)
>>>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>>>> +                  continue;
>>>> +              }
>>>> +              hoist = false;
>>>> +              break;
>>>> +            }
>>>> +
>>>> +            if (hoist)
>>>> +            {
>>>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>>>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>>> +              gsi_move_after (&si, &si_dst);
>>>> +              continue;
>>>> +            }
>>>> +          }
>>>> +          gsi_next (&si);
>>>> + }
>>>> +    }
>>>> +
>>>>    /* End loop-exit-fixes after versioning.  */
>>>>
>>>>    if (cond_expr_stmt_list)
>>>> Index: gcc/ChangeLog
>>>> ===================================================================
>>>> --- gcc/ChangeLog (revision 202663)
>>>> +++ gcc/ChangeLog (working copy)
>>>> @@ -1,3 +1,8 @@
>>>> +2013-10-01  Cong Hou  <congh@google.com>
>>>> +
>>>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>>>> + alias checks if it is possible to amortize the runtime overhead.
>>>> +
>>>>
>>>>
>>>
>>> --
>>> Richard Biener <rguenther@suse.de>
>>> SUSE / SUSE Labs
>>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>>> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Xinliang David Li Oct. 3, 2013, 3:21 p.m. UTC | #14
On Wed, Oct 2, 2013 at 5:30 PM, Cong Hou <congh@google.com> wrote:
> On Wed, Oct 2, 2013 at 2:47 PM, Xinliang David Li <davidxl@google.com> wrote:
>> I think you need to augment (using a wrapper class) the DDR to capture
>> more information about aliased memory pairs. It should be flexible
>> enough to handle the following cases (you don't have to handle all
>> cases in your first patch, but keep those in mind).
>
>
> In order to bring the information in this augmented structure from the
> analysis phase to transformation phase, should we add one more member
> to loop_vec_info? Note that currently almost all vectorization related
> information is contained in that struct.
>
>
>>
>> 1) All accesses in the same group have constant offsets:
>>
>>     b[i], b[i+1], b[i+2] etc
>
> This is the easy case.
>
>>
>> 2) Accesses in the same group may have offset which is specified by a
>> unsigned value:
>>
>>    unsigned N = ...
>>
>>    b[i], b[i+N]
>
> If the value of N or its upper bound (see the next case) is unknown at
> compile time, we could not merge the alias checks for a & b[i] and a &
> b[i+N]. This is because the segment of a may exist between b[i] and
> b[i+N].

It can be done, but done more conservatively (should also consider
possible overflow and wrap around). More generally,
For group: a[i + c_1*N + o_1], a[i + c_2*N + o_2] ...

where N is unsigned runtime constant, c_1, c_2, ..o_1, o_2, ... are
constant values,

you can determine the range of access cover all accesses across all iterations.

Generate the range check between the write group and read group based
on the above range.

>
>>
>> 3) Accesses have offset with value range > 0:
>>
>>    for (j = 0; j < 10000; j++)
>>            for (i = 0; i < ...; i++)
>>              {
>>                   .... b[i] ....
>>                   .... b[i + j ] ....               // j > 0
>>               }
>>
>
> If we know j is greater than 0 and has a constant upper bound, we can
> utilize this information during alias checks merging. For an induction
> variable j, its upper bound can be queried easily. What if j is not an
> induction variable:
>
> unsigned j = ...;
> if (j < 1000)
> {
>  for (i = 0; i < ...; i++)
>   {
>     .... b[i] ....
>     .... b[i + j ] ....
>   }
> }
>
> In current GCC implementation, how to get the upper bound of j here?
> Should we search the control dependent predicate of the loop to see if
> we are lucky to get the upper bound of j?
>

get_range_info (..).

David


>
>>
>> 4) base addresses are assigned from the same buffer:
>>
>>     b1  = &buffer[0];
>>     b2 = &buffer[10000];
>>     b3 = &buffer[20000];
>>
>> for (...)
>>   {
>>          ..b1[i]..
>>          ..b2[i]..
>>          ..
>>    }
>
> This case helped to find a bug in my patch. Here the basic address of
> b1 is an addr_expr &buffer instead of buffer. I should not compare the
> pointer values of two basic addresses any more but should use
> operand_equal_p(). Then Jakub is right: I should not sort all ddr
> pairs by comparing pointer values. I once wrote a comparison function
> and will consider to use that for sorting.
>
>
>>
>> 5) More elaborate case:
>>
>>    for (i = 0; i< 3; i++)
>>       base[i] = &buffer[i*N];
>>
>>  b1 = base[0];
>>  b2 = base[1];
>>  ...
>> for ()
>> {
>>    .. b1[i]..
>>     ..
>> }
>
> After loop unrolling this case becomes the same as the last one.
>
>
> thanks,
> Cong
>
>
>>
>> David
>>
>>
>> On Wed, Oct 2, 2013 at 2:34 PM, Cong Hou <congh@google.com> wrote:
>>> On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
>>>> On Tue, 1 Oct 2013, Cong Hou wrote:
>>>>
>>>>> When alias exists between data refs in a loop, to vectorize it GCC
>>>>> does loop versioning and adds runtime alias checks. Basically for each
>>>>> pair of data refs with possible data dependence, there will be two
>>>>> comparisons generated to make sure there is no aliasing between them
>>>>> in each iteration of the vectorized loop. If there are many such data
>>>>> refs pairs, the number of comparisons can be very large, which is a
>>>>> big overhead.
>>>>>
>>>>> However, in some cases it is possible to reduce the number of those
>>>>> comparisons. For example, for the following loop, we can detect that
>>>>> b[0] and b[1] are two consecutive member accesses so that we can
>>>>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>>>>> checking a[0:100]&b[0:2]:
>>>>>
>>>>> void foo(int*a, int* b)
>>>>> {
>>>>>    for (int i = 0; i < 100; ++i)
>>>>>     a[i] = b[0] + b[1];
>>>>> }
>>>>>
>>>>> Actually, the requirement of consecutive memory accesses is too
>>>>> strict. For the following loop, we can still combine the alias checks
>>>>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>>>>
>>>>> void foo(int*a, int* b)
>>>>> {
>>>>>    for (int i = 0; i < 100; ++i)
>>>>>     a[i] = b[0] + b[100];
>>>>> }
>>>>>
>>>>> This is because if b[0] is not in a[0:100] and b[100] is not in
>>>>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>>>>> to check a[0:100] and b[0:101] don't overlap.
>>>>>
>>>>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>>>>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>>>>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>>>>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>>>>> segment length of a, b1, and b2. Then we can combine the two
>>>>> comparisons into one if the following condition is satisfied:
>>>>>
>>>>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>>>>
>>>>>
>>>>> This patch detects those combination opportunities to reduce the
>>>>> number of alias checks. It is tested on an x86-64 machine.
>>>>
>>>> Apart from the other comments you got (to which I agree) the patch
>>>> seems to do two things, namely also:
>>>>
>>>> +  /* Extract load and store statements on pointers with zero-stride
>>>> +     accesses.  */
>>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>>> +    {
>>>>
>>>> which I'd rather see in a separate patch (and done also when
>>>> the loop doesn't require versioning for alias).
>>>>
>>>
>>>
>>> My mistake.. I am working on those two patches at the same time and
>>> pasted that one also here by mistake. I will send another patch about
>>> the "hoist" topic.
>>>
>>>
>>>> Also combining the alias checks in vect_create_cond_for_alias_checks
>>>> is nice but doesn't properly fix the use of the
>>>> vect-max-version-for-alias-checks param which currently inhibits
>>>> vectorization of the HIMENO benchmark by default (and make us look bad
>>>> compared to LLVM).
>>>>
>>>> So I believe this merging should be done incrementally when
>>>> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>>>>
>>>
>>>
>>> I agree that vect-max-version-for-alias-checks param should count the
>>> number of checks after the merge. However, the struct
>>> data_dependence_relation could not record the new information produced
>>> by the merge. The new information I mentioned contains the new segment
>>> length for comparisons. This length is calculated right in
>>> vect_create_cond_for_alias_checks() function. Since
>>> vect-max-version-for-alias-checks is used during analysis phase, shall
>>> we move all those (get segment length for each data ref and merge
>>> alias checks) from transformation to analysis phase? If we cannot
>>> store the result properly (data_dependence_relation is not enough),
>>> shall we do it twice in both phases?
>>>
>>> I also noticed a possible bug in the function vect_same_range_drs()
>>> called by vect_prune_runtime_alias_test_list(). For the following code
>>> I get two pairs of data refs after
>>> vect_prune_runtime_alias_test_list(), but in
>>> vect_create_cond_for_alias_checks() after detecting grouped accesses I
>>> got two identical pairs of data refs. The consequence is two identical
>>> alias checks are produced.
>>>
>>>
>>> void yuv2yuyv_ref (int *d, int *src, int n)
>>> {
>>>   char *dest = (char *)d;
>>>   int i;
>>>
>>>   for(i=0;i<n/2;i++){
>>>     dest[i*4 + 0] = (src[i*2 + 0])>>16;
>>>     dest[i*4 + 1] = (src[i*2 + 1])>>8;
>>>     dest[i*4 + 2] = (src[i*2 + 0])>>16;
>>>     dest[i*4 + 3] = (src[i*2 + 0])>>0;
>>>   }
>>> }
>>>
>>>
>>> I think the solution to this problem is changing
>>>
>>> GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
>>> == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)
>>>
>>> into
>>>
>>> STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
>>> (vinfo_for_stmt (stmt_i))))
>>> == STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
>>> (vinfo_for_stmt (stmt_j)))
>>>
>>>
>>> in function vect_same_range_drs(). What do you think about it?
>>>
>>>
>>> thanks,
>>> Cong
>>>
>>>
>>>
>>>> Thanks for working on this,
>>>> Richard.
>>>>
>>>>>
>>>>> thanks,
>>>>> Cong
>>>>>
>>>>>
>>>>>
>>>>> Index: gcc/tree-vect-loop-manip.c
>>>>> ===================================================================
>>>>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>>>>> +++ gcc/tree-vect-loop-manip.c (working copy)
>>>>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>>>>  along with GCC; see the file COPYING3.  If not see
>>>>>  <http://www.gnu.org/licenses/>.  */
>>>>>
>>>>> +#include <vector>
>>>>> +#include <utility>
>>>>> +#include <algorithm>
>>>>> +
>>>>>  #include "config.h"
>>>>>  #include "system.h"
>>>>>  #include "coretypes.h"
>>>>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>>>>    return segment_length;
>>>>>  }
>>>>>
>>>>> +namespace
>>>>> +{
>>>>> +
>>>>> +/* struct dr_addr_with_seg_len
>>>>> +
>>>>> +   A struct storing information of a data reference, including the data
>>>>> +   ref itself, its basic address, the access offset and the segment length
>>>>> +   for aliasing checks.  */
>>>>> +
>>>>> +struct dr_addr_with_seg_len
>>>>> +{
>>>>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>>>>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>>>>> +
>>>>> +  data_reference* dr;
>>>>> +  tree basic_addr;
>>>>> +  tree offset;
>>>>> +  tree seg_len;
>>>>> +};
>>>>> +
>>>>> +/* Operator == between two dr_addr_with_seg_len objects.
>>>>> +
>>>>> +   This equality operator is used to make sure two data refs
>>>>> +   are the same one so that we will consider to combine the
>>>>> +   aliasing checks of those two pairs of data dependent data
>>>>> +   refs.  */
>>>>> +
>>>>> +bool operator == (const dr_addr_with_seg_len& d1,
>>>>> +  const dr_addr_with_seg_len& d2)
>>>>> +{
>>>>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>>>>> + && operand_equal_p (d1.offset, d2.offset, 0)
>>>>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>>>>> +}
>>>>> +
>>>>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>>>>> + dr_addr_with_seg_len_pair_t;
>>>>> +
>>>>> +
>>>>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>>>>> +
>>>>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>>>>> +   so that we can combine aliasing checks during one scan.  */
>>>>> +
>>>>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>>>>> + const dr_addr_with_seg_len_pair_t& p2)
>>>>> +{
>>>>> +  const dr_addr_with_seg_len& p11 = p1.first;
>>>>> +  const dr_addr_with_seg_len& p12 = p1.second;
>>>>> +  const dr_addr_with_seg_len& p21 = p2.first;
>>>>> +  const dr_addr_with_seg_len& p22 = p2.second;
>>>>> +
>>>>> +  if (p11.basic_addr != p21.basic_addr)
>>>>> +    return p11.basic_addr < p21.basic_addr;
>>>>> +  if (p12.basic_addr != p22.basic_addr)
>>>>> +    return p12.basic_addr < p22.basic_addr;
>>>>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>>>>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>>>>> +    return p11.offset < p21.offset;
>>>>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>>>>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>>>>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>>>>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>>>>> +    return p12.offset < p22.offset;
>>>>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>>>>> +}
>>>>> +
>>>>> +}
>>>>>
>>>>>  /* Function vect_create_cond_for_alias_checks.
>>>>>
>>>>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>>>>    if (may_alias_ddrs.is_empty ())
>>>>>      return;
>>>>>
>>>>> +
>>>>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>>>>> +     and load_ptr_0, we create an expression:
>>>>> +
>>>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>>>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>>>>> +
>>>>> +     for aliasing checks. However, in some cases we can decrease
>>>>> +     the number of checks by combining two checks into one. For
>>>>> +     example, suppose we have another pair of data refs store_ptr_0
>>>>> +     and load_ptr_1, and if the following condition is satisfied:
>>>>> +
>>>>> +     load_ptr_0 < load_ptr_1  &&
>>>>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>>>>> +
>>>>> +     (this condition means, in each iteration of vectorized loop,
>>>>> +     the accessed memory of store_ptr_0 cannot be between the memory
>>>>> +     of load_ptr_0 and load_ptr_1.)
>>>>> +
>>>>> +     we then can use only the following expression to finish the
>>>>> +     alising checks between store_ptr_0 & load_ptr_0 and
>>>>> +     store_ptr_0 & load_ptr_1:
>>>>> +
>>>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>>>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>>>>> +
>>>>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>>>>> +     same basic address.  */
>>>>> +
>>>>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>>>>> +
>>>>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>>>>> +
>>>>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>>>>      {
>>>>>        struct data_reference *dr_a, *dr_b;
>>>>>        gimple dr_group_first_a, dr_group_first_b;
>>>>> -      tree addr_base_a, addr_base_b;
>>>>>        tree segment_length_a, segment_length_b;
>>>>>        gimple stmt_a, stmt_b;
>>>>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>>>>
>>>>>        dr_a = DDR_A (ddr);
>>>>>        stmt_a = DR_STMT (DDR_A (ddr));
>>>>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>>>>        if (dr_group_first_a)
>>>>> -        {
>>>>> + {
>>>>>    stmt_a = dr_group_first_a;
>>>>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>>>>   }
>>>>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>>>>        stmt_b = DR_STMT (DDR_B (ddr));
>>>>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>>>>        if (dr_group_first_b)
>>>>> -        {
>>>>> + {
>>>>>    stmt_b = dr_group_first_b;
>>>>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>>>>   }
>>>>>
>>>>> -      addr_base_a
>>>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>>>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>>>>> -       DR_INIT (dr_a)));
>>>>> -      addr_base_b
>>>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>>>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>>>>> -       DR_INIT (dr_b)));
>>>>> -
>>>>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>>>>   length_factor = scalar_loop_iters;
>>>>>        else
>>>>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>>>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>>>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>>>>
>>>>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>>>>> +  (dr_addr_with_seg_len
>>>>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>>>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>>>>> + segment_length_a),
>>>>> +   dr_addr_with_seg_len
>>>>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>>>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>>>>> + segment_length_b));
>>>>> +
>>>>> +      if (dr_with_seg_len_pair.first.basic_addr >
>>>>> +  dr_with_seg_len_pair.second.basic_addr)
>>>>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>>>>> +
>>>>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>>>>> +    }
>>>>> +
>>>>> +  /* Second, we sort the collected data ref pairs so that we can scan
>>>>> +     them once to combine all possible aliasing checks.  */
>>>>> +
>>>>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>>>>> +
>>>>> +  /* Remove duplicate data ref pairs.  */
>>>>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>>>>> + ddrs_with_seg_len.end()),
>>>>> +   ddrs_with_seg_len.end());
>>>>> +
>>>>> +  /* We then scan the sorted dr pairs and check if we can combine
>>>>> +     alias checks of two neighbouring dr pairs.  */
>>>>> +
>>>>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>>>>> +    {
>>>>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>>>>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>>>>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>>>>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>>>>> +
>>>>> +      if (dr_a1 == dr_a2)
>>>>> + {
>>>>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>>>>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>>>>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>>>>> +    continue;
>>>>> +
>>>>> +  int diff = int_cst_value (dr_b2.offset) -
>>>>> +     int_cst_value (dr_b1.offset);
>>>>> +
>>>>> +  gcc_assert (diff > 0);
>>>>> +
>>>>> +  if (diff <= vect_factor
>>>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>>>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>>> +  && diff - int_cst_value (dr_b1.seg_len) <
>>>>> +     int_cst_value (dr_a1.seg_len)))
>>>>> +    {
>>>>> +      if (dump_enabled_p ())
>>>>> + {
>>>>> +  dump_printf_loc
>>>>> +      (MSG_NOTE, vect_location,
>>>>> +       "combining two runtime checks for data references ");
>>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>>>>> +  dump_printf (MSG_NOTE, " and ");
>>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>>>>> +  dump_printf (MSG_NOTE, "\n");
>>>>> + }
>>>>> +
>>>>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>>>>> +  dr_b2.seg_len, size_int (diff));
>>>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>>>> +      --i;
>>>>> +    }
>>>>> + }
>>>>> +      else if (dr_b1 == dr_b2)
>>>>> + {
>>>>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>>>>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>>>>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>>>>> +    continue;
>>>>> +
>>>>> +  int diff = int_cst_value (dr_a2.offset) -
>>>>> +     int_cst_value (dr_a1.offset);
>>>>> +
>>>>> +  gcc_assert (diff > 0);
>>>>> +
>>>>> +  if (diff <= vect_factor
>>>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>>>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>>> +  && diff - int_cst_value (dr_a1.seg_len) <
>>>>> +     int_cst_value (dr_b1.seg_len)))
>>>>> +    {
>>>>> +      if (dump_enabled_p ())
>>>>> + {
>>>>> +  dump_printf_loc
>>>>> +      (MSG_NOTE, vect_location,
>>>>> +       "combining two runtime checks for data references ");
>>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>>>>> +  dump_printf (MSG_NOTE, " and ");
>>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>>>>> +  dump_printf (MSG_NOTE, "\n");
>>>>> + }
>>>>> +
>>>>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>>>>> +  dr_a2.seg_len, size_int (diff));
>>>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>>>> +      --i;
>>>>> +    }
>>>>> + }
>>>>> +    }
>>>>> +
>>>>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>>>>> +    {
>>>>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>>>>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>>>>> +      tree segment_length_a = dr_a.seg_len;
>>>>> +      tree segment_length_b = dr_b.seg_len;
>>>>> +
>>>>> +      tree addr_base_a
>>>>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>>>>> +      tree addr_base_b
>>>>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>>>>> +
>>>>>        if (dump_enabled_p ())
>>>>>   {
>>>>>    dump_printf_loc (MSG_NOTE, vect_location,
>>>>> -                           "create runtime check for data references ");
>>>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>>>>> +   "create runtime check for data references ");
>>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>>>>    dump_printf (MSG_NOTE, " and ");
>>>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>>>>> -          dump_printf (MSG_NOTE, "\n");
>>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>>>>> +  dump_printf (MSG_NOTE, "\n");
>>>>>   }
>>>>>
>>>>> -      seg_a_min = addr_base_a;
>>>>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>>>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>>>>> +      tree seg_a_min = addr_base_a;
>>>>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>>>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>>>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>>>>
>>>>> -      seg_b_min = addr_base_b;
>>>>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>>>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>>>>> +      tree seg_b_min = addr_base_b;
>>>>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>>>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>>>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>>>>
>>>>>        part_cond_expr =
>>>>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>>>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>>>>      }
>>>>>
>>>>> +  /* Extract load and store statements on pointers with zero-stride
>>>>> +     accesses.  */
>>>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>>>> +    {
>>>>> +
>>>>> +      /* In the loop body, we iterate each statement to check if it is a load
>>>>> + or store. Then we check the DR_STEP of the data reference.  If
>>>>> + DR_STEP is zero, then we will hoist the load statement to the loop
>>>>> + preheader, and move the store statement to the loop exit.  */
>>>>> +
>>>>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>>>>> +   !gsi_end_p (si); )
>>>>> + {
>>>>> +  gimple stmt = gsi_stmt (si);
>>>>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>>>>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>>>>> +
>>>>> +
>>>>> +  if (dr && integer_zerop (DR_STEP (dr)))
>>>>> +    {
>>>>> +      if (DR_IS_READ (dr))
>>>>> + {
>>>>> +  if (dump_file)
>>>>> +    {
>>>>> +      fprintf (dump_file,
>>>>> +       "Hoist the load to outside of the loop:\n");
>>>>> +      print_gimple_stmt (dump_file, stmt, 0,
>>>>> + TDF_VOPS|TDF_MEMSYMS);
>>>>> +    }
>>>>> +
>>>>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>>>>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>>>> +  gsi_move_after (&si, &si_dst);
>>>>> + }
>>>>> +      else
>>>>> + {
>>>>> +  gimple_stmt_iterator si_dst =
>>>>> +      gsi_last_bb (single_exit (loop)->dest);
>>>>> +  gsi_move_after (&si, &si_dst);
>>>>> + }
>>>>> +              continue;
>>>>> +    }
>>>>> +  else if (!dr)
>>>>> +          {
>>>>> +            bool hoist = true;
>>>>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>>>>> +            {
>>>>> +              tree op = gimple_op (stmt, i);
>>>>> +              if (TREE_CODE (op) == INTEGER_CST
>>>>> +                  || TREE_CODE (op) == REAL_CST)
>>>>> +                continue;
>>>>> +              if (TREE_CODE (op) == SSA_NAME)
>>>>> +              {
>>>>> +                gimple def = SSA_NAME_DEF_STMT (op);
>>>>> +                if (def == stmt
>>>>> +                    || gimple_nop_p (def)
>>>>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>>>>> +                  continue;
>>>>> +              }
>>>>> +              hoist = false;
>>>>> +              break;
>>>>> +            }
>>>>> +
>>>>> +            if (hoist)
>>>>> +            {
>>>>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>>>>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>>>> +              gsi_move_after (&si, &si_dst);
>>>>> +              continue;
>>>>> +            }
>>>>> +          }
>>>>> +          gsi_next (&si);
>>>>> + }
>>>>> +    }
>>>>> +
>>>>>    /* End loop-exit-fixes after versioning.  */
>>>>>
>>>>>    if (cond_expr_stmt_list)
>>>>> Index: gcc/ChangeLog
>>>>> ===================================================================
>>>>> --- gcc/ChangeLog (revision 202663)
>>>>> +++ gcc/ChangeLog (working copy)
>>>>> @@ -1,3 +1,8 @@
>>>>> +2013-10-01  Cong Hou  <congh@google.com>
>>>>> +
>>>>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>>>>> + alias checks if it is possible to amortize the runtime overhead.
>>>>> +
>>>>>
>>>>>
>>>>
>>>> --
>>>> Richard Biener <rguenther@suse.de>
>>>> SUSE / SUSE Labs
>>>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>>>> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Cong Hou Oct. 3, 2013, 5:57 p.m. UTC | #15
I noticed that there is a "struct dataref_aux" defined in
tree-vectorizer.h which is specific to the vectorizer pass and is
stored in (void*)aux in "struct data_reference". Can we add one more
field "segment_length" to dataref_aux so that we can pass this
information for merging alias checks? Then we can avoid to modify or
create other structures.


thanks,
Cong


On Wed, Oct 2, 2013 at 2:34 PM, Cong Hou <congh@google.com> wrote:
> On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
>> On Tue, 1 Oct 2013, Cong Hou wrote:
>>
>>> When alias exists between data refs in a loop, to vectorize it GCC
>>> does loop versioning and adds runtime alias checks. Basically for each
>>> pair of data refs with possible data dependence, there will be two
>>> comparisons generated to make sure there is no aliasing between them
>>> in each iteration of the vectorized loop. If there are many such data
>>> refs pairs, the number of comparisons can be very large, which is a
>>> big overhead.
>>>
>>> However, in some cases it is possible to reduce the number of those
>>> comparisons. For example, for the following loop, we can detect that
>>> b[0] and b[1] are two consecutive member accesses so that we can
>>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>>> checking a[0:100]&b[0:2]:
>>>
>>> void foo(int*a, int* b)
>>> {
>>>    for (int i = 0; i < 100; ++i)
>>>     a[i] = b[0] + b[1];
>>> }
>>>
>>> Actually, the requirement of consecutive memory accesses is too
>>> strict. For the following loop, we can still combine the alias checks
>>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>>
>>> void foo(int*a, int* b)
>>> {
>>>    for (int i = 0; i < 100; ++i)
>>>     a[i] = b[0] + b[100];
>>> }
>>>
>>> This is because if b[0] is not in a[0:100] and b[100] is not in
>>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>>> to check a[0:100] and b[0:101] don't overlap.
>>>
>>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>>> segment length of a, b1, and b2. Then we can combine the two
>>> comparisons into one if the following condition is satisfied:
>>>
>>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>>
>>>
>>> This patch detects those combination opportunities to reduce the
>>> number of alias checks. It is tested on an x86-64 machine.
>>
>> Apart from the other comments you got (to which I agree) the patch
>> seems to do two things, namely also:
>>
>> +  /* Extract load and store statements on pointers with zero-stride
>> +     accesses.  */
>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>> +    {
>>
>> which I'd rather see in a separate patch (and done also when
>> the loop doesn't require versioning for alias).
>>
>
>
> My mistake.. I am working on those two patches at the same time and
> pasted that one also here by mistake. I will send another patch about
> the "hoist" topic.
>
>
>> Also combining the alias checks in vect_create_cond_for_alias_checks
>> is nice but doesn't properly fix the use of the
>> vect-max-version-for-alias-checks param which currently inhibits
>> vectorization of the HIMENO benchmark by default (and make us look bad
>> compared to LLVM).
>>
>> So I believe this merging should be done incrementally when
>> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>>
>
>
> I agree that vect-max-version-for-alias-checks param should count the
> number of checks after the merge. However, the struct
> data_dependence_relation could not record the new information produced
> by the merge. The new information I mentioned contains the new segment
> length for comparisons. This length is calculated right in
> vect_create_cond_for_alias_checks() function. Since
> vect-max-version-for-alias-checks is used during analysis phase, shall
> we move all those (get segment length for each data ref and merge
> alias checks) from transformation to analysis phase? If we cannot
> store the result properly (data_dependence_relation is not enough),
> shall we do it twice in both phases?
>
> I also noticed a possible bug in the function vect_same_range_drs()
> called by vect_prune_runtime_alias_test_list(). For the following code
> I get two pairs of data refs after
> vect_prune_runtime_alias_test_list(), but in
> vect_create_cond_for_alias_checks() after detecting grouped accesses I
> got two identical pairs of data refs. The consequence is two identical
> alias checks are produced.
>
>
> void yuv2yuyv_ref (int *d, int *src, int n)
> {
>   char *dest = (char *)d;
>   int i;
>
>   for(i=0;i<n/2;i++){
>     dest[i*4 + 0] = (src[i*2 + 0])>>16;
>     dest[i*4 + 1] = (src[i*2 + 1])>>8;
>     dest[i*4 + 2] = (src[i*2 + 0])>>16;
>     dest[i*4 + 3] = (src[i*2 + 0])>>0;
>   }
> }
>
>
> I think the solution to this problem is changing
>
> GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
> == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)
>
> into
>
> STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
> (vinfo_for_stmt (stmt_i))))
> == STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
> (vinfo_for_stmt (stmt_j)))
>
>
> in function vect_same_range_drs(). What do you think about it?
>
>
> thanks,
> Cong
>
>
>
>> Thanks for working on this,
>> Richard.
>>
>>>
>>> thanks,
>>> Cong
>>>
>>>
>>>
>>> Index: gcc/tree-vect-loop-manip.c
>>> ===================================================================
>>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>>> +++ gcc/tree-vect-loop-manip.c (working copy)
>>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>>  along with GCC; see the file COPYING3.  If not see
>>>  <http://www.gnu.org/licenses/>.  */
>>>
>>> +#include <vector>
>>> +#include <utility>
>>> +#include <algorithm>
>>> +
>>>  #include "config.h"
>>>  #include "system.h"
>>>  #include "coretypes.h"
>>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>>    return segment_length;
>>>  }
>>>
>>> +namespace
>>> +{
>>> +
>>> +/* struct dr_addr_with_seg_len
>>> +
>>> +   A struct storing information of a data reference, including the data
>>> +   ref itself, its basic address, the access offset and the segment length
>>> +   for aliasing checks.  */
>>> +
>>> +struct dr_addr_with_seg_len
>>> +{
>>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>>> +
>>> +  data_reference* dr;
>>> +  tree basic_addr;
>>> +  tree offset;
>>> +  tree seg_len;
>>> +};
>>> +
>>> +/* Operator == between two dr_addr_with_seg_len objects.
>>> +
>>> +   This equality operator is used to make sure two data refs
>>> +   are the same one so that we will consider to combine the
>>> +   aliasing checks of those two pairs of data dependent data
>>> +   refs.  */
>>> +
>>> +bool operator == (const dr_addr_with_seg_len& d1,
>>> +  const dr_addr_with_seg_len& d2)
>>> +{
>>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>>> + && operand_equal_p (d1.offset, d2.offset, 0)
>>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>>> +}
>>> +
>>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>>> + dr_addr_with_seg_len_pair_t;
>>> +
>>> +
>>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>>> +
>>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>>> +   so that we can combine aliasing checks during one scan.  */
>>> +
>>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>>> + const dr_addr_with_seg_len_pair_t& p2)
>>> +{
>>> +  const dr_addr_with_seg_len& p11 = p1.first;
>>> +  const dr_addr_with_seg_len& p12 = p1.second;
>>> +  const dr_addr_with_seg_len& p21 = p2.first;
>>> +  const dr_addr_with_seg_len& p22 = p2.second;
>>> +
>>> +  if (p11.basic_addr != p21.basic_addr)
>>> +    return p11.basic_addr < p21.basic_addr;
>>> +  if (p12.basic_addr != p22.basic_addr)
>>> +    return p12.basic_addr < p22.basic_addr;
>>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>>> +    return p11.offset < p21.offset;
>>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>>> +    return p12.offset < p22.offset;
>>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>>> +}
>>> +
>>> +}
>>>
>>>  /* Function vect_create_cond_for_alias_checks.
>>>
>>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>>    if (may_alias_ddrs.is_empty ())
>>>      return;
>>>
>>> +
>>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>>> +     and load_ptr_0, we create an expression:
>>> +
>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>>> +
>>> +     for aliasing checks. However, in some cases we can decrease
>>> +     the number of checks by combining two checks into one. For
>>> +     example, suppose we have another pair of data refs store_ptr_0
>>> +     and load_ptr_1, and if the following condition is satisfied:
>>> +
>>> +     load_ptr_0 < load_ptr_1  &&
>>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>>> +
>>> +     (this condition means, in each iteration of vectorized loop,
>>> +     the accessed memory of store_ptr_0 cannot be between the memory
>>> +     of load_ptr_0 and load_ptr_1.)
>>> +
>>> +     we then can use only the following expression to finish the
>>> +     alising checks between store_ptr_0 & load_ptr_0 and
>>> +     store_ptr_0 & load_ptr_1:
>>> +
>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>>> +
>>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>>> +     same basic address.  */
>>> +
>>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>>> +
>>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>>> +
>>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>>      {
>>>        struct data_reference *dr_a, *dr_b;
>>>        gimple dr_group_first_a, dr_group_first_b;
>>> -      tree addr_base_a, addr_base_b;
>>>        tree segment_length_a, segment_length_b;
>>>        gimple stmt_a, stmt_b;
>>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>>
>>>        dr_a = DDR_A (ddr);
>>>        stmt_a = DR_STMT (DDR_A (ddr));
>>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>>        if (dr_group_first_a)
>>> -        {
>>> + {
>>>    stmt_a = dr_group_first_a;
>>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>>   }
>>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>>        stmt_b = DR_STMT (DDR_B (ddr));
>>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>>        if (dr_group_first_b)
>>> -        {
>>> + {
>>>    stmt_b = dr_group_first_b;
>>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>>   }
>>>
>>> -      addr_base_a
>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>>> -       DR_INIT (dr_a)));
>>> -      addr_base_b
>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>>> -       DR_INIT (dr_b)));
>>> -
>>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>>   length_factor = scalar_loop_iters;
>>>        else
>>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>>
>>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>>> +  (dr_addr_with_seg_len
>>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>>> + segment_length_a),
>>> +   dr_addr_with_seg_len
>>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>>> + segment_length_b));
>>> +
>>> +      if (dr_with_seg_len_pair.first.basic_addr >
>>> +  dr_with_seg_len_pair.second.basic_addr)
>>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>>> +
>>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>>> +    }
>>> +
>>> +  /* Second, we sort the collected data ref pairs so that we can scan
>>> +     them once to combine all possible aliasing checks.  */
>>> +
>>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>>> +
>>> +  /* Remove duplicate data ref pairs.  */
>>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>>> + ddrs_with_seg_len.end()),
>>> +   ddrs_with_seg_len.end());
>>> +
>>> +  /* We then scan the sorted dr pairs and check if we can combine
>>> +     alias checks of two neighbouring dr pairs.  */
>>> +
>>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>>> +    {
>>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>>> +
>>> +      if (dr_a1 == dr_a2)
>>> + {
>>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>>> +    continue;
>>> +
>>> +  int diff = int_cst_value (dr_b2.offset) -
>>> +     int_cst_value (dr_b1.offset);
>>> +
>>> +  gcc_assert (diff > 0);
>>> +
>>> +  if (diff <= vect_factor
>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_b1.seg_len) <
>>> +     int_cst_value (dr_a1.seg_len)))
>>> +    {
>>> +      if (dump_enabled_p ())
>>> + {
>>> +  dump_printf_loc
>>> +      (MSG_NOTE, vect_location,
>>> +       "combining two runtime checks for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>>> +  dump_printf (MSG_NOTE, " and ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>> + }
>>> +
>>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>>> +  dr_b2.seg_len, size_int (diff));
>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>> +      --i;
>>> +    }
>>> + }
>>> +      else if (dr_b1 == dr_b2)
>>> + {
>>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>>> +    continue;
>>> +
>>> +  int diff = int_cst_value (dr_a2.offset) -
>>> +     int_cst_value (dr_a1.offset);
>>> +
>>> +  gcc_assert (diff > 0);
>>> +
>>> +  if (diff <= vect_factor
>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>> +  && diff - int_cst_value (dr_a1.seg_len) <
>>> +     int_cst_value (dr_b1.seg_len)))
>>> +    {
>>> +      if (dump_enabled_p ())
>>> + {
>>> +  dump_printf_loc
>>> +      (MSG_NOTE, vect_location,
>>> +       "combining two runtime checks for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>>> +  dump_printf (MSG_NOTE, " and ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>> + }
>>> +
>>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>>> +  dr_a2.seg_len, size_int (diff));
>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>> +      --i;
>>> +    }
>>> + }
>>> +    }
>>> +
>>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>>> +    {
>>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>>> +      tree segment_length_a = dr_a.seg_len;
>>> +      tree segment_length_b = dr_b.seg_len;
>>> +
>>> +      tree addr_base_a
>>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>>> +      tree addr_base_b
>>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>>> +
>>>        if (dump_enabled_p ())
>>>   {
>>>    dump_printf_loc (MSG_NOTE, vect_location,
>>> -                           "create runtime check for data references ");
>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>>> +   "create runtime check for data references ");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>>    dump_printf (MSG_NOTE, " and ");
>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>>> -          dump_printf (MSG_NOTE, "\n");
>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>>> +  dump_printf (MSG_NOTE, "\n");
>>>   }
>>>
>>> -      seg_a_min = addr_base_a;
>>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>>> +      tree seg_a_min = addr_base_a;
>>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>>
>>> -      seg_b_min = addr_base_b;
>>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>>> +      tree seg_b_min = addr_base_b;
>>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>>
>>>        part_cond_expr =
>>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>>      }
>>>
>>> +  /* Extract load and store statements on pointers with zero-stride
>>> +     accesses.  */
>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>> +    {
>>> +
>>> +      /* In the loop body, we iterate each statement to check if it is a load
>>> + or store. Then we check the DR_STEP of the data reference.  If
>>> + DR_STEP is zero, then we will hoist the load statement to the loop
>>> + preheader, and move the store statement to the loop exit.  */
>>> +
>>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>>> +   !gsi_end_p (si); )
>>> + {
>>> +  gimple stmt = gsi_stmt (si);
>>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>>> +
>>> +
>>> +  if (dr && integer_zerop (DR_STEP (dr)))
>>> +    {
>>> +      if (DR_IS_READ (dr))
>>> + {
>>> +  if (dump_file)
>>> +    {
>>> +      fprintf (dump_file,
>>> +       "Hoist the load to outside of the loop:\n");
>>> +      print_gimple_stmt (dump_file, stmt, 0,
>>> + TDF_VOPS|TDF_MEMSYMS);
>>> +    }
>>> +
>>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>> +  gsi_move_after (&si, &si_dst);
>>> + }
>>> +      else
>>> + {
>>> +  gimple_stmt_iterator si_dst =
>>> +      gsi_last_bb (single_exit (loop)->dest);
>>> +  gsi_move_after (&si, &si_dst);
>>> + }
>>> +              continue;
>>> +    }
>>> +  else if (!dr)
>>> +          {
>>> +            bool hoist = true;
>>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>>> +            {
>>> +              tree op = gimple_op (stmt, i);
>>> +              if (TREE_CODE (op) == INTEGER_CST
>>> +                  || TREE_CODE (op) == REAL_CST)
>>> +                continue;
>>> +              if (TREE_CODE (op) == SSA_NAME)
>>> +              {
>>> +                gimple def = SSA_NAME_DEF_STMT (op);
>>> +                if (def == stmt
>>> +                    || gimple_nop_p (def)
>>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>>> +                  continue;
>>> +              }
>>> +              hoist = false;
>>> +              break;
>>> +            }
>>> +
>>> +            if (hoist)
>>> +            {
>>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>> +              gsi_move_after (&si, &si_dst);
>>> +              continue;
>>> +            }
>>> +          }
>>> +          gsi_next (&si);
>>> + }
>>> +    }
>>> +
>>>    /* End loop-exit-fixes after versioning.  */
>>>
>>>    if (cond_expr_stmt_list)
>>> Index: gcc/ChangeLog
>>> ===================================================================
>>> --- gcc/ChangeLog (revision 202663)
>>> +++ gcc/ChangeLog (working copy)
>>> @@ -1,3 +1,8 @@
>>> +2013-10-01  Cong Hou  <congh@google.com>
>>> +
>>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>>> + alias checks if it is possible to amortize the runtime overhead.
>>> +
>>>
>>>
>>
>> --
>> Richard Biener <rguenther@suse.de>
>> SUSE / SUSE Labs
>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Xinliang David Li Oct. 3, 2013, 5:59 p.m. UTC | #16
Looks reasonable to me.

David

On Thu, Oct 3, 2013 at 10:57 AM, Cong Hou <congh@google.com> wrote:
> I noticed that there is a "struct dataref_aux" defined in
> tree-vectorizer.h which is specific to the vectorizer pass and is
> stored in (void*)aux in "struct data_reference". Can we add one more
> field "segment_length" to dataref_aux so that we can pass this
> information for merging alias checks? Then we can avoid to modify or
> create other structures.
>
>
> thanks,
> Cong
>
>
> On Wed, Oct 2, 2013 at 2:34 PM, Cong Hou <congh@google.com> wrote:
>> On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
>>> On Tue, 1 Oct 2013, Cong Hou wrote:
>>>
>>>> When alias exists between data refs in a loop, to vectorize it GCC
>>>> does loop versioning and adds runtime alias checks. Basically for each
>>>> pair of data refs with possible data dependence, there will be two
>>>> comparisons generated to make sure there is no aliasing between them
>>>> in each iteration of the vectorized loop. If there are many such data
>>>> refs pairs, the number of comparisons can be very large, which is a
>>>> big overhead.
>>>>
>>>> However, in some cases it is possible to reduce the number of those
>>>> comparisons. For example, for the following loop, we can detect that
>>>> b[0] and b[1] are two consecutive member accesses so that we can
>>>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>>>> checking a[0:100]&b[0:2]:
>>>>
>>>> void foo(int*a, int* b)
>>>> {
>>>>    for (int i = 0; i < 100; ++i)
>>>>     a[i] = b[0] + b[1];
>>>> }
>>>>
>>>> Actually, the requirement of consecutive memory accesses is too
>>>> strict. For the following loop, we can still combine the alias checks
>>>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>>>
>>>> void foo(int*a, int* b)
>>>> {
>>>>    for (int i = 0; i < 100; ++i)
>>>>     a[i] = b[0] + b[100];
>>>> }
>>>>
>>>> This is because if b[0] is not in a[0:100] and b[100] is not in
>>>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>>>> to check a[0:100] and b[0:101] don't overlap.
>>>>
>>>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>>>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>>>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>>>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>>>> segment length of a, b1, and b2. Then we can combine the two
>>>> comparisons into one if the following condition is satisfied:
>>>>
>>>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>>>
>>>>
>>>> This patch detects those combination opportunities to reduce the
>>>> number of alias checks. It is tested on an x86-64 machine.
>>>
>>> Apart from the other comments you got (to which I agree) the patch
>>> seems to do two things, namely also:
>>>
>>> +  /* Extract load and store statements on pointers with zero-stride
>>> +     accesses.  */
>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>> +    {
>>>
>>> which I'd rather see in a separate patch (and done also when
>>> the loop doesn't require versioning for alias).
>>>
>>
>>
>> My mistake.. I am working on those two patches at the same time and
>> pasted that one also here by mistake. I will send another patch about
>> the "hoist" topic.
>>
>>
>>> Also combining the alias checks in vect_create_cond_for_alias_checks
>>> is nice but doesn't properly fix the use of the
>>> vect-max-version-for-alias-checks param which currently inhibits
>>> vectorization of the HIMENO benchmark by default (and make us look bad
>>> compared to LLVM).
>>>
>>> So I believe this merging should be done incrementally when
>>> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>>>
>>
>>
>> I agree that vect-max-version-for-alias-checks param should count the
>> number of checks after the merge. However, the struct
>> data_dependence_relation could not record the new information produced
>> by the merge. The new information I mentioned contains the new segment
>> length for comparisons. This length is calculated right in
>> vect_create_cond_for_alias_checks() function. Since
>> vect-max-version-for-alias-checks is used during analysis phase, shall
>> we move all those (get segment length for each data ref and merge
>> alias checks) from transformation to analysis phase? If we cannot
>> store the result properly (data_dependence_relation is not enough),
>> shall we do it twice in both phases?
>>
>> I also noticed a possible bug in the function vect_same_range_drs()
>> called by vect_prune_runtime_alias_test_list(). For the following code
>> I get two pairs of data refs after
>> vect_prune_runtime_alias_test_list(), but in
>> vect_create_cond_for_alias_checks() after detecting grouped accesses I
>> got two identical pairs of data refs. The consequence is two identical
>> alias checks are produced.
>>
>>
>> void yuv2yuyv_ref (int *d, int *src, int n)
>> {
>>   char *dest = (char *)d;
>>   int i;
>>
>>   for(i=0;i<n/2;i++){
>>     dest[i*4 + 0] = (src[i*2 + 0])>>16;
>>     dest[i*4 + 1] = (src[i*2 + 1])>>8;
>>     dest[i*4 + 2] = (src[i*2 + 0])>>16;
>>     dest[i*4 + 3] = (src[i*2 + 0])>>0;
>>   }
>> }
>>
>>
>> I think the solution to this problem is changing
>>
>> GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
>> == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)
>>
>> into
>>
>> STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
>> (vinfo_for_stmt (stmt_i))))
>> == STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
>> (vinfo_for_stmt (stmt_j)))
>>
>>
>> in function vect_same_range_drs(). What do you think about it?
>>
>>
>> thanks,
>> Cong
>>
>>
>>
>>> Thanks for working on this,
>>> Richard.
>>>
>>>>
>>>> thanks,
>>>> Cong
>>>>
>>>>
>>>>
>>>> Index: gcc/tree-vect-loop-manip.c
>>>> ===================================================================
>>>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>>>> +++ gcc/tree-vect-loop-manip.c (working copy)
>>>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>>>  along with GCC; see the file COPYING3.  If not see
>>>>  <http://www.gnu.org/licenses/>.  */
>>>>
>>>> +#include <vector>
>>>> +#include <utility>
>>>> +#include <algorithm>
>>>> +
>>>>  #include "config.h"
>>>>  #include "system.h"
>>>>  #include "coretypes.h"
>>>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>>>    return segment_length;
>>>>  }
>>>>
>>>> +namespace
>>>> +{
>>>> +
>>>> +/* struct dr_addr_with_seg_len
>>>> +
>>>> +   A struct storing information of a data reference, including the data
>>>> +   ref itself, its basic address, the access offset and the segment length
>>>> +   for aliasing checks.  */
>>>> +
>>>> +struct dr_addr_with_seg_len
>>>> +{
>>>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>>>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>>>> +
>>>> +  data_reference* dr;
>>>> +  tree basic_addr;
>>>> +  tree offset;
>>>> +  tree seg_len;
>>>> +};
>>>> +
>>>> +/* Operator == between two dr_addr_with_seg_len objects.
>>>> +
>>>> +   This equality operator is used to make sure two data refs
>>>> +   are the same one so that we will consider to combine the
>>>> +   aliasing checks of those two pairs of data dependent data
>>>> +   refs.  */
>>>> +
>>>> +bool operator == (const dr_addr_with_seg_len& d1,
>>>> +  const dr_addr_with_seg_len& d2)
>>>> +{
>>>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>>>> + && operand_equal_p (d1.offset, d2.offset, 0)
>>>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>>>> +}
>>>> +
>>>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>>>> + dr_addr_with_seg_len_pair_t;
>>>> +
>>>> +
>>>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>>>> +
>>>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>>>> +   so that we can combine aliasing checks during one scan.  */
>>>> +
>>>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>>>> + const dr_addr_with_seg_len_pair_t& p2)
>>>> +{
>>>> +  const dr_addr_with_seg_len& p11 = p1.first;
>>>> +  const dr_addr_with_seg_len& p12 = p1.second;
>>>> +  const dr_addr_with_seg_len& p21 = p2.first;
>>>> +  const dr_addr_with_seg_len& p22 = p2.second;
>>>> +
>>>> +  if (p11.basic_addr != p21.basic_addr)
>>>> +    return p11.basic_addr < p21.basic_addr;
>>>> +  if (p12.basic_addr != p22.basic_addr)
>>>> +    return p12.basic_addr < p22.basic_addr;
>>>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>>>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>>>> +    return p11.offset < p21.offset;
>>>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>>>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>>>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>>>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>>>> +    return p12.offset < p22.offset;
>>>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>>>> +}
>>>> +
>>>> +}
>>>>
>>>>  /* Function vect_create_cond_for_alias_checks.
>>>>
>>>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>>>    if (may_alias_ddrs.is_empty ())
>>>>      return;
>>>>
>>>> +
>>>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>>>> +     and load_ptr_0, we create an expression:
>>>> +
>>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>>>> +
>>>> +     for aliasing checks. However, in some cases we can decrease
>>>> +     the number of checks by combining two checks into one. For
>>>> +     example, suppose we have another pair of data refs store_ptr_0
>>>> +     and load_ptr_1, and if the following condition is satisfied:
>>>> +
>>>> +     load_ptr_0 < load_ptr_1  &&
>>>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>>>> +
>>>> +     (this condition means, in each iteration of vectorized loop,
>>>> +     the accessed memory of store_ptr_0 cannot be between the memory
>>>> +     of load_ptr_0 and load_ptr_1.)
>>>> +
>>>> +     we then can use only the following expression to finish the
>>>> +     alising checks between store_ptr_0 & load_ptr_0 and
>>>> +     store_ptr_0 & load_ptr_1:
>>>> +
>>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>>>> +
>>>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>>>> +     same basic address.  */
>>>> +
>>>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>>>> +
>>>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>>>> +
>>>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>>>      {
>>>>        struct data_reference *dr_a, *dr_b;
>>>>        gimple dr_group_first_a, dr_group_first_b;
>>>> -      tree addr_base_a, addr_base_b;
>>>>        tree segment_length_a, segment_length_b;
>>>>        gimple stmt_a, stmt_b;
>>>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>>>
>>>>        dr_a = DDR_A (ddr);
>>>>        stmt_a = DR_STMT (DDR_A (ddr));
>>>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>>>        if (dr_group_first_a)
>>>> -        {
>>>> + {
>>>>    stmt_a = dr_group_first_a;
>>>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>>>   }
>>>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>>>        stmt_b = DR_STMT (DDR_B (ddr));
>>>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>>>        if (dr_group_first_b)
>>>> -        {
>>>> + {
>>>>    stmt_b = dr_group_first_b;
>>>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>>>   }
>>>>
>>>> -      addr_base_a
>>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>>>> -       DR_INIT (dr_a)));
>>>> -      addr_base_b
>>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>>>> -       DR_INIT (dr_b)));
>>>> -
>>>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>>>   length_factor = scalar_loop_iters;
>>>>        else
>>>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>>>
>>>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>>>> +  (dr_addr_with_seg_len
>>>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>>>> + segment_length_a),
>>>> +   dr_addr_with_seg_len
>>>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>>>> + segment_length_b));
>>>> +
>>>> +      if (dr_with_seg_len_pair.first.basic_addr >
>>>> +  dr_with_seg_len_pair.second.basic_addr)
>>>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>>>> +
>>>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>>>> +    }
>>>> +
>>>> +  /* Second, we sort the collected data ref pairs so that we can scan
>>>> +     them once to combine all possible aliasing checks.  */
>>>> +
>>>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>>>> +
>>>> +  /* Remove duplicate data ref pairs.  */
>>>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>>>> + ddrs_with_seg_len.end()),
>>>> +   ddrs_with_seg_len.end());
>>>> +
>>>> +  /* We then scan the sorted dr pairs and check if we can combine
>>>> +     alias checks of two neighbouring dr pairs.  */
>>>> +
>>>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>>>> +    {
>>>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>>>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>>>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>>>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>>>> +
>>>> +      if (dr_a1 == dr_a2)
>>>> + {
>>>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>>>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>>>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>>>> +    continue;
>>>> +
>>>> +  int diff = int_cst_value (dr_b2.offset) -
>>>> +     int_cst_value (dr_b1.offset);
>>>> +
>>>> +  gcc_assert (diff > 0);
>>>> +
>>>> +  if (diff <= vect_factor
>>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_b1.seg_len) <
>>>> +     int_cst_value (dr_a1.seg_len)))
>>>> +    {
>>>> +      if (dump_enabled_p ())
>>>> + {
>>>> +  dump_printf_loc
>>>> +      (MSG_NOTE, vect_location,
>>>> +       "combining two runtime checks for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>>>> +  dump_printf (MSG_NOTE, " and ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>> + }
>>>> +
>>>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>>>> +  dr_b2.seg_len, size_int (diff));
>>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>>> +      --i;
>>>> +    }
>>>> + }
>>>> +      else if (dr_b1 == dr_b2)
>>>> + {
>>>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>>>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>>>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>>>> +    continue;
>>>> +
>>>> +  int diff = int_cst_value (dr_a2.offset) -
>>>> +     int_cst_value (dr_a1.offset);
>>>> +
>>>> +  gcc_assert (diff > 0);
>>>> +
>>>> +  if (diff <= vect_factor
>>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_a1.seg_len) <
>>>> +     int_cst_value (dr_b1.seg_len)))
>>>> +    {
>>>> +      if (dump_enabled_p ())
>>>> + {
>>>> +  dump_printf_loc
>>>> +      (MSG_NOTE, vect_location,
>>>> +       "combining two runtime checks for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>>>> +  dump_printf (MSG_NOTE, " and ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>> + }
>>>> +
>>>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>>>> +  dr_a2.seg_len, size_int (diff));
>>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>>> +      --i;
>>>> +    }
>>>> + }
>>>> +    }
>>>> +
>>>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>>>> +    {
>>>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>>>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>>>> +      tree segment_length_a = dr_a.seg_len;
>>>> +      tree segment_length_b = dr_b.seg_len;
>>>> +
>>>> +      tree addr_base_a
>>>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>>>> +      tree addr_base_b
>>>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>>>> +
>>>>        if (dump_enabled_p ())
>>>>   {
>>>>    dump_printf_loc (MSG_NOTE, vect_location,
>>>> -                           "create runtime check for data references ");
>>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>>>> +   "create runtime check for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>>>    dump_printf (MSG_NOTE, " and ");
>>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>>>> -          dump_printf (MSG_NOTE, "\n");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>>   }
>>>>
>>>> -      seg_a_min = addr_base_a;
>>>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>>>> +      tree seg_a_min = addr_base_a;
>>>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>>>
>>>> -      seg_b_min = addr_base_b;
>>>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>>>> +      tree seg_b_min = addr_base_b;
>>>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>>>
>>>>        part_cond_expr =
>>>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>>>      }
>>>>
>>>> +  /* Extract load and store statements on pointers with zero-stride
>>>> +     accesses.  */
>>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>>> +    {
>>>> +
>>>> +      /* In the loop body, we iterate each statement to check if it is a load
>>>> + or store. Then we check the DR_STEP of the data reference.  If
>>>> + DR_STEP is zero, then we will hoist the load statement to the loop
>>>> + preheader, and move the store statement to the loop exit.  */
>>>> +
>>>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>>>> +   !gsi_end_p (si); )
>>>> + {
>>>> +  gimple stmt = gsi_stmt (si);
>>>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>>>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>>>> +
>>>> +
>>>> +  if (dr && integer_zerop (DR_STEP (dr)))
>>>> +    {
>>>> +      if (DR_IS_READ (dr))
>>>> + {
>>>> +  if (dump_file)
>>>> +    {
>>>> +      fprintf (dump_file,
>>>> +       "Hoist the load to outside of the loop:\n");
>>>> +      print_gimple_stmt (dump_file, stmt, 0,
>>>> + TDF_VOPS|TDF_MEMSYMS);
>>>> +    }
>>>> +
>>>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>>>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>>> +  gsi_move_after (&si, &si_dst);
>>>> + }
>>>> +      else
>>>> + {
>>>> +  gimple_stmt_iterator si_dst =
>>>> +      gsi_last_bb (single_exit (loop)->dest);
>>>> +  gsi_move_after (&si, &si_dst);
>>>> + }
>>>> +              continue;
>>>> +    }
>>>> +  else if (!dr)
>>>> +          {
>>>> +            bool hoist = true;
>>>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>>>> +            {
>>>> +              tree op = gimple_op (stmt, i);
>>>> +              if (TREE_CODE (op) == INTEGER_CST
>>>> +                  || TREE_CODE (op) == REAL_CST)
>>>> +                continue;
>>>> +              if (TREE_CODE (op) == SSA_NAME)
>>>> +              {
>>>> +                gimple def = SSA_NAME_DEF_STMT (op);
>>>> +                if (def == stmt
>>>> +                    || gimple_nop_p (def)
>>>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>>>> +                  continue;
>>>> +              }
>>>> +              hoist = false;
>>>> +              break;
>>>> +            }
>>>> +
>>>> +            if (hoist)
>>>> +            {
>>>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>>>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>>> +              gsi_move_after (&si, &si_dst);
>>>> +              continue;
>>>> +            }
>>>> +          }
>>>> +          gsi_next (&si);
>>>> + }
>>>> +    }
>>>> +
>>>>    /* End loop-exit-fixes after versioning.  */
>>>>
>>>>    if (cond_expr_stmt_list)
>>>> Index: gcc/ChangeLog
>>>> ===================================================================
>>>> --- gcc/ChangeLog (revision 202663)
>>>> +++ gcc/ChangeLog (working copy)
>>>> @@ -1,3 +1,8 @@
>>>> +2013-10-01  Cong Hou  <congh@google.com>
>>>> +
>>>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>>>> + alias checks if it is possible to amortize the runtime overhead.
>>>> +
>>>>
>>>>
>>>
>>> --
>>> Richard Biener <rguenther@suse.de>
>>> SUSE / SUSE Labs
>>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>>> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Joseph Myers Oct. 3, 2013, 9:06 p.m. UTC | #17
On Tue, 1 Oct 2013, Cong Hou wrote:

> +#include <vector>
> +#include <utility>
> +#include <algorithm>
> +
>  #include "config.h"

Whatever the other issues about including these headers at all, any system 
header (C or C++) must always be included *after* config.h, as config.h 
may define feature test macros that are only properly effective if defined 
before any system headers are included, and these macros (affecting such 
things as the size of off_t) need to be consistent throughout GCC.
Cong Hou Oct. 3, 2013, 9:18 p.m. UTC | #18
On Thu, Oct 3, 2013 at 2:06 PM, Joseph S. Myers <joseph@codesourcery.com> wrote:
> On Tue, 1 Oct 2013, Cong Hou wrote:
>
>> +#include <vector>
>> +#include <utility>
>> +#include <algorithm>
>> +
>>  #include "config.h"
>
> Whatever the other issues about including these headers at all, any system
> header (C or C++) must always be included *after* config.h, as config.h
> may define feature test macros that are only properly effective if defined
> before any system headers are included, and these macros (affecting such
> things as the size of off_t) need to be consistent throughout GCC.
>

OK. Actually I did meet some conflicts when I put those three C++
headers after all other includes.

Thank you for the comments.


Cong


> --
> Joseph S. Myers
> joseph@codesourcery.com
Cong Hou Oct. 3, 2013, 9:35 p.m. UTC | #19
Forget about this "aux" idea as the segment length for one data ref
can be different in different dr pairs.

In my patch I created a struct as shown below:

struct dr_addr_with_seg_len
{
  data_reference *dr;
  tree basic_addr;
  tree offset;
  tree seg_len;
};


Note that basic_addr and offset can always obtained from dr, but we
need to store two segment lengths for each dr pair. It is improper to
add a field to data_dependence_relation as it is defined outside of
vectorizer. We can change the type (a new one combining
data_dependence_relation and segment length) of may_alias_ddrs in
loop_vec_info to include such information, but we have to add a new
type to tree-vectorizer.h which is only used in two places - still too
much.

One possible solution is that we create a local struct as shown above
and a new function which returns the merged alias check information.
This function will be called twice: once during analysis phase and
once in transformation phase. Then we don't have to store the merged
alias check information during those two phases. The additional time
cost is minimal as there will not be too many data dependent dr pairs
in a loop.

Any comment?


thanks,
Cong


On Thu, Oct 3, 2013 at 10:57 AM, Cong Hou <congh@google.com> wrote:
> I noticed that there is a "struct dataref_aux" defined in
> tree-vectorizer.h which is specific to the vectorizer pass and is
> stored in (void*)aux in "struct data_reference". Can we add one more
> field "segment_length" to dataref_aux so that we can pass this
> information for merging alias checks? Then we can avoid to modify or
> create other structures.
>
>
> thanks,
> Cong
>
>
> On Wed, Oct 2, 2013 at 2:34 PM, Cong Hou <congh@google.com> wrote:
>> On Wed, Oct 2, 2013 at 4:24 AM, Richard Biener <rguenther@suse.de> wrote:
>>> On Tue, 1 Oct 2013, Cong Hou wrote:
>>>
>>>> When alias exists between data refs in a loop, to vectorize it GCC
>>>> does loop versioning and adds runtime alias checks. Basically for each
>>>> pair of data refs with possible data dependence, there will be two
>>>> comparisons generated to make sure there is no aliasing between them
>>>> in each iteration of the vectorized loop. If there are many such data
>>>> refs pairs, the number of comparisons can be very large, which is a
>>>> big overhead.
>>>>
>>>> However, in some cases it is possible to reduce the number of those
>>>> comparisons. For example, for the following loop, we can detect that
>>>> b[0] and b[1] are two consecutive member accesses so that we can
>>>> combine the alias check between a[0:100]&b[0] and a[0:100]&b[1] into
>>>> checking a[0:100]&b[0:2]:
>>>>
>>>> void foo(int*a, int* b)
>>>> {
>>>>    for (int i = 0; i < 100; ++i)
>>>>     a[i] = b[0] + b[1];
>>>> }
>>>>
>>>> Actually, the requirement of consecutive memory accesses is too
>>>> strict. For the following loop, we can still combine the alias checks
>>>> between a[0:100]&b[0] and a[0:100]&b[100]:
>>>>
>>>> void foo(int*a, int* b)
>>>> {
>>>>    for (int i = 0; i < 100; ++i)
>>>>     a[i] = b[0] + b[100];
>>>> }
>>>>
>>>> This is because if b[0] is not in a[0:100] and b[100] is not in
>>>> a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
>>>> to check a[0:100] and b[0:101] don't overlap.
>>>>
>>>> More generally, consider two pairs of data refs (a, b1) and (a, b2).
>>>> Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
>>>> offset_b1 and offset_b2 (offset_b1 < offset_b2) are offsets of b1 and
>>>> b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
>>>> segment length of a, b1, and b2. Then we can combine the two
>>>> comparisons into one if the following condition is satisfied:
>>>>
>>>> offset_b2- offset_b1 - segment_length_b1 < segment_length_a
>>>>
>>>>
>>>> This patch detects those combination opportunities to reduce the
>>>> number of alias checks. It is tested on an x86-64 machine.
>>>
>>> Apart from the other comments you got (to which I agree) the patch
>>> seems to do two things, namely also:
>>>
>>> +  /* Extract load and store statements on pointers with zero-stride
>>> +     accesses.  */
>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>> +    {
>>>
>>> which I'd rather see in a separate patch (and done also when
>>> the loop doesn't require versioning for alias).
>>>
>>
>>
>> My mistake.. I am working on those two patches at the same time and
>> pasted that one also here by mistake. I will send another patch about
>> the "hoist" topic.
>>
>>
>>> Also combining the alias checks in vect_create_cond_for_alias_checks
>>> is nice but doesn't properly fix the use of the
>>> vect-max-version-for-alias-checks param which currently inhibits
>>> vectorization of the HIMENO benchmark by default (and make us look bad
>>> compared to LLVM).
>>>
>>> So I believe this merging should be done incrementally when
>>> we collect the DDRs we need to test in vect_mark_for_runtime_alias_test.
>>>
>>
>>
>> I agree that vect-max-version-for-alias-checks param should count the
>> number of checks after the merge. However, the struct
>> data_dependence_relation could not record the new information produced
>> by the merge. The new information I mentioned contains the new segment
>> length for comparisons. This length is calculated right in
>> vect_create_cond_for_alias_checks() function. Since
>> vect-max-version-for-alias-checks is used during analysis phase, shall
>> we move all those (get segment length for each data ref and merge
>> alias checks) from transformation to analysis phase? If we cannot
>> store the result properly (data_dependence_relation is not enough),
>> shall we do it twice in both phases?
>>
>> I also noticed a possible bug in the function vect_same_range_drs()
>> called by vect_prune_runtime_alias_test_list(). For the following code
>> I get two pairs of data refs after
>> vect_prune_runtime_alias_test_list(), but in
>> vect_create_cond_for_alias_checks() after detecting grouped accesses I
>> got two identical pairs of data refs. The consequence is two identical
>> alias checks are produced.
>>
>>
>> void yuv2yuyv_ref (int *d, int *src, int n)
>> {
>>   char *dest = (char *)d;
>>   int i;
>>
>>   for(i=0;i<n/2;i++){
>>     dest[i*4 + 0] = (src[i*2 + 0])>>16;
>>     dest[i*4 + 1] = (src[i*2 + 1])>>8;
>>     dest[i*4 + 2] = (src[i*2 + 0])>>16;
>>     dest[i*4 + 3] = (src[i*2 + 0])>>0;
>>   }
>> }
>>
>>
>> I think the solution to this problem is changing
>>
>> GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
>> == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)
>>
>> into
>>
>> STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
>> (vinfo_for_stmt (stmt_i))))
>> == STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT
>> (vinfo_for_stmt (stmt_j)))
>>
>>
>> in function vect_same_range_drs(). What do you think about it?
>>
>>
>> thanks,
>> Cong
>>
>>
>>
>>> Thanks for working on this,
>>> Richard.
>>>
>>>>
>>>> thanks,
>>>> Cong
>>>>
>>>>
>>>>
>>>> Index: gcc/tree-vect-loop-manip.c
>>>> ===================================================================
>>>> --- gcc/tree-vect-loop-manip.c (revision 202662)
>>>> +++ gcc/tree-vect-loop-manip.c (working copy)
>>>> @@ -19,6 +19,10 @@ You should have received a copy of the G
>>>>  along with GCC; see the file COPYING3.  If not see
>>>>  <http://www.gnu.org/licenses/>.  */
>>>>
>>>> +#include <vector>
>>>> +#include <utility>
>>>> +#include <algorithm>
>>>> +
>>>>  #include "config.h"
>>>>  #include "system.h"
>>>>  #include "coretypes.h"
>>>> @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
>>>>    return segment_length;
>>>>  }
>>>>
>>>> +namespace
>>>> +{
>>>> +
>>>> +/* struct dr_addr_with_seg_len
>>>> +
>>>> +   A struct storing information of a data reference, including the data
>>>> +   ref itself, its basic address, the access offset and the segment length
>>>> +   for aliasing checks.  */
>>>> +
>>>> +struct dr_addr_with_seg_len
>>>> +{
>>>> +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
>>>> +    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
>>>> +
>>>> +  data_reference* dr;
>>>> +  tree basic_addr;
>>>> +  tree offset;
>>>> +  tree seg_len;
>>>> +};
>>>> +
>>>> +/* Operator == between two dr_addr_with_seg_len objects.
>>>> +
>>>> +   This equality operator is used to make sure two data refs
>>>> +   are the same one so that we will consider to combine the
>>>> +   aliasing checks of those two pairs of data dependent data
>>>> +   refs.  */
>>>> +
>>>> +bool operator == (const dr_addr_with_seg_len& d1,
>>>> +  const dr_addr_with_seg_len& d2)
>>>> +{
>>>> +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
>>>> + && operand_equal_p (d1.offset, d2.offset, 0)
>>>> + && operand_equal_p (d1.seg_len, d2.seg_len, 0);
>>>> +}
>>>> +
>>>> +typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
>>>> + dr_addr_with_seg_len_pair_t;
>>>> +
>>>> +
>>>> +/* Operator < between two dr_addr_with_seg_len_pair_t objects.
>>>> +
>>>> +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
>>>> +   so that we can combine aliasing checks during one scan.  */
>>>> +
>>>> +bool operator < (const dr_addr_with_seg_len_pair_t& p1,
>>>> + const dr_addr_with_seg_len_pair_t& p2)
>>>> +{
>>>> +  const dr_addr_with_seg_len& p11 = p1.first;
>>>> +  const dr_addr_with_seg_len& p12 = p1.second;
>>>> +  const dr_addr_with_seg_len& p21 = p2.first;
>>>> +  const dr_addr_with_seg_len& p22 = p2.second;
>>>> +
>>>> +  if (p11.basic_addr != p21.basic_addr)
>>>> +    return p11.basic_addr < p21.basic_addr;
>>>> +  if (p12.basic_addr != p22.basic_addr)
>>>> +    return p12.basic_addr < p22.basic_addr;
>>>> +  if (TREE_CODE (p11.offset) != INTEGER_CST
>>>> +      || TREE_CODE (p21.offset) != INTEGER_CST)
>>>> +    return p11.offset < p21.offset;
>>>> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>>>> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>>>> +  if (TREE_CODE (p12.offset) != INTEGER_CST
>>>> +      || TREE_CODE (p22.offset) != INTEGER_CST)
>>>> +    return p12.offset < p22.offset;
>>>> +  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
>>>> +}
>>>> +
>>>> +}
>>>>
>>>>  /* Function vect_create_cond_for_alias_checks.
>>>>
>>>> @@ -2292,20 +2364,51 @@ vect_create_cond_for_alias_checks (loop_
>>>>    if (may_alias_ddrs.is_empty ())
>>>>      return;
>>>>
>>>> +
>>>> +  /* Basically, for each pair of dependent data refs store_ptr_0
>>>> +     and load_ptr_0, we create an expression:
>>>> +
>>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>>> +     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
>>>> +
>>>> +     for aliasing checks. However, in some cases we can decrease
>>>> +     the number of checks by combining two checks into one. For
>>>> +     example, suppose we have another pair of data refs store_ptr_0
>>>> +     and load_ptr_1, and if the following condition is satisfied:
>>>> +
>>>> +     load_ptr_0 < load_ptr_1  &&
>>>> +     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
>>>> +
>>>> +     (this condition means, in each iteration of vectorized loop,
>>>> +     the accessed memory of store_ptr_0 cannot be between the memory
>>>> +     of load_ptr_0 and load_ptr_1.)
>>>> +
>>>> +     we then can use only the following expression to finish the
>>>> +     alising checks between store_ptr_0 & load_ptr_0 and
>>>> +     store_ptr_0 & load_ptr_1:
>>>> +
>>>> +     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
>>>> +     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
>>>> +
>>>> +     Note that we only consider that load_ptr_0 and load_ptr_1 have the
>>>> +     same basic address.  */
>>>> +
>>>> +  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
>>>> +
>>>> +  /* First, we collect all data ref pairs for aliasing checks.  */
>>>> +
>>>>    FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
>>>>      {
>>>>        struct data_reference *dr_a, *dr_b;
>>>>        gimple dr_group_first_a, dr_group_first_b;
>>>> -      tree addr_base_a, addr_base_b;
>>>>        tree segment_length_a, segment_length_b;
>>>>        gimple stmt_a, stmt_b;
>>>> -      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
>>>>
>>>>        dr_a = DDR_A (ddr);
>>>>        stmt_a = DR_STMT (DDR_A (ddr));
>>>>        dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
>>>>        if (dr_group_first_a)
>>>> -        {
>>>> + {
>>>>    stmt_a = dr_group_first_a;
>>>>    dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
>>>>   }
>>>> @@ -2314,20 +2417,11 @@ vect_create_cond_for_alias_checks (loop_
>>>>        stmt_b = DR_STMT (DDR_B (ddr));
>>>>        dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
>>>>        if (dr_group_first_b)
>>>> -        {
>>>> + {
>>>>    stmt_b = dr_group_first_b;
>>>>    dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
>>>>   }
>>>>
>>>> -      addr_base_a
>>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
>>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
>>>> -       DR_INIT (dr_a)));
>>>> -      addr_base_b
>>>> - = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
>>>> -   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
>>>> -       DR_INIT (dr_b)));
>>>> -
>>>>        if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
>>>>   length_factor = scalar_loop_iters;
>>>>        else
>>>> @@ -2335,24 +2429,149 @@ vect_create_cond_for_alias_checks (loop_
>>>>        segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
>>>>        segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
>>>>
>>>> +      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
>>>> +  (dr_addr_with_seg_len
>>>> +       (dr_a, DR_BASE_ADDRESS (dr_a),
>>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
>>>> + segment_length_a),
>>>> +   dr_addr_with_seg_len
>>>> +       (dr_b, DR_BASE_ADDRESS (dr_b),
>>>> + size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
>>>> + segment_length_b));
>>>> +
>>>> +      if (dr_with_seg_len_pair.first.basic_addr >
>>>> +  dr_with_seg_len_pair.second.basic_addr)
>>>> + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
>>>> +
>>>> +      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
>>>> +    }
>>>> +
>>>> +  /* Second, we sort the collected data ref pairs so that we can scan
>>>> +     them once to combine all possible aliasing checks.  */
>>>> +
>>>> +  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
>>>> +
>>>> +  /* Remove duplicate data ref pairs.  */
>>>> +  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
>>>> + ddrs_with_seg_len.end()),
>>>> +   ddrs_with_seg_len.end());
>>>> +
>>>> +  /* We then scan the sorted dr pairs and check if we can combine
>>>> +     alias checks of two neighbouring dr pairs.  */
>>>> +
>>>> +  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
>>>> +    {
>>>> +      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
>>>> +      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
>>>> +      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
>>>> +      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
>>>> +
>>>> +      if (dr_a1 == dr_a2)
>>>> + {
>>>> +  if (dr_b1.basic_addr != dr_b2.basic_addr
>>>> +      || TREE_CODE (dr_b1.offset) != INTEGER_CST
>>>> +      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
>>>> +    continue;
>>>> +
>>>> +  int diff = int_cst_value (dr_b2.offset) -
>>>> +     int_cst_value (dr_b1.offset);
>>>> +
>>>> +  gcc_assert (diff > 0);
>>>> +
>>>> +  if (diff <= vect_factor
>>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
>>>> +      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_b1.seg_len) <
>>>> +     int_cst_value (dr_a1.seg_len)))
>>>> +    {
>>>> +      if (dump_enabled_p ())
>>>> + {
>>>> +  dump_printf_loc
>>>> +      (MSG_NOTE, vect_location,
>>>> +       "combining two runtime checks for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
>>>> +  dump_printf (MSG_NOTE, " and ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>> + }
>>>> +
>>>> +      dr_b1.seg_len = size_binop (PLUS_EXPR,
>>>> +  dr_b2.seg_len, size_int (diff));
>>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>>> +      --i;
>>>> +    }
>>>> + }
>>>> +      else if (dr_b1 == dr_b2)
>>>> + {
>>>> +  if (dr_a1.basic_addr != dr_a2.basic_addr
>>>> +      || TREE_CODE (dr_a1.offset) != INTEGER_CST
>>>> +      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
>>>> +    continue;
>>>> +
>>>> +  int diff = int_cst_value (dr_a2.offset) -
>>>> +     int_cst_value (dr_a1.offset);
>>>> +
>>>> +  gcc_assert (diff > 0);
>>>> +
>>>> +  if (diff <= vect_factor
>>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
>>>> +      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
>>>> +  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
>>>> +  && diff - int_cst_value (dr_a1.seg_len) <
>>>> +     int_cst_value (dr_b1.seg_len)))
>>>> +    {
>>>> +      if (dump_enabled_p ())
>>>> + {
>>>> +  dump_printf_loc
>>>> +      (MSG_NOTE, vect_location,
>>>> +       "combining two runtime checks for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
>>>> +  dump_printf (MSG_NOTE, " and ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>> + }
>>>> +
>>>> +      dr_a1.seg_len = size_binop (PLUS_EXPR,
>>>> +  dr_a2.seg_len, size_int (diff));
>>>> +      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
>>>> +      --i;
>>>> +    }
>>>> + }
>>>> +    }
>>>> +
>>>> +  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
>>>> +    {
>>>> +      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
>>>> +      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
>>>> +      tree segment_length_a = dr_a.seg_len;
>>>> +      tree segment_length_b = dr_b.seg_len;
>>>> +
>>>> +      tree addr_base_a
>>>> + = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
>>>> +      tree addr_base_b
>>>> + = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
>>>> +
>>>>        if (dump_enabled_p ())
>>>>   {
>>>>    dump_printf_loc (MSG_NOTE, vect_location,
>>>> -                           "create runtime check for data references ");
>>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
>>>> +   "create runtime check for data references ");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
>>>>    dump_printf (MSG_NOTE, " and ");
>>>> -  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
>>>> -          dump_printf (MSG_NOTE, "\n");
>>>> +  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
>>>> +  dump_printf (MSG_NOTE, "\n");
>>>>   }
>>>>
>>>> -      seg_a_min = addr_base_a;
>>>> -      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>>> -      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
>>>> +      tree seg_a_min = addr_base_a;
>>>> +      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
>>>> +      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
>>>>   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
>>>>
>>>> -      seg_b_min = addr_base_b;
>>>> -      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>>> -      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
>>>> +      tree seg_b_min = addr_base_b;
>>>> +      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
>>>> +      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
>>>>   seg_b_min = seg_b_max, seg_b_max = addr_base_b;
>>>>
>>>>        part_cond_expr =
>>>> @@ -2477,6 +2696,81 @@ vect_loop_versioning (loop_vec_info loop
>>>>        adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
>>>>      }
>>>>
>>>> +  /* Extract load and store statements on pointers with zero-stride
>>>> +     accesses.  */
>>>> +  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
>>>> +    {
>>>> +
>>>> +      /* In the loop body, we iterate each statement to check if it is a load
>>>> + or store. Then we check the DR_STEP of the data reference.  If
>>>> + DR_STEP is zero, then we will hoist the load statement to the loop
>>>> + preheader, and move the store statement to the loop exit.  */
>>>> +
>>>> +      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
>>>> +   !gsi_end_p (si); )
>>>> + {
>>>> +  gimple stmt = gsi_stmt (si);
>>>> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>>>> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>>>> +
>>>> +
>>>> +  if (dr && integer_zerop (DR_STEP (dr)))
>>>> +    {
>>>> +      if (DR_IS_READ (dr))
>>>> + {
>>>> +  if (dump_file)
>>>> +    {
>>>> +      fprintf (dump_file,
>>>> +       "Hoist the load to outside of the loop:\n");
>>>> +      print_gimple_stmt (dump_file, stmt, 0,
>>>> + TDF_VOPS|TDF_MEMSYMS);
>>>> +    }
>>>> +
>>>> +  basic_block preheader = loop_preheader_edge (loop)->src;
>>>> +  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>>> +  gsi_move_after (&si, &si_dst);
>>>> + }
>>>> +      else
>>>> + {
>>>> +  gimple_stmt_iterator si_dst =
>>>> +      gsi_last_bb (single_exit (loop)->dest);
>>>> +  gsi_move_after (&si, &si_dst);
>>>> + }
>>>> +              continue;
>>>> +    }
>>>> +  else if (!dr)
>>>> +          {
>>>> +            bool hoist = true;
>>>> +            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
>>>> +            {
>>>> +              tree op = gimple_op (stmt, i);
>>>> +              if (TREE_CODE (op) == INTEGER_CST
>>>> +                  || TREE_CODE (op) == REAL_CST)
>>>> +                continue;
>>>> +              if (TREE_CODE (op) == SSA_NAME)
>>>> +              {
>>>> +                gimple def = SSA_NAME_DEF_STMT (op);
>>>> +                if (def == stmt
>>>> +                    || gimple_nop_p (def)
>>>> +                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
>>>> +                  continue;
>>>> +              }
>>>> +              hoist = false;
>>>> +              break;
>>>> +            }
>>>> +
>>>> +            if (hoist)
>>>> +            {
>>>> +              basic_block preheader = loop_preheader_edge (loop)->src;
>>>> +              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
>>>> +              gsi_move_after (&si, &si_dst);
>>>> +              continue;
>>>> +            }
>>>> +          }
>>>> +          gsi_next (&si);
>>>> + }
>>>> +    }
>>>> +
>>>>    /* End loop-exit-fixes after versioning.  */
>>>>
>>>>    if (cond_expr_stmt_list)
>>>> Index: gcc/ChangeLog
>>>> ===================================================================
>>>> --- gcc/ChangeLog (revision 202663)
>>>> +++ gcc/ChangeLog (working copy)
>>>> @@ -1,3 +1,8 @@
>>>> +2013-10-01  Cong Hou  <congh@google.com>
>>>> +
>>>> + * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
>>>> + alias checks if it is possible to amortize the runtime overhead.
>>>> +
>>>>
>>>>
>>>
>>> --
>>> Richard Biener <rguenther@suse.de>
>>> SUSE / SUSE Labs
>>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>>> GF: Jeff Hawn, Jennifer Guild, Felix Imend
Richard Biener Oct. 18, 2013, 12:41 p.m. UTC | #20
On Wed, Oct 2, 2013 at 8:26 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Wed, Oct 02, 2013 at 10:50:21AM -0700, Cong Hou wrote:
>> >> +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
>> >> +    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
>> >
>> > This is going to ICE whenever the offsets wouldn't fit into a
>> > HOST_WIDE_INT.
>> >
>> > I'd say you just shouldn't put into the vector entries where offset isn't
>> > host_integerp, those would never be merged with other checks, or something
>> > similar.
>>
>> Do you mean I should use widest_int_cst_value()? Then I will replace
>> all int_cst_value() here with it. I also changed the type of "diff"
>> variable into HOST_WIDEST_INT.
>
> Actually, best would be just to use
> tree_int_cst_compare (p11.offset, p21.offset)
> that will handle any INTEGER_CSTs, not just those that fit into HWI.

Note that this is not equivalent because int_cst_value () sign-extends
even unsigned values (yes, a very bad name for such a function).  But
I believe that in this case - comparing pointer offsets(?) - int_cst_value
is the correct function to use.

Richard.

>         Jakub
diff mbox

Patch

Index: gcc/tree-vect-loop-manip.c
===================================================================
--- gcc/tree-vect-loop-manip.c (revision 202662)
+++ gcc/tree-vect-loop-manip.c (working copy)
@@ -19,6 +19,10 @@  You should have received a copy of the G
 along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */

+#include <vector>
+#include <utility>
+#include <algorithm>
+
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -2248,6 +2252,74 @@  vect_vfa_segment_size (struct data_refer
   return segment_length;
 }

+namespace
+{
+
+/* struct dr_addr_with_seg_len
+
+   A struct storing information of a data reference, including the data
+   ref itself, its basic address, the access offset and the segment length
+   for aliasing checks.  */
+
+struct dr_addr_with_seg_len
+{
+  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
+    : dr (d), basic_addr (addr), offset (off), seg_len (len) {}
+
+  data_reference* dr;
+  tree basic_addr;
+  tree offset;
+  tree seg_len;
+};
+
+/* Operator == between two dr_addr_with_seg_len objects.
+
+   This equality operator is used to make sure two data refs
+   are the same one so that we will consider to combine the
+   aliasing checks of those two pairs of data dependent data
+   refs.  */
+
+bool operator == (const dr_addr_with_seg_len& d1,
+  const dr_addr_with_seg_len& d2)
+{
+  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
+ && operand_equal_p (d1.offset, d2.offset, 0)
+ && operand_equal_p (d1.seg_len, d2.seg_len, 0);
+}
+
+typedef std::pair <dr_addr_with_seg_len, dr_addr_with_seg_len>
+ dr_addr_with_seg_len_pair_t;
+
+
+/* Operator < between two dr_addr_with_seg_len_pair_t objects.
+
+   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
+   so that we can combine aliasing checks during one scan.  */
+
+bool operator < (const dr_addr_with_seg_len_pair_t& p1,
+ const dr_addr_with_seg_len_pair_t& p2)
+{
+  const dr_addr_with_seg_len& p11 = p1.first;
+  const dr_addr_with_seg_len& p12 = p1.second;
+  const dr_addr_with_seg_len& p21 = p2.first;
+  const dr_addr_with_seg_len& p22 = p2.second;
+
+  if (p11.basic_addr != p21.basic_addr)
+    return p11.basic_addr < p21.basic_addr;
+  if (p12.basic_addr != p22.basic_addr)
+    return p12.basic_addr < p22.basic_addr;
+  if (TREE_CODE (p11.offset) != INTEGER_CST
+      || TREE_CODE (p21.offset) != INTEGER_CST)
+    return p11.offset < p21.offset;
+  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
+    return int_cst_value (p11.offset) < int_cst_value (p21.offset);
+  if (TREE_CODE (p12.offset) != INTEGER_CST
+      || TREE_CODE (p22.offset) != INTEGER_CST)
+    return p12.offset < p22.offset;
+  return int_cst_value (p12.offset) < int_cst_value (p22.offset);
+}
+
+}

 /* Function vect_create_cond_for_alias_checks.

@@ -2292,20 +2364,51 @@  vect_create_cond_for_alias_checks (loop_
   if (may_alias_ddrs.is_empty ())
     return;

+
+  /* Basically, for each pair of dependent data refs store_ptr_0
+     and load_ptr_0, we create an expression:
+
+     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
+     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
+
+     for aliasing checks. However, in some cases we can decrease
+     the number of checks by combining two checks into one. For
+     example, suppose we have another pair of data refs store_ptr_0
+     and load_ptr_1, and if the following condition is satisfied:
+
+     load_ptr_0 < load_ptr_1  &&
+     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
+
+     (this condition means, in each iteration of vectorized loop,
+     the accessed memory of store_ptr_0 cannot be between the memory
+     of load_ptr_0 and load_ptr_1.)
+
+     we then can use only the following expression to finish the
+     alising checks between store_ptr_0 & load_ptr_0 and
+     store_ptr_0 & load_ptr_1:
+
+     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
+     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
+
+     Note that we only consider that load_ptr_0 and load_ptr_1 have the
+     same basic address.  */
+
+  std::vector<dr_addr_with_seg_len_pair_t> ddrs_with_seg_len;
+
+  /* First, we collect all data ref pairs for aliasing checks.  */
+
   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
     {
       struct data_reference *dr_a, *dr_b;
       gimple dr_group_first_a, dr_group_first_b;
-      tree addr_base_a, addr_base_b;
       tree segment_length_a, segment_length_b;
       gimple stmt_a, stmt_b;
-      tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;

       dr_a = DDR_A (ddr);
       stmt_a = DR_STMT (DDR_A (ddr));
       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
       if (dr_group_first_a)
-        {
+ {
   stmt_a = dr_group_first_a;
   dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
  }
@@ -2314,20 +2417,11 @@  vect_create_cond_for_alias_checks (loop_
       stmt_b = DR_STMT (DDR_B (ddr));
       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
       if (dr_group_first_b)
-        {
+ {
   stmt_b = dr_group_first_b;
   dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
  }

-      addr_base_a
- = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a),
-   size_binop (PLUS_EXPR, DR_OFFSET (dr_a),
-       DR_INIT (dr_a)));
-      addr_base_b
- = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b),
-   size_binop (PLUS_EXPR, DR_OFFSET (dr_b),
-       DR_INIT (dr_b)));
-
       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
  length_factor = scalar_loop_iters;
       else
@@ -2335,24 +2429,149 @@  vect_create_cond_for_alias_checks (loop_
       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);

+      dr_addr_with_seg_len_pair_t dr_with_seg_len_pair
+  (dr_addr_with_seg_len
+       (dr_a, DR_BASE_ADDRESS (dr_a),
+ size_binop (PLUS_EXPR, DR_OFFSET (dr_a), DR_INIT (dr_a)),
+ segment_length_a),
+   dr_addr_with_seg_len
+       (dr_b, DR_BASE_ADDRESS (dr_b),
+ size_binop (PLUS_EXPR, DR_OFFSET (dr_b), DR_INIT (dr_b)),
+ segment_length_b));
+
+      if (dr_with_seg_len_pair.first.basic_addr >
+  dr_with_seg_len_pair.second.basic_addr)
+ std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
+
+      ddrs_with_seg_len.push_back (dr_with_seg_len_pair);
+    }
+
+  /* Second, we sort the collected data ref pairs so that we can scan
+     them once to combine all possible aliasing checks.  */
+
+  std::sort (ddrs_with_seg_len.begin(), ddrs_with_seg_len.end());
+
+  /* Remove duplicate data ref pairs.  */
+  ddrs_with_seg_len.erase (std::unique (ddrs_with_seg_len.begin(),
+ ddrs_with_seg_len.end()),
+   ddrs_with_seg_len.end());
+
+  /* We then scan the sorted dr pairs and check if we can combine
+     alias checks of two neighbouring dr pairs.  */
+
+  for (size_t i = 1; i < ddrs_with_seg_len.size (); ++i)
+    {
+      dr_addr_with_seg_len& dr_a1 = ddrs_with_seg_len[i-1].first;
+      dr_addr_with_seg_len& dr_b1 = ddrs_with_seg_len[i-1].second;
+      dr_addr_with_seg_len& dr_a2 = ddrs_with_seg_len[i].first;
+      dr_addr_with_seg_len& dr_b2 = ddrs_with_seg_len[i].second;
+
+      if (dr_a1 == dr_a2)
+ {
+  if (dr_b1.basic_addr != dr_b2.basic_addr
+      || TREE_CODE (dr_b1.offset) != INTEGER_CST
+      || TREE_CODE (dr_b2.offset) != INTEGER_CST)
+    continue;
+
+  int diff = int_cst_value (dr_b2.offset) -
+     int_cst_value (dr_b1.offset);
+
+  gcc_assert (diff > 0);
+
+  if (diff <= vect_factor
+      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
+  && diff - int_cst_value (dr_b1.seg_len) < vect_factor)
+      || (TREE_CODE (dr_b1.seg_len) == INTEGER_CST
+  && TREE_CODE (dr_a1.seg_len) == INTEGER_CST
+  && diff - int_cst_value (dr_b1.seg_len) <
+     int_cst_value (dr_a1.seg_len)))
+    {
+      if (dump_enabled_p ())
+ {
+  dump_printf_loc
+      (MSG_NOTE, vect_location,
+       "combining two runtime checks for data references ");
+  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1.dr));
+  dump_printf (MSG_NOTE, " and ");
+  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2.dr));
+  dump_printf (MSG_NOTE, "\n");
+ }
+
+      dr_b1.seg_len = size_binop (PLUS_EXPR,
+  dr_b2.seg_len, size_int (diff));
+      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
+      --i;
+    }
+ }
+      else if (dr_b1 == dr_b2)
+ {
+  if (dr_a1.basic_addr != dr_a2.basic_addr
+      || TREE_CODE (dr_a1.offset) != INTEGER_CST
+      || TREE_CODE (dr_a2.offset) != INTEGER_CST)
+    continue;
+
+  int diff = int_cst_value (dr_a2.offset) -
+     int_cst_value (dr_a1.offset);
+
+  gcc_assert (diff > 0);
+
+  if (diff <= vect_factor
+      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
+  && diff - int_cst_value (dr_a1.seg_len) < vect_factor)
+      || (TREE_CODE (dr_a1.seg_len) == INTEGER_CST
+  && TREE_CODE (dr_b1.seg_len) == INTEGER_CST
+  && diff - int_cst_value (dr_a1.seg_len) <
+     int_cst_value (dr_b1.seg_len)))
+    {
+      if (dump_enabled_p ())
+ {
+  dump_printf_loc
+      (MSG_NOTE, vect_location,
+       "combining two runtime checks for data references ");
+  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1.dr));
+  dump_printf (MSG_NOTE, " and ");
+  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2.dr));
+  dump_printf (MSG_NOTE, "\n");
+ }
+
+      dr_a1.seg_len = size_binop (PLUS_EXPR,
+  dr_a2.seg_len, size_int (diff));
+      ddrs_with_seg_len.erase (ddrs_with_seg_len.begin () + i);
+      --i;
+    }
+ }
+    }
+
+  for (size_t i = 0, s = ddrs_with_seg_len.size (); i < s; ++i)
+    {
+      const dr_addr_with_seg_len& dr_a = ddrs_with_seg_len[i].first;
+      const dr_addr_with_seg_len& dr_b = ddrs_with_seg_len[i].second;
+      tree segment_length_a = dr_a.seg_len;
+      tree segment_length_b = dr_b.seg_len;
+
+      tree addr_base_a
+ = fold_build_pointer_plus (dr_a.basic_addr, dr_a.offset);
+      tree addr_base_b
+ = fold_build_pointer_plus (dr_b.basic_addr, dr_b.offset);
+
       if (dump_enabled_p ())
  {
   dump_printf_loc (MSG_NOTE, vect_location,
-                           "create runtime check for data references ");
-  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
+   "create runtime check for data references ");
+  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
   dump_printf (MSG_NOTE, " and ");
-  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
-          dump_printf (MSG_NOTE, "\n");
+  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
+  dump_printf (MSG_NOTE, "\n");
  }

-      seg_a_min = addr_base_a;
-      seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
-      if (tree_int_cst_compare (DR_STEP (dr_a), size_zero_node) < 0)
+      tree seg_a_min = addr_base_a;
+      tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
+      if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
  seg_a_min = seg_a_max, seg_a_max = addr_base_a;

-      seg_b_min = addr_base_b;
-      seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
-      if (tree_int_cst_compare (DR_STEP (dr_b), size_zero_node) < 0)
+      tree seg_b_min = addr_base_b;
+      tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
+      if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
  seg_b_min = seg_b_max, seg_b_max = addr_base_b;

       part_cond_expr =
@@ -2477,6 +2696,81 @@  vect_loop_versioning (loop_vec_info loop
       adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
     }

+  /* Extract load and store statements on pointers with zero-stride
+     accesses.  */
+  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
+    {
+
+      /* In the loop body, we iterate each statement to check if it is a load
+ or store. Then we check the DR_STEP of the data reference.  If
+ DR_STEP is zero, then we will hoist the load statement to the loop
+ preheader, and move the store statement to the loop exit.  */
+
+      for (gimple_stmt_iterator si = gsi_start_bb (loop->header);
+   !gsi_end_p (si); )
+ {
+  gimple stmt = gsi_stmt (si);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+
+
+  if (dr && integer_zerop (DR_STEP (dr)))
+    {
+      if (DR_IS_READ (dr))
+ {
+  if (dump_file)
+    {
+      fprintf (dump_file,
+       "Hoist the load to outside of the loop:\n");
+      print_gimple_stmt (dump_file, stmt, 0,
+ TDF_VOPS|TDF_MEMSYMS);
+    }
+
+  basic_block preheader = loop_preheader_edge (loop)->src;
+  gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
+  gsi_move_after (&si, &si_dst);
+ }
+      else
+ {
+  gimple_stmt_iterator si_dst =
+      gsi_last_bb (single_exit (loop)->dest);
+  gsi_move_after (&si, &si_dst);
+ }
+              continue;
+    }
+  else if (!dr)
+          {
+            bool hoist = true;
+            for (size_t i = 0; i < gimple_num_ops (stmt); i++)
+            {
+              tree op = gimple_op (stmt, i);
+              if (TREE_CODE (op) == INTEGER_CST
+                  || TREE_CODE (op) == REAL_CST)
+                continue;
+              if (TREE_CODE (op) == SSA_NAME)
+              {
+                gimple def = SSA_NAME_DEF_STMT (op);
+                if (def == stmt
+                    || gimple_nop_p (def)
+                    || !flow_bb_inside_loop_p (loop, gimple_bb (def)))
+                  continue;
+              }
+              hoist = false;
+              break;
+            }
+
+            if (hoist)
+            {
+              basic_block preheader = loop_preheader_edge (loop)->src;
+              gimple_stmt_iterator si_dst = gsi_last_bb (preheader);
+              gsi_move_after (&si, &si_dst);
+              continue;
+            }
+          }
+          gsi_next (&si);
+ }
+    }
+
   /* End loop-exit-fixes after versioning.  */

   if (cond_expr_stmt_list)
Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog (revision 202663)
+++ gcc/ChangeLog (working copy)
@@ -1,3 +1,8 @@ 
+2013-10-01  Cong Hou  <congh@google.com>
+
+ * tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Combine
+ alias checks if it is possible to amortize the runtime overhead.
+