diff mbox

[1/3] Fix PR46029: reimplement if-convert stores.

Message ID AANLkTi=nbd_Doyw9+gr3_ed5cZMEMqf+j4XxMUG=qP-i@mail.gmail.com
State New
Headers show

Commit Message

Sebastian Pop Nov. 15, 2010, 10:27 p.m. UTC
Hi Richi,

fixes to your review are posted separately, see below for the details.
See 0001-Fix-PR46029-reimplement-if-convert-stores.patch for the
combined patch.

On Fri, Nov 5, 2010 at 07:06, Richard Guenther <rguenther@suse.de> wrote:
> On Wed, 3 Nov 2010, Sebastian Pop wrote:
>
>> 2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>
>>
>>       PR tree-optimization/46029
>>       * doc/invoke.texi (-ftree-loop-if-convert-stores): Update description.
>>       * tree-if-conv.c (has_unaligned_memory_refs): New.
>>       (if_convertible_gimple_assign_stmt_p): Call has_unaligned_memory_refs.
>>       (create_scratchpad): New.
>>       (create_indirect_cond_expr): New.
>>       (predicate_mem_writes): Call create_indirect_cond_expr.  Take an extra
>>       parameter for scratch_pad.
>>       (combine_blocks): Same.
>>       (tree_if_conversion): Same.
>>       (main_tree_if_conversion): Pass to tree_if_conversion a pointer to
>>       scratch_pad.
>>       (struct ifc_dr): Removed.
>>       (IFC_DR): Removed.
>>       (DR_WRITTEN_AT_LEAST_ONCE): Removed.
>>       (DR_RW_UNCONDITIONALLY): Removed.
>>       (memrefs_read_or_written_unconditionally): Removed.
>>       (write_memrefs_written_at_least_once): Removed.
>>       (ifcvt_memrefs_wont_trap): Removed.
>>       (ifcvt_could_trap_p): Does not take refs parameter anymore.
>>       (if_convertible_gimple_assign_stmt_p): Same.
>>       (if_convertible_stmt_p): Same.
>>       (if_convertible_loop_p_1): Remove initialization of dr->aux,
>>       DR_WRITTEN_AT_LEAST_ONCE, and DR_RW_UNCONDITIONALLY.
>>       (if_convertible_loop_p): Remove deallocation of the same.
>
> Comments in-line
>
>> testsuite/
>>       * g++.dg/tree-ssa/ifc-pr46029.C: New.
>>       * gcc.dg/tree-ssa/ifc-8.c: New.
>>       * gcc.dg/tree-ssa/ifc-5.c: Make it exactly like the FFmpeg kernel.
>> ---
>>  gcc/ChangeLog                               |   28 ++
>>  gcc/doc/invoke.texi                         |   18 +-
>>  gcc/testsuite/ChangeLog                     |    7 +
>>  gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C |   76 ++++++
>>  gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c       |   17 +-
>>  gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c       |   29 ++
>>  gcc/tree-if-conv.c                          |  379 ++++++++++++---------------
>>  7 files changed, 336 insertions(+), 218 deletions(-)
>>  create mode 100644 gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c
>>
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index beed454..0f58882 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,3 +1,31 @@
>> +2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>
>> +
>> +     PR tree-optimization/46029
>> +     * doc/invoke.texi (-ftree-loop-if-convert-stores): Update description.
>> +     * tree-if-conv.c (has_unaligned_memory_refs): New.
>> +     (if_convertible_gimple_assign_stmt_p): Call has_unaligned_memory_refs.
>> +     (create_scratchpad): New.
>> +     (create_indirect_cond_expr): New.
>> +     (predicate_mem_writes): Call create_indirect_cond_expr.  Take an extra
>> +     parameter for scratch_pad.
>> +     (combine_blocks): Same.
>> +     (tree_if_conversion): Same.
>> +     (main_tree_if_conversion): Pass to tree_if_conversion a pointer to
>> +     scratch_pad.
>> +     (struct ifc_dr): Removed.
>> +     (IFC_DR): Removed.
>> +     (DR_WRITTEN_AT_LEAST_ONCE): Removed.
>> +     (DR_RW_UNCONDITIONALLY): Removed.
>> +     (memrefs_read_or_written_unconditionally): Removed.
>> +     (write_memrefs_written_at_least_once): Removed.
>> +     (ifcvt_memrefs_wont_trap): Removed.
>> +     (ifcvt_could_trap_p): Does not take refs parameter anymore.
>> +     (if_convertible_gimple_assign_stmt_p): Same.
>> +     (if_convertible_stmt_p): Same.
>> +     (if_convertible_loop_p_1): Remove initialization of dr->aux,
>> +     DR_WRITTEN_AT_LEAST_ONCE, and DR_RW_UNCONDITIONALLY.
>> +     (if_convertible_loop_p): Remove deallocation of the same.
>> +
>>  2010-10-20  Nathan Froyd  <froydnj@codesourcery.com>
>>
>>       * ifcvt.c (noce_emit_cmove): If both of the values are SUBREGs, try
>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> index ee68454..28b0cbb 100644
>> --- a/gcc/doc/invoke.texi
>> +++ b/gcc/doc/invoke.texi
>> @@ -6935,20 +6935,26 @@ if vectorization is enabled.
>>
>>  @item -ftree-loop-if-convert-stores
>>  Attempt to also if-convert conditional jumps containing memory writes.
>> -This transformation can be unsafe for multi-threaded programs as it
>> -transforms conditional memory writes into unconditional memory writes.
>>  For example,
>>  @smallexample
>>  for (i = 0; i < N; i++)
>>    if (cond)
>> -    A[i] = expr;
>> +    A[i] = B[i] + 2;
>>  @end smallexample
>>  would be transformed to
>>  @smallexample
>> -for (i = 0; i < N; i++)
>> -  A[i] = cond ? expr : A[i];
>> +void *scratchpad = alloca (64);
>> +for (i = 0; i < N; i++) @{
>> +  a = cond ? &A[i] : scratchpad;
>> +  b = cond ? &B[i] : scratchpad;
>> +  *a = *b + 2;
>> +@}
>>  @end smallexample
>> -potentially producing data races.
>> +The compiler allocates a scratchpad memory on the stack for each
>> +function in which the if-conversion of memory stores or reads
>> +happened.  This scratchpad memory is used during the part of the
>> +computation that is discarded, i.e., when the condition is evaluated
>> +to false.
>>
>>  @item -ftree-loop-distribution
>>  Perform loop distribution.  This flag can improve cache performance on
>> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
>> index 9d9c543..4233f86 100644
>> --- a/gcc/testsuite/ChangeLog
>> +++ b/gcc/testsuite/ChangeLog
>> @@ -1,3 +1,10 @@
>> +2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>
>> +
>> +     PR tree-optimization/46029
>> +     * g++.dg/tree-ssa/ifc-pr46029.C: New.
>> +     * gcc.dg/tree-ssa/ifc-8.c: New.
>> +     * gcc.dg/tree-ssa/ifc-5.c: Make it exactly like the FFmpeg kernel.
>> +
>>  2010-10-20  Rainer Orth  <ro@CeBiTec.Uni-Bielefeld.DE>
>>
>>       PR c++/46024
>> diff --git a/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C b/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C
>> new file mode 100644
>> index 0000000..2a54bdb
>> --- /dev/null
>> +++ b/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C
>> @@ -0,0 +1,76 @@
>> +// { dg-do run }
>> +/* { dg-options "-O -ftree-loop-if-convert-stores" } */
>> +
>> +namespace
>> +{
>> +  struct rb_tree_node_
>> +  {
>> +    rb_tree_node_ ():m_p_left (0), m_p_parent (0), m_metadata (0)
>> +    {
>> +    }
>> +    unsigned &get_metadata ()
>> +    {
>> +      return m_metadata;
>> +    }
>> +    rb_tree_node_ *m_p_left;
>> +    rb_tree_node_ *m_p_parent;
>> +    unsigned m_metadata;
>> +  };
>> +
>> +  struct bin_search_tree_const_node_it_
>> +  {
>> +    bin_search_tree_const_node_it_ (rb_tree_node_ * p_nd):m_p_nd (p_nd)
>> +    {
>> +    }
>> +    unsigned &get_metadata ()
>> +    {
>> +      return m_p_nd->get_metadata ();
>> +    }
>> +    bin_search_tree_const_node_it_ get_l_child ()
>> +    {
>> +      return bin_search_tree_const_node_it_ (m_p_nd->m_p_left);
>> +    }
>> +
>> +    rb_tree_node_ *m_p_nd;
>> +  };
>> +
>> +  struct bin_search_tree_no_data_
>> +  {
>> +    typedef rb_tree_node_ *node_pointer;
>> +      bin_search_tree_no_data_ ():m_p_head (new rb_tree_node_ ())
>> +    {
>> +    }
>> +    void insert_imp_empty (int r_value)
>> +    {
>> +      rb_tree_node_ *p_new_node = new rb_tree_node_ ();
>> +      m_p_head->m_p_parent = p_new_node;
>> +      p_new_node->m_p_parent = m_p_head;
>> +      update_to_top (m_p_head->m_p_parent);
>> +    }
>> +    void apply_update (bin_search_tree_const_node_it_ nd_it)
>> +    {
>> +      unsigned
>> +     l_max_endpoint
>> +     =
>> +     (nd_it.get_l_child ().m_p_nd ==
>> +      0) ? 0 : nd_it.get_l_child ().get_metadata ();
>> +      nd_it.get_metadata () = l_max_endpoint;
>> +    }
>> +    void update_to_top (node_pointer p_nd)
>> +    {
>> +      while (p_nd != m_p_head)
>> +     {
>> +       apply_update (p_nd);
>> +       p_nd = p_nd->m_p_parent;
>> +     }
>> +    }
>> +
>> +    rb_tree_node_ * m_p_head;
>> +  };
>> +}
>> +
>> +int main ()
>> +{
>> +  bin_search_tree_no_data_ ().insert_imp_empty (0);
>> +  return 0;
>> +}
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c
>> index a9cc816..d88c4a2 100644
>> --- a/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c
>> @@ -12,11 +12,18 @@ dct_unquantize_h263_inter_c (short *block, int n, int qscale, int nCoeffs)
>>    for (i = 0; i <= nCoeffs; i++)
>>      {
>>        level = block[i];
>> -      if (level < 0)
>> -     level = level * qmul - qadd;
>> -      else
>> -     level = level * qmul + qadd;
>> -      block[i] = level;
>> +      if (level)
>> +        {
>> +          if (level < 0)
>> +            {
>> +              level = level * qmul - qadd;
>> +            }
>> +          else
>> +            {
>> +              level = level * qmul + qadd;
>> +            }
>> +          block[i] = level;
>> +        }
>>      }
>>  }
>>
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c
>> new file mode 100644
>> index 0000000..d7cf279
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c
>> @@ -0,0 +1,29 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-c -O2 -ftree-vectorize" { target *-*-* } } */
>> +
>> +typedef union tree_node *tree;
>> +struct tree_common
>> +{
>> +  unsigned volatile_flag : 1;
>> +  unsigned unsigned_flag : 1;
>> +};
>> +struct tree_type
>> +{
>> +  tree next_variant;
>> +  tree main_variant;
>> +};
>> +union tree_node
>> +{
>> +  struct tree_common common;
>> +  struct tree_type type;
>> +};
>> +void finish_enum (tree enumtype)
>> +{
>> +  tree tem;
>> +  for (tem = ((enumtype)->type.main_variant); tem; tem = ((tem)->type.next_variant))
>> +    {
>> +      if (tem == enumtype)
>> +     continue;
>> +      ((tem)->common.unsigned_flag) = ((enumtype)->common.unsigned_flag);
>> +    }
>> +}
>> diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
>> index 642dbda..9fc6190 100644
>> --- a/gcc/tree-if-conv.c
>> +++ b/gcc/tree-if-conv.c
>> @@ -446,171 +446,47 @@ if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
>>    return true;
>>  }
>>
>> -/* Records the status of a data reference.  This struct is attached to
>> -   each DR->aux field.  */
>> -
>> -struct ifc_dr {
>> -  /* -1 when not initialized, 0 when false, 1 when true.  */
>> -  int written_at_least_once;
>> -
>> -  /* -1 when not initialized, 0 when false, 1 when true.  */
>> -  int rw_unconditionally;
>> -};
>> -
>> -#define IFC_DR(DR) ((struct ifc_dr *) (DR)->aux)
>> -#define DR_WRITTEN_AT_LEAST_ONCE(DR) (IFC_DR (DR)->written_at_least_once)
>> -#define DR_RW_UNCONDITIONALLY(DR) (IFC_DR (DR)->rw_unconditionally)
>> -
>> -/* Returns true when the memory references of STMT are read or written
>> -   unconditionally.  In other words, this function returns true when
>> -   for every data reference A in STMT there exist other accesses to
>> -   the same data reference with predicates that add up (OR-up) to the
>> -   true predicate: this ensures that the data reference A is touched
>> -   (read or written) on every iteration of the if-converted loop.  */
>> -
>> -static bool
>> -memrefs_read_or_written_unconditionally (gimple stmt,
>> -                                      VEC (data_reference_p, heap) *drs)
>> -{
>> -  int i, j;
>> -  data_reference_p a, b;
>> -  tree ca = bb_predicate (gimple_bb (stmt));
>> -
>> -  for (i = 0; VEC_iterate (data_reference_p, drs, i, a); i++)
>> -    if (DR_STMT (a) == stmt)
>> -      {
>> -     bool found = false;
>> -     int x = DR_RW_UNCONDITIONALLY (a);
>> -
>> -     if (x == 0)
>> -       return false;
>> -
>> -     if (x == 1)
>> -       continue;
>> -
>> -     for (j = 0; VEC_iterate (data_reference_p, drs, j, b); j++)
>> -       if (DR_STMT (b) != stmt
>> -           && same_data_refs (a, b))
>> -         {
>> -           tree cb = bb_predicate (gimple_bb (DR_STMT (b)));
>> -
>> -           if (DR_RW_UNCONDITIONALLY (b) == 1
>> -               || is_true_predicate (cb)
>> -               || is_true_predicate (ca = fold_or_predicates (EXPR_LOCATION (cb),
>> -                                                              ca, cb)))
>> -             {
>> -               DR_RW_UNCONDITIONALLY (a) = 1;
>> -               DR_RW_UNCONDITIONALLY (b) = 1;
>> -               found = true;
>> -               break;
>> -             }
>> -         }
>> -
>> -     if (!found)
>> -       {
>> -         DR_RW_UNCONDITIONALLY (a) = 0;
>> -         return false;
>> -       }
>> -      }
>> -
>> -  return true;
>> -}
>> -
>> -/* Returns true when the memory references of STMT are unconditionally
>> -   written.  In other words, this function returns true when for every
>> -   data reference A written in STMT, there exist other writes to the
>> -   same data reference with predicates that add up (OR-up) to the true
>> -   predicate: this ensures that the data reference A is written on
>> -   every iteration of the if-converted loop.  */
>> +/* Wrapper around gimple_could_trap_p refined for the needs of the
>> +   if-conversion.  */
>>
>>  static bool
>> -write_memrefs_written_at_least_once (gimple stmt,
>> -                                  VEC (data_reference_p, heap) *drs)
>> +ifcvt_could_trap_p (gimple stmt)
>>  {
>> -  int i, j;
>> -  data_reference_p a, b;
>> -  tree ca = bb_predicate (gimple_bb (stmt));
>> -
>> -  for (i = 0; VEC_iterate (data_reference_p, drs, i, a); i++)
>> -    if (DR_STMT (a) == stmt
>> -     && DR_IS_WRITE (a))
>> -      {
>> -     bool found = false;
>> -     int x = DR_WRITTEN_AT_LEAST_ONCE (a);
>> -
>> -     if (x == 0)
>> -       return false;
>> -
>> -     if (x == 1)
>> -       continue;
>> -
>> -     for (j = 0; VEC_iterate (data_reference_p, drs, j, b); j++)
>> -       if (DR_STMT (b) != stmt
>> -           && DR_IS_WRITE (b)
>> -           && same_data_refs_base_objects (a, b))
>> -         {
>> -           tree cb = bb_predicate (gimple_bb (DR_STMT (b)));
>> -
>> -           if (DR_WRITTEN_AT_LEAST_ONCE (b) == 1
>> -               || is_true_predicate (cb)
>> -               || is_true_predicate (ca = fold_or_predicates (EXPR_LOCATION (cb),
>> -                                                              ca, cb)))
>> -             {
>> -               DR_WRITTEN_AT_LEAST_ONCE (a) = 1;
>> -               DR_WRITTEN_AT_LEAST_ONCE (b) = 1;
>> -               found = true;
>> -               break;
>> -             }
>> -         }
>> -
>> -     if (!found)
>> -       {
>> -         DR_WRITTEN_AT_LEAST_ONCE (a) = 0;
>> -         return false;
>> -       }
>> -      }
>> +  if (gimple_vuse (stmt)
>> +      && !gimple_could_trap_p_1 (stmt, false, false))
>> +    return false;
>>
>> -  return true;
>> +  return gimple_could_trap_p (stmt);
>>  }
>>
>> -/* Return true when the memory references of STMT won't trap in the
>> -   if-converted code.  There are two things that we have to check for:
>> -
>> -   - writes to memory occur to writable memory: if-conversion of
>> -   memory writes transforms the conditional memory writes into
>> -   unconditional writes, i.e. "if (cond) A[i] = foo" is transformed
>> -   into "A[i] = cond ? foo : A[i]", and as the write to memory may not
>> -   be executed at all in the original code, it may be a readonly
>> -   memory.  To check that A is not const-qualified, we check that
>> -   there exists at least an unconditional write to A in the current
>> -   function.
>> -
>> -   - reads or writes to memory are valid memory accesses for every
>> -   iteration.  To check that the memory accesses are correctly formed
>> -   and that we are allowed to read and write in these locations, we
>> -   check that the memory accesses to be if-converted occur at every
>> -   iteration unconditionally.  */
>> +/* Returns true when stmt contains a data reference.  */
>>
>>  static bool
>> -ifcvt_memrefs_wont_trap (gimple stmt, VEC (data_reference_p, heap) *refs)
>> +has_unaligned_memory_refs (gimple stmt)
>
> Ick - unified diffs are hard to read (sometimes).  The comment
> doesn't match the function name -- unaligned data reference or not?
>
>>  {
>> -  return write_memrefs_written_at_least_once (stmt, refs)
>> -    && memrefs_read_or_written_unconditionally (stmt, refs);
>> -}
>> -
>> -/* Wrapper around gimple_could_trap_p refined for the needs of the
>> -   if-conversion.  Try to prove that the memory accesses of STMT could
>> -   not trap in the innermost loop containing STMT.  */
>> +  int unsignedp, volatilep;
>> +  HOST_WIDE_INT bitsize, bitpos;
>> +  tree toffset;
>> +  enum machine_mode mode;
>> +  VEC (data_ref_loc, heap) *refs = VEC_alloc (data_ref_loc, heap, 3);
>> +  bool res = get_references_in_stmt (stmt, &refs);
>> +  unsigned i;
>> +  data_ref_loc *ref;
>> +
>> +  FOR_EACH_VEC_ELT (data_ref_loc, refs, i, ref)
>> +    {
>> +      get_inner_reference (*ref->pos, &bitsize, &bitpos, &toffset,
>> +                        &mode, &unsignedp, &volatilep, true);
>>
>> -static bool
>> -ifcvt_could_trap_p (gimple stmt, VEC (data_reference_p, heap) *refs)
>> -{
>> -  if (gimple_vuse (stmt)
>> -      && !gimple_could_trap_p_1 (stmt, false, false)
>> -      && ifcvt_memrefs_wont_trap (stmt, refs))
>> -    return false;
>> +      if ((bitpos % BITS_PER_UNIT) != 0)
>
> Hmm, that's not actually unaligned but not addressable, right?
> I guess you want to re-use ivopts may_be_nonaddressable_p instead.
>
>> +     {
>> +       res = true;
>> +       break;
>> +     }
>> +    }
>>
>> -  return gimple_could_trap_p (stmt);
>> +  VEC_free (data_ref_loc, heap, refs);
>> +  return res;
>>  }
>>
>>  /* Return true when STMT is if-convertible.
>> @@ -621,8 +497,7 @@ ifcvt_could_trap_p (gimple stmt, VEC (data_reference_p, heap) *refs)
>>     - LHS is not var decl.  */
>>
>>  static bool
>> -if_convertible_gimple_assign_stmt_p (gimple stmt,
>> -                                  VEC (data_reference_p, heap) *refs)
>> +if_convertible_gimple_assign_stmt_p (gimple stmt)
>>  {
>>    tree lhs = gimple_assign_lhs (stmt);
>>    basic_block bb;
>> @@ -650,12 +525,20 @@ if_convertible_gimple_assign_stmt_p (gimple stmt,
>>
>>    if (flag_tree_loop_if_convert_stores)
>>      {
>> -      if (ifcvt_could_trap_p (stmt, refs))
>> +      if (ifcvt_could_trap_p (stmt))
>>       {
>>         if (dump_file && (dump_flags & TDF_DETAILS))
>>           fprintf (dump_file, "tree could trap...\n");
>>         return false;
>>       }
>> +
>> +      if (has_unaligned_memory_refs (stmt))
>> +     {
>> +       if (dump_file && (dump_flags & TDF_DETAILS))
>> +         fprintf (dump_file, "uses misaligned memory...\n");
>
> But here it suggests misaligned again (why'd we care for misalignment?)
>

All the above issues are fixed by
0002-Reuse-ivopts-may_be_nonaddressable_p.patch.

>> +       return false;
>> +     }
>> +
>>        return true;
>>      }
>>
>> @@ -690,7 +573,7 @@ if_convertible_gimple_assign_stmt_p (gimple stmt,
>>     - it is a GIMPLE_LABEL or a GIMPLE_COND.  */
>>
>>  static bool
>> -if_convertible_stmt_p (gimple stmt, VEC (data_reference_p, heap) *refs)
>> +if_convertible_stmt_p (gimple stmt)
>>  {
>>    switch (gimple_code (stmt))
>>      {
>> @@ -700,7 +583,7 @@ if_convertible_stmt_p (gimple stmt, VEC (data_reference_p, heap) *refs)
>>        return true;
>>
>>      case GIMPLE_ASSIGN:
>> -      return if_convertible_gimple_assign_stmt_p (stmt, refs);
>> +      return if_convertible_gimple_assign_stmt_p (stmt);
>>
>>      default:
>>        /* Don't know what to do with 'em so don't do anything.  */
>> @@ -1016,18 +899,6 @@ if_convertible_loop_p_1 (struct loop *loop,
>>    if (!res)
>>      return false;
>>
>> -  if (flag_tree_loop_if_convert_stores)
>> -    {
>> -      data_reference_p dr;
>> -
>> -      for (i = 0; VEC_iterate (data_reference_p, *refs, i, dr); i++)
>> -     {
>> -       dr->aux = XNEW (struct ifc_dr);
>> -       DR_WRITTEN_AT_LEAST_ONCE (dr) = -1;
>> -       DR_RW_UNCONDITIONALLY (dr) = -1;
>> -     }
>> -    }
>> -
>>    for (i = 0; i < loop->num_nodes; i++)
>>      {
>>        basic_block bb = ifc_bbs[i];
>> @@ -1040,7 +911,7 @@ if_convertible_loop_p_1 (struct loop *loop,
>>        /* Check the if-convertibility of statements in predicated BBs.  */
>>        if (is_predicated (bb))
>>       for (itr = gsi_start_bb (bb); !gsi_end_p (itr); gsi_next (&itr))
>> -       if (!if_convertible_stmt_p (gsi_stmt (itr), *refs))
>> +       if (!if_convertible_stmt_p (gsi_stmt (itr)))
>>           return false;
>>      }
>>
>> @@ -1101,15 +972,6 @@ if_convertible_loop_p (struct loop *loop)
>>    ddrs = VEC_alloc (ddr_p, heap, 25);
>>    res = if_convertible_loop_p_1 (loop, &refs, &ddrs);
>>
>> -  if (flag_tree_loop_if_convert_stores)
>> -    {
>> -      data_reference_p dr;
>> -      unsigned int i;
>> -
>> -      for (i = 0; VEC_iterate (data_reference_p, refs, i, dr); i++)
>> -     free (dr->aux);
>> -    }
>> -
>>    free_data_refs (refs);
>>    free_dependence_relations (ddrs);
>>    return res;
>> @@ -1366,6 +1228,78 @@ insert_gimplified_predicates (loop_p loop)
>>      }
>>  }
>>
>> +/* Insert at the beginning of the first basic block of the current
>> +   function the allocation on the stack of N bytes of memory and
>> +   return a pointer to this scratchpad memory.  */
>> +
>> +static tree
>> +create_scratchpad (void)
>> +{
>> +  basic_block bb = single_succ (ENTRY_BLOCK_PTR);
>> +  gimple_stmt_iterator gsi = gsi_after_labels (bb);
>> +
>> +  /* void *tmp = __builtin_alloca */
>> +  const char *name = "scratch_pad";
>> +  tree x = build_int_cst (integer_type_node, 64);
>> +  gimple stmt = gimple_build_call (built_in_decls[BUILT_IN_ALLOCA], 1, x);
>> +  tree var = create_tmp_var (ptr_type_node, name);
>> +  tree tmp = make_ssa_name (var, stmt);
>
> It would be better to use an automatic variable than using alloca
> which is expensive.  Why was your choice that way?  (Are we ever
> if-converting aggregate stores?  I hope not.)
>
> Also you are unconditionally allocating 64 bytes instead of N.
>
> Note that if you want to make vectorization happy you would need
> to ensure that for
>
>  if (x)
>    a[i] = ...;
>
> the scratchpad you'll end up using will have the _same_ alignment
> as a[i] (same or larger for all offsets).  Using a local array
> of chars should make it possible for the vectorizer to adjust
> its alignment if needed.
>

This is addressed in 0003-Don-t-use-alloca.patch.

>> +  add_referenced_var (var);
>> +  gimple_call_set_lhs (stmt, tmp);
>> +  SSA_NAME_DEF_STMT (tmp) = stmt;
>> +  update_stmt (stmt);
>> +
>> +  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
>> +  return tmp;
>> +}
>> +
>> +/* Returns a memory reference to the pointer defined by the
>> +   conditional expression: pointer = cond ? &A[i] : scratch_pad; and
>> +   inserts this code at GSI.  */
>> +
>> +static tree
>> +create_indirect_cond_expr (tree ai, tree cond, tree *scratch_pad,
>> +                        gimple_stmt_iterator *gsi)
>> +{
>> +  tree type = TREE_TYPE (ai);
>> +
>> +  tree pointer_to_type, address_of_ai, addr_expr, cond_expr;
>> +  tree pointer, star_pointer;
>> +  gimple addr_stmt, pointer_stmt;
>> +
>> +  /* address_of_ai = &A[i];  */
>> +  pointer_to_type = build_pointer_type (type);
>> +  address_of_ai = create_tmp_var (pointer_to_type, "_ifc_");
>
> Use create_tmp_reg (everywhere)

This is in 0004-Use-create_tmp_reg-instead-of-create_tmp_var.patch.

>
>> +  add_referenced_var (address_of_ai);
>> +  addr_expr = build_fold_addr_expr (ai);
>
> If you build that before create_tmp_reg you can use TREE_TYPE of
> it and avoid creating pointer_to_type.
>

Fixed in 0005-Avoid-call-to-build_pointer_type.patch.

>> +  addr_stmt = gimple_build_assign (address_of_ai, addr_expr);
>> +  address_of_ai = make_ssa_name (address_of_ai, addr_stmt);
>> +  gimple_assign_set_lhs (addr_stmt, address_of_ai);
>> +  SSA_NAME_DEF_STMT (address_of_ai) = addr_stmt;
>> +  update_stmt (addr_stmt);
>> +  gsi_insert_before (gsi, addr_stmt, GSI_SAME_STMT);
>> +
>> +  /* Allocate the scratch pad only once per function.  */
>> +  if (!*scratch_pad)
>> +    *scratch_pad = create_scratchpad ();
>> +
>> +  /* pointer = cond ? address_of_ai : scratch_pad;  */
>> +  pointer = create_tmp_var (pointer_to_type, "_ifc_");
>> +  add_referenced_var (pointer);
>> +  cond_expr = build3 (COND_EXPR, pointer_to_type, unshare_expr (cond),
>> +                   address_of_ai, *scratch_pad);
>> +  pointer_stmt = gimple_build_assign (pointer, cond_expr);
>> +  pointer = make_ssa_name (pointer, pointer_stmt);
>> +  gimple_assign_set_lhs (pointer_stmt, pointer);
>> +  SSA_NAME_DEF_STMT (pointer) = pointer_stmt;
>> +  update_stmt (pointer_stmt);
>> +  gsi_insert_before (gsi, pointer_stmt, GSI_SAME_STMT);
>> +
>> +  star_pointer = build_simple_mem_ref (pointer);
>
> build2 (MEM_REF, TREE_TYPE (ai), pointer,
>        build_int_cst (reference_alias_ptr_type (ai), 0));
>
> as you need to preserve TBAA info.
>

Fixed in 0006-Preserve-TBAA.patch.

>> +  return star_pointer;
>> +}
>> +
>>  /* Predicate each write to memory in LOOP.
>>
>>     This function transforms control flow constructs containing memory
>> @@ -1377,10 +1311,19 @@ insert_gimplified_predicates (loop_p loop)
>>
>>     into the following form that does not contain control flow:
>>
>> -   | for (i = 0; i < N; i++)
>> -   |   A[i] = cond ? expr : A[i];
>> +   | void *scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES);
>> +   |
>> +   | for (i = 0; i < N; i++) {
>> +   |   p = cond ? &A[i] : scratch_pad;
>> +   |   *p = expr;
>> +   | }
>> +
>> +   SCRATCH_PAD is allocated on the stack for each function once and it is
>> +   large enough to contain any kind of scalar assignment or read.  All
>> +   values read or written to SCRATCH_PAD are not used in the computation.
>>
>> -   The original CFG looks like this:
>> +   In a more detailed way, the if-conversion of memory writes works
>> +   like this, supposing that the original CFG looks like this:
>>
>>     | bb_0
>>     |   i = 0
>> @@ -1430,10 +1373,12 @@ insert_gimplified_predicates (loop_p loop)
>>     |   goto bb_1
>>     | end_bb_4
>>
>> -   predicate_mem_writes is then predicating the memory write as follows:
>> +   predicate_mem_writes is then allocating SCRATCH_PAD in the basic block
>> +   preceding the loop header, and is predicating the memory write:
>>
>>     | bb_0
>>     |   i = 0
>> +   |   scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES);
>>     | end_bb_0
>>     |
>>     | bb_1
>> @@ -1441,12 +1386,14 @@ insert_gimplified_predicates (loop_p loop)
>>     | end_bb_1
>>     |
>>     | bb_2
>> +   |   cond = some_computation;
>>     |   if (cond) goto bb_3 else goto bb_4
>>     | end_bb_2
>>     |
>>     | bb_3
>>     |   cond = some_computation;
>> -   |   A[i] = cond ? expr : A[i];
>> +   |   p = cond ? &A[i] : scratch_pad;
>> +   |   *p = expr;
>>     |   goto bb_4
>>     | end_bb_3
>>     |
>> @@ -1459,12 +1406,14 @@ insert_gimplified_predicates (loop_p loop)
>>
>>     | bb_0
>>     |   i = 0
>> +   |   scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES);
>>     |   if (i < N) goto bb_5 else goto bb_1
>>     | end_bb_0
>>     |
>>     | bb_1
>>     |   cond = some_computation;
>> -   |   A[i] = cond ? expr : A[i];
>> +   |   p = cond ? &A[i] : scratch_pad;
>> +   |   *p = expr;
>>     |   if (i < N) goto bb_5 else goto bb_4
>>     | end_bb_1
>>     |
>> @@ -1474,7 +1423,7 @@ insert_gimplified_predicates (loop_p loop)
>>  */
>>
>>  static void
>> -predicate_mem_writes (loop_p loop)
>> +predicate_mem_writes (loop_p loop, tree *scratch_pad)
>>  {
>>    unsigned int i, orig_loop_num_nodes = loop->num_nodes;
>>
>> @@ -1489,20 +1438,35 @@ predicate_mem_writes (loop_p loop)
>>       continue;
>>
>>        for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>> -     if ((stmt = gsi_stmt (gsi))
>> -         && gimple_assign_single_p (stmt)
>> -         && gimple_vdef (stmt))
>> -       {
>> -         tree lhs = gimple_assign_lhs (stmt);
>> -         tree rhs = gimple_assign_rhs1 (stmt);
>> -         tree type = TREE_TYPE (lhs);
>> -
>> -         lhs = ifc_temp_var (type, unshare_expr (lhs), &gsi);
>> -         rhs = ifc_temp_var (type, unshare_expr (rhs), &gsi);
>> -         rhs = build3 (COND_EXPR, type, unshare_expr (cond), rhs, lhs);
>> -         gimple_assign_set_rhs1 (stmt, ifc_temp_var (type, rhs, &gsi));
>> -         update_stmt (stmt);
>> -       }
>> +     {
>> +       stmt = gsi_stmt (gsi);
>> +       if (gimple_assign_single_p (stmt)
>> +           && gimple_vdef (stmt))
>> +         {
>> +           /* A[i] = x;  */
>> +           tree ai = gimple_assign_lhs (stmt);
>> +
>> +           /* pointer = cond ? &A[i] : scratch_pad;  */
>> +           tree star_pointer = create_indirect_cond_expr (ai, cond,
>> +                                                          scratch_pad, &gsi);
>> +           /* *pointer = x;  */
>> +           gimple_assign_set_lhs (stmt, star_pointer);
>> +           update_stmt (stmt);
>> +         }
>> +       else if (gimple_assign_single_p (stmt)
>> +                && gimple_vuse (stmt))
>> +         {
>> +           /* x = A[i];  */
>> +           tree ai = gimple_assign_rhs1 (stmt);
>> +
>> +           /* pointer = cond ? &A[i] : scratch_pad;  */
>> +           tree star_pointer = create_indirect_cond_expr (ai, cond,
>> +                                                          scratch_pad, &gsi);
>> +           /* x = *pointer;  */
>> +           gimple_assign_set_rhs1 (stmt, star_pointer);
>> +           update_stmt (stmt);
>> +         }
>> +     }
>>      }
>>  }
>>
>> @@ -1552,7 +1516,7 @@ remove_conditions_and_labels (loop_p loop)
>>     blocks.  Replace PHI nodes with conditional modify expressions.  */
>>
>>  static void
>> -combine_blocks (struct loop *loop)
>> +combine_blocks (struct loop *loop, tree *scratch_pad)
>>  {
>>    basic_block bb, exit_bb, merge_target_bb;
>>    unsigned int orig_loop_num_nodes = loop->num_nodes;
>> @@ -1565,7 +1529,7 @@ combine_blocks (struct loop *loop)
>>    predicate_all_scalar_phis (loop);
>>
>>    if (flag_tree_loop_if_convert_stores)
>> -    predicate_mem_writes (loop);
>> +    predicate_mem_writes (loop, scratch_pad);
>>
>>    /* Merge basic blocks: first remove all the edges in the loop,
>>       except for those from the exit block.  */
>> @@ -1654,7 +1618,7 @@ combine_blocks (struct loop *loop)
>>     profitability analysis.  Returns true when something changed.  */
>>
>>  static bool
>> -tree_if_conversion (struct loop *loop)
>> +tree_if_conversion (struct loop *loop, tree *scratch_pad)
>>  {
>>    bool changed = false;
>>    ifc_bbs = NULL;
>> @@ -1666,7 +1630,7 @@ tree_if_conversion (struct loop *loop)
>>    /* Now all statements are if-convertible.  Combine all the basic
>>       blocks into one huge basic block doing the if-conversion
>>       on-the-fly.  */
>> -  combine_blocks (loop);
>> +  combine_blocks (loop, scratch_pad);
>>
>>    if (flag_tree_loop_if_convert_stores)
>>      mark_sym_for_renaming (gimple_vop (cfun));
>> @@ -1697,12 +1661,13 @@ main_tree_if_conversion (void)
>>    struct loop *loop;
>>    bool changed = false;
>>    unsigned todo = 0;
>> +  tree scratch_pad = NULL_TREE;
>>
>>    if (number_of_loops () <= 1)
>>      return 0;
>>
>>    FOR_EACH_LOOP (li, loop, 0)
>> -    changed |= tree_if_conversion (loop);
>> +    changed |= tree_if_conversion (loop, &scratch_pad);
>>
>>    if (changed)
>>      todo |= TODO_cleanup_cfg;
>
> Overall I like the new way much more.  Please update and repost.
>

All the patches are combined on top of the previous patch.  The
updated patch is 0001-Fix-PR46029-reimplement-if-convert-stores.patch.
I am bootstrapping and testing this combined patch on amd64-linux.
Let me know if I can improve this patch in some other way.

Thanks,
Sebastian

Comments

Richard Biener Nov. 16, 2010, 2:05 p.m. UTC | #1
On Mon, 15 Nov 2010, Sebastian Pop wrote:

> Hi Richi,
> 
> fixes to your review are posted separately, see below for the details.
> See 0001-Fix-PR46029-reimplement-if-convert-stores.patch for the
> combined patch.

+  tree base = create_tmp_var (array_type, "scratch_pad");
+  tree a0 = build4 (ARRAY_REF, elt_type, base, integer_zero_node, 
NULL_TREE,
+                   NULL_TREE);

you can drop creating the ARRAY_REF and do

+  return insert_address_of (base, build_pointer_type (elt_type), &gsi);


The patches are ok with the above change.

Thanks,
Richard.
Richard Biener Nov. 16, 2010, 2:09 p.m. UTC | #2
On Tue, 16 Nov 2010, Richard Guenther wrote:

> On Mon, 15 Nov 2010, Sebastian Pop wrote:
> 
> > Hi Richi,
> > 
> > fixes to your review are posted separately, see below for the details.
> > See 0001-Fix-PR46029-reimplement-if-convert-stores.patch for the
> > combined patch.
> 
> +  tree base = create_tmp_var (array_type, "scratch_pad");
> +  tree a0 = build4 (ARRAY_REF, elt_type, base, integer_zero_node, 
> NULL_TREE,
> +                   NULL_TREE);
> 
> you can drop creating the ARRAY_REF and do
> 
> +  return insert_address_of (base, build_pointer_type (elt_type), &gsi);
> 
> 
> The patches are ok with the above change.

Btw, in insert_address_of you might want to add

  struct ptr_info_def *pi = get_ptr_info (address_of_ai);
  pt_solution_set_var (&pi->pt, SSA_NAME_VAR (address_of_ai));

Richard.
Sebastian Pop Jan. 3, 2011, 8:26 p.m. UTC | #3
Hi Richi,

On Tue, Nov 16, 2010 at 08:09, Richard Guenther <rguenther@suse.de> wrote:
> On Tue, 16 Nov 2010, Richard Guenther wrote:
>
>> On Mon, 15 Nov 2010, Sebastian Pop wrote:
>>
>> > Hi Richi,
>> >
>> > fixes to your review are posted separately, see below for the details.
>> > See 0001-Fix-PR46029-reimplement-if-convert-stores.patch for the
>> > combined patch.
>>
>> +  tree base = create_tmp_var (array_type, "scratch_pad");
>> +  tree a0 = build4 (ARRAY_REF, elt_type, base, integer_zero_node,
>> NULL_TREE,
>> +                   NULL_TREE);
>>
>> you can drop creating the ARRAY_REF and do
>>
>> +  return insert_address_of (base, build_pointer_type (elt_type), &gsi);
>>
>>
>> The patches are ok with the above change.
>
> Btw, in insert_address_of you might want to add
>
>  struct ptr_info_def *pi = get_ptr_info (address_of_ai);
>  pt_solution_set_var (&pi->pt, SSA_NAME_VAR (address_of_ai));
>

In four tests of the SPEC benchmarks I am getting this kind of errors:

results.f:19:0: error: address taken, but ADDRESSABLE bit not set
scratch_pad.1438
f951: note: in statement
_ifc_.1454_733 = [cond_expr] D.2525_407 != 0 ? _ifc_.1453_797 :
&scratch_pad.1438;

results.f:19:0: error: address taken, but ADDRESSABLE bit not set
vkl
f951: note: in statement
_ifc_.1450_7837 = [cond_expr] prephitmp.1376_7987 > 1 ? &vkl[11] :
&scratch_pad.1438;

If I'm watching the addressable field of the vkl or scratch_pad decls,
I see that TREE_ADDRESSABLE is cleared in maybe_optimize_var

  if (TREE_ADDRESSABLE (var)
      /* Do not change TREE_ADDRESSABLE if we need to preserve var as
	 a non-register.  Otherwise we are confused and forget to
	 add virtual operands for it.  */
      && (!is_gimple_reg_type (TREE_TYPE (var))
	  || !bitmap_bit_p (not_reg_needs, DECL_UID (var))))
    {
      TREE_ADDRESSABLE (var) = 0;

this is in pass_fold_builtins executed after the loop optimizer is done.
If I'm disabling this optimization, the above ICE disappears.
Any idea how to fix this issue?

Thanks,
Sebastian

PS: With the previous code building an alloca call to allocate the scratch_pad,
I do not see the ICE on the scratch_pad.1438 but I still see the other error
on the vkl array.
Richard Biener Jan. 3, 2011, 9:39 p.m. UTC | #4
On Mon, Jan 3, 2011 at 9:26 PM, Sebastian Pop <sebpop@gmail.com> wrote:
> Hi Richi,
>
> On Tue, Nov 16, 2010 at 08:09, Richard Guenther <rguenther@suse.de> wrote:
>> On Tue, 16 Nov 2010, Richard Guenther wrote:
>>
>>> On Mon, 15 Nov 2010, Sebastian Pop wrote:
>>>
>>> > Hi Richi,
>>> >
>>> > fixes to your review are posted separately, see below for the details.
>>> > See 0001-Fix-PR46029-reimplement-if-convert-stores.patch for the
>>> > combined patch.
>>>
>>> +  tree base = create_tmp_var (array_type, "scratch_pad");
>>> +  tree a0 = build4 (ARRAY_REF, elt_type, base, integer_zero_node,
>>> NULL_TREE,
>>> +                   NULL_TREE);
>>>
>>> you can drop creating the ARRAY_REF and do
>>>
>>> +  return insert_address_of (base, build_pointer_type (elt_type), &gsi);
>>>
>>>
>>> The patches are ok with the above change.
>>
>> Btw, in insert_address_of you might want to add
>>
>>  struct ptr_info_def *pi = get_ptr_info (address_of_ai);
>>  pt_solution_set_var (&pi->pt, SSA_NAME_VAR (address_of_ai));
>>
>
> In four tests of the SPEC benchmarks I am getting this kind of errors:
>
> results.f:19:0: error: address taken, but ADDRESSABLE bit not set
> scratch_pad.1438
> f951: note: in statement
> _ifc_.1454_733 = [cond_expr] D.2525_407 != 0 ? _ifc_.1453_797 :
> &scratch_pad.1438;
>
> results.f:19:0: error: address taken, but ADDRESSABLE bit not set
> vkl
> f951: note: in statement
> _ifc_.1450_7837 = [cond_expr] prephitmp.1376_7987 > 1 ? &vkl[11] :
> &scratch_pad.1438;
>
> If I'm watching the addressable field of the vkl or scratch_pad decls,
> I see that TREE_ADDRESSABLE is cleared in maybe_optimize_var
>
>  if (TREE_ADDRESSABLE (var)
>      /* Do not change TREE_ADDRESSABLE if we need to preserve var as
>         a non-register.  Otherwise we are confused and forget to
>         add virtual operands for it.  */
>      && (!is_gimple_reg_type (TREE_TYPE (var))
>          || !bitmap_bit_p (not_reg_needs, DECL_UID (var))))
>    {
>      TREE_ADDRESSABLE (var) = 0;
>
> this is in pass_fold_builtins executed after the loop optimizer is done.
> If I'm disabling this optimization, the above ICE disappears.
> Any idea how to fix this issue?

walk_stmt_load_store_addr_ops needs to handle COND_EXPRs in
the gimple_assign_single_p () visit_addr case.

Of course COND_EXPRs should be moved to non-single RHS at some point.

Richard.

> Thanks,
> Sebastian
>
> PS: With the previous code building an alloca call to allocate the scratch_pad,
> I do not see the ICE on the scratch_pad.1438 but I still see the other error
> on the vkl array.
>
diff mbox

Patch

From 3dd7298ecc74c49df46e05913efeb8395bb11c62 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <sebpop@gmail.com>
Date: Tue, 26 Oct 2010 16:34:29 -0500
Subject: [PATCH] Fix PR46029: reimplement if-convert stores.

2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>

	PR tree-optimization/46029
	* doc/invoke.texi (-ftree-loop-if-convert-stores): Update description.
	* tree-if-conv.c (has_unaligned_memory_refs): New.
	(if_convertible_gimple_assign_stmt_p): Call has_unaligned_memory_refs.
	(create_scratchpad): New.
	(create_indirect_cond_expr): New.
	(predicate_mem_writes): Call create_indirect_cond_expr.  Take an extra
	parameter for scratch_pad.
	(combine_blocks): Same.
	(tree_if_conversion): Same.
	(main_tree_if_conversion): Pass to tree_if_conversion a pointer to
	scratch_pad.
	(struct ifc_dr): Removed.
	(IFC_DR): Removed.
	(DR_WRITTEN_AT_LEAST_ONCE): Removed.
	(DR_RW_UNCONDITIONALLY): Removed.
	(memrefs_read_or_written_unconditionally): Removed.
	(write_memrefs_written_at_least_once): Removed.
	(ifcvt_memrefs_wont_trap): Removed.
	(ifcvt_could_trap_p): Does not take refs parameter anymore.
	(if_convertible_gimple_assign_stmt_p): Same.
	(if_convertible_stmt_p): Same.
	(if_convertible_loop_p_1): Remove initialization of dr->aux,
	DR_WRITTEN_AT_LEAST_ONCE, and DR_RW_UNCONDITIONALLY.
	(if_convertible_loop_p): Remove deallocation of the same.

testsuite/
	* g++.dg/tree-ssa/ifc-pr46029.C: New.
	* gcc.dg/tree-ssa/ifc-8.c: New.
	* gcc.dg/tree-ssa/ifc-5.c: Make it exactly like the FFmpeg kernel.
---
 gcc/ChangeLog                               |   28 ++
 gcc/doc/invoke.texi                         |   18 +-
 gcc/testsuite/ChangeLog                     |    7 +
 gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C |   76 ++++++
 gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c       |   19 +-
 gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c       |   29 ++
 gcc/tree-if-conv.c                          |  375 ++++++++++++---------------
 7 files changed, 332 insertions(+), 220 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index df496e3..0623d2a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,31 @@ 
+2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>
+
+	PR tree-optimization/46029
+	* doc/invoke.texi (-ftree-loop-if-convert-stores): Update description.
+	* tree-if-conv.c (has_unaligned_memory_refs): New.
+	(if_convertible_gimple_assign_stmt_p): Call has_unaligned_memory_refs.
+	(create_scratchpad): New.
+	(create_indirect_cond_expr): New.
+	(predicate_mem_writes): Call create_indirect_cond_expr.  Take an extra
+	parameter for scratch_pad.
+	(combine_blocks): Same.
+	(tree_if_conversion): Same.
+	(main_tree_if_conversion): Pass to tree_if_conversion a pointer to
+	scratch_pad.
+	(struct ifc_dr): Removed.
+	(IFC_DR): Removed.
+	(DR_WRITTEN_AT_LEAST_ONCE): Removed.
+	(DR_RW_UNCONDITIONALLY): Removed.
+	(memrefs_read_or_written_unconditionally): Removed.
+	(write_memrefs_written_at_least_once): Removed.
+	(ifcvt_memrefs_wont_trap): Removed.
+	(ifcvt_could_trap_p): Does not take refs parameter anymore.
+	(if_convertible_gimple_assign_stmt_p): Same.
+	(if_convertible_stmt_p): Same.
+	(if_convertible_loop_p_1): Remove initialization of dr->aux,
+	DR_WRITTEN_AT_LEAST_ONCE, and DR_RW_UNCONDITIONALLY.
+	(if_convertible_loop_p): Remove deallocation of the same.
+
 2010-11-11  Joern Rennecke  <amylaar@spamcop.net>
 
 	PR target/44749
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index f197483..d322123 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -6959,20 +6959,26 @@  if vectorization is enabled.
 
 @item -ftree-loop-if-convert-stores
 Attempt to also if-convert conditional jumps containing memory writes.
-This transformation can be unsafe for multi-threaded programs as it
-transforms conditional memory writes into unconditional memory writes.
 For example,
 @smallexample
 for (i = 0; i < N; i++)
   if (cond)
-    A[i] = expr;
+    A[i] = B[i] + 2;
 @end smallexample
 would be transformed to
 @smallexample
-for (i = 0; i < N; i++)
-  A[i] = cond ? expr : A[i];
+void *scratchpad = alloca (64);
+for (i = 0; i < N; i++) @{
+  a = cond ? &A[i] : scratchpad;
+  b = cond ? &B[i] : scratchpad;
+  *a = *b + 2;
+@}
 @end smallexample
-potentially producing data races.
+The compiler allocates a scratchpad memory on the stack for each
+function in which the if-conversion of memory stores or reads
+happened.  This scratchpad memory is used during the part of the
+computation that is discarded, i.e., when the condition is evaluated
+to false.
 
 @item -ftree-loop-distribution
 Perform loop distribution.  This flag can improve cache performance on
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 3156601..c5c2473 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@ 
+2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>
+
+	PR tree-optimization/46029
+	* g++.dg/tree-ssa/ifc-pr46029.C: New.
+	* gcc.dg/tree-ssa/ifc-8.c: New.
+	* gcc.dg/tree-ssa/ifc-5.c: Make it exactly like the FFmpeg kernel.
+
 2010-11-11  Nicola Pero  <nicola.pero@meta-innovation.com>
 
 	* objc.dg/property/at-property-20.m: New.
diff --git a/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C b/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C
new file mode 100644
index 0000000..2a54bdb
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C
@@ -0,0 +1,76 @@ 
+// { dg-do run }
+/* { dg-options "-O -ftree-loop-if-convert-stores" } */
+
+namespace
+{
+  struct rb_tree_node_
+  {
+    rb_tree_node_ ():m_p_left (0), m_p_parent (0), m_metadata (0)
+    {
+    }
+    unsigned &get_metadata ()
+    {
+      return m_metadata;
+    }
+    rb_tree_node_ *m_p_left;
+    rb_tree_node_ *m_p_parent;
+    unsigned m_metadata;
+  };
+
+  struct bin_search_tree_const_node_it_
+  {
+    bin_search_tree_const_node_it_ (rb_tree_node_ * p_nd):m_p_nd (p_nd)
+    {
+    }
+    unsigned &get_metadata ()
+    {
+      return m_p_nd->get_metadata ();
+    }
+    bin_search_tree_const_node_it_ get_l_child ()
+    {
+      return bin_search_tree_const_node_it_ (m_p_nd->m_p_left);
+    }
+
+    rb_tree_node_ *m_p_nd;
+  };
+
+  struct bin_search_tree_no_data_
+  {
+    typedef rb_tree_node_ *node_pointer;
+      bin_search_tree_no_data_ ():m_p_head (new rb_tree_node_ ())
+    {
+    }
+    void insert_imp_empty (int r_value)
+    {
+      rb_tree_node_ *p_new_node = new rb_tree_node_ ();
+      m_p_head->m_p_parent = p_new_node;
+      p_new_node->m_p_parent = m_p_head;
+      update_to_top (m_p_head->m_p_parent);
+    }
+    void apply_update (bin_search_tree_const_node_it_ nd_it)
+    {
+      unsigned
+	l_max_endpoint
+	=
+	(nd_it.get_l_child ().m_p_nd ==
+	 0) ? 0 : nd_it.get_l_child ().get_metadata ();
+      nd_it.get_metadata () = l_max_endpoint;
+    }
+    void update_to_top (node_pointer p_nd)
+    {
+      while (p_nd != m_p_head)
+	{
+	  apply_update (p_nd);
+	  p_nd = p_nd->m_p_parent;
+	}
+    }
+
+    rb_tree_node_ * m_p_head;
+  };
+}
+
+int main ()
+{
+  bin_search_tree_no_data_ ().insert_imp_empty (0);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c
index a9cc816..4be2cdb 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-c -O2 -ftree-vectorize -fdump-tree-ifcvt-stats" { target *-*-* } } */
+/* { dg-options "-c -O2 -ftree-vectorize -ftree-loop-if-convert-stores -fdump-tree-ifcvt-stats" { target *-*-* } } */
 
 void
 dct_unquantize_h263_inter_c (short *block, int n, int qscale, int nCoeffs)
@@ -12,11 +12,18 @@  dct_unquantize_h263_inter_c (short *block, int n, int qscale, int nCoeffs)
   for (i = 0; i <= nCoeffs; i++)
     {
       level = block[i];
-      if (level < 0)
-	level = level * qmul - qadd;
-      else
-	level = level * qmul + qadd;
-      block[i] = level;
+      if (level)
+        {
+          if (level < 0)
+            {
+              level = level * qmul - qadd;
+            }
+          else
+            {
+              level = level * qmul + qadd;
+            }
+          block[i] = level;
+        }
     }
 }
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c
new file mode 100644
index 0000000..d7cf279
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c
@@ -0,0 +1,29 @@ 
+/* { dg-do compile } */
+/* { dg-options "-c -O2 -ftree-vectorize" { target *-*-* } } */
+
+typedef union tree_node *tree;
+struct tree_common
+{
+  unsigned volatile_flag : 1;
+  unsigned unsigned_flag : 1;
+};
+struct tree_type
+{
+  tree next_variant;
+  tree main_variant;
+};
+union tree_node
+{
+  struct tree_common common;
+  struct tree_type type;
+};
+void finish_enum (tree enumtype)
+{
+  tree tem;
+  for (tem = ((enumtype)->type.main_variant); tem; tem = ((tem)->type.next_variant))
+    {
+      if (tem == enumtype)
+	continue;
+      ((tem)->common.unsigned_flag) = ((enumtype)->common.unsigned_flag);
+    }
+}
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index fc65845..f4732ad 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -226,7 +226,7 @@  ifc_temp_var (tree type, tree expr, gimple_stmt_iterator *gsi)
   gimple stmt;
 
   /* Create new temporary variable.  */
-  var = create_tmp_var (type, name);
+  var = create_tmp_reg (type, name);
   add_referenced_var (var);
 
   /* Build new statement to assign EXPR to new variable.  */
@@ -446,171 +446,38 @@  if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
   return true;
 }
 
-/* Records the status of a data reference.  This struct is attached to
-   each DR->aux field.  */
-
-struct ifc_dr {
-  /* -1 when not initialized, 0 when false, 1 when true.  */
-  int written_at_least_once;
-
-  /* -1 when not initialized, 0 when false, 1 when true.  */
-  int rw_unconditionally;
-};
-
-#define IFC_DR(DR) ((struct ifc_dr *) (DR)->aux)
-#define DR_WRITTEN_AT_LEAST_ONCE(DR) (IFC_DR (DR)->written_at_least_once)
-#define DR_RW_UNCONDITIONALLY(DR) (IFC_DR (DR)->rw_unconditionally)
-
-/* Returns true when the memory references of STMT are read or written
-   unconditionally.  In other words, this function returns true when
-   for every data reference A in STMT there exist other accesses to
-   the same data reference with predicates that add up (OR-up) to the
-   true predicate: this ensures that the data reference A is touched
-   (read or written) on every iteration of the if-converted loop.  */
+/* Wrapper around gimple_could_trap_p refined for the needs of the
+   if-conversion.  */
 
 static bool
-memrefs_read_or_written_unconditionally (gimple stmt,
-					 VEC (data_reference_p, heap) *drs)
+ifcvt_could_trap_p (gimple stmt)
 {
-  int i, j;
-  data_reference_p a, b;
-  tree ca = bb_predicate (gimple_bb (stmt));
-
-  for (i = 0; VEC_iterate (data_reference_p, drs, i, a); i++)
-    if (DR_STMT (a) == stmt)
-      {
-	bool found = false;
-	int x = DR_RW_UNCONDITIONALLY (a);
-
-	if (x == 0)
-	  return false;
-
-	if (x == 1)
-	  continue;
-
-	for (j = 0; VEC_iterate (data_reference_p, drs, j, b); j++)
-	  if (DR_STMT (b) != stmt
-	      && same_data_refs (a, b))
-	    {
-	      tree cb = bb_predicate (gimple_bb (DR_STMT (b)));
-
-	      if (DR_RW_UNCONDITIONALLY (b) == 1
-		  || is_true_predicate (cb)
-		  || is_true_predicate (ca = fold_or_predicates (EXPR_LOCATION (cb),
-								 ca, cb)))
-		{
-		  DR_RW_UNCONDITIONALLY (a) = 1;
-		  DR_RW_UNCONDITIONALLY (b) = 1;
-		  found = true;
-		  break;
-		}
-	    }
-
-	if (!found)
-	  {
-	    DR_RW_UNCONDITIONALLY (a) = 0;
-	    return false;
-	  }
-      }
+  if (gimple_vuse (stmt)
+      && !gimple_could_trap_p_1 (stmt, false, false))
+    return false;
 
-  return true;
+  return gimple_could_trap_p (stmt);
 }
 
-/* Returns true when the memory references of STMT are unconditionally
-   written.  In other words, this function returns true when for every
-   data reference A written in STMT, there exist other writes to the
-   same data reference with predicates that add up (OR-up) to the true
-   predicate: this ensures that the data reference A is written on
-   every iteration of the if-converted loop.  */
+/* Returns true when stmt contains a data reference.  */
 
 static bool
-write_memrefs_written_at_least_once (gimple stmt,
-				     VEC (data_reference_p, heap) *drs)
+has_non_addressable_refs (gimple stmt)
 {
-  int i, j;
-  data_reference_p a, b;
-  tree ca = bb_predicate (gimple_bb (stmt));
+  VEC (data_ref_loc, heap) *refs = VEC_alloc (data_ref_loc, heap, 3);
+  bool res = get_references_in_stmt (stmt, &refs);
+  unsigned i;
+  data_ref_loc *ref;
 
-  for (i = 0; VEC_iterate (data_reference_p, drs, i, a); i++)
-    if (DR_STMT (a) == stmt
-	&& DR_IS_WRITE (a))
+  FOR_EACH_VEC_ELT (data_ref_loc, refs, i, ref)
+    if (may_be_nonaddressable_p (*(ref->pos)))
       {
-	bool found = false;
-	int x = DR_WRITTEN_AT_LEAST_ONCE (a);
-
-	if (x == 0)
-	  return false;
-
-	if (x == 1)
-	  continue;
-
-	for (j = 0; VEC_iterate (data_reference_p, drs, j, b); j++)
-	  if (DR_STMT (b) != stmt
-	      && DR_IS_WRITE (b)
-	      && same_data_refs_base_objects (a, b))
-	    {
-	      tree cb = bb_predicate (gimple_bb (DR_STMT (b)));
-
-	      if (DR_WRITTEN_AT_LEAST_ONCE (b) == 1
-		  || is_true_predicate (cb)
-		  || is_true_predicate (ca = fold_or_predicates (EXPR_LOCATION (cb),
-								 ca, cb)))
-		{
-		  DR_WRITTEN_AT_LEAST_ONCE (a) = 1;
-		  DR_WRITTEN_AT_LEAST_ONCE (b) = 1;
-		  found = true;
-		  break;
-		}
-	    }
-
-	if (!found)
-	  {
-	    DR_WRITTEN_AT_LEAST_ONCE (a) = 0;
-	    return false;
-	  }
+	res = true;
+	break;
       }
 
-  return true;
-}
-
-/* Return true when the memory references of STMT won't trap in the
-   if-converted code.  There are two things that we have to check for:
-
-   - writes to memory occur to writable memory: if-conversion of
-   memory writes transforms the conditional memory writes into
-   unconditional writes, i.e. "if (cond) A[i] = foo" is transformed
-   into "A[i] = cond ? foo : A[i]", and as the write to memory may not
-   be executed at all in the original code, it may be a readonly
-   memory.  To check that A is not const-qualified, we check that
-   there exists at least an unconditional write to A in the current
-   function.
-
-   - reads or writes to memory are valid memory accesses for every
-   iteration.  To check that the memory accesses are correctly formed
-   and that we are allowed to read and write in these locations, we
-   check that the memory accesses to be if-converted occur at every
-   iteration unconditionally.  */
-
-static bool
-ifcvt_memrefs_wont_trap (gimple stmt, VEC (data_reference_p, heap) *refs)
-{
-  return write_memrefs_written_at_least_once (stmt, refs)
-    && memrefs_read_or_written_unconditionally (stmt, refs);
-}
-
-/* Wrapper around gimple_could_trap_p refined for the needs of the
-   if-conversion.  Try to prove that the memory accesses of STMT could
-   not trap in the innermost loop containing STMT.  */
-
-static bool
-ifcvt_could_trap_p (gimple stmt, VEC (data_reference_p, heap) *refs)
-{
-  if (gimple_vuse (stmt)
-      && !gimple_could_trap_p_1 (stmt, false, false)
-      && ifcvt_memrefs_wont_trap (stmt, refs))
-    return false;
-
-  return gimple_could_trap_p (stmt);
+  VEC_free (data_ref_loc, heap, refs);
+  return res;
 }
 
 /* Return true when STMT is if-convertible.
@@ -621,8 +488,7 @@  ifcvt_could_trap_p (gimple stmt, VEC (data_reference_p, heap) *refs)
    - LHS is not var decl.  */
 
 static bool
-if_convertible_gimple_assign_stmt_p (gimple stmt,
-				     VEC (data_reference_p, heap) *refs)
+if_convertible_gimple_assign_stmt_p (gimple stmt)
 {
   tree lhs = gimple_assign_lhs (stmt);
   basic_block bb;
@@ -650,19 +516,27 @@  if_convertible_gimple_assign_stmt_p (gimple stmt,
 
   if (flag_tree_loop_if_convert_stores)
     {
-      if (ifcvt_could_trap_p (stmt, refs))
+      if (ifcvt_could_trap_p (stmt))
 	{
 	  if (dump_file && (dump_flags & TDF_DETAILS))
-	    fprintf (dump_file, "tree could trap...\n");
+	    fprintf (dump_file, "tree could trap\n");
 	  return false;
 	}
+
+      if (has_non_addressable_refs (stmt))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "has non-addressable memory references\n");
+	  return false;
+	}
+
       return true;
     }
 
   if (gimple_assign_rhs_could_trap_p (stmt))
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file, "tree could trap...\n");
+	fprintf (dump_file, "tree could trap\n");
       return false;
     }
 
@@ -690,7 +564,7 @@  if_convertible_gimple_assign_stmt_p (gimple stmt,
    - it is a GIMPLE_LABEL or a GIMPLE_COND.  */
 
 static bool
-if_convertible_stmt_p (gimple stmt, VEC (data_reference_p, heap) *refs)
+if_convertible_stmt_p (gimple stmt)
 {
   switch (gimple_code (stmt))
     {
@@ -700,7 +574,7 @@  if_convertible_stmt_p (gimple stmt, VEC (data_reference_p, heap) *refs)
       return true;
 
     case GIMPLE_ASSIGN:
-      return if_convertible_gimple_assign_stmt_p (stmt, refs);
+      return if_convertible_gimple_assign_stmt_p (stmt);
 
     default:
       /* Don't know what to do with 'em so don't do anything.  */
@@ -1016,18 +890,6 @@  if_convertible_loop_p_1 (struct loop *loop,
   if (!res)
     return false;
 
-  if (flag_tree_loop_if_convert_stores)
-    {
-      data_reference_p dr;
-
-      for (i = 0; VEC_iterate (data_reference_p, *refs, i, dr); i++)
-	{
-	  dr->aux = XNEW (struct ifc_dr);
-	  DR_WRITTEN_AT_LEAST_ONCE (dr) = -1;
-	  DR_RW_UNCONDITIONALLY (dr) = -1;
-	}
-    }
-
   for (i = 0; i < loop->num_nodes; i++)
     {
       basic_block bb = ifc_bbs[i];
@@ -1040,7 +902,7 @@  if_convertible_loop_p_1 (struct loop *loop,
       /* Check the if-convertibility of statements in predicated BBs.  */
       if (is_predicated (bb))
 	for (itr = gsi_start_bb (bb); !gsi_end_p (itr); gsi_next (&itr))
-	  if (!if_convertible_stmt_p (gsi_stmt (itr), *refs))
+	  if (!if_convertible_stmt_p (gsi_stmt (itr)))
 	    return false;
     }
 
@@ -1101,15 +963,6 @@  if_convertible_loop_p (struct loop *loop)
   ddrs = VEC_alloc (ddr_p, heap, 25);
   res = if_convertible_loop_p_1 (loop, &refs, &ddrs);
 
-  if (flag_tree_loop_if_convert_stores)
-    {
-      data_reference_p dr;
-      unsigned int i;
-
-      for (i = 0; VEC_iterate (data_reference_p, refs, i, dr); i++)
-	free (dr->aux);
-    }
-
   free_data_refs (refs);
   free_dependence_relations (ddrs);
   return res;
@@ -1370,6 +1223,81 @@  insert_gimplified_predicates (loop_p loop)
     }
 }
 
+/* Inserts at position GSI a statement "ADDRESS_OF_AI = &AI;" and
+   returns the ADDRESS_OF_AI.  */
+
+static tree
+insert_address_of (tree ai, gimple_stmt_iterator *gsi)
+{
+  tree addr_expr = build_fold_addr_expr (ai);
+  tree address_of_ai = create_tmp_reg (TREE_TYPE (addr_expr), "_ifc_");
+  gimple addr_stmt = gimple_build_assign (address_of_ai, addr_expr);
+
+  add_referenced_var (address_of_ai);
+  address_of_ai = make_ssa_name (address_of_ai, addr_stmt);
+  gimple_assign_set_lhs (addr_stmt, address_of_ai);
+  SSA_NAME_DEF_STMT (address_of_ai) = addr_stmt;
+  update_stmt (addr_stmt);
+  gsi_insert_before (gsi, addr_stmt, GSI_SAME_STMT);
+
+  return address_of_ai;
+}
+
+/* Insert at the beginning of the first basic block of the current
+   function the allocation on the stack of N_BYTES of memory and
+   return a pointer to this scratchpad memory.  */
+
+static tree
+create_scratchpad (int n_bytes)
+{
+  basic_block bb = single_succ (ENTRY_BLOCK_PTR);
+  gimple_stmt_iterator gsi = gsi_after_labels (bb);
+  tree x = build_int_cst (integer_type_node, n_bytes - 1);
+  tree elt_type = char_type_node;
+  tree array_type = build_array_type (elt_type, build_index_type (x));
+  tree base = create_tmp_reg (array_type, "scratch_pad");
+  tree a0 = build4 (ARRAY_REF, elt_type, base, integer_zero_node, NULL_TREE,
+		    NULL_TREE);
+
+  add_referenced_var (base);
+  return insert_address_of (a0, &gsi);
+}
+
+/* Returns a memory reference to the pointer defined by the
+   conditional expression: pointer = cond ? &A[i] : scratch_pad; and
+   inserts this code at GSI.  */
+
+static tree
+create_indirect_cond_expr (tree ai, tree cond, tree *scratch_pad,
+			   gimple_stmt_iterator *gsi)
+{
+  tree cond_expr;
+  tree pointer;
+  gimple pointer_stmt;
+
+  /* address_of_ai = &A[i];  */
+  tree address_of_ai = insert_address_of (ai, gsi);
+
+  /* Allocate the scratch pad only once per function.  */
+  if (!*scratch_pad)
+    *scratch_pad = create_scratchpad (64);
+
+  /* pointer = cond ? address_of_ai : scratch_pad;  */
+  pointer = create_tmp_reg (TREE_TYPE (address_of_ai), "_ifc_");
+  add_referenced_var (pointer);
+  cond_expr = build3 (COND_EXPR, TREE_TYPE (address_of_ai),
+		      unshare_expr (cond), address_of_ai, *scratch_pad);
+  pointer_stmt = gimple_build_assign (pointer, cond_expr);
+  pointer = make_ssa_name (pointer, pointer_stmt);
+  gimple_assign_set_lhs (pointer_stmt, pointer);
+  SSA_NAME_DEF_STMT (pointer) = pointer_stmt;
+  update_stmt (pointer_stmt);
+  gsi_insert_before (gsi, pointer_stmt, GSI_SAME_STMT);
+
+  return build2 (MEM_REF, TREE_TYPE (ai), pointer,
+		 build_int_cst (reference_alias_ptr_type (ai), 0));
+}
+
 /* Predicate each write to memory in LOOP.
 
    This function transforms control flow constructs containing memory
@@ -1381,10 +1309,19 @@  insert_gimplified_predicates (loop_p loop)
 
    into the following form that does not contain control flow:
 
-   | for (i = 0; i < N; i++)
-   |   A[i] = cond ? expr : A[i];
+   | void *scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES);
+   |
+   | for (i = 0; i < N; i++) {
+   |   p = cond ? &A[i] : scratch_pad;
+   |   *p = expr;
+   | }
+
+   SCRATCH_PAD is allocated on the stack for each function once and it is
+   large enough to contain any kind of scalar assignment or read.  All
+   values read or written to SCRATCH_PAD are not used in the computation.
 
-   The original CFG looks like this:
+   In a more detailed way, the if-conversion of memory writes works
+   like this, supposing that the original CFG looks like this:
 
    | bb_0
    |   i = 0
@@ -1434,10 +1371,12 @@  insert_gimplified_predicates (loop_p loop)
    |   goto bb_1
    | end_bb_4
 
-   predicate_mem_writes is then predicating the memory write as follows:
+   predicate_mem_writes is then allocating SCRATCH_PAD in the basic block
+   preceding the loop header, and is predicating the memory write:
 
    | bb_0
    |   i = 0
+   |   scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES);
    | end_bb_0
    |
    | bb_1
@@ -1445,12 +1384,14 @@  insert_gimplified_predicates (loop_p loop)
    | end_bb_1
    |
    | bb_2
+   |   cond = some_computation;
    |   if (cond) goto bb_3 else goto bb_4
    | end_bb_2
    |
    | bb_3
    |   cond = some_computation;
-   |   A[i] = cond ? expr : A[i];
+   |   p = cond ? &A[i] : scratch_pad;
+   |   *p = expr;
    |   goto bb_4
    | end_bb_3
    |
@@ -1463,12 +1404,14 @@  insert_gimplified_predicates (loop_p loop)
 
    | bb_0
    |   i = 0
+   |   scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES);
    |   if (i < N) goto bb_5 else goto bb_1
    | end_bb_0
    |
    | bb_1
    |   cond = some_computation;
-   |   A[i] = cond ? expr : A[i];
+   |   p = cond ? &A[i] : scratch_pad;
+   |   *p = expr;
    |   if (i < N) goto bb_5 else goto bb_4
    | end_bb_1
    |
@@ -1478,7 +1421,7 @@  insert_gimplified_predicates (loop_p loop)
 */
 
 static void
-predicate_mem_writes (loop_p loop)
+predicate_mem_writes (loop_p loop, tree *scratch_pad)
 {
   unsigned int i, orig_loop_num_nodes = loop->num_nodes;
 
@@ -1493,20 +1436,35 @@  predicate_mem_writes (loop_p loop)
 	continue;
 
       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-	if ((stmt = gsi_stmt (gsi))
-	    && gimple_assign_single_p (stmt)
-	    && gimple_vdef (stmt))
-	  {
-	    tree lhs = gimple_assign_lhs (stmt);
-	    tree rhs = gimple_assign_rhs1 (stmt);
-	    tree type = TREE_TYPE (lhs);
-
-	    lhs = ifc_temp_var (type, unshare_expr (lhs), &gsi);
-	    rhs = ifc_temp_var (type, unshare_expr (rhs), &gsi);
-	    rhs = build3 (COND_EXPR, type, unshare_expr (cond), rhs, lhs);
-	    gimple_assign_set_rhs1 (stmt, ifc_temp_var (type, rhs, &gsi));
-	    update_stmt (stmt);
-	  }
+	{
+	  stmt = gsi_stmt (gsi);
+	  if (gimple_assign_single_p (stmt)
+	      && gimple_vdef (stmt))
+	    {
+	      /* A[i] = x;  */
+	      tree ai = gimple_assign_lhs (stmt);
+
+	      /* pointer = cond ? &A[i] : scratch_pad;  */
+	      tree star_pointer = create_indirect_cond_expr (ai, cond,
+							     scratch_pad, &gsi);
+	      /* *pointer = x;  */
+	      gimple_assign_set_lhs (stmt, star_pointer);
+	      update_stmt (stmt);
+	    }
+	  else if (gimple_assign_single_p (stmt)
+		   && gimple_vuse (stmt))
+	    {
+	      /* x = A[i];  */
+	      tree ai = gimple_assign_rhs1 (stmt);
+
+	      /* pointer = cond ? &A[i] : scratch_pad;  */
+	      tree star_pointer = create_indirect_cond_expr (ai, cond,
+							     scratch_pad, &gsi);
+	      /* x = *pointer;  */
+	      gimple_assign_set_rhs1 (stmt, star_pointer);
+	      update_stmt (stmt);
+	    }
+	}
     }
 }
 
@@ -1556,7 +1514,7 @@  remove_conditions_and_labels (loop_p loop)
    blocks.  Replace PHI nodes with conditional modify expressions.  */
 
 static void
-combine_blocks (struct loop *loop)
+combine_blocks (struct loop *loop, tree *scratch_pad)
 {
   basic_block bb, exit_bb, merge_target_bb;
   unsigned int orig_loop_num_nodes = loop->num_nodes;
@@ -1569,7 +1527,7 @@  combine_blocks (struct loop *loop)
   predicate_all_scalar_phis (loop);
 
   if (flag_tree_loop_if_convert_stores)
-    predicate_mem_writes (loop);
+    predicate_mem_writes (loop, scratch_pad);
 
   /* Merge basic blocks: first remove all the edges in the loop,
      except for those from the exit block.  */
@@ -1658,7 +1616,7 @@  combine_blocks (struct loop *loop)
    profitability analysis.  Returns true when something changed.  */
 
 static bool
-tree_if_conversion (struct loop *loop)
+tree_if_conversion (struct loop *loop, tree *scratch_pad)
 {
   bool changed = false;
   ifc_bbs = NULL;
@@ -1670,7 +1628,7 @@  tree_if_conversion (struct loop *loop)
   /* Now all statements are if-convertible.  Combine all the basic
      blocks into one huge basic block doing the if-conversion
      on-the-fly.  */
-  combine_blocks (loop);
+  combine_blocks (loop, scratch_pad);
 
   if (flag_tree_loop_if_convert_stores)
     mark_sym_for_renaming (gimple_vop (cfun));
@@ -1701,12 +1659,13 @@  main_tree_if_conversion (void)
   struct loop *loop;
   bool changed = false;
   unsigned todo = 0;
+  tree scratch_pad = NULL_TREE;
 
   if (number_of_loops () <= 1)
     return 0;
 
   FOR_EACH_LOOP (li, loop, 0)
-    changed |= tree_if_conversion (loop);
+    changed |= tree_if_conversion (loop, &scratch_pad);
 
   if (changed)
     todo |= TODO_cleanup_cfg;
-- 
1.7.0.4