Patchwork Fix PR45948: add ssa defs from builtin partitions to the last partition.

login
register
mail settings
Submitter Sebastian Pop
Date Dec. 10, 2010, 11:50 p.m.
Message ID <1292025035-30771-1-git-send-email-sebpop@gmail.com>
Download mbox | patch
Permalink /patch/75145/
State New
Headers show

Comments

Sebastian Pop - Dec. 10, 2010, 11:50 p.m.
Hi,

This patch fixes the following pattern:

void
foo (int i, int n)
{
  int a[30];
  int b[30];
  for (; i < n; i++)
    a[i] = b[i] = 0;

  while (1)
    if (b[0])
      bar (a[i - 1]);
}

For the first loop, we end up generating two memset zero, and the loop
completely disappears.  SCEV constant propagation does not apply and
the computation of the last value of "i" does not occur anymore.

With this patch, we now walk over all the scalar statements of the
partitions that we know will be code generated with a builtin, and add
all those scalar computations that do not occur in other partitions
(not generated as builtins) to a last partition (that captures all the
computations that have not been loop distributed).

I am testing this patch on amd64-linux.  Ok for trunk after test?

Thanks,
Sebastian

2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>

	PR tree-optimization/45948
	* tree-loop-distribution.c (ssa_name_has_uses_outside_loop_p): New.
	(stmt_has_scalar_dependences_outside_loop): New.
	(stmt_generated_in_another_partition): New.
	(add_scalar_computations_to_partition): New.
	(rdg_build_partitions): Call add_scalar_computations_to_partition.

	* gcc.dg/tree-ssa/ldist-pr45948.c: New.
---
 gcc/ChangeLog                                 |    9 ++
 gcc/testsuite/ChangeLog                       |    5 +
 gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c |   23 ++++++
 gcc/tree-loop-distribution.c                  |   98 +++++++++++++++++++++++++
 4 files changed, 135 insertions(+), 0 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c
Sebastian Pop - Dec. 12, 2010, 12:27 a.m.
On Fri, Dec 10, 2010 at 17:50, Sebastian Pop <sebpop@gmail.com> wrote:
> Hi,
>
> This patch fixes the following pattern:
>
> void
> foo (int i, int n)
> {
>  int a[30];
>  int b[30];
>  for (; i < n; i++)
>    a[i] = b[i] = 0;
>
>  while (1)
>    if (b[0])
>      bar (a[i - 1]);
> }
>
> For the first loop, we end up generating two memset zero, and the loop
> completely disappears.  SCEV constant propagation does not apply and
> the computation of the last value of "i" does not occur anymore.
>
> With this patch, we now walk over all the scalar statements of the
> partitions that we know will be code generated with a builtin, and add
> all those scalar computations that do not occur in other partitions
> (not generated as builtins) to a last partition (that captures all the
> computations that have not been loop distributed).
>
> I am testing this patch on amd64-linux.  Ok for trunk after test?

This passed regstrap on amd64-linux.

>
> Thanks,
> Sebastian
>
> 2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
>
>        PR tree-optimization/45948
>        * tree-loop-distribution.c (ssa_name_has_uses_outside_loop_p): New.
>        (stmt_has_scalar_dependences_outside_loop): New.
>        (stmt_generated_in_another_partition): New.
>        (add_scalar_computations_to_partition): New.
>        (rdg_build_partitions): Call add_scalar_computations_to_partition.
>
>        * gcc.dg/tree-ssa/ldist-pr45948.c: New.
> ---
>  gcc/ChangeLog                                 |    9 ++
>  gcc/testsuite/ChangeLog                       |    5 +
>  gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c |   23 ++++++
>  gcc/tree-loop-distribution.c                  |   98 +++++++++++++++++++++++++
>  4 files changed, 135 insertions(+), 0 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 0389806..4e09ef9 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,5 +1,14 @@
>  2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
>
> +       PR tree-optimization/45948
> +       * tree-loop-distribution.c (ssa_name_has_uses_outside_loop_p): New.
> +       (stmt_has_scalar_dependences_outside_loop): New.
> +       (stmt_generated_in_another_partition): New.
> +       (add_scalar_computations_to_partition): New.
> +       (rdg_build_partitions): Call add_scalar_computations_to_partition.
> +
> +2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
> +
>        PR tree-optimization/43023
>        * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p):
>        Removed.
> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
> index 7bb46f3..af60360 100644
> --- a/gcc/testsuite/ChangeLog
> +++ b/gcc/testsuite/ChangeLog
> @@ -1,5 +1,10 @@
>  2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
>
> +       PR tree-optimization/45948
> +       * gcc.dg/tree-ssa/ldist-pr45948.c: New.
> +
> +2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
> +
>        PR tree-optimization/43023
>        * gfortran.dg/ldist-1.f90: Adjust pattern.
>        * gfortran.dg/ldist-pr43023.f90: New.
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c
> new file mode 100644
> index 0000000..3e467bd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
> +
> +extern void bar(int);
> +
> +void
> +foo (int i, int n)
> +{
> +  int a[30];
> +  int b[30];
> +  for (; i < n; i++)
> +    a[i] = b[i] = 0;
> +
> +  while (1)
> +    if (b[0])
> +      bar (a[i - 1]);
> +}
> +
> +/* We should apply loop distribution and generate 2 memset (0).  */
> +
> +/* { dg-final { scan-tree-dump "distributed: split to 3" "ldist" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_memset" 4 "ldist" } } */
> +/* { dg-final { cleanup-tree-dump "ldist" } } */
> diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
> index b603209..a9ee67f 100644
> --- a/gcc/tree-loop-distribution.c
> +++ b/gcc/tree-loop-distribution.c
> @@ -808,6 +808,102 @@ fuse_partitions_with_similar_memory_accesses (struct graph *rdg,
>          }
>  }
>
> +/* Returns true when DEF is an SSA_NAME defined in LOOP and used after
> +   the LOOP.  */
> +
> +static bool
> +ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
> +{
> +  imm_use_iterator imm_iter;
> +  use_operand_p use_p;
> +
> +  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
> +    if (loop != loop_containing_stmt (USE_STMT (use_p)))
> +      return true;
> +
> +  return false;
> +}
> +
> +/* Returns true when STMT defines a scalar variable used after the
> +   loop.  */
> +
> +static bool
> +stmt_has_scalar_dependences_outside_loop (gimple stmt)
> +{
> +  tree name;
> +
> +  switch (gimple_code (stmt))
> +    {
> +    case GIMPLE_ASSIGN:
> +      name = gimple_assign_lhs (stmt);
> +      break;
> +
> +    case GIMPLE_PHI:
> +      name = gimple_phi_result (stmt);
> +      break;
> +
> +    default:
> +      return false;
> +    }
> +
> +  return TREE_CODE (name) == SSA_NAME
> +    && ssa_name_has_uses_outside_loop_p (name, loop_containing_stmt (stmt));
> +}
> +
> +/* Returns true when STMT will be code generated in a partition of RDG
> +   different than PART and that will not be code generated as a
> +   builtin.  */
> +
> +static bool
> +stmt_generated_in_another_partition (struct graph *rdg, gimple stmt, int part,
> +                                    VEC (bitmap, heap) *partitions)
> +{
> +  int p;
> +  bitmap pp;
> +  unsigned i;
> +  bitmap_iterator bi;
> +
> +  FOR_EACH_VEC_ELT (bitmap, partitions, p, pp)
> +    if (p != part
> +       && !can_generate_builtin (rdg, pp))
> +      EXECUTE_IF_SET_IN_BITMAP (pp, 0, i, bi)
> +       if (stmt == RDG_STMT (rdg, i))
> +         return true;
> +
> +  return false;
> +}
> +
> +/* For each partition in PARTITIONS that will be code generated using
> +   a builtin, add its scalar computations used after the loop to
> +   PARTITION.  */
> +
> +static void
> +add_scalar_computations_to_partition (struct graph *rdg,
> +                                     VEC (bitmap, heap) *partitions,
> +                                     bitmap partition)
> +{
> +  int p;
> +  bitmap pp;
> +  unsigned i;
> +  bitmap_iterator bi;
> +  bitmap l = BITMAP_ALLOC (NULL);
> +  bitmap pr = BITMAP_ALLOC (NULL);
> +  bool f = false;
> +
> +  FOR_EACH_VEC_ELT (bitmap, partitions, p, pp)
> +    if (can_generate_builtin (rdg, pp))
> +      EXECUTE_IF_SET_IN_BITMAP (pp, 0, i, bi)
> +       if (stmt_has_scalar_dependences_outside_loop (RDG_STMT (rdg, i))
> +           && !stmt_generated_in_another_partition (rdg, RDG_STMT (rdg, i), p,
> +                                                    partitions))
> +         rdg_flag_vertex_and_dependent (rdg, i, partition, l, pr, &f);
> +
> +  rdg_flag_loop_exits (rdg, l, partition, pr, &f);
> +
> +  BITMAP_FREE (pr);
> +  BITMAP_FREE (l);
> +}
> +
>  /* Aggregate several components into a useful partition that is
>    registered in the PARTITIONS vector.  Partitions will be
>    distributed in different loops.  */
> @@ -871,6 +967,8 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
>       free_rdg_components (comps);
>     }
>
> +  add_scalar_computations_to_partition (rdg, *partitions, partition);
> +
>   /* If there is something left in the last partition, save it.  */
>   if (bitmap_count_bits (partition) > 0)
>     VEC_safe_push (bitmap, heap, *partitions, partition);
> --
> 1.7.1
>
>
Richard Guenther - Dec. 13, 2010, 4:46 p.m.
On Sun, Dec 12, 2010 at 1:27 AM, Sebastian Pop <sebpop@gmail.com> wrote:
> On Fri, Dec 10, 2010 at 17:50, Sebastian Pop <sebpop@gmail.com> wrote:
>> Hi,
>>
>> This patch fixes the following pattern:
>>
>> void
>> foo (int i, int n)
>> {
>>  int a[30];
>>  int b[30];
>>  for (; i < n; i++)
>>    a[i] = b[i] = 0;
>>
>>  while (1)
>>    if (b[0])
>>      bar (a[i - 1]);
>> }
>>
>> For the first loop, we end up generating two memset zero, and the loop
>> completely disappears.  SCEV constant propagation does not apply and
>> the computation of the last value of "i" does not occur anymore.
>>
>> With this patch, we now walk over all the scalar statements of the
>> partitions that we know will be code generated with a builtin, and add
>> all those scalar computations that do not occur in other partitions
>> (not generated as builtins) to a last partition (that captures all the
>> computations that have not been loop distributed).
>>
>> I am testing this patch on amd64-linux.  Ok for trunk after test?
>
> This passed regstrap on amd64-linux.

I think the idea is ok, but as loop distribution already has to track
use-def scalar dependences can't we simply add all loop-closed
PHI nodes (args?) to the last partition?  Because it is those that
we need to preserve (I think).  Your patch might effectively do
that, but the extra code looks odd to me, as the loop distribution
machinery would already need to know most of this, no?

Thanks,
Richard.

>>
>> Thanks,
>> Sebastian
>>
>> 2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
>>
>>        PR tree-optimization/45948
>>        * tree-loop-distribution.c (ssa_name_has_uses_outside_loop_p): New.
>>        (stmt_has_scalar_dependences_outside_loop): New.
>>        (stmt_generated_in_another_partition): New.
>>        (add_scalar_computations_to_partition): New.
>>        (rdg_build_partitions): Call add_scalar_computations_to_partition.
>>
>>        * gcc.dg/tree-ssa/ldist-pr45948.c: New.
>> ---
>>  gcc/ChangeLog                                 |    9 ++
>>  gcc/testsuite/ChangeLog                       |    5 +
>>  gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c |   23 ++++++
>>  gcc/tree-loop-distribution.c                  |   98 +++++++++++++++++++++++++
>>  4 files changed, 135 insertions(+), 0 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c
>>
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index 0389806..4e09ef9 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,5 +1,14 @@
>>  2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
>>
>> +       PR tree-optimization/45948
>> +       * tree-loop-distribution.c (ssa_name_has_uses_outside_loop_p): New.
>> +       (stmt_has_scalar_dependences_outside_loop): New.
>> +       (stmt_generated_in_another_partition): New.
>> +       (add_scalar_computations_to_partition): New.
>> +       (rdg_build_partitions): Call add_scalar_computations_to_partition.
>> +
>> +2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
>> +
>>        PR tree-optimization/43023
>>        * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p):
>>        Removed.
>> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
>> index 7bb46f3..af60360 100644
>> --- a/gcc/testsuite/ChangeLog
>> +++ b/gcc/testsuite/ChangeLog
>> @@ -1,5 +1,10 @@
>>  2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
>>
>> +       PR tree-optimization/45948
>> +       * gcc.dg/tree-ssa/ldist-pr45948.c: New.
>> +
>> +2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
>> +
>>        PR tree-optimization/43023
>>        * gfortran.dg/ldist-1.f90: Adjust pattern.
>>        * gfortran.dg/ldist-pr43023.f90: New.
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c
>> new file mode 100644
>> index 0000000..3e467bd
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c
>> @@ -0,0 +1,23 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
>> +
>> +extern void bar(int);
>> +
>> +void
>> +foo (int i, int n)
>> +{
>> +  int a[30];
>> +  int b[30];
>> +  for (; i < n; i++)
>> +    a[i] = b[i] = 0;
>> +
>> +  while (1)
>> +    if (b[0])
>> +      bar (a[i - 1]);
>> +}
>> +
>> +/* We should apply loop distribution and generate 2 memset (0).  */
>> +
>> +/* { dg-final { scan-tree-dump "distributed: split to 3" "ldist" } } */
>> +/* { dg-final { scan-tree-dump-times "__builtin_memset" 4 "ldist" } } */
>> +/* { dg-final { cleanup-tree-dump "ldist" } } */
>> diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
>> index b603209..a9ee67f 100644
>> --- a/gcc/tree-loop-distribution.c
>> +++ b/gcc/tree-loop-distribution.c
>> @@ -808,6 +808,102 @@ fuse_partitions_with_similar_memory_accesses (struct graph *rdg,
>>          }
>>  }
>>
>> +/* Returns true when DEF is an SSA_NAME defined in LOOP and used after
>> +   the LOOP.  */
>> +
>> +static bool
>> +ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
>> +{
>> +  imm_use_iterator imm_iter;
>> +  use_operand_p use_p;
>> +
>> +  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
>> +    if (loop != loop_containing_stmt (USE_STMT (use_p)))
>> +      return true;
>> +
>> +  return false;
>> +}
>> +
>> +/* Returns true when STMT defines a scalar variable used after the
>> +   loop.  */
>> +
>> +static bool
>> +stmt_has_scalar_dependences_outside_loop (gimple stmt)
>> +{
>> +  tree name;
>> +
>> +  switch (gimple_code (stmt))
>> +    {
>> +    case GIMPLE_ASSIGN:
>> +      name = gimple_assign_lhs (stmt);
>> +      break;
>> +
>> +    case GIMPLE_PHI:
>> +      name = gimple_phi_result (stmt);
>> +      break;
>> +
>> +    default:
>> +      return false;
>> +    }
>> +
>> +  return TREE_CODE (name) == SSA_NAME
>> +    && ssa_name_has_uses_outside_loop_p (name, loop_containing_stmt (stmt));
>> +}
>> +
>> +/* Returns true when STMT will be code generated in a partition of RDG
>> +   different than PART and that will not be code generated as a
>> +   builtin.  */
>> +
>> +static bool
>> +stmt_generated_in_another_partition (struct graph *rdg, gimple stmt, int part,
>> +                                    VEC (bitmap, heap) *partitions)
>> +{
>> +  int p;
>> +  bitmap pp;
>> +  unsigned i;
>> +  bitmap_iterator bi;
>> +
>> +  FOR_EACH_VEC_ELT (bitmap, partitions, p, pp)
>> +    if (p != part
>> +       && !can_generate_builtin (rdg, pp))
>> +      EXECUTE_IF_SET_IN_BITMAP (pp, 0, i, bi)
>> +       if (stmt == RDG_STMT (rdg, i))
>> +         return true;
>> +
>> +  return false;
>> +}
>> +
>> +/* For each partition in PARTITIONS that will be code generated using
>> +   a builtin, add its scalar computations used after the loop to
>> +   PARTITION.  */
>> +
>> +static void
>> +add_scalar_computations_to_partition (struct graph *rdg,
>> +                                     VEC (bitmap, heap) *partitions,
>> +                                     bitmap partition)
>> +{
>> +  int p;
>> +  bitmap pp;
>> +  unsigned i;
>> +  bitmap_iterator bi;
>> +  bitmap l = BITMAP_ALLOC (NULL);
>> +  bitmap pr = BITMAP_ALLOC (NULL);
>> +  bool f = false;
>> +
>> +  FOR_EACH_VEC_ELT (bitmap, partitions, p, pp)
>> +    if (can_generate_builtin (rdg, pp))
>> +      EXECUTE_IF_SET_IN_BITMAP (pp, 0, i, bi)
>> +       if (stmt_has_scalar_dependences_outside_loop (RDG_STMT (rdg, i))
>> +           && !stmt_generated_in_another_partition (rdg, RDG_STMT (rdg, i), p,
>> +                                                    partitions))
>> +         rdg_flag_vertex_and_dependent (rdg, i, partition, l, pr, &f);
>> +
>> +  rdg_flag_loop_exits (rdg, l, partition, pr, &f);
>> +
>> +  BITMAP_FREE (pr);
>> +  BITMAP_FREE (l);
>> +}
>> +
>>  /* Aggregate several components into a useful partition that is
>>    registered in the PARTITIONS vector.  Partitions will be
>>    distributed in different loops.  */
>> @@ -871,6 +967,8 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
>>       free_rdg_components (comps);
>>     }
>>
>> +  add_scalar_computations_to_partition (rdg, *partitions, partition);
>> +
>>   /* If there is something left in the last partition, save it.  */
>>   if (bitmap_count_bits (partition) > 0)
>>     VEC_safe_push (bitmap, heap, *partitions, partition);
>> --
>> 1.7.1
>>
>>
>
Sebastian Pop - Dec. 13, 2010, 5:05 p.m.
On Mon, Dec 13, 2010 at 10:46, Richard Guenther
<richard.guenther@gmail.com> wrote:
> I think the idea is ok, but as loop distribution already has to track
> use-def scalar dependences can't we simply add all loop-closed
> PHI nodes (args?) to the last partition?  Because it is those that
> we need to preserve (I think).  Your patch might effectively do
> that, but the extra code looks odd to me, as the loop distribution
> machinery would already need to know most of this, no?

Note that this patch does use the same code as the loop distribution
machinery: rdg_flag_vertex_and_dependent

Adding all the close phi nodes to the last partition would duplicate a
lot of the scalar computations that are already computed in other
partitions: for example,

for (i = 0, j = 0; i < n; i++, j+=2)
{
  A[i] = 0;
  B[j] = 0;
}

use (i, j);

in this code we would have a close phi node for both i and j, and it
is possible to distribute the memset zero on array A so the
computation of i should be duplicated on the last partition, but the
loop on j will still remain as B[j] cannot be generated with a memset
zero, and so if you add the close phi of j to the last partition you
would duplicate the scalar computation of j.

So we have to detect all the partitions to find out which scalar
computations were not code generated in these partitions.

Sebastian
Richard Guenther - Dec. 15, 2010, 2:08 a.m.
On Mon, Dec 13, 2010 at 6:05 PM, Sebastian Pop <sebpop@gmail.com> wrote:
> On Mon, Dec 13, 2010 at 10:46, Richard Guenther
> <richard.guenther@gmail.com> wrote:
>> I think the idea is ok, but as loop distribution already has to track
>> use-def scalar dependences can't we simply add all loop-closed
>> PHI nodes (args?) to the last partition?  Because it is those that
>> we need to preserve (I think).  Your patch might effectively do
>> that, but the extra code looks odd to me, as the loop distribution
>> machinery would already need to know most of this, no?
>
> Note that this patch does use the same code as the loop distribution
> machinery: rdg_flag_vertex_and_dependent
>
> Adding all the close phi nodes to the last partition would duplicate a
> lot of the scalar computations that are already computed in other
> partitions: for example,
>
> for (i = 0, j = 0; i < n; i++, j+=2)
> {
>  A[i] = 0;
>  B[j] = 0;
> }
>
> use (i, j);
>
> in this code we would have a close phi node for both i and j, and it
> is possible to distribute the memset zero on array A so the
> computation of i should be duplicated on the last partition, but the
> loop on j will still remain as B[j] cannot be generated with a memset
> zero, and so if you add the close phi of j to the last partition you
> would duplicate the scalar computation of j.
>
> So we have to detect all the partitions to find out which scalar
> computations were not code generated in these partitions.

Hm, that's true.  Your patch is ok then, as is.

Thanks,
Richard.

> Sebastian
>

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 0389806..4e09ef9 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,14 @@ 
 2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
 
+	PR tree-optimization/45948
+	* tree-loop-distribution.c (ssa_name_has_uses_outside_loop_p): New.
+	(stmt_has_scalar_dependences_outside_loop): New.
+	(stmt_generated_in_another_partition): New.
+	(add_scalar_computations_to_partition): New.
+	(rdg_build_partitions): Call add_scalar_computations_to_partition.
+
+2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
+
 	PR tree-optimization/43023
 	* tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p):
 	Removed.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 7bb46f3..af60360 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,5 +1,10 @@ 
 2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
 
+	PR tree-optimization/45948
+	* gcc.dg/tree-ssa/ldist-pr45948.c: New.
+
+2010-12-10  Sebastian Pop  <sebastian.pop@amd.com>
+
 	PR tree-optimization/43023
 	* gfortran.dg/ldist-1.f90: Adjust pattern.
 	* gfortran.dg/ldist-pr43023.f90: New.
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c
new file mode 100644
index 0000000..3e467bd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-pr45948.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
+
+extern void bar(int);
+
+void
+foo (int i, int n)
+{
+  int a[30];
+  int b[30];
+  for (; i < n; i++)
+    a[i] = b[i] = 0;
+
+  while (1)
+    if (b[0])
+      bar (a[i - 1]);
+}
+
+/* We should apply loop distribution and generate 2 memset (0).  */
+
+/* { dg-final { scan-tree-dump "distributed: split to 3" "ldist" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_memset" 4 "ldist" } } */
+/* { dg-final { cleanup-tree-dump "ldist" } } */
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index b603209..a9ee67f 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -808,6 +808,102 @@  fuse_partitions_with_similar_memory_accesses (struct graph *rdg,
 	  }
 }
 
+/* Returns true when DEF is an SSA_NAME defined in LOOP and used after
+   the LOOP.  */
+
+static bool
+ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
+{
+  imm_use_iterator imm_iter;
+  use_operand_p use_p;
+
+  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
+    if (loop != loop_containing_stmt (USE_STMT (use_p)))
+      return true;
+
+  return false;
+}
+
+/* Returns true when STMT defines a scalar variable used after the
+   loop.  */
+
+static bool
+stmt_has_scalar_dependences_outside_loop (gimple stmt)
+{
+  tree name;
+
+  switch (gimple_code (stmt))
+    {
+    case GIMPLE_ASSIGN:
+      name = gimple_assign_lhs (stmt);
+      break;
+
+    case GIMPLE_PHI:
+      name = gimple_phi_result (stmt);
+      break;
+
+    default:
+      return false;
+    }
+
+  return TREE_CODE (name) == SSA_NAME
+    && ssa_name_has_uses_outside_loop_p (name, loop_containing_stmt (stmt));
+}
+
+/* Returns true when STMT will be code generated in a partition of RDG
+   different than PART and that will not be code generated as a
+   builtin.  */
+
+static bool
+stmt_generated_in_another_partition (struct graph *rdg, gimple stmt, int part,
+				     VEC (bitmap, heap) *partitions)
+{
+  int p;
+  bitmap pp;
+  unsigned i;
+  bitmap_iterator bi;
+
+  FOR_EACH_VEC_ELT (bitmap, partitions, p, pp)
+    if (p != part
+	&& !can_generate_builtin (rdg, pp))
+      EXECUTE_IF_SET_IN_BITMAP (pp, 0, i, bi)
+	if (stmt == RDG_STMT (rdg, i))
+	  return true;
+
+  return false;
+}
+
+/* For each partition in PARTITIONS that will be code generated using
+   a builtin, add its scalar computations used after the loop to
+   PARTITION.  */
+
+static void
+add_scalar_computations_to_partition (struct graph *rdg,
+				      VEC (bitmap, heap) *partitions,
+				      bitmap partition)
+{
+  int p;
+  bitmap pp;
+  unsigned i;
+  bitmap_iterator bi;
+  bitmap l = BITMAP_ALLOC (NULL);
+  bitmap pr = BITMAP_ALLOC (NULL);
+  bool f = false;
+
+  FOR_EACH_VEC_ELT (bitmap, partitions, p, pp)
+    if (can_generate_builtin (rdg, pp))
+      EXECUTE_IF_SET_IN_BITMAP (pp, 0, i, bi)
+	if (stmt_has_scalar_dependences_outside_loop (RDG_STMT (rdg, i))
+	    && !stmt_generated_in_another_partition (rdg, RDG_STMT (rdg, i), p,
+						     partitions))
+	  rdg_flag_vertex_and_dependent (rdg, i, partition, l, pr, &f);
+
+  rdg_flag_loop_exits (rdg, l, partition, pr, &f);
+
+  BITMAP_FREE (pr);
+  BITMAP_FREE (l);
+}
+
 /* Aggregate several components into a useful partition that is
    registered in the PARTITIONS vector.  Partitions will be
    distributed in different loops.  */
@@ -871,6 +967,8 @@  rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
       free_rdg_components (comps);
     }
 
+  add_scalar_computations_to_partition (rdg, *partitions, partition);
+
   /* If there is something left in the last partition, save it.  */
   if (bitmap_count_bits (partition) > 0)
     VEC_safe_push (bitmap, heap, *partitions, partition);