diff mbox series

tree-optimization/109849 - missed code hoisting

Message ID 20230523095533.B1C0713588@imap2.suse-dmz.suse.de
State New
Headers show
Series tree-optimization/109849 - missed code hoisting | expand

Commit Message

Richard Biener May 23, 2023, 9:55 a.m. UTC
The following fixes code hoisting to properly consider ANTIC_OUT instead
of ANTIC_IN.  That's a bit expensive to re-compute but since we no
longer iterate we're doing this only once per BB which should be
acceptable.  This avoids missing hoistings to the end of blocks where
something in the block clobbers the hoisted value.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

	PR tree-optimization/109849
	* tree-ssa-pre.cc (do_hoist_insertion): Compute ANTIC_OUT
	and use that to determine what to hoist.

	* gcc.dg/tree-ssa/ssa-hoist-8.c: New testcase.
---
 gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c | 22 ++++++++++
 gcc/tree-ssa-pre.cc                         | 48 ++++++++++++++++++---
 2 files changed, 64 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c

Comments

Christophe Lyon May 24, 2023, 10:07 a.m. UTC | #1
Hi Richard,

On Tue, 23 May 2023 at 11:55, Richard Biener via Gcc-patches <
gcc-patches@gcc.gnu.org> wrote:

> The following fixes code hoisting to properly consider ANTIC_OUT instead
> of ANTIC_IN.  That's a bit expensive to re-compute but since we no
> longer iterate we're doing this only once per BB which should be
> acceptable.  This avoids missing hoistings to the end of blocks where
> something in the block clobbers the hoisted value.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
>
>         PR tree-optimization/109849
>         * tree-ssa-pre.cc (do_hoist_insertion): Compute ANTIC_OUT
>         and use that to determine what to hoist.
>
>         * gcc.dg/tree-ssa/ssa-hoist-8.c: New testcase.
>

This patch causes a regression on aarch64:
gcc.target/aarch64/sve/fmla_2.c: \\tst1d found 3 times
 FAIL: gcc.target/aarch64/sve/fmla_2.c scan-assembler-times \\tst1d 2


We used to generate:
        mov     x6, 0
        mov     w7, 55
        whilelo p7.d, wzr, w7
        .p2align 3,,7
.L2:
        ld1d    z30.d, p7/z, [x5, x6, lsl 3]
        ld1d    z31.d, p7/z, [x4, x6, lsl 3]
        cmpne   p6.d, p7/z, z30.d, #0
        ld1d    z30.d, p7/z, [x3, x6, lsl 3]
        ld1d    z29.d, p6/z, [x2, x6, lsl 3]
        movprfx z28, z30
        fmla    z28.d, p6/m, z31.d, z29.d
        fmla    z31.d, p6/m, z30.d, z29.d
        st1d    z28.d, p7, [x1, x6, lsl 3]
        st1d    z31.d, p7, [x0, x6, lsl 3]
        incd    x6
        whilelo p7.d, w6, w7
        b.any   .L2


But now:
        mov     x6, 0
        mov     w7, 55
        ptrue   p4.b, all
        whilelo p7.d, wzr, w7
        .p2align 3,,7
.L2:
        ld1d    z30.d, p7/z, [x5, x6, lsl 3]
        ld1d    z31.d, p7/z, [x4, x6, lsl 3]
        cmpne   p6.d, p7/z, z30.d, #0
        cmpeq   p5.d, p7/z, z30.d, #0
        ld1d    z29.d, p6/z, [x2, x6, lsl 3]
        ld1d    z28.d, p6/z, [x3, x6, lsl 3]
        ld1d    z30.d, p5/z, [x3, x6, lsl 3]
        movprfx z27, z31
        fmla    z27.d, p4/m, z29.d, z28.d
        movprfx z30.d, p6/m, z28.d
        fmla    z30.d, p6/m, z31.d, z29.d
        st1d    z27.d, p6, [x0, x6, lsl 3]
        st1d    z30.d, p7, [x1, x6, lsl 3]
        st1d    z31.d, p5, [x0, x6, lsl 3]
        incd    x6
        whilelo p7.d, w6, w7
        b.any   .L2


Christophe

---
>  gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c | 22 ++++++++++
>  gcc/tree-ssa-pre.cc                         | 48 ++++++++++++++++++---
>  2 files changed, 64 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c
> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c
> new file mode 100644
> index 00000000000..66bb48e0dc1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-pre-stats" } */
> +
> +int mem;
> +void foo ();
> +int bar (int flag)
> +{
> +  int res;
> +  foo ();
> +  /* Hoist the load of mem here even though foo () clobbers it.  */
> +  if (flag)
> +    res = mem;
> +  else
> +    {
> +      res = mem;
> +      mem = 2;
> +    }
> +  return res;
> +}
> +
> +/* { dg-final { scan-tree-dump "HOIST inserted: 1" "pre" } } */
> +/* { dg-final { scan-tree-dump-times " = mem;" 1 "pre" } } */
> diff --git a/gcc/tree-ssa-pre.cc b/gcc/tree-ssa-pre.cc
> index 1f7eea93c16..d56431b4145 100644
> --- a/gcc/tree-ssa-pre.cc
> +++ b/gcc/tree-ssa-pre.cc
> @@ -3622,19 +3622,51 @@ do_hoist_insertion (basic_block block)
>        && stmt_ends_bb_p (gsi_stmt (last)))
>      return false;
>
> -  /* Compute the set of hoistable expressions from ANTIC_IN.  First
> compute
> +  /* We have multiple successors, compute ANTIC_OUT by taking the
> intersection
> +     of all of ANTIC_IN translating through PHI nodes.  Note we do not
> have to
> +     worry about iteration stability here so just intersect the
> expression sets
> +     as well.  This is a simplification of what we do in
> compute_antic_aux.  */
> +  bitmap_set_t ANTIC_OUT = bitmap_set_new ();
> +  bool first = true;
> +  FOR_EACH_EDGE (e, ei, block->succs)
> +    {
> +      if (first)
> +       {
> +         phi_translate_set (ANTIC_OUT, ANTIC_IN (e->dest), e);
> +         first = false;
> +       }
> +      else if (!gimple_seq_empty_p (phi_nodes (e->dest)))
> +       {
> +         bitmap_set_t tmp = bitmap_set_new ();
> +         phi_translate_set (tmp, ANTIC_IN (e->dest), e);
> +         bitmap_and_into (&ANTIC_OUT->values, &tmp->values);
> +         bitmap_and_into (&ANTIC_OUT->expressions, &tmp->expressions);
> +         bitmap_set_free (tmp);
> +       }
> +      else
> +       {
> +         bitmap_and_into (&ANTIC_OUT->values, &ANTIC_IN
> (e->dest)->values);
> +         bitmap_and_into (&ANTIC_OUT->expressions,
> +                          &ANTIC_IN (e->dest)->expressions);
> +       }
> +    }
> +
> +  /* Compute the set of hoistable expressions from ANTIC_OUT.  First
> compute
>       hoistable values.  */
>    bitmap_set hoistable_set;
>
> -  /* A hoistable value must be in ANTIC_IN(block)
> +  /* A hoistable value must be in ANTIC_OUT(block)
>       but not in AVAIL_OUT(BLOCK).  */
>    bitmap_initialize (&hoistable_set.values, &grand_bitmap_obstack);
>    bitmap_and_compl (&hoistable_set.values,
> -                   &ANTIC_IN (block)->values, &AVAIL_OUT (block)->values);
> +                   &ANTIC_OUT->values, &AVAIL_OUT (block)->values);
>
>    /* Short-cut for a common case: hoistable_set is empty.  */
>    if (bitmap_empty_p (&hoistable_set.values))
> -    return false;
> +    {
> +      bitmap_set_free (ANTIC_OUT);
> +      return false;
> +    }
>
>    /* Compute which of the hoistable values is in AVAIL_OUT of
>       at least one of the successors of BLOCK.  */
> @@ -3652,11 +3684,14 @@ do_hoist_insertion (basic_block block)
>
>    /* Short-cut for a common case: availout_in_some is empty.  */
>    if (bitmap_empty_p (&availout_in_some))
> -    return false;
> +    {
> +      bitmap_set_free (ANTIC_OUT);
> +      return false;
> +    }
>
>    /* Hack hoitable_set in-place so we can use
> sorted_array_from_bitmap_set.  */
>    bitmap_move (&hoistable_set.values, &availout_in_some);
> -  hoistable_set.expressions = ANTIC_IN (block)->expressions;
> +  hoistable_set.expressions = ANTIC_OUT->expressions;
>
>    /* Now finally construct the topological-ordered expression set.  */
>    vec<pre_expr> exprs = sorted_array_from_bitmap_set (&hoistable_set);
> @@ -3731,6 +3766,7 @@ do_hoist_insertion (basic_block block)
>      }
>
>    exprs.release ();
> +  bitmap_set_free (ANTIC_OUT);
>
>    return new_stuff;
>  }
> --
> 2.35.3
>
Richard Biener May 24, 2023, 10:41 a.m. UTC | #2
On Wed, 24 May 2023, Christophe Lyon wrote:

> Hi Richard,
> 
> On Tue, 23 May 2023 at 11:55, Richard Biener via Gcc-patches <
> gcc-patches@gcc.gnu.org> wrote:
> 
> > The following fixes code hoisting to properly consider ANTIC_OUT instead
> > of ANTIC_IN.  That's a bit expensive to re-compute but since we no
> > longer iterate we're doing this only once per BB which should be
> > acceptable.  This avoids missing hoistings to the end of blocks where
> > something in the block clobbers the hoisted value.
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
> >
> >         PR tree-optimization/109849
> >         * tree-ssa-pre.cc (do_hoist_insertion): Compute ANTIC_OUT
> >         and use that to determine what to hoist.
> >
> >         * gcc.dg/tree-ssa/ssa-hoist-8.c: New testcase.
> >
> 
> This patch causes a regression on aarch64:
> gcc.target/aarch64/sve/fmla_2.c: \\tst1d found 3 times
>  FAIL: gcc.target/aarch64/sve/fmla_2.c scan-assembler-times \\tst1d 2
> 
> 
> We used to generate:
>         mov     x6, 0
>         mov     w7, 55
>         whilelo p7.d, wzr, w7
>         .p2align 3,,7
> .L2:
>         ld1d    z30.d, p7/z, [x5, x6, lsl 3]
>         ld1d    z31.d, p7/z, [x4, x6, lsl 3]
>         cmpne   p6.d, p7/z, z30.d, #0
>         ld1d    z30.d, p7/z, [x3, x6, lsl 3]
>         ld1d    z29.d, p6/z, [x2, x6, lsl 3]
>         movprfx z28, z30
>         fmla    z28.d, p6/m, z31.d, z29.d
>         fmla    z31.d, p6/m, z30.d, z29.d
>         st1d    z28.d, p7, [x1, x6, lsl 3]
>         st1d    z31.d, p7, [x0, x6, lsl 3]
>         incd    x6
>         whilelo p7.d, w6, w7
>         b.any   .L2
> 
> 
> But now:
>         mov     x6, 0
>         mov     w7, 55
>         ptrue   p4.b, all
>         whilelo p7.d, wzr, w7
>         .p2align 3,,7
> .L2:
>         ld1d    z30.d, p7/z, [x5, x6, lsl 3]
>         ld1d    z31.d, p7/z, [x4, x6, lsl 3]
>         cmpne   p6.d, p7/z, z30.d, #0
>         cmpeq   p5.d, p7/z, z30.d, #0
>         ld1d    z29.d, p6/z, [x2, x6, lsl 3]
>         ld1d    z28.d, p6/z, [x3, x6, lsl 3]
>         ld1d    z30.d, p5/z, [x3, x6, lsl 3]
>         movprfx z27, z31
>         fmla    z27.d, p4/m, z29.d, z28.d
>         movprfx z30.d, p6/m, z28.d
>         fmla    z30.d, p6/m, z31.d, z29.d
>         st1d    z27.d, p6, [x0, x6, lsl 3]
>         st1d    z30.d, p7, [x1, x6, lsl 3]
>         st1d    z31.d, p5, [x0, x6, lsl 3]
>         incd    x6
>         whilelo p7.d, w6, w7
>         b.any   .L2

Thanks for reporting.  I'm testing the following together with
a testcase that's not architecture specific.

Richard.

From 2340df60dd9192b30b02de5b34f9cb7c16806430 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Wed, 24 May 2023 12:36:28 +0200
Subject: [PATCH] tree-optimization/109849 - fix fallout of PRE hoisting change
To: gcc-patches@gcc.gnu.org

The PR109849 fix made us no longer hoist some memory loads because
of the expression set intersection.  We can still avoid to compute
the union by simply taking the first sets expressions and leave
the pruning of expressions with values not suitable for hoisting
to sorted_array_from_bitmap_set.

	PR tree-optimization/109849
	* tree-ssa-pre.cc (do_hoist_insertion): Do not intersect
	expressions but take the first sets.

	* gcc.dg/tree-ssa/ssa-hoist-9.c: New testcase.
---
 gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c | 20 ++++++++++++++++++++
 gcc/tree-ssa-pre.cc                         | 12 ++++--------
 2 files changed, 24 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c
new file mode 100644
index 00000000000..388f79fd80f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-pre-stats" } */
+
+int foo (int flag, int * __restrict a, int * __restrict b)
+{
+  int res;
+  if (flag)
+    res = *a + *b;
+  else
+    {
+      res = *a;
+      *a = 1;
+      res += *b;
+    }
+  return res;
+}
+
+/* { dg-final { scan-tree-dump "HOIST inserted: 3" "pre" } } */
+/* { dg-final { scan-tree-dump-times " = \\\*" 2 "pre" } } */
+/* { dg-final { scan-tree-dump-times " = \[^\r\n\]* \\\+ \[^\r\n\]*;" 1 "pre" } } */
diff --git a/gcc/tree-ssa-pre.cc b/gcc/tree-ssa-pre.cc
index b1ceea90a8e..7bbfa5ac43d 100644
--- a/gcc/tree-ssa-pre.cc
+++ b/gcc/tree-ssa-pre.cc
@@ -3625,8 +3625,9 @@ do_hoist_insertion (basic_block block)
 
   /* We have multiple successors, compute ANTIC_OUT by taking the intersection
      of all of ANTIC_IN translating through PHI nodes.  Note we do not have to
-     worry about iteration stability here so just intersect the expression sets
-     as well.  This is a simplification of what we do in compute_antic_aux.  */
+     worry about iteration stability here so just use the expression set
+     from the first set and prune that by sorted_array_from_bitmap_set.
+     This is a simplification of what we do in compute_antic_aux.  */
   bitmap_set_t ANTIC_OUT = bitmap_set_new ();
   bool first = true;
   FOR_EACH_EDGE (e, ei, block->succs)
@@ -3641,15 +3642,10 @@ do_hoist_insertion (basic_block block)
 	  bitmap_set_t tmp = bitmap_set_new ();
 	  phi_translate_set (tmp, ANTIC_IN (e->dest), e);
 	  bitmap_and_into (&ANTIC_OUT->values, &tmp->values);
-	  bitmap_and_into (&ANTIC_OUT->expressions, &tmp->expressions);
 	  bitmap_set_free (tmp);
 	}
       else
-	{
-	  bitmap_and_into (&ANTIC_OUT->values, &ANTIC_IN (e->dest)->values);
-	  bitmap_and_into (&ANTIC_OUT->expressions,
-			   &ANTIC_IN (e->dest)->expressions);
-	}
+	bitmap_and_into (&ANTIC_OUT->values, &ANTIC_IN (e->dest)->values);
     }
 
   /* Compute the set of hoistable expressions from ANTIC_OUT.  First compute
Christophe Lyon May 24, 2023, 12:45 p.m. UTC | #3
On Wed, 24 May 2023 at 12:41, Richard Biener <rguenther@suse.de> wrote:

> On Wed, 24 May 2023, Christophe Lyon wrote:
>
> > Hi Richard,
> >
> > On Tue, 23 May 2023 at 11:55, Richard Biener via Gcc-patches <
> > gcc-patches@gcc.gnu.org> wrote:
> >
> > > The following fixes code hoisting to properly consider ANTIC_OUT
> instead
> > > of ANTIC_IN.  That's a bit expensive to re-compute but since we no
> > > longer iterate we're doing this only once per BB which should be
> > > acceptable.  This avoids missing hoistings to the end of blocks where
> > > something in the block clobbers the hoisted value.
> > >
> > > Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
> > >
> > >         PR tree-optimization/109849
> > >         * tree-ssa-pre.cc (do_hoist_insertion): Compute ANTIC_OUT
> > >         and use that to determine what to hoist.
> > >
> > >         * gcc.dg/tree-ssa/ssa-hoist-8.c: New testcase.
> > >
> >
> > This patch causes a regression on aarch64:
> > gcc.target/aarch64/sve/fmla_2.c: \\tst1d found 3 times
> >  FAIL: gcc.target/aarch64/sve/fmla_2.c scan-assembler-times \\tst1d 2
> >
> >
> > We used to generate:
> >         mov     x6, 0
> >         mov     w7, 55
> >         whilelo p7.d, wzr, w7
> >         .p2align 3,,7
> > .L2:
> >         ld1d    z30.d, p7/z, [x5, x6, lsl 3]
> >         ld1d    z31.d, p7/z, [x4, x6, lsl 3]
> >         cmpne   p6.d, p7/z, z30.d, #0
> >         ld1d    z30.d, p7/z, [x3, x6, lsl 3]
> >         ld1d    z29.d, p6/z, [x2, x6, lsl 3]
> >         movprfx z28, z30
> >         fmla    z28.d, p6/m, z31.d, z29.d
> >         fmla    z31.d, p6/m, z30.d, z29.d
> >         st1d    z28.d, p7, [x1, x6, lsl 3]
> >         st1d    z31.d, p7, [x0, x6, lsl 3]
> >         incd    x6
> >         whilelo p7.d, w6, w7
> >         b.any   .L2
> >
> >
> > But now:
> >         mov     x6, 0
> >         mov     w7, 55
> >         ptrue   p4.b, all
> >         whilelo p7.d, wzr, w7
> >         .p2align 3,,7
> > .L2:
> >         ld1d    z30.d, p7/z, [x5, x6, lsl 3]
> >         ld1d    z31.d, p7/z, [x4, x6, lsl 3]
> >         cmpne   p6.d, p7/z, z30.d, #0
> >         cmpeq   p5.d, p7/z, z30.d, #0
> >         ld1d    z29.d, p6/z, [x2, x6, lsl 3]
> >         ld1d    z28.d, p6/z, [x3, x6, lsl 3]
> >         ld1d    z30.d, p5/z, [x3, x6, lsl 3]
> >         movprfx z27, z31
> >         fmla    z27.d, p4/m, z29.d, z28.d
> >         movprfx z30.d, p6/m, z28.d
> >         fmla    z30.d, p6/m, z31.d, z29.d
> >         st1d    z27.d, p6, [x0, x6, lsl 3]
> >         st1d    z30.d, p7, [x1, x6, lsl 3]
> >         st1d    z31.d, p5, [x0, x6, lsl 3]
> >         incd    x6
> >         whilelo p7.d, w6, w7
> >         b.any   .L2
>
> Thanks for reporting.  I'm testing the following together with
> a testcase that's not architecture specific.
>
> Richard.
>
> From 2340df60dd9192b30b02de5b34f9cb7c16806430 Mon Sep 17 00:00:00 2001
> From: Richard Biener <rguenther@suse.de>
> Date: Wed, 24 May 2023 12:36:28 +0200
> Subject: [PATCH] tree-optimization/109849 - fix fallout of PRE hoisting
> change
> To: gcc-patches@gcc.gnu.org
>
> The PR109849 fix made us no longer hoist some memory loads because
> of the expression set intersection.  We can still avoid to compute
> the union by simply taking the first sets expressions and leave
> the pruning of expressions with values not suitable for hoisting
> to sorted_array_from_bitmap_set.
>
>         PR tree-optimization/109849
>         * tree-ssa-pre.cc (do_hoist_insertion): Do not intersect
>         expressions but take the first sets.
>
>         * gcc.dg/tree-ssa/ssa-hoist-9.c: New testcase.
>

Thanks, I confirm that this fixes the problem I reported on aarch64.

Christophe


> ---
>  gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c | 20 ++++++++++++++++++++
>  gcc/tree-ssa-pre.cc                         | 12 ++++--------
>  2 files changed, 24 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c
> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c
> new file mode 100644
> index 00000000000..388f79fd80f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-pre-stats" } */
> +
> +int foo (int flag, int * __restrict a, int * __restrict b)
> +{
> +  int res;
> +  if (flag)
> +    res = *a + *b;
> +  else
> +    {
> +      res = *a;
> +      *a = 1;
> +      res += *b;
> +    }
> +  return res;
> +}
> +
> +/* { dg-final { scan-tree-dump "HOIST inserted: 3" "pre" } } */
> +/* { dg-final { scan-tree-dump-times " = \\\*" 2 "pre" } } */
> +/* { dg-final { scan-tree-dump-times " = \[^\r\n\]* \\\+ \[^\r\n\]*;" 1
> "pre" } } */
> diff --git a/gcc/tree-ssa-pre.cc b/gcc/tree-ssa-pre.cc
> index b1ceea90a8e..7bbfa5ac43d 100644
> --- a/gcc/tree-ssa-pre.cc
> +++ b/gcc/tree-ssa-pre.cc
> @@ -3625,8 +3625,9 @@ do_hoist_insertion (basic_block block)
>
>    /* We have multiple successors, compute ANTIC_OUT by taking the
> intersection
>       of all of ANTIC_IN translating through PHI nodes.  Note we do not
> have to
> -     worry about iteration stability here so just intersect the
> expression sets
> -     as well.  This is a simplification of what we do in
> compute_antic_aux.  */
> +     worry about iteration stability here so just use the expression set
> +     from the first set and prune that by sorted_array_from_bitmap_set.
> +     This is a simplification of what we do in compute_antic_aux.  */
>    bitmap_set_t ANTIC_OUT = bitmap_set_new ();
>    bool first = true;
>    FOR_EACH_EDGE (e, ei, block->succs)
> @@ -3641,15 +3642,10 @@ do_hoist_insertion (basic_block block)
>           bitmap_set_t tmp = bitmap_set_new ();
>           phi_translate_set (tmp, ANTIC_IN (e->dest), e);
>           bitmap_and_into (&ANTIC_OUT->values, &tmp->values);
> -         bitmap_and_into (&ANTIC_OUT->expressions, &tmp->expressions);
>           bitmap_set_free (tmp);
>         }
>        else
> -       {
> -         bitmap_and_into (&ANTIC_OUT->values, &ANTIC_IN
> (e->dest)->values);
> -         bitmap_and_into (&ANTIC_OUT->expressions,
> -                          &ANTIC_IN (e->dest)->expressions);
> -       }
> +       bitmap_and_into (&ANTIC_OUT->values, &ANTIC_IN (e->dest)->values);
>      }
>
>    /* Compute the set of hoistable expressions from ANTIC_OUT.  First
> compute
> --
> 2.35.3
>
>
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c
new file mode 100644
index 00000000000..66bb48e0dc1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-pre-stats" } */
+
+int mem;
+void foo ();
+int bar (int flag)
+{
+  int res;
+  foo ();
+  /* Hoist the load of mem here even though foo () clobbers it.  */
+  if (flag)
+    res = mem;
+  else
+    {
+      res = mem;
+      mem = 2;
+    }
+  return res;
+}
+
+/* { dg-final { scan-tree-dump "HOIST inserted: 1" "pre" } } */
+/* { dg-final { scan-tree-dump-times " = mem;" 1 "pre" } } */
diff --git a/gcc/tree-ssa-pre.cc b/gcc/tree-ssa-pre.cc
index 1f7eea93c16..d56431b4145 100644
--- a/gcc/tree-ssa-pre.cc
+++ b/gcc/tree-ssa-pre.cc
@@ -3622,19 +3622,51 @@  do_hoist_insertion (basic_block block)
       && stmt_ends_bb_p (gsi_stmt (last)))
     return false;
 
-  /* Compute the set of hoistable expressions from ANTIC_IN.  First compute
+  /* We have multiple successors, compute ANTIC_OUT by taking the intersection
+     of all of ANTIC_IN translating through PHI nodes.  Note we do not have to
+     worry about iteration stability here so just intersect the expression sets
+     as well.  This is a simplification of what we do in compute_antic_aux.  */
+  bitmap_set_t ANTIC_OUT = bitmap_set_new ();
+  bool first = true;
+  FOR_EACH_EDGE (e, ei, block->succs)
+    {
+      if (first)
+	{
+	  phi_translate_set (ANTIC_OUT, ANTIC_IN (e->dest), e);
+	  first = false;
+	}
+      else if (!gimple_seq_empty_p (phi_nodes (e->dest)))
+	{
+	  bitmap_set_t tmp = bitmap_set_new ();
+	  phi_translate_set (tmp, ANTIC_IN (e->dest), e);
+	  bitmap_and_into (&ANTIC_OUT->values, &tmp->values);
+	  bitmap_and_into (&ANTIC_OUT->expressions, &tmp->expressions);
+	  bitmap_set_free (tmp);
+	}
+      else
+	{
+	  bitmap_and_into (&ANTIC_OUT->values, &ANTIC_IN (e->dest)->values);
+	  bitmap_and_into (&ANTIC_OUT->expressions,
+			   &ANTIC_IN (e->dest)->expressions);
+	}
+    }
+
+  /* Compute the set of hoistable expressions from ANTIC_OUT.  First compute
      hoistable values.  */
   bitmap_set hoistable_set;
 
-  /* A hoistable value must be in ANTIC_IN(block)
+  /* A hoistable value must be in ANTIC_OUT(block)
      but not in AVAIL_OUT(BLOCK).  */
   bitmap_initialize (&hoistable_set.values, &grand_bitmap_obstack);
   bitmap_and_compl (&hoistable_set.values,
-		    &ANTIC_IN (block)->values, &AVAIL_OUT (block)->values);
+		    &ANTIC_OUT->values, &AVAIL_OUT (block)->values);
 
   /* Short-cut for a common case: hoistable_set is empty.  */
   if (bitmap_empty_p (&hoistable_set.values))
-    return false;
+    {
+      bitmap_set_free (ANTIC_OUT);
+      return false;
+    }
 
   /* Compute which of the hoistable values is in AVAIL_OUT of
      at least one of the successors of BLOCK.  */
@@ -3652,11 +3684,14 @@  do_hoist_insertion (basic_block block)
 
   /* Short-cut for a common case: availout_in_some is empty.  */
   if (bitmap_empty_p (&availout_in_some))
-    return false;
+    {
+      bitmap_set_free (ANTIC_OUT);
+      return false;
+    }
 
   /* Hack hoitable_set in-place so we can use sorted_array_from_bitmap_set.  */
   bitmap_move (&hoistable_set.values, &availout_in_some);
-  hoistable_set.expressions = ANTIC_IN (block)->expressions;
+  hoistable_set.expressions = ANTIC_OUT->expressions;
 
   /* Now finally construct the topological-ordered expression set.  */
   vec<pre_expr> exprs = sorted_array_from_bitmap_set (&hoistable_set);
@@ -3731,6 +3766,7 @@  do_hoist_insertion (basic_block block)
     }
 
   exprs.release ();
+  bitmap_set_free (ANTIC_OUT);
 
   return new_stuff;
 }