diff mbox

[PR43864] Gimple level duplicate block cleanup.

Message ID 4DEF4408.4040001@codesourcery.com
State New
Headers show

Commit Message

Tom de Vries June 8, 2011, 9:42 a.m. UTC
Hi Richard,

I have a patch for PR43864. The patch adds a gimple level duplicate block
cleanup. The patch has been bootstrapped and reg-tested on x86_64, and
reg-tested on ARM. The size impact on ARM for spec2000 is shown in the following
table (%, lower is better).

                     none            pic
                thumb1  thumb2  thumb1 thumb2
spec2000          99.9    99.9    99.8   99.8

PR43864 is currently marked as a duplicate of PR20070, but I'm not sure that the
optimizations proposed in PR20070 would fix this PR.

The problem in this PR is that when compiling with -O2, the example below should
only have one call to free. The original problem is formulated in terms of -Os,
but currently we generate one call to free with -Os, although still not the
smallest code possible. I'll show here the -O2 case, since that's similar to the
original PR.

#include <stdio.h>
void foo (char*, FILE*);
char* hprofStartupp(char *outputFileName, char *ctx)
{
    char fileName[1000];
    FILE *fp;
    sprintf(fileName, outputFileName);
    if (access(fileName, 1) == 0) {
        free(ctx);
        return 0;
    }

    fp = fopen(fileName, 0);
    if (fp == 0) {
        free(ctx);
        return 0;
    }

    foo(outputFileName, fp);

    return ctx;
}

AFAIU, there are 2 complementary methods of rtl optimizations proposed in PR20070.
- Merging 2 blocks which are identical expect for input registers, by using a
  conditional move to choose between the different input registers.
- Merging 2 blocks which have different local registers, by ignoring those
  differences

Blocks .L6 and.L7 have no difference in local registers, but they have a
difference in input registers: r3 and r1. Replacing the move to r5 by a
conditional move would probably be benificial in terms of size, but it's not
clear what condition the conditional move should be using. Calculating such a
condition would add in size and increase the execution path.

gcc -O2 -march=armv7-a -mthumb pr43864.c -S:
...
	push	{r4, r5, lr}
	mov	r4, r0
	sub	sp, sp, #1004
	mov	r5, r1
	mov	r0, sp
	mov	r1, r4
	bl	sprintf
	mov	r0, sp
	movs	r1, #1
	bl	access
	mov	r3, r0
	cbz	r0, .L6
	movs	r1, #0
	mov	r0, sp
	bl	fopen
	mov	r1, r0
	cbz	r0, .L7
	mov	r0, r4
	bl	foo
.L3:
	mov	r0, r5
	add	sp, sp, #1004
	pop	{r4, r5, pc}
.L6:
	mov	r0, r5
	mov	r5, r3
	bl	free
	b	.L3
.L7:
	mov	r0, r5
	mov	r5, r1
	bl	free
	b	.L3
...

The proposed patch solved the problem by dealing with the 2 blocks at a level
when they are still identical: at gimple level. It detect that the 2 blocks are
identical, and removes one of them.

The following table shows the impact of the patch on the example in terms of
size for -march=armv7-a:

          without     with    delta
Os      :     108      104       -4
O2      :     120      104      -16
Os thumb:      68       64       -4
O2 thumb:      76       64      -12

The gain in size for -O2 is that of removing the entire block, plus the
replacement of 2 moves by a constant set, which also decreases the execution
path. The patch ensures optimal code for both -O2 and -Os.


By keeping track of equivalent definitions in the 2 blocks, we can ignore those
differences in comparison. Without this feature, we would only match blocks with
resultless operations, due the the ssa-nature of gimples.
For example, with this feature, we reduce the following function to its minimum
at gimple level, rather than at rtl level.

int f(int c, int b, int d)
{
  int r, e;

  if (c)
    r = b + d;
  else
    {
      e = b + d;
      r = e;
    }

  return r;
}

;; Function f (f)

f (int c, int b, int d)
{
  int e;

<bb 2>:
  e_6 = b_3(D) + d_4(D);
  return e_6;

}

I'll send the patch with the testcases in a separate email.

OK for trunk?

Thanks,
- Tom

2011-06-08  Tom de Vries  <tom@codesourcery.com>

	PR middle-end/43864
	* tree-cfgcleanup.c (int_int_splay_lookup, int_int_splay_insert)
	(int_int_splay_node_contained_in, int_int_splay_contained_in)
	(equiv_lookup, equiv_insert, equiv_contained_in, equiv_init)
	(equiv_delete, gimple_base_equal_p, pt_solution_equal_p, gimple_equal_p)
	(bb_gimple_equal_p, update_debug_stmts, cleanup_duplicate_preds_1)
	(same_or_local_phi_alternatives, cleanup_duplicate_preds): New function.
	(cleanup_tree_cfg_bb): Use cleanup_duplicate_preds.

Comments

Richard Biener June 8, 2011, 9:55 a.m. UTC | #1
On Wed, Jun 8, 2011 at 11:42 AM, Tom de Vries <vries@codesourcery.com> wrote:
> Hi Richard,
>
> I have a patch for PR43864. The patch adds a gimple level duplicate block
> cleanup. The patch has been bootstrapped and reg-tested on x86_64, and
> reg-tested on ARM. The size impact on ARM for spec2000 is shown in the following
> table (%, lower is better).
>
>                     none            pic
>                thumb1  thumb2  thumb1 thumb2
> spec2000          99.9    99.9    99.8   99.8
>
> PR43864 is currently marked as a duplicate of PR20070, but I'm not sure that the
> optimizations proposed in PR20070 would fix this PR.
>
> The problem in this PR is that when compiling with -O2, the example below should
> only have one call to free. The original problem is formulated in terms of -Os,
> but currently we generate one call to free with -Os, although still not the
> smallest code possible. I'll show here the -O2 case, since that's similar to the
> original PR.
>
> #include <stdio.h>
> void foo (char*, FILE*);
> char* hprofStartupp(char *outputFileName, char *ctx)
> {
>    char fileName[1000];
>    FILE *fp;
>    sprintf(fileName, outputFileName);
>    if (access(fileName, 1) == 0) {
>        free(ctx);
>        return 0;
>    }
>
>    fp = fopen(fileName, 0);
>    if (fp == 0) {
>        free(ctx);
>        return 0;
>    }
>
>    foo(outputFileName, fp);
>
>    return ctx;
> }
>
> AFAIU, there are 2 complementary methods of rtl optimizations proposed in PR20070.
> - Merging 2 blocks which are identical expect for input registers, by using a
>  conditional move to choose between the different input registers.
> - Merging 2 blocks which have different local registers, by ignoring those
>  differences
>
> Blocks .L6 and.L7 have no difference in local registers, but they have a
> difference in input registers: r3 and r1. Replacing the move to r5 by a
> conditional move would probably be benificial in terms of size, but it's not
> clear what condition the conditional move should be using. Calculating such a
> condition would add in size and increase the execution path.
>
> gcc -O2 -march=armv7-a -mthumb pr43864.c -S:
> ...
>        push    {r4, r5, lr}
>        mov     r4, r0
>        sub     sp, sp, #1004
>        mov     r5, r1
>        mov     r0, sp
>        mov     r1, r4
>        bl      sprintf
>        mov     r0, sp
>        movs    r1, #1
>        bl      access
>        mov     r3, r0
>        cbz     r0, .L6
>        movs    r1, #0
>        mov     r0, sp
>        bl      fopen
>        mov     r1, r0
>        cbz     r0, .L7
>        mov     r0, r4
>        bl      foo
> .L3:
>        mov     r0, r5
>        add     sp, sp, #1004
>        pop     {r4, r5, pc}
> .L6:
>        mov     r0, r5
>        mov     r5, r3
>        bl      free
>        b       .L3
> .L7:
>        mov     r0, r5
>        mov     r5, r1
>        bl      free
>        b       .L3
> ...
>
> The proposed patch solved the problem by dealing with the 2 blocks at a level
> when they are still identical: at gimple level. It detect that the 2 blocks are
> identical, and removes one of them.
>
> The following table shows the impact of the patch on the example in terms of
> size for -march=armv7-a:
>
>          without     with    delta
> Os      :     108      104       -4
> O2      :     120      104      -16
> Os thumb:      68       64       -4
> O2 thumb:      76       64      -12
>
> The gain in size for -O2 is that of removing the entire block, plus the
> replacement of 2 moves by a constant set, which also decreases the execution
> path. The patch ensures optimal code for both -O2 and -Os.
>
>
> By keeping track of equivalent definitions in the 2 blocks, we can ignore those
> differences in comparison. Without this feature, we would only match blocks with
> resultless operations, due the the ssa-nature of gimples.
> For example, with this feature, we reduce the following function to its minimum
> at gimple level, rather than at rtl level.
>
> int f(int c, int b, int d)
> {
>  int r, e;
>
>  if (c)
>    r = b + d;
>  else
>    {
>      e = b + d;
>      r = e;
>    }
>
>  return r;
> }
>
> ;; Function f (f)
>
> f (int c, int b, int d)
> {
>  int e;
>
> <bb 2>:
>  e_6 = b_3(D) + d_4(D);
>  return e_6;
>
> }
>
> I'll send the patch with the testcases in a separate email.
>
> OK for trunk?

I don't like that you hook this into cleanup_tree_cfg - that is called
_way_ too often.

This also duplicates the literal matching done on the RTL level - instead
I think this optimization would be more related to value-numbering
(either that of SCCVN/FRE/PRE or that of DOM which also does
jump-threading).

Richard.

> Thanks,
> - Tom
>
> 2011-06-08  Tom de Vries  <tom@codesourcery.com>
>
>        PR middle-end/43864
>        * tree-cfgcleanup.c (int_int_splay_lookup, int_int_splay_insert)
>        (int_int_splay_node_contained_in, int_int_splay_contained_in)
>        (equiv_lookup, equiv_insert, equiv_contained_in, equiv_init)
>        (equiv_delete, gimple_base_equal_p, pt_solution_equal_p, gimple_equal_p)
>        (bb_gimple_equal_p, update_debug_stmts, cleanup_duplicate_preds_1)
>        (same_or_local_phi_alternatives, cleanup_duplicate_preds): New function.
>        (cleanup_tree_cfg_bb): Use cleanup_duplicate_preds.
>
Steven Bosscher June 8, 2011, 10:36 a.m. UTC | #2
On Wed, Jun 8, 2011 at 11:55 AM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Wed, Jun 8, 2011 at 11:42 AM, Tom de Vries <vries@codesourcery.com> wrote:
>> Hi Richard,
>>
>> I have a patch for PR43864. The patch adds a gimple level duplicate block
>> cleanup. The patch has been bootstrapped and reg-tested on x86_64, and
>> reg-tested on ARM. The size impact on ARM for spec2000 is shown in the following
>> table (%, lower is better).
>>
>>                     none            pic
>>                thumb1  thumb2  thumb1 thumb2
>> spec2000          99.9    99.9    99.8   99.8
>>
>> PR43864 is currently marked as a duplicate of PR20070, but I'm not sure that the
>> optimizations proposed in PR20070 would fix this PR.
>>
>> The problem in this PR is that when compiling with -O2, the example below should
>> only have one call to free. The original problem is formulated in terms of -Os,
>> but currently we generate one call to free with -Os, although still not the
>> smallest code possible. I'll show here the -O2 case, since that's similar to the
>> original PR.
>>
>> #include <stdio.h>
>> void foo (char*, FILE*);
>> char* hprofStartupp(char *outputFileName, char *ctx)
>> {
>>    char fileName[1000];
>>    FILE *fp;
>>    sprintf(fileName, outputFileName);
>>    if (access(fileName, 1) == 0) {
>>        free(ctx);
>>        return 0;
>>    }
>>
>>    fp = fopen(fileName, 0);
>>    if (fp == 0) {
>>        free(ctx);
>>        return 0;
>>    }
>>
>>    foo(outputFileName, fp);
>>
>>    return ctx;
>> }
>>
>> AFAIU, there are 2 complementary methods of rtl optimizations proposed in PR20070.
>> - Merging 2 blocks which are identical expect for input registers, by using a
>>  conditional move to choose between the different input registers.
>> - Merging 2 blocks which have different local registers, by ignoring those
>>  differences
>>
>> Blocks .L6 and.L7 have no difference in local registers, but they have a
>> difference in input registers: r3 and r1. Replacing the move to r5 by a
>> conditional move would probably be benificial in terms of size, but it's not
>> clear what condition the conditional move should be using. Calculating such a
>> condition would add in size and increase the execution path.
>>
>> gcc -O2 -march=armv7-a -mthumb pr43864.c -S:
>> ...
>>        push    {r4, r5, lr}
>>        mov     r4, r0
>>        sub     sp, sp, #1004
>>        mov     r5, r1
>>        mov     r0, sp
>>        mov     r1, r4
>>        bl      sprintf
>>        mov     r0, sp
>>        movs    r1, #1
>>        bl      access
>>        mov     r3, r0
>>        cbz     r0, .L6
>>        movs    r1, #0
>>        mov     r0, sp
>>        bl      fopen
>>        mov     r1, r0
>>        cbz     r0, .L7
>>        mov     r0, r4
>>        bl      foo
>> .L3:
>>        mov     r0, r5
>>        add     sp, sp, #1004
>>        pop     {r4, r5, pc}
>> .L6:
>>        mov     r0, r5
>>        mov     r5, r3
>>        bl      free
>>        b       .L3
>> .L7:
>>        mov     r0, r5
>>        mov     r5, r1
>>        bl      free
>>        b       .L3
>> ...
>>
>> The proposed patch solved the problem by dealing with the 2 blocks at a level
>> when they are still identical: at gimple level. It detect that the 2 blocks are
>> identical, and removes one of them.
>>
>> The following table shows the impact of the patch on the example in terms of
>> size for -march=armv7-a:
>>
>>          without     with    delta
>> Os      :     108      104       -4
>> O2      :     120      104      -16
>> Os thumb:      68       64       -4
>> O2 thumb:      76       64      -12
>>
>> The gain in size for -O2 is that of removing the entire block, plus the
>> replacement of 2 moves by a constant set, which also decreases the execution
>> path. The patch ensures optimal code for both -O2 and -Os.
>>
>>
>> By keeping track of equivalent definitions in the 2 blocks, we can ignore those
>> differences in comparison. Without this feature, we would only match blocks with
>> resultless operations, due the the ssa-nature of gimples.
>> For example, with this feature, we reduce the following function to its minimum
>> at gimple level, rather than at rtl level.
>>
>> int f(int c, int b, int d)
>> {
>>  int r, e;
>>
>>  if (c)
>>    r = b + d;
>>  else
>>    {
>>      e = b + d;
>>      r = e;
>>    }
>>
>>  return r;
>> }
>>
>> ;; Function f (f)
>>
>> f (int c, int b, int d)
>> {
>>  int e;
>>
>> <bb 2>:
>>  e_6 = b_3(D) + d_4(D);
>>  return e_6;
>>
>> }
>>
>> I'll send the patch with the testcases in a separate email.
>>
>> OK for trunk?
>
> I don't like that you hook this into cleanup_tree_cfg - that is called
> _way_ too often.
>
> This also duplicates the literal matching done on the RTL level - instead
> I think this optimization would be more related to value-numbering
> (either that of SCCVN/FRE/PRE or that of DOM which also does
> jump-threading).

And while at it: Put it in a separate file ;-)

Ciao!
Steven
Jeff Law June 10, 2011, 6:39 p.m. UTC | #3
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 06/08/11 03:42, Tom de Vries wrote:
> Hi Richard,
> 
> I have a patch for PR43864. The patch adds a gimple level duplicate block
> cleanup. The patch has been bootstrapped and reg-tested on x86_64, and
> reg-tested on ARM. The size impact on ARM for spec2000 is shown in the following
> table (%, lower is better).
> 
>                      none            pic
>                 thumb1  thumb2  thumb1 thumb2
> spec2000          99.9    99.9    99.8   99.8
> 
> PR43864 is currently marked as a duplicate of PR20070, but I'm not sure that the
> optimizations proposed in PR20070 would fix this PR.
> 
> The problem in this PR is that when compiling with -O2, the example below should
> only have one call to free. The original problem is formulated in terms of -Os,
> but currently we generate one call to free with -Os, although still not the
> smallest code possible. I'll show here the -O2 case, since that's similar to the
> original PR.
[ ... ]

FWIW, I've seen at least one paper which claims that extending value
numbering redundancy elimination to handle blocks is effective at
eliminating duplicates.    Redundancy elimination for blocks turns into
CFG manipulations.

We want to do this early in the pipeline so that register numbering
doesn't get in the way.

I'm going to let you and Richi iterate, but wanted to chime in with my
general support for detecting and eliminating blocks early in the
pipeline (gimple).

jeff
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.11 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org/

iQEcBAEBAgAGBQJN8mTsAAoJEBRtltQi2kC7UxMIAKqEprg9LUJS4QnUeXRKDx7h
+tMBS1hPEZcmfxV9jo95oXb+1yiCIrI2CUou9PDO8E4i/Dv3x6kCF5vyfLBGx0ZI
dS80HjTdvr2vUzxFQnChESvDYepyg2JE6pnlnOe4cwnuyFHWhPQG7L3e3lDaZkTa
224afwDNNUMmCGDE4emUpgV/evQ4dpiiY/dqAU1fu6ev8wQxfDpyGeZOTD+qC1XM
DfpZumzHJpJfo2w/LMbSiBNci0HYxENjUheNixzHDLMSUO8AdStje6NBIJ96I3s7
p94WuwuP08B1wU5J0F4B1TiyAJxDOBbtApRwhpAOs9NImEDydmOXDLjIaPivXBY=
=2J0r
-----END PGP SIGNATURE-----
diff mbox

Patch

Index: gcc/tree-cfgcleanup.c
===================================================================
--- gcc/tree-cfgcleanup.c	(revision 173703)
+++ gcc/tree-cfgcleanup.c	(working copy)
@@ -641,6 +641,552 @@  cleanup_omp_return (basic_block bb)
   return true;
 }
 
+/* Returns true if S contains (I1, I2).  */
+
+static bool
+int_int_splay_lookup (splay_tree s, unsigned int i1, unsigned int i2)
+{
+  splay_tree_node node;
+
+  if (s == NULL)
+    return false;
+
+  node = splay_tree_lookup (s, i1);
+  return node && node->value == i2;
+}
+
+/* Attempts to insert (I1, I2) into *S.  Returns true if successful.
+   Allocates *S if necessary.  */
+
+static bool
+int_int_splay_insert (splay_tree *s, unsigned int i1 , unsigned int i2)
+{
+  if (*s != NULL)
+    {
+      /* Check for existing element, which would otherwise be silently
+	 overwritten by splay_tree_insert.  */
+      if (splay_tree_lookup (*s, i1))
+	return false;
+    }
+  else
+    *s = splay_tree_new (splay_tree_compare_ints, 0, 0);
+
+  splay_tree_insert (*s, i1, i2);
+  return true;
+}
+
+/* Returns 0 if (NODE->value, NODE->key) is an element of S.  Otherwise,
+   returns 1.  */
+
+static int
+int_int_splay_node_contained_in (splay_tree_node node, void *s)
+{
+  splay_tree_node snode = splay_tree_lookup ((splay_tree)s, node->key);
+  return (!snode || node->value != snode->value) ? 1 : 0;
+}
+
+/* Returns true if all elements of S1 are also in S2.  */
+
+static bool
+int_int_splay_contained_in (splay_tree s1, splay_tree s2)
+{
+  if (s1 == NULL)
+    return true;
+  if (s2 == NULL)
+    return false;
+  return splay_tree_foreach (s1, int_int_splay_node_contained_in, s2) == 0;
+}
+
+typedef splay_tree equiv_t;
+
+/* Returns true if EQUIV contains (SSA_NAME_VERSION (VAL1),
+                                   SSA_NAME_VERSION (VAL2)).  */
+
+static bool
+equiv_lookup (equiv_t equiv, tree val1, tree val2)
+{
+  if (val1 == NULL_TREE || val2 == NULL_TREE
+      || TREE_CODE (val1) != SSA_NAME || TREE_CODE (val2) != SSA_NAME)
+    return false;
+
+  return int_int_splay_lookup (equiv, SSA_NAME_VERSION (val1),
+			       SSA_NAME_VERSION (val2));
+}
+
+/* Attempts to insert (SSA_NAME_VERSION (VAL1), SSA_NAME_VERSION (VAL2)) into
+   EQUIV, provided they are defined BB1 and BB2.  Returns true if successful.
+   Allocates *EQUIV if necessary.  */
+
+static bool
+equiv_insert (equiv_t *equiv, tree val1, tree val2,
+	      basic_block bb1, basic_block bb2)
+{
+  if (val1 == NULL_TREE || val2 == NULL_TREE
+      || TREE_CODE (val1) != SSA_NAME || TREE_CODE (val2) != SSA_NAME
+      || gimple_bb (SSA_NAME_DEF_STMT (val1)) != bb1
+      || gimple_bb (SSA_NAME_DEF_STMT (val2)) != bb2)
+    return false;
+
+  return int_int_splay_insert (equiv, SSA_NAME_VERSION (val1),
+			       SSA_NAME_VERSION (val2));
+}
+
+/* Returns true if all elements of S1 are also in S2.  */
+
+static bool
+equiv_contained_in (equiv_t s1, equiv_t s2)
+{
+  return int_int_splay_contained_in (s1, s2);
+}
+
+/* Init equiv_t *S.  */
+
+static void
+equiv_init (equiv_t *s)
+{
+  *s = NULL;
+}
+
+/* Delete equiv_t *S and reinit.  */
+
+static void
+equiv_delete (equiv_t *s)
+{
+  if (!*s)
+    return;
+
+  splay_tree_delete (*s);
+  *s = NULL;
+}
+
+/* Check whether S1 and S2 are equal, considering the fields in
+   gimple_statement_base.  Ignores fields uid, location, bb, and block.  */
+
+static bool
+gimple_base_equal_p (gimple s1, gimple s2)
+{
+  if (gimple_code (s1) != gimple_code (s2))
+    return false;
+
+  if (gimple_no_warning_p (s1) != gimple_no_warning_p (s2))
+    return false;
+
+  /* For pass-local flags visited and plf we would like to be more aggressive.
+     But that means we must have a way to find out whether the flags are
+     currently in use or not.  */
+  if (gimple_visited_p (s1) != gimple_visited_p (s2))
+    return false;
+
+  if (is_gimple_assign (s1)
+      && (gimple_assign_nontemporal_move_p (s1)
+          != gimple_assign_nontemporal_move_p (s2)))
+    return false;
+
+  if (gimple_plf (s1, GF_PLF_1) != gimple_plf (s2, GF_PLF_1))
+    return false;
+
+  if (gimple_plf (s1, GF_PLF_2) != gimple_plf (s2, GF_PLF_2))
+    return false;
+
+  /* The modified field is set when allocating, but only reset for the first
+     time once ssa_operands_active.  So before ssa_operands_active, the field
+     signals that the ssa operands have not been scanned, and after
+     ssa_operands_active it signals that the ssa operands might be invalid.
+     We check here only for the latter case.  */
+  if (ssa_operands_active ()
+      && (gimple_modified_p (s1) || gimple_modified_p (s2)))
+    return false;
+
+  if (gimple_has_volatile_ops (s1) != gimple_has_volatile_ops (s2))
+    return false;
+
+  if (s1->gsbase.subcode != s2->gsbase.subcode)
+    return false;
+
+  if (gimple_num_ops (s1) != gimple_num_ops (s2))
+    return false;
+
+  return true;
+}
+
+/* Return true if p1 and p2 can be considered equal.  */
+
+static bool
+pt_solution_equal_p (struct pt_solution *p1, struct pt_solution *p2)
+{
+  if (pt_solution_empty_p (p1) != pt_solution_empty_p (p2))
+    return false;
+  if (pt_solution_empty_p (p1))
+    return true;
+
+  /* TODO: make this less conservative.  */
+  return (p1->anything && p2->anything);
+}
+
+/* Return true if gimple statements S1 and S2 are equal.  EQUIV contains pairs
+   of local defs that can be considered equivalent at entry, and if equal
+   contains at exit the defs and vdefs of S1 and S2.  */
+
+static bool
+gimple_equal_p (equiv_t *equiv, gimple s1, gimple s2)
+{
+  unsigned int i;
+  enum gimple_statement_structure_enum gss;
+  tree lhs1, lhs2;
+  basic_block bb1 = gimple_bb (s1), bb2 = gimple_bb (s2);
+
+  /* Handle omp gimples conservatively.  */
+  if (is_gimple_omp (s1) || is_gimple_omp (s2))
+    return false;
+
+  if (!gimple_base_equal_p (s1, s2))
+    return false;
+
+  gss = gimple_statement_structure (s1);
+  switch (gss)
+    {
+    case GSS_CALL:
+      if (!pt_solution_equal_p (gimple_call_use_set (s1),
+				gimple_call_use_set (s2))
+	  || !pt_solution_equal_p (gimple_call_clobber_set (s1),
+				   gimple_call_clobber_set (s2))
+	  || !gimple_call_same_target_p (s1, s2))
+	return false;
+      /* Falthru.  */
+
+    case GSS_WITH_MEM_OPS_BASE:
+    case GSS_WITH_MEM_OPS:
+      {
+	tree vdef1 = gimple_vdef (s1), vdef2 = gimple_vdef (s2);
+	tree vuse1 = gimple_vuse (s1), vuse2 = gimple_vuse (s2);
+	if (vuse1 != vuse2 && !equiv_lookup (*equiv, vuse1, vuse2))
+	  return false;
+	if (vdef1 != vdef2 && !equiv_insert (equiv, vdef1, vdef2, bb1, bb2))
+	  return false;
+      }
+      /* Falthru.  */
+
+    case GSS_WITH_OPS:
+      /* Ignore gimple_def_ops and gimple_use_ops.  They are duplicates of
+	 gimple_vdef, gimple_vuse and gimple_ops, and checked elsewhere.  */
+      /* Falthru.  */
+
+    case GSS_BASE:
+      break;
+
+    default:
+      return false;
+    }
+
+  /* Find lhs.  */
+  lhs1 = gimple_get_lhs (s1);
+  lhs2 = gimple_get_lhs (s2);
+
+  /* Handle ops.  */
+  for (i = 0; i < gimple_num_ops (s1); ++i)
+    {
+      tree t1 = gimple_op (s1, i);
+      tree t2 = gimple_op (s2, i);
+      if (t1 == NULL_TREE && t2 == NULL_TREE)
+        continue;
+      if (t1 == NULL_TREE || t2 == NULL_TREE)
+        return false;
+      /* Skip lhs.  */
+      if (lhs1 == t1 && i == 0)
+	continue;
+      if (!operand_equal_p (t1, t2, 0) && !equiv_lookup (*equiv, t1, t2))
+	return false;
+    }
+
+  /* Handle lhs.  */
+  if (lhs1 != lhs2 && !equiv_insert (equiv, lhs1, lhs2, bb1, bb2))
+    return false;
+
+  return true;
+}
+
+/* Return true if BB1 and BB2 contain the same non-debug gimple statements, and
+   if the def pairs in PHI_EQUIV are found to be equivalent defs in BB1 and
+   BB2.  */
+
+static bool
+bb_gimple_equal_p (equiv_t phi_equiv, basic_block bb1, basic_block bb2)
+{
+  gimple_stmt_iterator gsi1 = gsi_start_nondebug_bb (bb1);
+  gimple_stmt_iterator gsi2 = gsi_start_nondebug_bb (bb2);
+  bool end1, end2;
+  equiv_t equiv;
+  bool equal = true;
+
+  end1 = gsi_end_p (gsi1);
+  end2 = gsi_end_p (gsi2);
+
+  /* Don't handle empty blocks, these are handled elsewhere in the cleanup.  */
+  if (end1 || end2)
+    return false;
+
+  /* TODO: handle blocks with phi-nodes.  We'll have find corresponding
+     phi-nodes in bb1 and bb2, with the same alternatives for the same
+     preds.  */
+  if (phi_nodes (bb1) != NULL || phi_nodes (bb2) != NULL)
+    return false;
+
+  equiv_init (&equiv);
+  while (true)
+    {
+      if (end1 && end2)
+        break;
+      if (end1 || end2
+	  || !gimple_equal_p (&equiv, gsi_stmt (gsi1), gsi_stmt (gsi2)))
+	{
+	  equal = false;
+	  break;
+	}
+
+      gsi_next_nondebug (&gsi1);
+      gsi_next_nondebug (&gsi2);
+      end1 = gsi_end_p (gsi1);
+      end2 = gsi_end_p (gsi2);
+    }
+
+  /* equiv now contains all bb1,bb2 def pairs which are equivalent.
+     Check if the phi alternatives are indeed equivalent.  */
+  equal = equal && equiv_contained_in (phi_equiv, equiv);
+
+  equiv_delete (&equiv);
+
+  return equal;
+}
+
+/* Resets all debug statements in BBUSE that have uses that are not
+   dominated by their defs.  */
+
+static void
+update_debug_stmts (basic_block bbuse)
+{
+  use_operand_p use_p;
+  ssa_op_iter oi;
+  basic_block bbdef;
+  gimple stmt, def_stmt;
+  gimple_stmt_iterator gsi;
+  tree name;
+
+  for (gsi = gsi_start_bb (bbuse); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      if (!is_gimple_debug (stmt))
+	continue;
+      gcc_assert (gimple_debug_bind_p (stmt));
+
+      gcc_assert (dom_info_available_p (CDI_DOMINATORS));
+
+      FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, oi, SSA_OP_USE)
+	{
+	  name = USE_FROM_PTR (use_p);
+	  gcc_assert (TREE_CODE (name) == SSA_NAME);
+
+	  def_stmt = SSA_NAME_DEF_STMT (name);
+	  gcc_assert (def_stmt != NULL);
+
+	  bbdef = gimple_bb (def_stmt);
+	  if (bbdef == NULL || bbuse == bbdef
+	      || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef))
+	    continue;
+
+	  gimple_debug_bind_reset_value (stmt);
+	  update_stmt (stmt);
+	}
+    }
+}
+
+/* E1 and E2 have a common dest.  Detect if E1->src and E2->src are duplicates,
+   and if so, redirect the predecessor edges of E1->src to E2->src and remove
+   E1->src.  Returns true if any changes were made.  */
+
+static bool
+cleanup_duplicate_preds_1 (equiv_t phi_equiv, edge e1, edge e2)
+{
+  edge pred_edge;
+  basic_block bb1, bb2, pred;
+  basic_block bb_dom = NULL, bb2_dom = NULL;
+  unsigned int i;
+  basic_block bb = e1->dest;
+  gcc_assert (bb == e2->dest);
+
+  if (e1->flags != e2->flags)
+    return false;
+
+  bb1 = e1->src;
+  bb2 = e2->src;
+
+  /* TODO: We could allow multiple successor edges here, as long as bb1 and bb2
+     have the same successors.  */
+  if (EDGE_COUNT (bb1->succs) != 1 || EDGE_COUNT (bb2->succs) != 1)
+    return false;
+
+  if (!bb_gimple_equal_p (phi_equiv, bb1, bb2))
+    return false;
+
+  if (dump_file)
+    fprintf (dump_file, "cleanup_duplicate_preds: "
+             "cleaning up <bb %d>, duplicate of <bb %d>\n", bb1->index,
+             bb2->index);
+
+  /* Calculate the changes to be made to the dominator info.  */
+  if (dom_info_available_p (CDI_DOMINATORS))
+    {
+      /* Calculate bb2_dom.  */
+      bb2_dom = nearest_common_dominator (CDI_DOMINATORS, bb2, bb1);
+      if (bb2_dom == bb1 || bb2_dom == bb2)
+        bb2_dom = get_immediate_dominator (CDI_DOMINATORS, bb2_dom);
+
+      /* Calculate bb_dom.  */
+      bb_dom = get_immediate_dominator (CDI_DOMINATORS, bb);
+      if (bb == bb2)
+        bb_dom = bb2_dom;
+      else if (bb_dom == bb1 || bb_dom == bb2)
+        bb_dom = bb2;
+      else
+        {
+          /* Count the predecessors of bb (other than bb1 or bb2), not dominated
+             by bb.  If there are none, merging bb1 and bb2 will mean that bb2
+             dominates bb.  */
+          int not_dominated = 0;
+          for (i = 0; i < EDGE_COUNT (bb->preds); ++i)
+            {
+              pred_edge = EDGE_PRED (bb, i);
+              pred = pred_edge->src;
+              if (pred == bb1 || pred == bb2)
+                continue;
+              if (dominated_by_p (CDI_DOMINATORS, pred, bb))
+                continue;
+              not_dominated++;
+            }
+          if (not_dominated == 0)
+            bb_dom = bb2;
+        }
+    }
+
+  /* Redirect the incoming edges of bb1 to bb2.  */
+  for (i = EDGE_COUNT (bb1->preds); i > 0 ; --i)
+    {
+      pred_edge = EDGE_PRED (bb1, i - 1);
+      pred = pred_edge->src;
+      pred_edge = redirect_edge_and_branch (pred_edge, bb2);
+      gcc_assert (pred_edge != NULL);
+      /* The set of successors of pred have changed.  */
+      bitmap_set_bit (cfgcleanup_altered_bbs, pred->index);
+    }
+
+  /* The set of predecessors has changed for both bb and bb2.  */
+  bitmap_set_bit (cfgcleanup_altered_bbs, bb->index);
+  bitmap_set_bit (cfgcleanup_altered_bbs, bb2->index);
+
+  /* bb1 has no incoming edges anymore, and has become unreachable.  */
+  delete_basic_block (bb1);
+  bitmap_clear_bit (cfgcleanup_altered_bbs, bb1->index);
+
+  /* Update dominator info.  */
+  if (dom_info_available_p (CDI_DOMINATORS))
+    {
+      /* Note: update order is relevant.  */
+      set_immediate_dominator (CDI_DOMINATORS, bb2, bb2_dom);
+      if (bb != bb2)
+	set_immediate_dominator (CDI_DOMINATORS, bb, bb_dom);
+      verify_dominators (CDI_DOMINATORS);
+    }
+
+  /* Reset invalidated debug statements.  */
+  update_debug_stmts (bb2);
+
+  return true;
+}
+
+/* Returns whether for all phis in E1->dest the phi alternatives for E1 and
+   E2 are either:
+   - equal, or
+   - defined locally in E1->src and E2->src.
+   In the latter case, register the alternatives in *PHI_EQUIV.  */
+
+static bool
+same_or_local_phi_alternatives (equiv_t *phi_equiv, edge e1, edge e2)
+{
+  int n1 = e1->dest_idx;
+  int n2 = e2->dest_idx;
+  gimple_stmt_iterator gsi;
+  basic_block dest = e1->dest;
+  gcc_assert (dest == e2->dest);
+
+  for (gsi = gsi_start_phis (dest); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple phi = gsi_stmt (gsi);
+      tree val1 = gimple_phi_arg_def (phi, n1);
+      tree val2 = gimple_phi_arg_def (phi, n2);
+
+      gcc_assert (val1 != NULL_TREE);
+      gcc_assert (val2 != NULL_TREE);
+
+      if (operand_equal_for_phi_arg_p (val1, val2))
+	continue;
+
+      if (!equiv_insert (phi_equiv, val1, val2, e1->src, e2->src))
+	return false;
+    }
+
+  return true;
+}
+
+/* Detect duplicate predecessor blocks of BB and clean them up.  Return true if
+   any changes were made.  */
+
+static bool
+cleanup_duplicate_preds (basic_block bb)
+{
+  edge e1, e2, e1_swapped, e2_swapped;
+  unsigned int i, j, n;
+  equiv_t phi_equiv;
+  bool changed;
+
+  if (optimize < 2)
+    return false;
+
+  n = EDGE_COUNT (bb->preds);
+
+  for (i = 0; i < n; ++i)
+    {
+      e1 = EDGE_PRED (bb, i);
+      if (e1->flags & EDGE_COMPLEX)
+	continue;
+      for (j = i + 1; j < n; ++j)
+        {
+          e2 = EDGE_PRED (bb, j);
+	  if (e2->flags & EDGE_COMPLEX)
+	    continue;
+
+	  /* Block e1->src might be deleted.  If bb and e1->src are the same
+	     block, delete e2->src instead, by swapping e1 and e2.  */
+	  e1_swapped = (bb == e1->src) ? e2: e1;
+	  e2_swapped = (bb == e1->src) ? e1: e2;
+
+	  /* For all phis in bb, the phi alternatives for e1 and e2 need to have
+	     the same value.  */
+	  equiv_init (&phi_equiv);
+	  if (same_or_local_phi_alternatives (&phi_equiv, e1_swapped, e2_swapped))
+	    /* Collapse e1->src and e2->src if they are duplicates.  */
+	    changed = cleanup_duplicate_preds_1 (phi_equiv, e1_swapped, e2_swapped);
+	  else
+	    changed = false;
+
+	  equiv_delete (&phi_equiv);
+
+	  if (changed)
+	    return true;
+        }
+    }
+
+  return false;
+}
+
 /* Tries to cleanup cfg in basic block BB.  Returns true if anything
    changes.  */
 
@@ -668,6 +1210,9 @@  cleanup_tree_cfg_bb (basic_block bb)
       return true;
     }
 
+  if (cleanup_duplicate_preds (bb))
+    return true;
+
   return retval;
 }