diff mbox

Combine four insns

Message ID 4C5C20D0.5020105@codesourcery.com
State New
Headers show

Commit Message

Bernd Schmidt Aug. 6, 2010, 2:48 p.m. UTC
I was slightly bored while waiting for some SPEC runs, so I played with
the combiner a little.  The following extends it to do four-insn
combinations.

Conceptually, the following is one motivation for the change: consider a
RISC target (probably not very prevalent when the combiner was written),
where arithmetic ops don't allow constants, only registers.  Then, to
combine multiple such operations into one (or rather, two), you need a
four-instruction window.  This is what happens e.g. on Thumb-1; PR42172
is such an example.  We have

	ldrb	r3, [r0]
	mov	r2, #7
	bic	r3, r2
	add	r2, r2, #49
	bic	r3, r2
	sub	r2, r2, #48
	orr	r3, r2
	add	r2, r2, #56
	bic	r3, r2
	add	r2, r2, #63
	and	r3, r2
	strb	r3, [r0]

which can be optimized into

	mov	r3, #8
	strb	r3, [r0]

by the patch below.  I'm attaching a file with a few more examples I
found.  The same patterns occur quite frequently - several times e.g. in
Linux TCP code.

The downside is a compile-time cost, which appears to be about 1% user
time on a full bootstrap.  To put that in persepective, it's 12s of real
time.

real 16m13.446s user 103m3.607s sys 3m2.235s
real 16m25.534s user 104m0.686s sys 3m4.158s

I'd argue that compile-time shouldn't be our top priority, as it's one
of the few things that still benefits from Moore's Law, while the same
may not be true for the programs we compile.  I expect people will argue
a 1% slowdown is unacceptable, but in that case I think we need to
discuss whether users are complaining about slow compiles more often
than they complain about missed optimizations - in my experience the
reverse is true.

Bootstrapped on i686-linux, a slightly earlier version also
regression-tested.


Bernd
-       andl    $4, %eax
-       cmpl    $1, %eax
-       sbbl    %eax, %eax
-       notl    %eax
+       sall    $5, %eax
        andl    $128, %eax
===========
-       movl    8(%eax), %edx
-       xorb    %dl, %dl
-       orl     $11, %edx
-       movl    %edx, 8(%eax)
+       movb    $11, 8(%eax)
==========
-       movzbl  %al, %edx
-       sall    $16, %edx
-       movl    92(%edi), %ecx
-       andl    $-16711681, %ecx
-       orl     %ecx, %edx
-       movl    %edx, 92(%edi)
+       movb    %al, 94(%edi)
==========
-       andl    $2, %eax
-       cmpl    $1, %eax
-       sbbl    %eax, %eax
-       notl    %eax
+       sall    $30, %eax
+       sarl    $31, %eax
===========
-       movl    %ecx, %edx
-       sall    $8, %edx
-       shrw    $8, %ax
-       orl     %edx, %eax
+       rolw    $8, %ax
===========
-       movl    %edx, %eax
-       sall    $5, %eax
-       imull   $39, %edx, %edx
-       subl    %edx, %ecx
-       leal    (%eax,%ecx), %edx
+       imull   $-7, %edx, %edx
+       addl    %ecx, %edx
==========
        movl    24(%esp), %eax
-       movzbl  792(%eax), %ebx
-       movl    %ebx, %eax
-       andl    $-80, %eax
-       orl     $64, %eax
-       andl    $15, %ebx
-       orl     %eax, %ebx
-       movl    24(%esp), %eax
-       movb    %bl, 792(%eax)
+       orb     $64, 792(%eax)
==========
-       movzbl  1(%eax), %ecx
-       movl    %ecx, %edx
-       shrb    $3, %dl
-       andl    $3, %edx
-       orl     $1, %edx
-       leal    0(,%edx,8), %ebp
-       movl    %ecx, %edx
-       andl    $-25, %edx
-       orl     %ebp, %edx
-       movb    %dl, 1(%eax)
+       orb     $8, 1(%eax)
==========
-       mov     r3, #2
-       bic     r0, r3
-       lsl     r0, r0, #24
-       lsr     r0, r0, #24
+       mov     r3, #253
+       and     r0, r3
==========
-       ldr     r3, [r0, r3]
-       mov     r0, #128
-       lsl     r0, r0, #17
-       and     r0, r3
-       neg     r3, r0
-       adc     r0, r0, r3
-       sub     r0, r0, #1
+       ldr     r0, [r0, r3]
+       lsl     r0, r0, #7
+       asr     r0, r0, #31
==========
-       bic     r2, r1, #-134217728
+       bic     r1, r1, #-134217728
-       rsb     r3, r3, r3, lsl #9
-       add     r3, r3, r3, lsl #18
-       negs    r1, r3
-       orr     r1, r2, r1, lsl #27
+       orr     r1, r1, r3, lsl #27
=========
 xfs_dir2_sf_toino8.isra.8:
-       @ args = 0, pretend = 0, frame = 72
+       @ args = 0, pretend = 0, frame = 40
==========
-       mvn     r2, fp
-       add     r3, fp, #1
-       adds    r2, r2, r7
-       adds    r4, r3, r2
+       mov     r4, r7
PR target/42172
	* combine.c (combine_validate_cost): New arg I0.  All callers changed.
	Take its cost into account if nonnull.
	(insn_a_feeds_b): New static function.
	(combine_instructions): Look for four-insn combinations.
	(can_combine_p): New args PRED2, SUCC2.  All callers changed.  Take
	them into account when computing all_adjacent and looking for other
	uses.
	(combinable_i3pat): New args I0DEST, I0_NOT_IN_SRC.  All callers
	changed.  Treat them like I1DEST and I1_NOT_IN_SRC.
	(try_combine): New arg I0.  Handle four-insn combinations.
	(distribute_notes): New arg ELIM_I0.  All callers changed.  Treat it
	like ELIM_I1.

Comments

Richard Biener Aug. 6, 2010, 3:04 p.m. UTC | #1
On Fri, Aug 6, 2010 at 4:48 PM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> I was slightly bored while waiting for some SPEC runs, so I played with
> the combiner a little.  The following extends it to do four-insn
> combinations.
>
> Conceptually, the following is one motivation for the change: consider a
> RISC target (probably not very prevalent when the combiner was written),
> where arithmetic ops don't allow constants, only registers.  Then, to
> combine multiple such operations into one (or rather, two), you need a
> four-instruction window.  This is what happens e.g. on Thumb-1; PR42172
> is such an example.  We have
>
>        ldrb    r3, [r0]
>        mov     r2, #7
>        bic     r3, r2
>        add     r2, r2, #49
>        bic     r3, r2
>        sub     r2, r2, #48
>        orr     r3, r2
>        add     r2, r2, #56
>        bic     r3, r2
>        add     r2, r2, #63
>        and     r3, r2
>        strb    r3, [r0]
>
> which can be optimized into
>
>        mov     r3, #8
>        strb    r3, [r0]
>
> by the patch below.  I'm attaching a file with a few more examples I
> found.  The same patterns occur quite frequently - several times e.g. in
> Linux TCP code.
>
> The downside is a compile-time cost, which appears to be about 1% user
> time on a full bootstrap.  To put that in persepective, it's 12s of real
> time.
>
> real 16m13.446s user 103m3.607s sys 3m2.235s
> real 16m25.534s user 104m0.686s sys 3m4.158s
>
> I'd argue that compile-time shouldn't be our top priority, as it's one
> of the few things that still benefits from Moore's Law, while the same
> may not be true for the programs we compile.  I expect people will argue
> a 1% slowdown is unacceptable, but in that case I think we need to
> discuss whether users are complaining about slow compiles more often
> than they complain about missed optimizations - in my experience the
> reverse is true.
>
> Bootstrapped on i686-linux, a slightly earlier version also
> regression-tested.

Do you have statistics how many two, three and four insn combinations
a) are tried, b) can be validated, c) are used in the end, for example
during a GCC bootstrap?

It might make sense to restrict 4 insn combinations to
-fexpensive-optimizations (thus, not enable it at -O1).

Richard.

>
> Bernd
>
Steven Bosscher Aug. 6, 2010, 3:07 p.m. UTC | #2
On Fri, Aug 6, 2010 at 4:48 PM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> I'd argue that compile-time shouldn't be our top priority, as it's one
> of the few things that still benefits from Moore's Law, while the same
> may not be true for the programs we compile.  I expect people will argue
> a 1% slowdown is unacceptable, but in that case I think we need to
> discuss whether users are complaining about slow compiles more often
> than they complain about missed optimizations - in my experience the
> reverse is true.

It depends where you look. In the free software community, the
majority of complaints is about compile time, but perhaps for your
customers the missed optimizations are more important.

But perhaps the optimization can be performed in another place than
combine? If it's just a relatively small set of common patterns, a
quick GIMPLE pass may be preferable.

(I know, it's the old argument we've had before: You have a real fix
now for a real problem, others (incl. me) believe that to fix this
problem in combine is a step away from sane instruction selection that
GCC may not have for years to come...)

What does the code look like, that is optimized so much better with 4
insns to combine? It looks like code that could come from bitfield
manipulations, which is an area where it's well-known that GCC does
not always optimize too well.

Ciao!
Steven
Paolo Bonzini Aug. 6, 2010, 4:45 p.m. UTC | #3
On 08/06/2010 05:07 PM, Steven Bosscher wrote:
> On Fri, Aug 6, 2010 at 4:48 PM, Bernd Schmidt<bernds@codesourcery.com>  wrote:
>> I'd argue that compile-time shouldn't be our top priority, as it's one
>> of the few things that still benefits from Moore's Law, while the same
>> may not be true for the programs we compile.  I expect people will argue
>> a 1% slowdown is unacceptable, but in that case I think we need to
>> discuss whether users are complaining about slow compiles more often
>> than they complain about missed optimizations - in my experience the
>> reverse is true.
>
> It depends where you look. In the free software community, the
> majority of complaints is about compile time, but perhaps for your
> customers the missed optimizations are more important.
>
> But perhaps the optimization can be performed in another place than
> combine? If it's just a relatively small set of common patterns, a
> quick GIMPLE pass may be preferable.
>
> (I know, it's the old argument we've had before: You have a real fix
> now for a real problem, others (incl. me) believe that to fix this
> problem in combine is a step away from sane instruction selection that
> GCC may not have for years to come...)

In this case I actually think this argument is not going to work, since 
we have not even a prototype of a sane instruction selection pass and 
not even someone who is thinking about working on it.

Paolo
Steven Bosscher Aug. 6, 2010, 5:22 p.m. UTC | #4
On Fri, Aug 6, 2010 at 6:45 PM, Paolo Bonzini <bonzini@gnu.org> wrote:
>> But perhaps the optimization can be performed in another place than
>> combine? If it's just a relatively small set of common patterns, a
>> quick GIMPLE pass may be preferable.

I think this part of my message was much more interesting to respond to.


>> (I know, it's the old argument we've had before: You have a real fix
>> now for a real problem, others (incl. me) believe that to fix this
>> problem in combine is a step away from sane instruction selection that
>> GCC may not have for years to come...)
>
> In this case I actually think this argument is not going to work, since we
> have not even a prototype of a sane instruction selection pass and not even
> someone who is thinking about working on it.

Agreed. There is the work from Preston Briggs, but that appears to
have gone no-where, unfortunately.

Ciao!
Steven
Jeff Law Aug. 6, 2010, 6:02 p.m. UTC | #5
On 08/06/10 11:22, Steven Bosscher wrote:
> On Fri, Aug 6, 2010 at 6:45 PM, Paolo Bonzini<bonzini@gnu.org>  wrote:
>>> But perhaps the optimization can be performed in another place than
>>> combine? If it's just a relatively small set of common patterns, a
>>> quick GIMPLE pass may be preferable.
> I think this part of my message was much more interesting to respond to.
This code naturally belongs in combine, putting it anywhere else is 
just, umm, silly.

jeff
Vladimir Makarov Aug. 6, 2010, 6:56 p.m. UTC | #6
On 08/06/2010 01:22 PM, Steven Bosscher wrote:
> On Fri, Aug 6, 2010 at 6:45 PM, Paolo Bonzini<bonzini@gnu.org>  wrote:
>    
> (I know, it's the old argument we've had before: You have a real fix
>>> now for a real problem, others (incl. me) believe that to fix this
>>> problem in combine is a step away from sane instruction selection that
>>> GCC may not have for years to come...)
>>>        
>> In this case I actually think this argument is not going to work, since we
>> have not even a prototype of a sane instruction selection pass and not even
>> someone who is thinking about working on it.
>>      
>    
To be honest I am periodically thinking about working on it but I did 
not make a final decision yet and don't know when I can start.
> Agreed. There is the work from Preston Briggs, but that appears to
> have gone no-where, unfortunately.
>
>    
IMHO the code should have become public if we want to see a progress on 
the problem solution.  But it is a Google property and they probably 
decided not to give it gcc community.  Although I heard that Ken Zadeck 
has access to it.  We will see what the final result will be.

I only can guess from speculations on info I have that Preston's code is 
probably not so valuable for GCC because as I understand the backend was 
only specialized for x86/x86_64.

The real problem is not in implementing modern pattern matching itself 
(there are a lot free pattern matching code and tools for code 
selection).  The real problem is in how to feed all md files (in which 
implementation a lot of gcc community efforts were made) to pattern 
matcher (in any case most probably we will not need define_split in md 
files if we use a modern pattern matcher).

Another problem, most modern pattern matchers works on trees not on 
DAGs.  There are some solutions (heuristic or expensive optimal) for 
this problem.  The combiner already solves the problem in some way.

In any case I'd not expect, from a new code selection pass, big code or 
compile time improvements (e.g. code selection in LLVM is very expensive 
and takes more time than GCC combiner).  But a new code selection pass 
could be more readable and easy for maintenance.
Steven Bosscher Aug. 6, 2010, 7:02 p.m. UTC | #7
On Fri, Aug 6, 2010 at 8:56 PM, Vladimir N. Makarov <vmakarov@redhat.com> wrote:
> IMHO the code should have become public if we want to see a progress on the
> problem solution.  But it is a Google property and they probably decided not
> to give it gcc community.  Although I heard that Ken Zadeck has access to
> it.  We will see what the final result will be.

http://gcc.gnu.org/viewcvs/branches/gimple-back-end/

But at that point, everyone stopped working on this. Perhaps Diego or
Ian can explain why.

Ciao!
Steven
Bernd Schmidt Aug. 6, 2010, 7:19 p.m. UTC | #8
On 08/06/2010 05:07 PM, Steven Bosscher wrote:

> It depends where you look. In the free software community, the
> majority of complaints is about compile time, but perhaps for your
> customers the missed optimizations are more important.

The latter I believe is certainly true.  But it also seems to happen
every now and then that I stumble on discussions on the Web where
someone claims gcc code generation is "crazy bad" for their code, or
where they compare gcc output to other compilers.

> But perhaps the optimization can be performed in another place than
> combine? If it's just a relatively small set of common patterns, a
> quick GIMPLE pass may be preferable.
> 
> (I know, it's the old argument we've had before: You have a real fix
> now for a real problem, others (incl. me) believe that to fix this
> problem in combine is a step away from sane instruction selection that
> GCC may not have for years to come...)

I don't think making an existing pass more powerful can be viewed as a
step away from a sane implementation.  It may set the bar higher for
comparisons, but that's as it should be - an alternative solution must
prove its superiority.  The combiner is one way of solving the problem,
and one that works in the context of gcc - alternative implementations
would first have to exist and then prove that they achieve better results.

> What does the code look like, that is optimized so much better with 4
> insns to combine? It looks like code that could come from bitfield
> manipulations, which is an area where it's well-known that GCC does
> not always optimize too well.

Here are some typical examples.  Some of it is bitfield operations.
PR42172 is.  Also, from gimplify.s:

gimple_set_plf (gimple stmt, enum plf_mask plf, unsigned char val_p)
{
  if (val_p)
    stmt->gsbase.plf |= (unsigned int) plf;
  else
    stmt->gsbase.plf &= ~((unsigned int) plf);
}

(inlined)
        call    gimple_build_goto
-       movzbl  1(%eax), %ecx
-       movl    %ecx, %edx
-       shrb    $3, %dl
-       andl    $3, %edx
-       orl     $1, %edx
-       leal    0(,%edx,8), %ebp
-       movl    %ecx, %edx
-       andl    $-25, %edx
-       orl     %ebp, %edx
-       movb    %dl, 1(%eax)
+       orb     $8, 1(%eax)
=============
Sometimes bitfield operations are written manually as masks and shifts.
 From one of my own programs:
            ciabalarm = (ciabalarm & ~0xff) | val;

-       movzbl  %dl, %ebx
-       movl    ciabalarm, %eax
-       xorb    %al, %al
-       orl     %eax, %ebx
-       movl    %ebx, ciabalarm
+       movb    %dl, ciabalarm
IIRC gcc actually used to be better at optimizing this; I seem to recall
relying on it with gcc-2.7.2.  I think the x86 backend has changed and
now allows fewer complex insns, making it harder for the combiner to do
its thing.  So this is probably a regression.
=============
From attr.i (kernel):

  if (((inode)->i_flags & 256))
   return -26;

-       andl    $256, %eax
-       cmpl    $1, %eax
-       sbbl    %eax, %eax
-       notl    %eax
+       sall    $23, %eax
+       sarl    $31, %eax
        andl    $-26, %eax
=============
Elsewhere in the kernel:
static inline __attribute__((always_inline)) __u8 rol8(__u8 word,
unsigned int shift)
{
 return (word << shift) | (word >> (8 - shift));
}

-       movl    %ecx, %edx
-       sall    $8, %edx
-       shrw    $8, %ax
-       orl     %edx, %eax
+       rolw    $8, %ax

These are all things which in gcc have always been combine's job to deal
with.


Bernd
Jeff Law Aug. 6, 2010, 7:37 p.m. UTC | #9
On 08/06/10 13:19, Bernd Schmidt wrote:
>> What does the code look like, that is optimized so much better with 4
>> insns to combine? It looks like code that could come from bitfield
>> manipulations, which is an area where it's well-known that GCC does
>> not always optimize too well.
> Here are some typical examples.  Some of it is bitfield operations.
> PR42172 is.  Also, from gimplify.s:
It's also worth noting that some ports have hacks to encourage 4->1 or 
4->2 combinations.  Basically they have patterns which represent an 
intermediate step in a 4->1 or 4->2 combination even if there is no 
machine instruction which implements the insn appearing in the 
intermediate step.  I've used (and recommended) this trick numerous 
times through the years, so I suspect these patterns probably exist in 
many ports.


Here's an example from h8300.md:


;; This is a "bridge" instruction.  Combine can't cram enough insns
;; together to crate a MAC instruction directly, but it can create
;; this instruction, which then allows combine to create the real
;; MAC insn.
;;
;; Unfortunately, if combine doesn't create a MAC instruction, this
;; insn must generate reasonably correct code.  Egad.
(define_insn ""
   [(set (match_operand:SI 0 "register_operand" "=a")
         (mult:SI
           (sign_extend:SI
             (mem:HI (post_inc:SI (match_operand:SI 1 "register_operand" 
"r"))))
           (sign_extend:SI
             (mem:HI (post_inc:SI (match_operand:SI 2 "register_operand" 
"r"))))))]
   "TARGET_MAC"
   "clrmac\;mac  @%2+,@%1+"
   [(set_attr "length" "6")
    (set_attr "cc" "none_0hit")])

(define_insn ""
   [(set (match_operand:SI 0 "register_operand" "=a")
         (plus:SI (mult:SI
           (sign_extend:SI (mem:HI
             (post_inc:SI (match_operand:SI 1 "register_operand" "r"))))
           (sign_extend:SI (mem:HI
             (post_inc:SI (match_operand:SI 2 "register_operand" "r")))))
               (match_operand:SI 3 "register_operand" "0")))]
   "TARGET_MAC"
   "mac  @%2+,@%1+"
   [(set_attr "length" "4")
    (set_attr "cc" "none_0hit")])
Bernd Schmidt Aug. 6, 2010, 7:43 p.m. UTC | #10
On 08/06/2010 09:37 PM, Jeff Law wrote:
> It's also worth noting that some ports have hacks to encourage 4->1 or
> 4->2 combinations.  Basically they have patterns which represent an
> intermediate step in a 4->1 or 4->2 combination even if there is no
> machine instruction which implements the insn appearing in the
> intermediate step.  I've used (and recommended) this trick numerous
> times through the years, so I suspect these patterns probably exist in
> many ports.

Yes.  Such combiner bridges may still be useful to help with five-insn
combinations :)

However, as I found when trying to fix PR42172 for the Thumb, describing
insns that don't actually exist on the machine can degrade code quality.
 For that PR, it seemed like it would be easy to try to just use
nonmemory_operand for the second input, and add a few alternatives with
constraints for certain constants to output the right sequence later on.
 However - some and operations are expanded into

rn = rn << 10;
rn = rn >> 20;

and if the shifts aren't present in the RTL, naturally they can't be
combined with anything else, which can be a problem.

So, half a dozen preliminary Thumb-1 patches later, I decided to do it
in the combiner, which turned out to be significantly easier (unless a
reviewer finds a big thinko...)


Bernd
Bernd Schmidt Aug. 6, 2010, 8:08 p.m. UTC | #11
On 08/06/2010 05:04 PM, Richard Guenther wrote:
> Do you have statistics how many two, three and four insn combinations
> a) are tried, b) can be validated, c) are used in the end, for example
> during a GCC bootstrap?

Here are some low-tech statistics for my collection of .i files.  This
includes gcc, Linux kernel (double counted here since there are two
versions for different architectures), SPECint2000, Toshi's stress
testsuite, a popular embedded benchmark and a few other things.  Let me
know if you want something else.  I only did a) and c) because I
realized too late you probably asked about which ones were discarded due
to costs.

$ grep Trying.four log |wc -l
307743
$ grep Trying.three log |wc -l
592776
$ grep Trying.two log |wc -l
1643112
$ grep Succeeded.two log |wc -l
204808
$ grep Succeeded.three.into.two log |wc -l
2976
$ grep Succeeded.three.into.one log |wc -l
12473
$ grep Succeeded.four.into.two log |wc -l
244
$ grep Succeeded.four.into.one log |wc -l
140

> It might make sense to restrict 4 insn combinations to
> -fexpensive-optimizations (thus, not enable it at -O1).

This, certainly.


Bernd
Richard Biener Aug. 6, 2010, 8:37 p.m. UTC | #12
On Fri, Aug 6, 2010 at 10:08 PM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> On 08/06/2010 05:04 PM, Richard Guenther wrote:
>> Do you have statistics how many two, three and four insn combinations
>> a) are tried, b) can be validated, c) are used in the end, for example
>> during a GCC bootstrap?
>
> Here are some low-tech statistics for my collection of .i files.  This
> includes gcc, Linux kernel (double counted here since there are two
> versions for different architectures), SPECint2000, Toshi's stress
> testsuite, a popular embedded benchmark and a few other things.  Let me
> know if you want something else.  I only did a) and c) because I
> realized too late you probably asked about which ones were discarded due
> to costs.

Right.

> $ grep Trying.four log |wc -l
> 307743
> $ grep Trying.three log |wc -l
> 592776
> $ grep Trying.two log |wc -l
> 1643112
> $ grep Succeeded.two log |wc -l
> 204808
> $ grep Succeeded.three.into.two log |wc -l
> 2976
> $ grep Succeeded.three.into.one log |wc -l
> 12473
> $ grep Succeeded.four.into.two log |wc -l
> 244
> $ grep Succeeded.four.into.one log |wc -l
> 140

No four into three?  So overall it's one order of magnitude less
three than two and two order of magnitude less four than three.

I think it's still reasonable (and I agree with that combine is a proper
place to do this).  Can you, for the list of examples you found where
it helps file enhancement bugreports so that we might do sth here
on the tree level or do better initial expansion?

Thanks,
Richard.

>> It might make sense to restrict 4 insn combinations to
>> -fexpensive-optimizations (thus, not enable it at -O1).
>
> This, certainly.
>
>
> Bernd
>
Steven Bosscher Aug. 6, 2010, 8:43 p.m. UTC | #13
On Fri, Aug 6, 2010 at 8:02 PM, Jeff Law <law@redhat.com> wrote:
>  On 08/06/10 11:22, Steven Bosscher wrote:
>>
>> On Fri, Aug 6, 2010 at 6:45 PM, Paolo Bonzini<bonzini@gnu.org>  wrote:
>>>>
>>>> But perhaps the optimization can be performed in another place than
>>>> combine? If it's just a relatively small set of common patterns, a
>>>> quick GIMPLE pass may be preferable.
>>
>> I think this part of my message was much more interesting to respond to.
>
> This code naturally belongs in combine, putting it anywhere else is just,
> umm, silly.

I suppose you meant to say you are of the opinion that it's silly, or
did you actually intend to put that as a fact? *That* would be silly
(fact), thank you very much.

Anyway.

If it's patterns we could combine at the GIMPLE level, it wouldn't be
so silly to handle it there. Weren't you once one of the people
talking about a tree-combine pass?

Or if this is a case the compiler could already expand from GIMPLE to
good initial RTL (i.e. TER), that wouldn't be a silly place to put it
either.

Ciao!
Steven
Richard Biener Aug. 6, 2010, 8:48 p.m. UTC | #14
On Fri, Aug 6, 2010 at 10:43 PM, Steven Bosscher <stevenb.gcc@gmail.com> wrote:
> On Fri, Aug 6, 2010 at 8:02 PM, Jeff Law <law@redhat.com> wrote:
>>  On 08/06/10 11:22, Steven Bosscher wrote:
>>>
>>> On Fri, Aug 6, 2010 at 6:45 PM, Paolo Bonzini<bonzini@gnu.org>  wrote:
>>>>>
>>>>> But perhaps the optimization can be performed in another place than
>>>>> combine? If it's just a relatively small set of common patterns, a
>>>>> quick GIMPLE pass may be preferable.
>>>
>>> I think this part of my message was much more interesting to respond to.
>>
>> This code naturally belongs in combine, putting it anywhere else is just,
>> umm, silly.
>
> I suppose you meant to say you are of the opinion that it's silly, or
> did you actually intend to put that as a fact? *That* would be silly
> (fact), thank you very much.
>
> Anyway.
>
> If it's patterns we could combine at the GIMPLE level, it wouldn't be
> so silly to handle it there. Weren't you once one of the people
> talking about a tree-combine pass?

Combining at GIMPLE level is difficult because complex instructions
span multiple GIMPLE statements (unless we invent more complex
ones, of course).  You also face the problem of doing target-independent
costs.

> Or if this is a case the compiler could already expand from GIMPLE to
> good initial RTL (i.e. TER), that wouldn't be a silly place to put it
> either.

That's true.  I think we should really differentiate when combine does
things because simplify-rtx does sth or because we can match a more
complex instruction.  The former should be handled at the GIMPLE level
and I'd like to see enhancement bugreports with cases that we miss
currently.

Richard.

> Ciao!
> Steven
>
Chris Lattner Aug. 6, 2010, 9:11 p.m. UTC | #15
On Aug 6, 2010, at 11:56 AM, Vladimir N. Makarov wrote:

> 
> In any case I'd not expect, from a new code selection pass, big code or compile time improvements (e.g. code selection in LLVM is very expensive and takes more time than GCC combiner).  But a new code selection pass could be more readable and easy for maintenance.

I agree with you that it is reasonably expensive, but FYI, the selection dag time in LLVM doesn't correspond to combine times.  The SelectionDAG phases in LLVM (which does do dag-based, not tree-based, pattern matching) does lowering, matching, pre-regalloc scheduling, and optimization.  It's more fair to compare it to expand, combine, pre-ra scheduling, and whatever optimization passes gcc runs early in the rtl pipeline.

-Chris
Jeff Law Aug. 6, 2010, 9:45 p.m. UTC | #16
On 08/06/10 13:43, Bernd Schmidt wrote:
> On 08/06/2010 09:37 PM, Jeff Law wrote:
>> It's also worth noting that some ports have hacks to encourage 4->1 or
>> 4->2 combinations.  Basically they have patterns which represent an
>> intermediate step in a 4->1 or 4->2 combination even if there is no
>> machine instruction which implements the insn appearing in the
>> intermediate step.  I've used (and recommended) this trick numerous
>> times through the years, so I suspect these patterns probably exist in
>> many ports.
> Yes.  Such combiner bridges may still be useful to help with five-insn
> combinations :)
Yea, though I expect there to be an ever-decreasing payoff for allowing 
larger bundles of instructions to be combined.
> However, as I found when trying to fix PR42172 for the Thumb, describing
> insns that don't actually exist on the machine can degrade code quality.
Precisely.  Creating those funky bridge instructions was never a 
solution, it was a hack and sometimes the hack created worse code.  Far 
better to actually fix combine.
Jeff
Jeff Law Aug. 6, 2010, 9:49 p.m. UTC | #17
On 08/06/10 14:43, Steven Bosscher wrote:
>
> If it's patterns we could combine at the GIMPLE level, it wouldn't be
> so silly to handle it there. Weren't you once one of the people
> talking about a tree-combine pass?
The specific cases I've seen won't combine at the gimple level.  For 
example, if you look at the H8, you need to have a post-inc memory 
operand which I really doubt we're going to ever expose at the gimple level.

I still think a tree combiner would be a great thing to have as it would 
eliminate tree-ssa-forwprop with a more generic engine.

> Or if this is a case the compiler could already expand from GIMPLE to
> good initial RTL (i.e. TER), that wouldn't be a silly place to put it
> either.
Again, I don't think the majority of the cases where 4->1 or 4->2 
combinations occur are going to be anything that we'd expose that early 
in the pipeline.

Jeff
Jeff Law Aug. 6, 2010, 9:52 p.m. UTC | #18
On 08/06/10 14:37, Richard Guenther wrote:
>>> It might make sense to restrict 4 insn combinations to
>>> -fexpensive-optimizations (thus, not enable it at -O1).
>> This, certainly.
Agreed.

Jeff
Bernd Schmidt Aug. 6, 2010, 10:41 p.m. UTC | #19
On 08/06/2010 10:37 PM, Richard Guenther wrote:
> No four into three?

I did not implement this.  It's probably possible, but I'm not convinced
it would be worthwhile.  There are a few other things we could try with
the combiner, such as 2->2 combinations, which is similar to what I did
with reload_combine recently.

> I think it's still reasonable (and I agree with that combine is a proper
> place to do this).  Can you, for the list of examples you found where
> it helps file enhancement bugreports so that we might do sth here
> on the tree level or do better initial expansion?

Filed PRs 45214 through 45218, which should give reasonably good
coverage.  I'll add a note to 42172.


Bernd
Richard Biener Aug. 6, 2010, 11:47 p.m. UTC | #20
On Sat, Aug 7, 2010 at 12:41 AM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> On 08/06/2010 10:37 PM, Richard Guenther wrote:
>> No four into three?
>
> I did not implement this.  It's probably possible, but I'm not convinced
> it would be worthwhile.  There are a few other things we could try with
> the combiner, such as 2->2 combinations, which is similar to what I did
> with reload_combine recently.
>
>> I think it's still reasonable (and I agree with that combine is a proper
>> place to do this).  Can you, for the list of examples you found where
>> it helps file enhancement bugreports so that we might do sth here
>> on the tree level or do better initial expansion?
>
> Filed PRs 45214 through 45218, which should give reasonably good
> coverage.  I'll add a note to 42172.

Thanks for doing this.  I think the patch is ok if you guard the
4-way combines by flag_expensive_optimizations.

Thanks,
Richard.

>
> Bernd
>
Eric Botcazou Aug. 7, 2010, 8:10 a.m. UTC | #21
> Thanks for doing this.  I think the patch is ok if you guard the
> 4-way combines by flag_expensive_optimizations.

Combining Steven and Bernd's figures, 1% of a bootstrap time is 37% of the 
combiner's time.  The result is 0.18% more combined insns.  It seems to me 
that we are already very far in the directory of diminishing returns.

Needless to say that, by fixing the problems earlier, we not only save 1% of 
compilation time but speed up the compiler by not generating the dreadful RTL 
in the first place and carrying it over to the combiner.

Bernd is essentially of the opinion that compilation time doesn't matter.  It 
seems to me that, even if we were to adopt this position, this shouldn't mean 
wasting compilation time, which I think is the case here.

So I think that the patch shouldn't go in at this point.
Paolo Bonzini Aug. 8, 2010, 11:39 a.m. UTC | #22
On Fri, Aug 6, 2010 at 14:56, Vladimir N. Makarov <vmakarov@redhat.com> wrote:
> Another problem, most modern pattern matchers works on trees not on DAGs.
> There are some solutions (heuristic or expensive optimal) for this problem.
> The combiner already solves the problem in some way.

One idea I had was to separate combination to multi-output
instructions and combination to single-output instruction.  The former
is probably done only in very special cases (conditional codes,
divmod, etc.), the latter could be done using tree-based pattern
selection.

Paolo
Mark Mitchell Aug. 9, 2010, 2:50 p.m. UTC | #23
Jeff Law wrote:

>> Yes.  Such combiner bridges may still be useful to help with five-insn
>> combinations :)

> Yea, though I expect there to be an ever-decreasing payoff for allowing
> larger bundles of instructions to be combined.

We all agree that faster compilers are better, all other things equal.
There was a suggestion for a heuristic that might speed up things
considerably without losing many opportunities, and Bernd thought that
was a good idea.  Perhaps that will get back most of the time and this
discussion will be semi-moot.

Also, I think that the number of instructions combine should consider
might as well be a --param.  As far as I can tell, it would be
reasonably easy to modify the pass to pass around an array of
instructions to combine, rather than i0, i1, i2, i3.  Then, distributors
could set whatever default (2, 3, 4, 5, 100, whatever) they think
appropriate for their users.  Bernd, is this something that you could do
readily?

We still have to have the argument about FSF GCC, but it would be
trivial for a distributor (or even an end user building from source) to
adjust the default as they like.  And, furthermore, it would allow
people to make this trade-off at run-time, independent of the default.
I think it would also make the algorithm clearer.

As to the more general question of compile-time vs. optimization, I
think it's clear that to CodeSourcery's customer base the latter matters
more than the former -- but only up to a point.  Few people want 100%
more compilation time in order to get 0.01% more performance.  But, many
people would happily take 10% more compilation time to get 1% more
performance, and almost all would accept 1% more compilation time to get
1% more performance.

Like Bernd, I doubt that compilation time with optimization enabled per
se is the dominant concern for most people.  GCC's own build times are
increasingly dominated by configure scripts, make overhead, and linking.
 Incremental linking would probably do more to improve the typical
programmers compile-edit-debug cycle (which is probably done with -g,
not -O2!) than anything else.  Compile-time improvements at -O2 benefit
distributors or others who are doing builds of massive amounts of
software, but I'm skeptical that GCC is getting slower faster than
hardware is getting cheaper.

So, I think that combining four instructions by default is plausible.
But, defaults are always arguable.  I think that for FSF GCC we
shouldn't spend too much time worrying about it.  The userbase is too
diverse to make it easy for us to please everybody.  Make it easy for
distributors and let them pick.  They have a better chance of getting
things set up correctly for their users, since their users are a
narrower set of people.
Bernd Schmidt Aug. 9, 2010, 3 p.m. UTC | #24
On 08/09/2010 04:50 PM, Mark Mitchell wrote:

> Also, I think that the number of instructions combine should consider
> might as well be a --param.  As far as I can tell, it would be
> reasonably easy to modify the pass to pass around an array of
> instructions to combine, rather than i0, i1, i2, i3.  Then, distributors
> could set whatever default (2, 3, 4, 5, 100, whatever) they think
> appropriate for their users.  Bernd, is this something that you could do
> readily?

Not readily since it would require rather more surgery to make sure all
the lifetimes, register deaths etc. are tracked.  Also, the way combine
keeps track of LOG_LINKS (i.e. single-basic block only, one user for
each reg only) makes larger windows have dubious value.

What I do want to try is to do something similar with fwprop (i.e. if
substitution fails or is unprofitable, try substituting more uses).  If
that turns out feasible, it should be possible to have it controlled by
a param.

> Like Bernd, I doubt that compilation time with optimization enabled per
> se is the dominant concern for most people.  GCC's own build times are
> increasingly dominated by configure scripts, make overhead, and linking.

I've been noticing fixincludes too.


Bernd
Chris Lattner Aug. 9, 2010, 4 p.m. UTC | #25
On Aug 9, 2010, at 7:50 AM, Mark Mitchell wrote:

> Also, I think that the number of instructions combine should consider
> might as well be a --param. ...  Then, distributors
> could set whatever default (2, 3, 4, 5, 100, whatever) they think
> appropriate for their users.
> 
> We still have to have the argument about FSF GCC, but it would be
> trivial for a distributor (or even an end user building from source) to
> adjust the default as they like.
...
> But, defaults are always arguable.  I think that for FSF GCC we
> shouldn't spend too much time worrying about it.  The userbase is too
> diverse to make it easy for us to please everybody.  Make it easy for
> distributors and let them pick.  They have a better chance of getting
> things set up correctly for their users, since their users are a
> narrower set of people.

I find this attitude to be really interesting, as the approach was similar in the -fomit-frame-pointer thread.  While it's always easy to add "yet another knob" and push the onus/responsibility back onto distributors, this is problematic in other ways.  In particular, this neutralizes one of the major advantages that GCC has brought to the FOSS world over the last couple decades: it provides a standardized interface to the C compiler which makefiles and programmers know.

I realize that this specific example isn't a big deal, and GCC has more params than normal distributors would actually care about in practice, but the general attitude is interesting.  If nothing else, if the distributor wanted to change something, they could always hack up the source before they ship it.

Is "throw in a param and let the distributors decide" really a great solution to issues like these?

-Chris
Richard Biener Aug. 9, 2010, 4:02 p.m. UTC | #26
On Mon, Aug 9, 2010 at 6:00 PM, Chris Lattner <clattner@apple.com> wrote:
>
> On Aug 9, 2010, at 7:50 AM, Mark Mitchell wrote:
>
>> Also, I think that the number of instructions combine should consider
>> might as well be a --param. ...  Then, distributors
>> could set whatever default (2, 3, 4, 5, 100, whatever) they think
>> appropriate for their users.
>>
>> We still have to have the argument about FSF GCC, but it would be
>> trivial for a distributor (or even an end user building from source) to
>> adjust the default as they like.
> ...
>> But, defaults are always arguable.  I think that for FSF GCC we
>> shouldn't spend too much time worrying about it.  The userbase is too
>> diverse to make it easy for us to please everybody.  Make it easy for
>> distributors and let them pick.  They have a better chance of getting
>> things set up correctly for their users, since their users are a
>> narrower set of people.
>
> I find this attitude to be really interesting, as the approach was similar in the -fomit-frame-pointer thread.  While it's always easy to add "yet another knob" and push the onus/responsibility back onto distributors, this is problematic in other ways.  In particular, this neutralizes one of the major advantages that GCC has brought to the FOSS world over the last couple decades: it provides a standardized interface to the C compiler which makefiles and programmers know.
>
> I realize that this specific example isn't a big deal, and GCC has more params than normal distributors would actually care about in practice, but the general attitude is interesting.  If nothing else, if the distributor wanted to change something, they could always hack up the source before they ship it.
>
> Is "throw in a param and let the distributors decide" really a great solution to issues like these?

No, it is not.  It also makes reproducing bugs harder and explodes
the testing matrix.

So I'd like to have _less_ knobs, not more.

Please.

Richard.

> -Chris
>
>
Mark Mitchell Aug. 9, 2010, 4:06 p.m. UTC | #27
Chris Lattner wrote:

> Is "throw in a param and let the distributors decide" really a great solution to issues like these?

Do you have a better one?

I see no inherent reason to expect that the right answer for small ARM
microcontrollers is the same as for MIPS control-plane processors or x86
server processors.  And, even for a given processor, I'm not sure that
all users want the same thing.  As you say, since it's open-source
software, users/distributors can always modify it anyhow.  A --param
just makes that easier, and exposes the choice to users.

I'm all for setting things up so that they work well by default.  But,
I'm skeptical that there's a "right answer" to this particular question.
Chris Lattner Aug. 9, 2010, 4:18 p.m. UTC | #28
On Aug 9, 2010, at 9:06 AM, Mark Mitchell wrote:

> Chris Lattner wrote:
> 
>> Is "throw in a param and let the distributors decide" really a great solution to issues like these?
> 
> Do you have a better one?

Yes, pick an answer.  Either one is better than a new param IMO.

> I see no inherent reason to expect that the right answer for small ARM
> microcontrollers is the same as for MIPS control-plane processors or x86
> server processors.

So make the setting be a target property?

> I'm all for setting things up so that they work well by default.  But,
> I'm skeptical that there's a "right answer" to this particular question.

It's even better to just rip out combine of course, but I understand that's not a short-term solution. :)

-Chris
Mark Mitchell Aug. 9, 2010, 4:27 p.m. UTC | #29
Chris Lattner wrote:

>>> Is "throw in a param and let the distributors decide" really a great solution to issues like these?
>> Do you have a better one?
> 
> Yes, pick an answer.  Either one is better than a new param IMO.

All I can say is that I disagree.

Even if we didn't want to expose this to users, it would be better if
the code were structured that way.  There's no good justification for
hard-coding constants into the compiler, and that's essentially what
we've done for combine.  We've just done it by coding the algorithm that
way instead of by having "3" or "4" somewhere in the code.

And, the general rule in GCC is that magic constants should be exposed
as --params.  If we want to change that rule, of course, we can do so,
but it's certainly been useful to people.

I don't recommend messing about with --params for most people.  In fact,
CodeSourcery has a knowledgebase entry recommending that people stick
with -Os, -O1, -O2, or -O3!  But, that doesn't mean they aren't useful.
Joseph Myers Aug. 9, 2010, 5:27 p.m. UTC | #30
On Mon, 9 Aug 2010, Richard Guenther wrote:

> > Is "throw in a param and let the distributors decide" really a great 
> > solution to issues like these?
> 
> No, it is not.  It also makes reproducing bugs harder and explodes
> the testing matrix.
> 
> So I'd like to have _less_ knobs, not more.

Knobs for users are at least better than configure-time knobs (and params 
are to a large extent knobs for GCC developers rather than for normal 
users).  For almost any piece of free software - not just GCC, or even 
mainly GCC - it's a pain for those using the software on multiple 
GNU/Linux distributions, or on GNU/Linux and on other systems, when the 
same software gratuitously behaves differently on different systems 
because the respective distributors / builders decided to configure it 
differently.  For the multi-platform user it's better not to have all the 
configure-time knobs, so they only need to set up their personal 
configuration to override cases where their taste differs from one fixed 
default, and not override every case where one distributor has decided to 
change the default.
Chris Lattner Aug. 10, 2010, 3:32 p.m. UTC | #31
On Aug 9, 2010, at 9:27 AM, Mark Mitchell wrote:

> Chris Lattner wrote:
> 
>>>> Is "throw in a param and let the distributors decide" really a great solution to issues like these?
>>> Do you have a better one?
>> 
>> Yes, pick an answer.  Either one is better than a new param IMO.
> 
> All I can say is that I disagree.
> 
> Even if we didn't want to expose this to users, it would be better if
> the code were structured that way.  There's no good justification for
> hard-coding constants into the compiler, and that's essentially what
> we've done for combine.  We've just done it by coding the algorithm that
> way instead of by having "3" or "4" somewhere in the code.

Sure, cleaning up and generalizing the code is different from suggesting to distributors that they make diverging variants of GCC.

> And, the general rule in GCC is that magic constants should be exposed
> as --params.  If we want to change that rule, of course, we can do so,
> but it's certainly been useful to people.
> 
> I don't recommend messing about with --params for most people.  In fact,
> CodeSourcery has a knowledgebase entry recommending that people stick
> with -Os, -O1, -O2, or -O3!  But, that doesn't mean they aren't useful.

I think that suggesting people stick to -OX is a great idea :)

-Chris
Ian Lance Taylor Aug. 12, 2010, 3:50 a.m. UTC | #32
On Fri, Aug 6, 2010 at 11:56 AM, Vladimir N. Makarov
<vmakarov@redhat.com> wrote:
> On 08/06/2010 01:22 PM, Steven Bosscher wrote:

>> Agreed. There is the work from Preston Briggs, but that appears to
>> have gone no-where, unfortunately.
>>
> IMHO the code should have become public if we want to see a progress on the
> problem solution.  But it is a Google property and they probably decided not
> to give it gcc community.  Although I heard that Ken Zadeck has access to
> it.  We will see what the final result will be.
>
> I only can guess from speculations on info I have that Preston's code is
> probably not so valuable for GCC because as I understand the backend was
> only specialized for x86/x86_64.

The work by Preston and others was incomplete, was x86 specific, did
not work with existing MD files, and the tests that worked showed that
it didn't do any better than IRA.  So the project was cancelled.

There is nothing secret about the code, but since it essentially means
replacing the RTL backend I'm not sure it's useful in the context of
gcc.  It could only ever have been used if it were significantly
better than what we have today.

Ian
diff mbox

Patch

Index: combine.c
===================================================================
--- combine.c	(revision 162821)
+++ combine.c	(working copy)
@@ -385,10 +385,10 @@  static void init_reg_last (void);
 static void setup_incoming_promotions (rtx);
 static void set_nonzero_bits_and_sign_copies (rtx, const_rtx, void *);
 static int cant_combine_insn_p (rtx);
-static int can_combine_p (rtx, rtx, rtx, rtx, rtx *, rtx *);
-static int combinable_i3pat (rtx, rtx *, rtx, rtx, int, rtx *);
+static int can_combine_p (rtx, rtx, rtx, rtx, rtx, rtx, rtx *, rtx *);
+static int combinable_i3pat (rtx, rtx *, rtx, rtx, rtx, int, int, rtx *);
 static int contains_muldiv (rtx);
-static rtx try_combine (rtx, rtx, rtx, int *);
+static rtx try_combine (rtx, rtx, rtx, rtx, int *);
 static void undo_all (void);
 static void undo_commit (void);
 static rtx *find_split_point (rtx *, rtx, bool);
@@ -438,7 +438,7 @@  static void reg_dead_at_p_1 (rtx, const_
 static int reg_dead_at_p (rtx, rtx);
 static void move_deaths (rtx, rtx, int, rtx, rtx *);
 static int reg_bitfield_target_p (rtx, rtx);
-static void distribute_notes (rtx, rtx, rtx, rtx, rtx, rtx);
+static void distribute_notes (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
 static void distribute_links (rtx);
 static void mark_used_regs_combine (rtx);
 static void record_promoted_value (rtx, rtx);
@@ -766,7 +766,7 @@  do_SUBST_MODE (rtx *into, enum machine_m
 
 /* Subroutine of try_combine.  Determine whether the combine replacement
    patterns NEWPAT, NEWI2PAT and NEWOTHERPAT are cheaper according to
-   insn_rtx_cost that the original instruction sequence I1, I2, I3 and
+   insn_rtx_cost that the original instruction sequence I0, I1, I2, I3 and
    undobuf.other_insn.  Note that I1 and/or NEWI2PAT may be NULL_RTX.
    NEWOTHERPAT and undobuf.other_insn may also both be NULL_RTX.  This
    function returns false, if the costs of all instructions can be
@@ -774,10 +774,10 @@  do_SUBST_MODE (rtx *into, enum machine_m
    sequence.  */
 
 static bool
-combine_validate_cost (rtx i1, rtx i2, rtx i3, rtx newpat, rtx newi2pat,
-		       rtx newotherpat)
+combine_validate_cost (rtx i0, rtx i1, rtx i2, rtx i3, rtx newpat,
+		       rtx newi2pat, rtx newotherpat)
 {
-  int i1_cost, i2_cost, i3_cost;
+  int i0_cost, i1_cost, i2_cost, i3_cost;
   int new_i2_cost, new_i3_cost;
   int old_cost, new_cost;
 
@@ -788,13 +788,23 @@  combine_validate_cost (rtx i1, rtx i2, r
   if (i1)
     {
       i1_cost = INSN_COST (i1);
-      old_cost = (i1_cost > 0 && i2_cost > 0 && i3_cost > 0)
-		 ? i1_cost + i2_cost + i3_cost : 0;
+      if (i0)
+	{
+	  i0_cost = INSN_COST (i0);
+	  old_cost = (i0_cost > 0 && i1_cost > 0 && i2_cost > 0 && i3_cost > 0
+		      ? i0_cost + i1_cost + i2_cost + i3_cost : 0);
+	}
+      else
+	{
+	  old_cost = (i1_cost > 0 && i2_cost > 0 && i3_cost > 0
+		      ? i1_cost + i2_cost + i3_cost : 0);
+	  i0_cost = 0;
+	}
     }
   else
     {
       old_cost = (i2_cost > 0 && i3_cost > 0) ? i2_cost + i3_cost : 0;
-      i1_cost = 0;
+      i1_cost = i0_cost = 0;
     }
 
   /* Calculate the replacement insn_rtx_costs.  */
@@ -833,7 +843,16 @@  combine_validate_cost (rtx i1, rtx i2, r
     {
       if (dump_file)
 	{
-	  if (i1)
+	  if (i0)
+	    {
+	      fprintf (dump_file,
+		       "rejecting combination of insns %d, %d, %d and %d\n",
+		       INSN_UID (i0), INSN_UID (i1), INSN_UID (i2),
+		       INSN_UID (i3));
+	      fprintf (dump_file, "original costs %d + %d + %d + %d = %d\n",
+		       i0_cost, i1_cost, i2_cost, i3_cost, old_cost);
+	    }
+	  else if (i1)
 	    {
 	      fprintf (dump_file,
 		       "rejecting combination of insns %d, %d and %d\n",
@@ -1010,6 +1029,19 @@  clear_log_links (void)
     if (INSN_P (insn))
       free_INSN_LIST_list (&LOG_LINKS (insn));
 }
+
+/* Walk the LOG_LINKS of insn B to see if we find a reference to A.  Return
+   true if we found a LOG_LINK that proves that A feeds B.  */
+
+static bool
+insn_a_feeds_b (rtx a, rtx b)
+{
+  rtx links;
+  for (links = LOG_LINKS (b); links; links = XEXP (links, 1))
+    if (XEXP (links, 0) == a)
+      return true;
+  return false;
+}
 
 /* Main entry point for combiner.  F is the first insn of the function.
    NREGS is the first unused pseudo-reg number.
@@ -1150,7 +1182,7 @@  combine_instructions (rtx f, unsigned in
 	      /* Try this insn with each insn it links back to.  */
 
 	      for (links = LOG_LINKS (insn); links; links = XEXP (links, 1))
-		if ((next = try_combine (insn, XEXP (links, 0),
+		if ((next = try_combine (insn, XEXP (links, 0), NULL_RTX,
 					 NULL_RTX, &new_direct_jump_p)) != 0)
 		  goto retry;
 
@@ -1168,8 +1200,8 @@  combine_instructions (rtx f, unsigned in
 		  for (nextlinks = LOG_LINKS (link);
 		       nextlinks;
 		       nextlinks = XEXP (nextlinks, 1))
-		    if ((next = try_combine (insn, link,
-					     XEXP (nextlinks, 0),
+		    if ((next = try_combine (insn, link, XEXP (nextlinks, 0),
+					     NULL_RTX,
 					     &new_direct_jump_p)) != 0)
 		      goto retry;
 		}
@@ -1187,14 +1219,14 @@  combine_instructions (rtx f, unsigned in
 		  && NONJUMP_INSN_P (prev)
 		  && sets_cc0_p (PATTERN (prev)))
 		{
-		  if ((next = try_combine (insn, prev,
-					   NULL_RTX, &new_direct_jump_p)) != 0)
+		  if ((next = try_combine (insn, prev, NULL_RTX, NULL_RTX,
+					   &new_direct_jump_p)) != 0)
 		    goto retry;
 
 		  for (nextlinks = LOG_LINKS (prev); nextlinks;
 		       nextlinks = XEXP (nextlinks, 1))
-		    if ((next = try_combine (insn, prev,
-					     XEXP (nextlinks, 0),
+		    if ((next = try_combine (insn, prev, XEXP (nextlinks, 0),
+					     NULL_RTX,
 					     &new_direct_jump_p)) != 0)
 		      goto retry;
 		}
@@ -1207,14 +1239,14 @@  combine_instructions (rtx f, unsigned in
 		  && GET_CODE (PATTERN (insn)) == SET
 		  && reg_mentioned_p (cc0_rtx, SET_SRC (PATTERN (insn))))
 		{
-		  if ((next = try_combine (insn, prev,
-					   NULL_RTX, &new_direct_jump_p)) != 0)
+		  if ((next = try_combine (insn, prev, NULL_RTX, NULL_RTX,
+					   &new_direct_jump_p)) != 0)
 		    goto retry;
 
 		  for (nextlinks = LOG_LINKS (prev); nextlinks;
 		       nextlinks = XEXP (nextlinks, 1))
-		    if ((next = try_combine (insn, prev,
-					     XEXP (nextlinks, 0),
+		    if ((next = try_combine (insn, prev, XEXP (nextlinks, 0),
+					     NULL_RTX,
 					     &new_direct_jump_p)) != 0)
 		      goto retry;
 		}
@@ -1230,7 +1262,8 @@  combine_instructions (rtx f, unsigned in
 		    && NONJUMP_INSN_P (prev)
 		    && sets_cc0_p (PATTERN (prev))
 		    && (next = try_combine (insn, XEXP (links, 0),
-					    prev, &new_direct_jump_p)) != 0)
+					    prev, NULL_RTX,
+					    &new_direct_jump_p)) != 0)
 		  goto retry;
 #endif
 
@@ -1240,10 +1273,64 @@  combine_instructions (rtx f, unsigned in
 		for (nextlinks = XEXP (links, 1); nextlinks;
 		     nextlinks = XEXP (nextlinks, 1))
 		  if ((next = try_combine (insn, XEXP (links, 0),
-					   XEXP (nextlinks, 0),
+					   XEXP (nextlinks, 0), NULL_RTX,
 					   &new_direct_jump_p)) != 0)
 		    goto retry;
 
+	      /* Try four-instruction combinations.  */
+	      for (links = LOG_LINKS (insn); links; links = XEXP (links, 1))
+		{
+		  rtx next1;
+		  rtx link = XEXP (links, 0);
+
+		  /* If the linked insn has been replaced by a note, then there
+		     is no point in pursuing this chain any further.  */
+		  if (NOTE_P (link))
+		    continue;
+
+		  for (next1 = LOG_LINKS (link); next1; next1 = XEXP (next1, 1))
+		    {
+		      rtx link1 = XEXP (next1, 0);
+		      if (NOTE_P (link1))
+			continue;
+		      /* I0 -> I1 -> I2 -> I3.  */
+		      for (nextlinks = LOG_LINKS (link1); nextlinks;
+			   nextlinks = XEXP (nextlinks, 1))
+			if ((next = try_combine (insn, link, link1,
+						 XEXP (nextlinks, 0),
+						 &new_direct_jump_p)) != 0)
+			  goto retry;
+		      /* I0, I1 -> I2, I2 -> I3.  */
+		      for (nextlinks = XEXP (next1, 1); nextlinks;
+			   nextlinks = XEXP (nextlinks, 1))
+			if ((next = try_combine (insn, link, link1,
+						 XEXP (nextlinks, 0),
+						 &new_direct_jump_p)) != 0)
+			  goto retry;
+		    }
+
+		  for (next1 = XEXP (links, 1); next1; next1 = XEXP (next1, 1))
+		    {
+		      rtx link1 = XEXP (next1, 0);
+		      if (NOTE_P (link1))
+			continue;
+		      /* I0 -> I2; I1, I2 -> I3.  */
+		      for (nextlinks = LOG_LINKS (link); nextlinks;
+			   nextlinks = XEXP (nextlinks, 1))
+			if ((next = try_combine (insn, link, link1,
+						 XEXP (nextlinks, 0),
+						 &new_direct_jump_p)) != 0)
+			  goto retry;
+		      /* I0 -> I1; I1, I2 -> I3.  */
+		      for (nextlinks = LOG_LINKS (link1); nextlinks;
+			   nextlinks = XEXP (nextlinks, 1))
+			if ((next = try_combine (insn, link, link1,
+						 XEXP (nextlinks, 0),
+						 &new_direct_jump_p)) != 0)
+			  goto retry;
+		    }
+		}
+
 	      /* Try this insn with each REG_EQUAL note it links back to.  */
 	      for (links = LOG_LINKS (insn); links; links = XEXP (links, 1))
 		{
@@ -1267,7 +1354,7 @@  combine_instructions (rtx f, unsigned in
 		      i2mod = temp;
 		      i2mod_old_rhs = copy_rtx (orig);
 		      i2mod_new_rhs = copy_rtx (note);
-		      next = try_combine (insn, i2mod, NULL_RTX,
+		      next = try_combine (insn, i2mod, NULL_RTX, NULL_RTX,
 					  &new_direct_jump_p);
 		      i2mod = NULL_RTX;
 		      if (next)
@@ -1529,9 +1616,10 @@  set_nonzero_bits_and_sign_copies (rtx x,
     }
 }
 
-/* See if INSN can be combined into I3.  PRED and SUCC are optionally
-   insns that were previously combined into I3 or that will be combined
-   into the merger of INSN and I3.
+/* See if INSN can be combined into I3.  PRED, PRED2, SUCC and SUCC2 are
+   optionally insns that were previously combined into I3 or that will be
+   combined into the merger of INSN and I3.  The order is PRED, PRED2,
+   INSN, SUCC, SUCC2, I3.
 
    Return 0 if the combination is not allowed for any reason.
 
@@ -1540,7 +1628,8 @@  set_nonzero_bits_and_sign_copies (rtx x,
    will return 1.  */
 
 static int
-can_combine_p (rtx insn, rtx i3, rtx pred ATTRIBUTE_UNUSED, rtx succ,
+can_combine_p (rtx insn, rtx i3, rtx pred ATTRIBUTE_UNUSED,
+	       rtx pred2 ATTRIBUTE_UNUSED, rtx succ, rtx succ2,
 	       rtx *pdest, rtx *psrc)
 {
   int i;
@@ -1550,10 +1639,25 @@  can_combine_p (rtx insn, rtx i3, rtx pre
 #ifdef AUTO_INC_DEC
   rtx link;
 #endif
-  int all_adjacent = (succ ? (next_active_insn (insn) == succ
-			      && next_active_insn (succ) == i3)
-		      : next_active_insn (insn) == i3);
+  bool all_adjacent = true;
 
+  if (succ)
+    {
+      if (succ2)
+	{
+	  if (next_active_insn (succ2) != i3)
+	    all_adjacent = false;
+	  if (next_active_insn (succ) != succ2)
+	    all_adjacent = false;
+	}
+      else if (next_active_insn (succ) != i3)
+	all_adjacent = false;
+      if (next_active_insn (insn) != succ)
+	all_adjacent = false;
+    }
+  else if (next_active_insn (insn) != i3)
+    all_adjacent = false;
+    
   /* Can combine only if previous insn is a SET of a REG, a SUBREG or CC0.
      or a PARALLEL consisting of such a SET and CLOBBERs.
 
@@ -1678,11 +1782,15 @@  can_combine_p (rtx insn, rtx i3, rtx pre
       /* Don't substitute into an incremented register.  */
       || FIND_REG_INC_NOTE (i3, dest)
       || (succ && FIND_REG_INC_NOTE (succ, dest))
+      || (succ2 && FIND_REG_INC_NOTE (succ2, dest))
       /* Don't substitute into a non-local goto, this confuses CFG.  */
       || (JUMP_P (i3) && find_reg_note (i3, REG_NON_LOCAL_GOTO, NULL_RTX))
       /* Make sure that DEST is not used after SUCC but before I3.  */
-      || (succ && ! all_adjacent
-	  && reg_used_between_p (dest, succ, i3))
+      || (!all_adjacent
+	  && ((succ2
+	       && (reg_used_between_p (dest, succ2, i3)
+		   || reg_used_between_p (dest, succ, succ2)))
+	      || (!succ2 && succ && reg_used_between_p (dest, succ, i3))))
       /* Make sure that the value that is to be substituted for the register
 	 does not use any registers whose values alter in between.  However,
 	 If the insns are adjacent, a use can't cross a set even though we
@@ -1765,13 +1873,12 @@  can_combine_p (rtx insn, rtx i3, rtx pre
 
   if (GET_CODE (src) == ASM_OPERANDS || volatile_refs_p (src))
     {
-      /* Make sure succ doesn't contain a volatile reference.  */
+      /* Make sure neither succ nor succ2 contains a volatile reference.  */
+      if (succ2 != 0 && volatile_refs_p (PATTERN (succ2)))
+	return 0;
       if (succ != 0 && volatile_refs_p (PATTERN (succ)))
 	return 0;
-
-      for (p = NEXT_INSN (insn); p != i3; p = NEXT_INSN (p))
-	if (INSN_P (p) && p != succ && volatile_refs_p (PATTERN (p)))
-	  return 0;
+      /* We'll check insns between INSN and I3 below.  */
     }
 
   /* If INSN is an asm, and DEST is a hard register, reject, since it has
@@ -1785,7 +1892,7 @@  can_combine_p (rtx insn, rtx i3, rtx pre
      they might affect machine state.  */
 
   for (p = NEXT_INSN (insn); p != i3; p = NEXT_INSN (p))
-    if (INSN_P (p) && p != succ && volatile_insn_p (PATTERN (p)))
+    if (INSN_P (p) && p != succ && p != succ2 && volatile_insn_p (PATTERN (p)))
       return 0;
 
   /* If INSN contains an autoincrement or autodecrement, make sure that
@@ -1801,8 +1908,12 @@  can_combine_p (rtx insn, rtx i3, rtx pre
 	    || reg_used_between_p (XEXP (link, 0), insn, i3)
 	    || (pred != NULL_RTX
 		&& reg_overlap_mentioned_p (XEXP (link, 0), PATTERN (pred)))
+	    || (pred2 != NULL_RTX
+		&& reg_overlap_mentioned_p (XEXP (link, 0), PATTERN (pred2)))
 	    || (succ != NULL_RTX
 		&& reg_overlap_mentioned_p (XEXP (link, 0), PATTERN (succ)))
+	    || (succ2 != NULL_RTX
+		&& reg_overlap_mentioned_p (XEXP (link, 0), PATTERN (succ2)))
 	    || reg_overlap_mentioned_p (XEXP (link, 0), PATTERN (i3))))
       return 0;
 #endif
@@ -1836,8 +1947,8 @@  can_combine_p (rtx insn, rtx i3, rtx pre
    of a PARALLEL of the pattern.  We validate that it is valid for combining.
 
    One problem is if I3 modifies its output, as opposed to replacing it
-   entirely, we can't allow the output to contain I2DEST or I1DEST as doing
-   so would produce an insn that is not equivalent to the original insns.
+   entirely, we can't allow the output to contain I2DEST, I1DEST or I0DEST as
+   doing so would produce an insn that is not equivalent to the original insns.
 
    Consider:
 
@@ -1858,7 +1969,8 @@  can_combine_p (rtx insn, rtx i3, rtx pre
    must reject the combination.  This case occurs when I2 and I1 both
    feed into I3, rather than when I1 feeds into I2, which feeds into I3.
    If I1_NOT_IN_SRC is nonzero, it means that finding I1 in the source
-   of a SET must prevent combination from occurring.
+   of a SET must prevent combination from occurring.  The same situation
+   can occur for I0, in which case I0_NOT_IN_SRC is set.
 
    Before doing the above check, we first try to expand a field assignment
    into a set of logical operations.
@@ -1870,8 +1982,8 @@  can_combine_p (rtx insn, rtx i3, rtx pre
    Return 1 if the combination is valid, zero otherwise.  */
 
 static int
-combinable_i3pat (rtx i3, rtx *loc, rtx i2dest, rtx i1dest,
-		  int i1_not_in_src, rtx *pi3dest_killed)
+combinable_i3pat (rtx i3, rtx *loc, rtx i2dest, rtx i1dest, rtx i0dest,
+		  int i1_not_in_src, int i0_not_in_src, rtx *pi3dest_killed)
 {
   rtx x = *loc;
 
@@ -1895,9 +2007,11 @@  combinable_i3pat (rtx i3, rtx *loc, rtx 
       if ((inner_dest != dest &&
 	   (!MEM_P (inner_dest)
 	    || rtx_equal_p (i2dest, inner_dest)
-	    || (i1dest && rtx_equal_p (i1dest, inner_dest)))
+	    || (i1dest && rtx_equal_p (i1dest, inner_dest))
+	    || (i0dest && rtx_equal_p (i0dest, inner_dest)))
 	   && (reg_overlap_mentioned_p (i2dest, inner_dest)
-	       || (i1dest && reg_overlap_mentioned_p (i1dest, inner_dest))))
+	       || (i1dest && reg_overlap_mentioned_p (i1dest, inner_dest))
+	       || (i0dest && reg_overlap_mentioned_p (i0dest, inner_dest))))
 
 	  /* This is the same test done in can_combine_p except we can't test
 	     all_adjacent; we don't have to, since this instruction will stay
@@ -1913,7 +2027,8 @@  combinable_i3pat (rtx i3, rtx *loc, rtx 
 	      && REGNO (inner_dest) < FIRST_PSEUDO_REGISTER
 	      && (! HARD_REGNO_MODE_OK (REGNO (inner_dest),
 					GET_MODE (inner_dest))))
-	  || (i1_not_in_src && reg_overlap_mentioned_p (i1dest, src)))
+	  || (i1_not_in_src && reg_overlap_mentioned_p (i1dest, src))
+	  || (i0_not_in_src && reg_overlap_mentioned_p (i0dest, src)))
 	return 0;
 
       /* If DEST is used in I3, it is being killed in this insn, so
@@ -1953,8 +2068,8 @@  combinable_i3pat (rtx i3, rtx *loc, rtx 
       int i;
 
       for (i = 0; i < XVECLEN (x, 0); i++)
-	if (! combinable_i3pat (i3, &XVECEXP (x, 0, i), i2dest, i1dest,
-				i1_not_in_src, pi3dest_killed))
+	if (! combinable_i3pat (i3, &XVECEXP (x, 0, i), i2dest, i1dest, i0dest,
+				i1_not_in_src, i0_not_in_src, pi3dest_killed))
 	  return 0;
     }
 
@@ -2364,15 +2479,15 @@  update_cfg_for_uncondjump (rtx insn)
     single_succ_edge (bb)->flags |= EDGE_FALLTHRU;
 }
 
+/* Try to combine the insns I0, I1 and I2 into I3.
+   Here I0, I1 and I2 appear earlier than I3.
+   I0 and I1 can be zero; then we combine just I2 into I3, or I1 and I2 into
+   I3.
 
-/* Try to combine the insns I1 and I2 into I3.
-   Here I1 and I2 appear earlier than I3.
-   I1 can be zero; then we combine just I2 into I3.
-
-   If we are combining three insns and the resulting insn is not recognized,
-   try splitting it into two insns.  If that happens, I2 and I3 are retained
-   and I1 is pseudo-deleted by turning it into a NOTE.  Otherwise, I1 and I2
-   are pseudo-deleted.
+   If we are combining more than two insns and the resulting insn is not
+   recognized, try splitting it into two insns.  If that happens, I2 and I3
+   are retained and I1/I0 are pseudo-deleted by turning them into a NOTE.
+   Otherwise, I0, I1 and I2 are pseudo-deleted.
 
    Return 0 if the combination does not work.  Then nothing is changed.
    If we did the combination, return the insn at which combine should
@@ -2382,34 +2497,38 @@  update_cfg_for_uncondjump (rtx insn)
    new direct jump instruction.  */
 
 static rtx
-try_combine (rtx i3, rtx i2, rtx i1, int *new_direct_jump_p)
+try_combine (rtx i3, rtx i2, rtx i1, rtx i0, int *new_direct_jump_p)
 {
   /* New patterns for I3 and I2, respectively.  */
   rtx newpat, newi2pat = 0;
   rtvec newpat_vec_with_clobbers = 0;
-  int substed_i2 = 0, substed_i1 = 0;
-  /* Indicates need to preserve SET in I1 or I2 in I3 if it is not dead.  */
-  int added_sets_1, added_sets_2;
+  int substed_i2 = 0, substed_i1 = 0, substed_i0 = 0;
+  /* Indicates need to preserve SET in I0, I1 or I2 in I3 if it is not
+     dead.  */
+  int added_sets_0, added_sets_1, added_sets_2;
   /* Total number of SETs to put into I3.  */
   int total_sets;
-  /* Nonzero if I2's body now appears in I3.  */
-  int i2_is_used;
+  /* Nonzero if I2's or I1's body now appears in I3.  */
+  int i2_is_used, i1_is_used;
   /* INSN_CODEs for new I3, new I2, and user of condition code.  */
   int insn_code_number, i2_code_number = 0, other_code_number = 0;
   /* Contains I3 if the destination of I3 is used in its source, which means
      that the old life of I3 is being killed.  If that usage is placed into
      I2 and not in I3, a REG_DEAD note must be made.  */
   rtx i3dest_killed = 0;
-  /* SET_DEST and SET_SRC of I2 and I1.  */
-  rtx i2dest = 0, i2src = 0, i1dest = 0, i1src = 0;
+  /* SET_DEST and SET_SRC of I2, I1 and I0.  */
+  rtx i2dest = 0, i2src = 0, i1dest = 0, i1src = 0, i0dest = 0, i0src = 0;
   /* Set if I2DEST was reused as a scratch register.  */
   bool i2scratch = false;
-  /* PATTERN (I1) and PATTERN (I2), or a copy of it in certain cases.  */
-  rtx i1pat = 0, i2pat = 0;
+  /* The PATTERNs of I0, I1, and I2, or a copy of them in certain cases.  */
+  rtx i0pat = 0, i1pat = 0, i2pat = 0;
   /* Indicates if I2DEST or I1DEST is in I2SRC or I1_SRC.  */
   int i2dest_in_i2src = 0, i1dest_in_i1src = 0, i2dest_in_i1src = 0;
-  int i2dest_killed = 0, i1dest_killed = 0;
+  int i0dest_in_i0src = 0, i1dest_in_i0src = 0, i2dest_in_i0src = 0;
+  int i2dest_killed = 0, i1dest_killed = 0, i0dest_killed;
   int i1_feeds_i3 = 0;
+  int i1_feeds_i3_n = 0, i1_feeds_i2_n = 0, i0_feeds_i3_n = 0;
+  int i0_feeds_i2_n = 0, i0_feeds_i1_n = 0;
   /* Notes that must be added to REG_NOTES in I3 and I2.  */
   rtx new_i3_notes, new_i2_notes;
   /* Notes that we substituted I3 into I2 instead of the normal case.  */
@@ -2431,6 +2550,7 @@  try_combine (rtx i3, rtx i2, rtx i1, int
   if (cant_combine_insn_p (i3)
       || cant_combine_insn_p (i2)
       || (i1 && cant_combine_insn_p (i1))
+      || (i0 && cant_combine_insn_p (i0))
       || likely_spilled_retval_p (i3))
     return 0;
 
@@ -2442,7 +2562,10 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
-      if (i1)
+      if (i0)
+	fprintf (dump_file, "\nTrying %d, %d, %d -> %d:\n",
+		 INSN_UID (i0), INSN_UID (i1), INSN_UID (i2), INSN_UID (i3));
+      else if (i1)
 	fprintf (dump_file, "\nTrying %d, %d -> %d:\n",
 		 INSN_UID (i1), INSN_UID (i2), INSN_UID (i3));
       else
@@ -2450,8 +2573,12 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 		 INSN_UID (i2), INSN_UID (i3));
     }
 
-  /* If I1 and I2 both feed I3, they can be in any order.  To simplify the
-     code below, set I1 to be the earlier of the two insns.  */
+  /* If multiple insns feed into one of I2 or I3, they can be in any
+     order.  To simplify the code below, reorder them in sequence.  */
+  if (i0 && DF_INSN_LUID (i0) > DF_INSN_LUID (i2))
+    temp = i2, i2 = i0, i0 = temp;
+  if (i0 && DF_INSN_LUID (i0) > DF_INSN_LUID (i1))
+    temp = i1, i1 = i0, i0 = temp;
   if (i1 && DF_INSN_LUID (i1) > DF_INSN_LUID (i2))
     temp = i1, i1 = i2, i2 = temp;
 
@@ -2673,8 +2800,11 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 #endif
 
   /* Verify that I2 and I1 are valid for combining.  */
-  if (! can_combine_p (i2, i3, i1, NULL_RTX, &i2dest, &i2src)
-      || (i1 && ! can_combine_p (i1, i3, NULL_RTX, i2, &i1dest, &i1src)))
+  if (! can_combine_p (i2, i3, i0, i1, NULL_RTX, NULL_RTX, &i2dest, &i2src)
+      || (i1 && ! can_combine_p (i1, i3, i0, NULL_RTX, i2, NULL_RTX,
+				 &i1dest, &i1src))
+      || (i0 && ! can_combine_p (i0, i3, NULL_RTX, NULL_RTX, i1, i2,
+				 &i0dest, &i0src)))
     {
       undo_all ();
       return 0;
@@ -2685,16 +2815,27 @@  try_combine (rtx i3, rtx i2, rtx i1, int
   i2dest_in_i2src = reg_overlap_mentioned_p (i2dest, i2src);
   i1dest_in_i1src = i1 && reg_overlap_mentioned_p (i1dest, i1src);
   i2dest_in_i1src = i1 && reg_overlap_mentioned_p (i2dest, i1src);
+  i0dest_in_i0src = i0 && reg_overlap_mentioned_p (i0dest, i0src);
+  i1dest_in_i0src = i0 && reg_overlap_mentioned_p (i1dest, i0src);
+  i2dest_in_i0src = i0 && reg_overlap_mentioned_p (i2dest, i0src);
   i2dest_killed = dead_or_set_p (i2, i2dest);
   i1dest_killed = i1 && dead_or_set_p (i1, i1dest);
+  i0dest_killed = i0 && dead_or_set_p (i0, i0dest);
 
   /* See if I1 directly feeds into I3.  It does if I1DEST is not used
      in I2SRC.  */
   i1_feeds_i3 = i1 && ! reg_overlap_mentioned_p (i1dest, i2src);
+  i1_feeds_i2_n = i1 && insn_a_feeds_b (i1, i2);
+  i1_feeds_i3_n = i1 && insn_a_feeds_b (i1, i3);
+  i0_feeds_i3_n = i0 && insn_a_feeds_b (i0, i3);
+  i0_feeds_i2_n = i0 && insn_a_feeds_b (i0, i2);
+  i0_feeds_i1_n = i0 && insn_a_feeds_b (i0, i1);
 
   /* Ensure that I3's pattern can be the destination of combines.  */
-  if (! combinable_i3pat (i3, &PATTERN (i3), i2dest, i1dest,
+  if (! combinable_i3pat (i3, &PATTERN (i3), i2dest, i1dest, i0dest,
 			  i1 && i2dest_in_i1src && i1_feeds_i3,
+			  i0 && ((i2dest_in_i0src && i0_feeds_i3_n)
+				 || (i1dest_in_i0src && !i0_feeds_i1_n)),
 			  &i3dest_killed))
     {
       undo_all ();
@@ -2706,6 +2847,7 @@  try_combine (rtx i3, rtx i2, rtx i1, int
      here.  */
   if (GET_CODE (i2src) == MULT
       || (i1 != 0 && GET_CODE (i1src) == MULT)
+      || (i0 != 0 && GET_CODE (i0src) == MULT)
       || (GET_CODE (PATTERN (i3)) == SET
 	  && GET_CODE (SET_SRC (PATTERN (i3))) == MULT))
     have_mult = 1;
@@ -2745,14 +2887,22 @@  try_combine (rtx i3, rtx i2, rtx i1, int
      feed into I3, the set in I1 needs to be kept around if I1DEST dies
      or is set in I3.  Otherwise (if I1 feeds I2 which feeds I3), the set
      in I1 needs to be kept around unless I1DEST dies or is set in either
-     I2 or I3.  We can distinguish these cases by seeing if I2SRC mentions
-     I1DEST.  If so, we know I1 feeds into I2.  */
+     I2 or I3.  The same consideration applies to I0.  */
 
-  added_sets_2 = ! dead_or_set_p (i3, i2dest);
+  added_sets_2 = !dead_or_set_p (i3, i2dest);
 
-  added_sets_1
-    = i1 && ! (i1_feeds_i3 ? dead_or_set_p (i3, i1dest)
-	       : (dead_or_set_p (i3, i1dest) || dead_or_set_p (i2, i1dest)));
+  if (i1)
+    added_sets_1 = !((i1_feeds_i3_n && dead_or_set_p (i3, i1dest))
+		     || (i1_feeds_i2_n && dead_or_set_p (i2, i1dest)));
+  else
+    added_sets_1 = 0;
+
+  if (i0)
+    added_sets_0 =  !((i0_feeds_i3_n && dead_or_set_p (i3, i0dest))
+		      || (i0_feeds_i2_n && dead_or_set_p (i2, i0dest))
+		      || (i0_feeds_i1_n && dead_or_set_p (i1, i0dest)));
+  else
+    added_sets_0 = 0;
 
   /* If the set in I2 needs to be kept around, we must make a copy of
      PATTERN (I2), so that when we substitute I1SRC for I1DEST in
@@ -2777,6 +2927,14 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 	i1pat = copy_rtx (PATTERN (i1));
     }
 
+  if (added_sets_0)
+    {
+      if (GET_CODE (PATTERN (i0)) == PARALLEL)
+	i0pat = gen_rtx_SET (VOIDmode, i0dest, copy_rtx (i0src));
+      else
+	i0pat = copy_rtx (PATTERN (i0));
+    }
+
   combine_merges++;
 
   /* Substitute in the latest insn for the regs set by the earlier ones.  */
@@ -2825,8 +2983,8 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 					      i2src, const0_rtx))
 	      != GET_MODE (SET_DEST (newpat))))
 	{
-	  if (can_change_dest_mode(SET_DEST (newpat), added_sets_2,
-				   compare_mode))
+	  if (can_change_dest_mode (SET_DEST (newpat), added_sets_2,
+				    compare_mode))
 	    {
 	      unsigned int regno = REGNO (SET_DEST (newpat));
 	      rtx new_dest;
@@ -2889,13 +3047,14 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 
       n_occurrences = 0;		/* `subst' counts here */
 
-      /* If I1 feeds into I2 (not into I3) and I1DEST is in I1SRC, we
-	 need to make a unique copy of I2SRC each time we substitute it
-	 to avoid self-referential rtl.  */
+      /* If I1 feeds into I2 and I1DEST is in I1SRC, we need to make a
+	 unique copy of I2SRC each time we substitute it to avoid
+	 self-referential rtl.  */
 
       subst_low_luid = DF_INSN_LUID (i2);
       newpat = subst (PATTERN (i3), i2dest, i2src, 0,
-		      ! i1_feeds_i3 && i1dest_in_i1src);
+		      ((i1_feeds_i2_n && i1dest_in_i1src)
+		       || (i0_feeds_i2_n && i0dest_in_i0src)));
       substed_i2 = 1;
 
       /* Record whether i2's body now appears within i3's body.  */
@@ -2911,13 +3070,14 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 	 This happens if I1DEST is mentioned in I2 and dies there, and
 	 has disappeared from the new pattern.  */
       if ((FIND_REG_INC_NOTE (i1, NULL_RTX) != 0
-	   && !i1_feeds_i3
+	   && i1_feeds_i2_n
 	   && dead_or_set_p (i2, i1dest)
 	   && !reg_overlap_mentioned_p (i1dest, newpat))
 	  /* Before we can do this substitution, we must redo the test done
 	     above (see detailed comments there) that ensures  that I1DEST
 	     isn't mentioned in any SETs in NEWPAT that are field assignments.  */
-          || !combinable_i3pat (NULL_RTX, &newpat, i1dest, NULL_RTX, 0, 0))
+          || !combinable_i3pat (NULL_RTX, &newpat, i1dest, NULL_RTX, NULL_RTX,
+				0, 0, 0))
 	{
 	  undo_all ();
 	  return 0;
@@ -2925,8 +3085,29 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 
       n_occurrences = 0;
       subst_low_luid = DF_INSN_LUID (i1);
-      newpat = subst (newpat, i1dest, i1src, 0, 0);
+      newpat = subst (newpat, i1dest, i1src, 0,
+		      i0_feeds_i1_n && i0dest_in_i0src);
       substed_i1 = 1;
+      i1_is_used = n_occurrences;
+    }
+  if (i0 && GET_CODE (newpat) != CLOBBER)
+    {
+      if ((FIND_REG_INC_NOTE (i0, NULL_RTX) != 0
+	   && ((i0_feeds_i2_n && dead_or_set_p (i2, i0dest))
+	       || (i0_feeds_i1_n && dead_or_set_p (i1, i0dest)))
+	   && !reg_overlap_mentioned_p (i0dest, newpat))
+          || !combinable_i3pat (NULL_RTX, &newpat, i0dest, NULL_RTX, NULL_RTX,
+				0, 0, 0))
+	{
+	  undo_all ();
+	  return 0;
+	}
+
+      n_occurrences = 0;
+      subst_low_luid = DF_INSN_LUID (i1);
+      newpat = subst (newpat, i0dest, i0src, 0,
+		      i0_feeds_i1_n && i0dest_in_i0src);
+      substed_i0 = 1;
     }
 
   /* Fail if an autoincrement side-effect has been duplicated.  Be careful
@@ -2934,7 +3115,12 @@  try_combine (rtx i3, rtx i2, rtx i1, int
   if ((FIND_REG_INC_NOTE (i2, NULL_RTX) != 0
        && i2_is_used + added_sets_2 > 1)
       || (i1 != 0 && FIND_REG_INC_NOTE (i1, NULL_RTX) != 0
-	  && (n_occurrences + added_sets_1 + (added_sets_2 && ! i1_feeds_i3)
+	  && (i1_is_used + added_sets_1 + (added_sets_2 && i1_feeds_i2_n)
+	      > 1))
+      || (i0 != 0 && FIND_REG_INC_NOTE (i0, NULL_RTX) != 0
+	  && (n_occurrences + added_sets_0
+	      + (added_sets_1 && i0_feeds_i1_n)
+	      + (added_sets_2 && i0_feeds_i2_n)
 	      > 1))
       /* Fail if we tried to make a new register.  */
       || max_reg_num () != maxreg
@@ -2954,14 +3140,15 @@  try_combine (rtx i3, rtx i2, rtx i1, int
      we must make a new PARALLEL for the latest insn
      to hold additional the SETs.  */
 
-  if (added_sets_1 || added_sets_2)
+  if (added_sets_0 || added_sets_1 || added_sets_2)
     {
+      int extra_sets = added_sets_0 + added_sets_1 + added_sets_2;
       combine_extras++;
 
       if (GET_CODE (newpat) == PARALLEL)
 	{
 	  rtvec old = XVEC (newpat, 0);
-	  total_sets = XVECLEN (newpat, 0) + added_sets_1 + added_sets_2;
+	  total_sets = XVECLEN (newpat, 0) + extra_sets;
 	  newpat = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (total_sets));
 	  memcpy (XVEC (newpat, 0)->elem, &old->elem[0],
 		  sizeof (old->elem[0]) * old->num_elem);
@@ -2969,25 +3156,31 @@  try_combine (rtx i3, rtx i2, rtx i1, int
       else
 	{
 	  rtx old = newpat;
-	  total_sets = 1 + added_sets_1 + added_sets_2;
+	  total_sets = 1 + extra_sets;
 	  newpat = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (total_sets));
 	  XVECEXP (newpat, 0, 0) = old;
 	}
 
+      if (added_sets_0)
+	XVECEXP (newpat, 0, --total_sets) = i0pat;
+
       if (added_sets_1)
-	XVECEXP (newpat, 0, --total_sets) = i1pat;
+	{
+	  rtx t = i1pat;
+	  if (i0_feeds_i1_n)
+	    t = subst (t, i0dest, i0src, 0, 0);
 
+	  XVECEXP (newpat, 0, --total_sets) = t;
+	}
       if (added_sets_2)
 	{
-	  /* If there is no I1, use I2's body as is.  We used to also not do
-	     the subst call below if I2 was substituted into I3,
-	     but that could lose a simplification.  */
-	  if (i1 == 0)
-	    XVECEXP (newpat, 0, --total_sets) = i2pat;
-	  else
-	    /* See comment where i2pat is assigned.  */
-	    XVECEXP (newpat, 0, --total_sets)
-	      = subst (i2pat, i1dest, i1src, 0, 0);
+	  rtx t = i2pat;
+	  if (i0_feeds_i2_n)
+	    t = subst (t, i0dest, i0src, 0, 0);
+	  if (i1_feeds_i2_n)
+	    t = subst (t, i1dest, i1src, 0, 0);
+
+	  XVECEXP (newpat, 0, --total_sets) = t;
 	}
     }
 
@@ -3543,7 +3736,7 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 
   /* Only allow this combination if insn_rtx_costs reports that the
      replacement instructions are cheaper than the originals.  */
-  if (!combine_validate_cost (i1, i2, i3, newpat, newi2pat, other_pat))
+  if (!combine_validate_cost (i0, i1, i2, i3, newpat, newi2pat, other_pat))
     {
       undo_all ();
       return 0;
@@ -3642,7 +3835,8 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 	}
 
       distribute_notes (new_other_notes, undobuf.other_insn,
-			undobuf.other_insn, NULL_RTX, NULL_RTX, NULL_RTX);
+			undobuf.other_insn, NULL_RTX, NULL_RTX, NULL_RTX,
+			NULL_RTX);
     }
 
   if (swap_i2i3)
@@ -3689,21 +3883,26 @@  try_combine (rtx i3, rtx i2, rtx i1, int
     }
 
   {
-    rtx i3notes, i2notes, i1notes = 0;
-    rtx i3links, i2links, i1links = 0;
+    rtx i3notes, i2notes, i1notes = 0, i0notes = 0;
+    rtx i3links, i2links, i1links = 0, i0links = 0;
     rtx midnotes = 0;
+    int from_luid;
     unsigned int regno;
     /* Compute which registers we expect to eliminate.  newi2pat may be setting
        either i3dest or i2dest, so we must check it.  Also, i1dest may be the
        same as i3dest, in which case newi2pat may be setting i1dest.  */
     rtx elim_i2 = ((newi2pat && reg_set_p (i2dest, newi2pat))
-		   || i2dest_in_i2src || i2dest_in_i1src
+		   || i2dest_in_i2src || i2dest_in_i1src || i2dest_in_i0src
 		   || !i2dest_killed
 		   ? 0 : i2dest);
-    rtx elim_i1 = (i1 == 0 || i1dest_in_i1src
+    rtx elim_i1 = (i1 == 0 || i1dest_in_i1src || i1dest_in_i0src
 		   || (newi2pat && reg_set_p (i1dest, newi2pat))
 		   || !i1dest_killed
 		   ? 0 : i1dest);
+    rtx elim_i0 = (i0 == 0 || i0dest_in_i0src
+		   || (newi2pat && reg_set_p (i0dest, newi2pat))
+		   || !i0dest_killed
+		   ? 0 : i0dest);
 
     /* Get the old REG_NOTES and LOG_LINKS from all our insns and
        clear them.  */
@@ -3711,6 +3910,8 @@  try_combine (rtx i3, rtx i2, rtx i1, int
     i2notes = REG_NOTES (i2), i2links = LOG_LINKS (i2);
     if (i1)
       i1notes = REG_NOTES (i1), i1links = LOG_LINKS (i1);
+    if (i0)
+      i0notes = REG_NOTES (i0), i0links = LOG_LINKS (i0);
 
     /* Ensure that we do not have something that should not be shared but
        occurs multiple times in the new insns.  Check this by first
@@ -3719,6 +3920,7 @@  try_combine (rtx i3, rtx i2, rtx i1, int
     reset_used_flags (i3notes);
     reset_used_flags (i2notes);
     reset_used_flags (i1notes);
+    reset_used_flags (i0notes);
     reset_used_flags (newpat);
     reset_used_flags (newi2pat);
     if (undobuf.other_insn)
@@ -3727,6 +3929,7 @@  try_combine (rtx i3, rtx i2, rtx i1, int
     i3notes = copy_rtx_if_shared (i3notes);
     i2notes = copy_rtx_if_shared (i2notes);
     i1notes = copy_rtx_if_shared (i1notes);
+    i0notes = copy_rtx_if_shared (i0notes);
     newpat = copy_rtx_if_shared (newpat);
     newi2pat = copy_rtx_if_shared (newi2pat);
     if (undobuf.other_insn)
@@ -3753,6 +3956,8 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 
 	if (substed_i1)
 	  replace_rtx (call_usage, i1dest, i1src);
+	if (substed_i0)
+	  replace_rtx (call_usage, i0dest, i0src);
 
 	CALL_INSN_FUNCTION_USAGE (i3) = call_usage;
       }
@@ -3827,43 +4032,58 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 	SET_INSN_DELETED (i1);
       }
 
+    if (i0)
+      {
+	LOG_LINKS (i0) = 0;
+	REG_NOTES (i0) = 0;
+	if (MAY_HAVE_DEBUG_INSNS)
+	  propagate_for_debug (i0, i3, i0dest, i0src, false);
+	SET_INSN_DELETED (i0);
+      }
+
     /* Get death notes for everything that is now used in either I3 or
        I2 and used to die in a previous insn.  If we built two new
        patterns, move from I1 to I2 then I2 to I3 so that we get the
        proper movement on registers that I2 modifies.  */
 
-    if (newi2pat)
-      {
-	move_deaths (newi2pat, NULL_RTX, DF_INSN_LUID (i1), i2, &midnotes);
-	move_deaths (newpat, newi2pat, DF_INSN_LUID (i1), i3, &midnotes);
-      }
+    if (i0)
+      from_luid = DF_INSN_LUID (i0);
+    else if (i1)
+      from_luid = DF_INSN_LUID (i1);
     else
-      move_deaths (newpat, NULL_RTX, i1 ? DF_INSN_LUID (i1) : DF_INSN_LUID (i2),
-		   i3, &midnotes);
+      from_luid = DF_INSN_LUID (i2);
+    if (newi2pat)
+      move_deaths (newi2pat, NULL_RTX, from_luid, i2, &midnotes);
+    move_deaths (newpat, newi2pat, from_luid, i3, &midnotes);
 
     /* Distribute all the LOG_LINKS and REG_NOTES from I1, I2, and I3.  */
     if (i3notes)
       distribute_notes (i3notes, i3, i3, newi2pat ? i2 : NULL_RTX,
-			elim_i2, elim_i1);
+			elim_i2, elim_i1, elim_i0);
     if (i2notes)
       distribute_notes (i2notes, i2, i3, newi2pat ? i2 : NULL_RTX,
-			elim_i2, elim_i1);
+			elim_i2, elim_i1, elim_i0);
     if (i1notes)
       distribute_notes (i1notes, i1, i3, newi2pat ? i2 : NULL_RTX,
-			elim_i2, elim_i1);
+			elim_i2, elim_i1, elim_i0);
+    if (i0notes)
+      distribute_notes (i0notes, i0, i3, newi2pat ? i2 : NULL_RTX,
+			elim_i2, elim_i1, elim_i0);
     if (midnotes)
       distribute_notes (midnotes, NULL_RTX, i3, newi2pat ? i2 : NULL_RTX,
-			elim_i2, elim_i1);
+			elim_i2, elim_i1, elim_i0);
 
     /* Distribute any notes added to I2 or I3 by recog_for_combine.  We
        know these are REG_UNUSED and want them to go to the desired insn,
        so we always pass it as i3.  */
 
     if (newi2pat && new_i2_notes)
-      distribute_notes (new_i2_notes, i2, i2, NULL_RTX, NULL_RTX, NULL_RTX);
+      distribute_notes (new_i2_notes, i2, i2, NULL_RTX, NULL_RTX, NULL_RTX,
+			NULL_RTX);
 
     if (new_i3_notes)
-      distribute_notes (new_i3_notes, i3, i3, NULL_RTX, NULL_RTX, NULL_RTX);
+      distribute_notes (new_i3_notes, i3, i3, NULL_RTX, NULL_RTX, NULL_RTX,
+			NULL_RTX);
 
     /* If I3DEST was used in I3SRC, it really died in I3.  We may need to
        put a REG_DEAD note for it somewhere.  If NEWI2PAT exists and sets
@@ -3877,39 +4097,51 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 	if (newi2pat && reg_set_p (i3dest_killed, newi2pat))
 	  distribute_notes (alloc_reg_note (REG_DEAD, i3dest_killed,
 					    NULL_RTX),
-			    NULL_RTX, i2, NULL_RTX, elim_i2, elim_i1);
+			    NULL_RTX, i2, NULL_RTX, elim_i2, elim_i1, elim_i0);
 	else
 	  distribute_notes (alloc_reg_note (REG_DEAD, i3dest_killed,
 					    NULL_RTX),
 			    NULL_RTX, i3, newi2pat ? i2 : NULL_RTX,
-			    elim_i2, elim_i1);
+			    elim_i2, elim_i1, elim_i0);
       }
 
     if (i2dest_in_i2src)
       {
+	rtx new_note = alloc_reg_note (REG_DEAD, i2dest, NULL_RTX);
 	if (newi2pat && reg_set_p (i2dest, newi2pat))
-	  distribute_notes (alloc_reg_note (REG_DEAD, i2dest, NULL_RTX),
-			    NULL_RTX, i2, NULL_RTX, NULL_RTX, NULL_RTX);
-	else
-	  distribute_notes (alloc_reg_note (REG_DEAD, i2dest, NULL_RTX),
-			    NULL_RTX, i3, newi2pat ? i2 : NULL_RTX,
+	  distribute_notes (new_note,  NULL_RTX, i2, NULL_RTX, NULL_RTX,
 			    NULL_RTX, NULL_RTX);
+	else
+	  distribute_notes (new_note, NULL_RTX, i3, newi2pat ? i2 : NULL_RTX,
+			    NULL_RTX, NULL_RTX, NULL_RTX);
       }
 
     if (i1dest_in_i1src)
       {
+	rtx new_note = alloc_reg_note (REG_DEAD, i1dest, NULL_RTX);
 	if (newi2pat && reg_set_p (i1dest, newi2pat))
-	  distribute_notes (alloc_reg_note (REG_DEAD, i1dest, NULL_RTX),
-			    NULL_RTX, i2, NULL_RTX, NULL_RTX, NULL_RTX);
+	  distribute_notes (new_note, NULL_RTX, i2, NULL_RTX, NULL_RTX,
+			    NULL_RTX, NULL_RTX);
 	else
-	  distribute_notes (alloc_reg_note (REG_DEAD, i1dest, NULL_RTX),
-			    NULL_RTX, i3, newi2pat ? i2 : NULL_RTX,
+	  distribute_notes (new_note, NULL_RTX, i3, newi2pat ? i2 : NULL_RTX,
+			    NULL_RTX, NULL_RTX, NULL_RTX);
+      }
+
+    if (i0dest_in_i0src)
+      {
+	rtx new_note = alloc_reg_note (REG_DEAD, i0dest, NULL_RTX);
+	if (newi2pat && reg_set_p (i0dest, newi2pat))
+	  distribute_notes (new_note, NULL_RTX, i2, NULL_RTX, NULL_RTX,
 			    NULL_RTX, NULL_RTX);
+	else
+	  distribute_notes (new_note, NULL_RTX, i3, newi2pat ? i2 : NULL_RTX,
+			    NULL_RTX, NULL_RTX, NULL_RTX);
       }
 
     distribute_links (i3links);
     distribute_links (i2links);
     distribute_links (i1links);
+    distribute_links (i0links);
 
     if (REG_P (i2dest))
       {
@@ -3959,6 +4191,23 @@  try_combine (rtx i3, rtx i2, rtx i1, int
 	  INC_REG_N_SETS (regno, -1);
       }
 
+    if (i0 && REG_P (i0dest))
+      {
+	rtx link;
+	rtx i0_insn = 0, i0_val = 0, set;
+
+	for (link = LOG_LINKS (i3); link; link = XEXP (link, 1))
+	  if ((set = single_set (XEXP (link, 0))) != 0
+	      && rtx_equal_p (i0dest, SET_DEST (set)))
+	    i0_insn = XEXP (link, 0), i0_val = SET_SRC (set);
+
+	record_value_for_reg (i0dest, i0_insn, i0_val);
+
+	regno = REGNO (i0dest);
+	if (! added_sets_0 && ! i0dest_in_i0src)
+	  INC_REG_N_SETS (regno, -1);
+      }
+
     /* Update reg_stat[].nonzero_bits et al for any changes that may have
        been made to this insn.  The order of
        set_nonzero_bits_and_sign_copies() is important.  Because newi2pat
@@ -3978,6 +4227,16 @@  try_combine (rtx i3, rtx i2, rtx i1, int
       df_insn_rescan (undobuf.other_insn);
     }
 
+  if (i0 && !(NOTE_P(i0) && (NOTE_KIND (i0) == NOTE_INSN_DELETED)))
+    {
+      if (dump_file)
+	{
+	  fprintf (dump_file, "modifying insn i1 ");
+	  dump_insn_slim (dump_file, i0);
+	}
+      df_insn_rescan (i0);
+    }
+
   if (i1 && !(NOTE_P(i1) && (NOTE_KIND (i1) == NOTE_INSN_DELETED)))
     {
       if (dump_file)
@@ -12668,7 +12927,7 @@  reg_bitfield_target_p (rtx x, rtx body)
 
 static void
 distribute_notes (rtx notes, rtx from_insn, rtx i3, rtx i2, rtx elim_i2,
-		  rtx elim_i1)
+		  rtx elim_i1, rtx elim_i0)
 {
   rtx note, next_note;
   rtx tem;
@@ -12914,7 +13173,8 @@  distribute_notes (rtx notes, rtx from_in
 			&& !(i2mod
 			     && reg_overlap_mentioned_p (XEXP (note, 0),
 							 i2mod_old_rhs)))
-		       || rtx_equal_p (XEXP (note, 0), elim_i1))
+		       || rtx_equal_p (XEXP (note, 0), elim_i1)
+		       || rtx_equal_p (XEXP (note, 0), elim_i0))
 		break;
 	      tem = i3;
 	    }
@@ -12981,7 +13241,7 @@  distribute_notes (rtx notes, rtx from_in
 			  REG_NOTES (tem) = NULL;
 
 			  distribute_notes (old_notes, tem, tem, NULL_RTX,
-					    NULL_RTX, NULL_RTX);
+					    NULL_RTX, NULL_RTX, NULL_RTX);
 			  distribute_links (LOG_LINKS (tem));
 
 			  SET_INSN_DELETED (tem);
@@ -12998,7 +13258,7 @@  distribute_notes (rtx notes, rtx from_in
 
 			      distribute_notes (old_notes, cc0_setter,
 						cc0_setter, NULL_RTX,
-						NULL_RTX, NULL_RTX);
+						NULL_RTX, NULL_RTX, NULL_RTX);
 			      distribute_links (LOG_LINKS (cc0_setter));
 
 			      SET_INSN_DELETED (cc0_setter);
@@ -13118,7 +13378,8 @@  distribute_notes (rtx notes, rtx from_in
 							     NULL_RTX);
 
 			      distribute_notes (new_note, place, place,
-						NULL_RTX, NULL_RTX, NULL_RTX);
+						NULL_RTX, NULL_RTX, NULL_RTX,
+						NULL_RTX);
 			    }
 			  else if (! refers_to_regno_p (i, i + 1,
 							PATTERN (place), 0)