diff mbox

[Path,AVR] : Improve loading of 32-bit constants

Message ID 4E144C61.60600@gjlay.de
State New
Headers show

Commit Message

Georg-Johann Lay July 6, 2011, 11:52 a.m. UTC
For loading a 32-bit constant in a register, there is room for
improvement:

* SF can be handled the same way as SI and therefore the patch
  adds a peep2 to produce a *reload_insf analogon to *reload_insi.

* If the destination register overlaps NO_LD_REGS, values already
  loaded into some other byte can be reused by a simple MOV.
  This is helpful then moving values like, e.g. -2, -100 etc. because
  all high bytes are 0xff.

* 0.0f can be directly moved to memory.

* The mov insns contain "!d" constraint. I see no reason to make "d"
  expensive and discourage use of d-regs.  A "*d" to hide is better
  because it does it neither puts additional pressure on "d" nor
  discourages "d".

The patch is basically a rewrite of output_reload_insisf.

Tested without regressions.

Ok to commit?

Johann

	* config/avr/avr.md (*reload_insi): Change predicate #1 to
	const_int_operand.  Ditto for peep2 producing this insn.
	Add argument to output_reload_insisf call.
	(*movsi,*movsf): Add argument to output_movsisf call.  Change "!d"
	constraint to "*d".
	(*reload_insf): New insn and new peep2 to produce it.
	* config/avr/avr-protos.h (output_movsisf): Change prototype.
	(output_reload_insisf): Change prototype.
	* config/avr/avr.c (avr_asm_len): New function.
	(output_reload_insisf): Rewrite.
	(output_movsisf): Change prototype.  output_reload_insisf for
	all CONST_INT and CONST_DOUBLE.  ALlow moving 0.0f to memory.
	(adjust_insn_length): Add argument to output_movsisf and
	output_reload_insisf call.

Comments

Denis Chertykov July 6, 2011, 1:49 p.m. UTC | #1
2011/7/6 Georg-Johann Lay <avr@gjlay.de>:
> For loading a 32-bit constant in a register, there is room for
> improvement:
>
> * SF can be handled the same way as SI and therefore the patch
>  adds a peep2 to produce a *reload_insf analogon to *reload_insi.
>
> * If the destination register overlaps NO_LD_REGS, values already
>  loaded into some other byte can be reused by a simple MOV.
>  This is helpful then moving values like, e.g. -2, -100 etc. because
>  all high bytes are 0xff.
>
> * 0.0f can be directly moved to memory.
>
> * The mov insns contain "!d" constraint. I see no reason to make "d"
>  expensive and discourage use of d-regs.  A "*d" to hide is better
>  because it does it neither puts additional pressure on "d" nor
>  discourages "d".
>

I would like to have a real code examples.

Denis.
Georg-Johann Lay July 6, 2011, 3:56 p.m. UTC | #2
Denis Chertykov wrote:
> 2011/7/6 Georg-Johann Lay <avr@gjlay.de>:
>> For loading a 32-bit constant in a register, there is room for
>> improvement:
>>
>> * SF can be handled the same way as SI and therefore the patch
>>  adds a peep2 to produce a *reload_insf analogon to *reload_insi.
>>
>> * If the destination register overlaps NO_LD_REGS, values already
>>  loaded into some other byte can be reused by a simple MOV.
>>  This is helpful then moving values like, e.g. -2, -100 etc. because
>>  all high bytes are 0xff.
>>
>> * 0.0f can be directly moved to memory.
>>
>> * The mov insns contain "!d" constraint. I see no reason to make "d"
>>  expensive and discourage use of d-regs.  A "*d" to hide is better
>>  because it does it neither puts additional pressure on "d" nor
>>  discourages "d".
>>
> 
> I would like to have a real code examples.
> 
> Denis.

Hi Denis.

Attached you find a small C file and the asm that is generated by new
and old versions (-Os -mmcu=atmega88 -S -dp).

I took away some regs as potential clobbers (or -fno-peephole2) to
show the effect of high register pressure.  Bit even if a clobber was
available you can see that the new version is smarter in reusing
values, e.g. note the loading of -1L to r22-r25.

Johann
.file	"oint.c"
__SREG__ = 0x3f
__SP_H__ = 0x3e
__SP_L__ = 0x3d
__tmp_reg__ = 0
__zero_reg__ = 1
	.global __do_copy_data
	.global __do_clear_bss
	.text
.global	foo1
	.type	foo1, @function
foo1:
	push r10	 ;  16	*pushqi/1	[length = 1]
	push r11	 ;  17	*pushqi/1	[length = 1]
	push r12	 ;  18	*pushqi/1	[length = 1]
	push r13	 ;  19	*pushqi/1	[length = 1]
	push r14	 ;  20	*pushqi/1	[length = 1]
	push r15	 ;  21	*pushqi/1	[length = 1]
	push r16	 ;  22	*pushqi/1	[length = 1]
	push r17	 ;  23	*pushqi/1	[length = 1]
/* prologue: function */
/* frame size = 0 */
/* stack size = 8 */
.L__stack_usage = 8
	movw r18,r22	 ;  2	*movsi/1	[length = 2]
	movw r20,r24
	ldi r22,lo8(-1)	 ;  7	*movsi/5	[length = 4]
	ldi r23,hi8(-1)
	ldi r24,hlo8(-1)
	ldi r25,hhi8(-1)
	mov __tmp_reg__,r31	 ;  9	*movsi/6	[length = 10]
	ldi r31,lo8(-2)
	mov r14,r31
	ldi r31,hi8(-2)
	mov r15,r31
	ldi r31,hlo8(-2)
	mov r16,r31
	ldi r31,hhi8(-2)
	mov r17,r31
	mov r31,__tmp_reg__
	mov __tmp_reg__,r31	 ;  10	*movsi/6	[length = 10]
	ldi r31,lo8(-16744448)
	mov r10,r31
	ldi r31,hi8(-16744448)
	mov r11,r31
	ldi r31,hlo8(-16744448)
	mov r12,r31
	ldi r31,hhi8(-16744448)
	mov r13,r31
	mov r31,__tmp_reg__
	rcall ibar	 ;  11	call_insn/3	[length = 1]
/* epilogue start */
	pop r17	 ;  26	popqi	[length = 1]
	pop r16	 ;  27	popqi	[length = 1]
	pop r15	 ;  28	popqi	[length = 1]
	pop r14	 ;  29	popqi	[length = 1]
	pop r13	 ;  30	popqi	[length = 1]
	pop r12	 ;  31	popqi	[length = 1]
	pop r11	 ;  32	popqi	[length = 1]
	pop r10	 ;  33	popqi	[length = 1]
	ret	 ;  34	return_from_epilogue	[length = 1]
	.size	foo1, .-foo1
.global	foo2
	.type	foo2, @function
foo2:
	push r10	 ;  16	*pushqi/1	[length = 1]
	push r11	 ;  17	*pushqi/1	[length = 1]
	push r12	 ;  18	*pushqi/1	[length = 1]
	push r13	 ;  19	*pushqi/1	[length = 1]
	push r14	 ;  20	*pushqi/1	[length = 1]
	push r15	 ;  21	*pushqi/1	[length = 1]
	push r16	 ;  22	*pushqi/1	[length = 1]
	push r17	 ;  23	*pushqi/1	[length = 1]
/* prologue: function */
/* frame size = 0 */
/* stack size = 8 */
.L__stack_usage = 8
	movw r18,r22	 ;  2	*movsi/1	[length = 2]
	movw r20,r24
	mov __tmp_reg__,r31	 ;  9	*movsi/6	[length = 10]
	ldi r31,lo8(65537)
	mov r14,r31
	ldi r31,hi8(65537)
	mov r15,r31
	ldi r31,hlo8(65537)
	mov r16,r31
	ldi r31,hhi8(65537)
	mov r17,r31
	mov r31,__tmp_reg__
	mov __tmp_reg__,r31	 ;  10	*movsi/6	[length = 10]
	ldi r31,lo8(-64504)
	mov r10,r31
	ldi r31,hi8(-64504)
	mov r11,r31
	ldi r31,hlo8(-64504)
	mov r12,r31
	ldi r31,hhi8(-64504)
	mov r13,r31
	mov r31,__tmp_reg__
	rcall ibar	 ;  11	call_insn/3	[length = 1]
/* epilogue start */
	pop r17	 ;  26	popqi	[length = 1]
	pop r16	 ;  27	popqi	[length = 1]
	pop r15	 ;  28	popqi	[length = 1]
	pop r14	 ;  29	popqi	[length = 1]
	pop r13	 ;  30	popqi	[length = 1]
	pop r12	 ;  31	popqi	[length = 1]
	pop r11	 ;  32	popqi	[length = 1]
	pop r10	 ;  33	popqi	[length = 1]
	ret	 ;  34	return_from_epilogue	[length = 1]
	.size	foo2, .-foo2
.global	foo3
	.type	foo3, @function
foo3:
	push r10	 ;  16	*pushqi/1	[length = 1]
	push r11	 ;  17	*pushqi/1	[length = 1]
	push r12	 ;  18	*pushqi/1	[length = 1]
	push r13	 ;  19	*pushqi/1	[length = 1]
	push r14	 ;  20	*pushqi/1	[length = 1]
	push r15	 ;  21	*pushqi/1	[length = 1]
	push r16	 ;  22	*pushqi/1	[length = 1]
	push r17	 ;  23	*pushqi/1	[length = 1]
/* prologue: function */
/* frame size = 0 */
/* stack size = 8 */
.L__stack_usage = 8
	movw r18,r22	 ;  2	*movsi/1	[length = 2]
	movw r20,r24
	mov __tmp_reg__,r31	 ;  9	*movsf/6	[length = 10]
	ldi r31,lo8(0xc0400000)
	mov r14,r31
	ldi r31,hi8(0xc0400000)
	mov r15,r31
	ldi r31,hlo8(0xc0400000)
	mov r16,r31
	ldi r31,hhi8(0xc0400000)
	mov r17,r31
	mov r31,__tmp_reg__
	mov __tmp_reg__,r31	 ;  10	*movsf/6	[length = 10]
	ldi r31,lo8(0x40000000)
	mov r10,r31
	ldi r31,hi8(0x40000000)
	mov r11,r31
	ldi r31,hlo8(0x40000000)
	mov r12,r31
	ldi r31,hhi8(0x40000000)
	mov r13,r31
	mov r31,__tmp_reg__
	rcall fbar	 ;  11	call_insn/3	[length = 1]
/* epilogue start */
	pop r17	 ;  26	popqi	[length = 1]
	pop r16	 ;  27	popqi	[length = 1]
	pop r15	 ;  28	popqi	[length = 1]
	pop r14	 ;  29	popqi	[length = 1]
	pop r13	 ;  30	popqi	[length = 1]
	pop r12	 ;  31	popqi	[length = 1]
	pop r11	 ;  32	popqi	[length = 1]
	pop r10	 ;  33	popqi	[length = 1]
	ret	 ;  34	return_from_epilogue	[length = 1]
	.size	foo3, .-foo3
.file	"oint.c"
__SREG__ = 0x3f
__SP_H__ = 0x3e
__SP_L__ = 0x3d
__tmp_reg__ = 0
__zero_reg__ = 1
	.text
.global	foo1
	.type	foo1, @function
foo1:
	push r10	 ;  16	*pushqi/1	[length = 1]
	push r11	 ;  17	*pushqi/1	[length = 1]
	push r12	 ;  18	*pushqi/1	[length = 1]
	push r13	 ;  19	*pushqi/1	[length = 1]
	push r14	 ;  20	*pushqi/1	[length = 1]
	push r15	 ;  21	*pushqi/1	[length = 1]
	push r16	 ;  22	*pushqi/1	[length = 1]
	push r17	 ;  23	*pushqi/1	[length = 1]
/* prologue: function */
/* frame size = 0 */
/* stack size = 8 */
.L__stack_usage = 8
	movw r18,r22	 ;  2	*movsi/1	[length = 2]
	movw r20,r24
	ldi r22,lo8(-1)	 ;  7	*movsi/5	[length = 3]
	ldi r23,lo8(-1)
	movw r24,r22
	ldi r17,lo8(-2)	 ;  9	*movsi/6	[length = 6]
	mov r14,r17
	clr r15
	dec r15
	ldi r16,lo8(-1)
	ldi r17,lo8(-1)
	clr r10	 ;  10	*movsi/6	[length = 7]
	set
	clr r11
	bld r11,7
	clr r12
	clr r13
	dec r13
	rcall ibar	 ;  11	*call_insn/2	[length = 1]
/* epilogue start */
	pop r17	 ;  26	popqi	[length = 1]
	pop r16	 ;  27	popqi	[length = 1]
	pop r15	 ;  28	popqi	[length = 1]
	pop r14	 ;  29	popqi	[length = 1]
	pop r13	 ;  30	popqi	[length = 1]
	pop r12	 ;  31	popqi	[length = 1]
	pop r11	 ;  32	popqi	[length = 1]
	pop r10	 ;  33	popqi	[length = 1]
	ret	 ;  34	return_from_epilogue	[length = 1]
	.size	foo1, .-foo1
.global	foo2
	.type	foo2, @function
foo2:
	push r10	 ;  16	*pushqi/1	[length = 1]
	push r11	 ;  17	*pushqi/1	[length = 1]
	push r12	 ;  18	*pushqi/1	[length = 1]
	push r13	 ;  19	*pushqi/1	[length = 1]
	push r14	 ;  20	*pushqi/1	[length = 1]
	push r15	 ;  21	*pushqi/1	[length = 1]
	push r16	 ;  22	*pushqi/1	[length = 1]
	push r17	 ;  23	*pushqi/1	[length = 1]
/* prologue: function */
/* frame size = 0 */
/* stack size = 8 */
.L__stack_usage = 8
	movw r18,r22	 ;  2	*movsi/1	[length = 2]
	movw r20,r24
	ldi r17,lo8(1)	 ;  9	*movsi/6	[length = 4]
	mov r14,r17
	clr r15
	movw r16,r14
	set	 ;  10	*movsi/6	[length = 8]
	clr r10
	bld r10,3
	clr r11
	bld r11,2
	clr r12
	dec r12
	mov r13,r12
	rcall ibar	 ;  11	*call_insn/2	[length = 1]
/* epilogue start */
	pop r17	 ;  26	popqi	[length = 1]
	pop r16	 ;  27	popqi	[length = 1]
	pop r15	 ;  28	popqi	[length = 1]
	pop r14	 ;  29	popqi	[length = 1]
	pop r13	 ;  30	popqi	[length = 1]
	pop r12	 ;  31	popqi	[length = 1]
	pop r11	 ;  32	popqi	[length = 1]
	pop r10	 ;  33	popqi	[length = 1]
	ret	 ;  34	return_from_epilogue	[length = 1]
	.size	foo2, .-foo2
.global	foo3
	.type	foo3, @function
foo3:
	push r10	 ;  16	*pushqi/1	[length = 1]
	push r11	 ;  17	*pushqi/1	[length = 1]
	push r12	 ;  18	*pushqi/1	[length = 1]
	push r13	 ;  19	*pushqi/1	[length = 1]
	push r14	 ;  20	*pushqi/1	[length = 1]
	push r15	 ;  21	*pushqi/1	[length = 1]
	push r16	 ;  22	*pushqi/1	[length = 1]
	push r17	 ;  23	*pushqi/1	[length = 1]
/* prologue: function */
/* frame size = 0 */
/* stack size = 8 */
.L__stack_usage = 8
	movw r18,r22	 ;  2	*movsi/1	[length = 2]
	movw r20,r24
	clr r14	 ;  9	*movsf/6	[length = 4]
	clr r15
	ldi r16,lo8(64)
	ldi r17,lo8(-64)
	clr r10	 ;  10	*movsf/6	[length = 6]
	clr r11
	clr r12
	set
	clr r13
	bld r13,6
	rcall fbar	 ;  11	*call_insn/2	[length = 1]
/* epilogue start */
	pop r17	 ;  26	popqi	[length = 1]
	pop r16	 ;  27	popqi	[length = 1]
	pop r15	 ;  28	popqi	[length = 1]
	pop r14	 ;  29	popqi	[length = 1]
	pop r13	 ;  30	popqi	[length = 1]
	pop r12	 ;  31	popqi	[length = 1]
	pop r11	 ;  32	popqi	[length = 1]
	pop r10	 ;  33	popqi	[length = 1]
	ret	 ;  34	return_from_epilogue	[length = 1]
	.size	foo3, .-foo3
	.ident	"GCC: (GNU) 4.7.0 20110704 (experimental)"
Denis Chertykov July 6, 2011, 5:17 p.m. UTC | #3
2011/7/6 Georg-Johann Lay <avr@gjlay.de>:
> Denis Chertykov wrote:
>> 2011/7/6 Georg-Johann Lay <avr@gjlay.de>:
>>> For loading a 32-bit constant in a register, there is room for
>>> improvement:
>>>
>>> * SF can be handled the same way as SI and therefore the patch
>>>  adds a peep2 to produce a *reload_insf analogon to *reload_insi.
>>>
>>> * If the destination register overlaps NO_LD_REGS, values already
>>>  loaded into some other byte can be reused by a simple MOV.
>>>  This is helpful then moving values like, e.g. -2, -100 etc. because
>>>  all high bytes are 0xff.
>>>
>>> * 0.0f can be directly moved to memory.
>>>
>>> * The mov insns contain "!d" constraint. I see no reason to make "d"
>>>  expensive and discourage use of d-regs.  A "*d" to hide is better
>>>  because it does it neither puts additional pressure on "d" nor
>>>  discourages "d".
>>>
>>
>> I would like to have a real code examples.
>>
>> Denis.
>
> Hi Denis.
>
> Attached you find a small C file and the asm that is generated by new
> and old versions (-Os -mmcu=atmega88 -S -dp).
>
> I took away some regs as potential clobbers (or -fno-peephole2) to
> show the effect of high register pressure.  Bit even if a clobber was
> available you can see that the new version is smarter in reusing
> values, e.g. note the loading of -1L to r22-r25.

I have asked about example of *d instead of !d.
Just svn GCC with *d vs svn GCC !d.


Denis.
Georg-Johann Lay July 6, 2011, 5:55 p.m. UTC | #4
Denis Chertykov wrote:
> 2011/7/6 Georg-Johann Lay <avr@gjlay.de>:
>> Denis Chertykov wrote:
>>> 2011/7/6 Georg-Johann Lay <avr@gjlay.de>:
>>>> For loading a 32-bit constant in a register, there is room for
>>>> improvement:
>>>>
>>>> * SF can be handled the same way as SI and therefore the patch
>>>>  adds a peep2 to produce a *reload_insf analogon to *reload_insi.
>>>>
>>>> * If the destination register overlaps NO_LD_REGS, values already
>>>>  loaded into some other byte can be reused by a simple MOV.
>>>>  This is helpful then moving values like, e.g. -2, -100 etc. because
>>>>  all high bytes are 0xff.
>>>>
>>>> * 0.0f can be directly moved to memory.
>>>>
>>>> * The mov insns contain "!d" constraint. I see no reason to make "d"
>>>>  expensive and discourage use of d-regs.  A "*d" to hide is better
>>>>  because it does it neither puts additional pressure on "d" nor
>>>>  discourages "d".
>>>>
>>> I would like to have a real code examples.
>>>
>>> Denis.
>> Hi Denis.
>>
>> Attached you find a small C file and the asm that is generated by new
>> and old versions (-Os -mmcu=atmega88 -S -dp).
>>
>> I took away some regs as potential clobbers (or -fno-peephole2) to
>> show the effect of high register pressure.  Bit even if a clobber was
>> available you can see that the new version is smarter in reusing
>> values, e.g. note the loading of -1L to r22-r25.
> 
> I have asked about example of *d instead of !d.
> Just svn GCC with *d vs svn GCC !d.
> 
> 
> Denis.

Ah, I couldn't depict that from your question.

I thought it could help in cases like these:

long z;

void inc (long y)
{
    z += y;
}

that gets compiled with -Os to

inc:
	push r16
	push r17
/* prologue: function */
/* frame size = 0 */
/* stack size = 2 */
.L__stack_usage = 2
	lds r16,z
	lds r17,z+1
	lds r18,z+2
	lds r19,z+3
	add r16,r22
	adc r17,r23
	adc r18,r24
	adc r19,r25
	sts z,r16
	sts z+1,r17
	sts z+2,r18
	sts z+3,r19
/* epilogue start */
	pop r17
	pop r16
	ret

But with the *d the code is still the same and R16 chosen instead of
better R18.  Maybe that's an IRA issue.

Looking again at the "*d" resp. "!d", I think the alternative is
superfluous because there is a "r" alternative and "d" is a subset of
"r", so allocator can always switch to "r" if it does not like or see "d".

I think we con remove that alternative, it's just confusing.

Johann
Georg-Johann Lay July 6, 2011, 6:35 p.m. UTC | #5
Denis Chertykov wrote:
> I have asked about example of *d instead of !d.
> Just svn GCC with *d vs svn GCC !d.
> 
> Denis.

Is the patch ok with the original !d instead of *d ?
It still improves and the !d vs. *d don't matter because there's
always r I think.

Johann
Denis Chertykov July 7, 2011, 6:11 a.m. UTC | #6
2011/7/6 Georg-Johann Lay <avr@gjlay.de>:
> Denis Chertykov wrote:
>> I have asked about example of *d instead of !d.
>> Just svn GCC with *d vs svn GCC !d.
>>
>> Denis.
>
> Is the patch ok with the original !d instead of *d ?

Ok.

Denis.
diff mbox

Patch

Index: config/avr/avr.md
===================================================================
--- config/avr/avr.md	(revision 175811)
+++ config/avr/avr.md	(working copy)
@@ -402,10 +402,10 @@  (define_expand "movsi"
 
 
 
-(define_peephole2 ; movsi_lreg_const
+(define_peephole2 ; *reload_insi
   [(match_scratch:QI 2 "d")
    (set (match_operand:SI 0 "l_register_operand" "")
-        (match_operand:SI 1 "immediate_operand" ""))
+        (match_operand:SI 1 "const_int_operand" ""))
    (match_dup 2)]
   "(operands[1] != const0_rtx
     && operands[1] != constm1_rtx)"
@@ -416,22 +416,26 @@  (define_peephole2 ; movsi_lreg_const
 ;; '*' because it is not used in rtl generation.
 (define_insn "*reload_insi"
   [(set (match_operand:SI 0 "register_operand" "=r")
-        (match_operand:SI 1 "immediate_operand" "i"))
+        (match_operand:SI 1 "const_int_operand" "n"))
    (clobber (match_operand:QI 2 "register_operand" "=&d"))]
   "reload_completed"
-  "* return output_reload_insisf (insn, operands, NULL);"
+  {
+    return output_reload_insisf (insn, operands, operands[2], NULL);
+  }
   [(set_attr "length" "8")
-   (set_attr "cc" "none")])
+   (set_attr "cc" "clobber")])
 
 
 (define_insn "*movsi"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r,Qm,!d,r")
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r,Qm,*d,r")
         (match_operand:SI 1 "general_operand"       "r,L,Qm,rL,i,i"))]
   "(register_operand (operands[0],SImode)
     || register_operand (operands[1],SImode) || const0_rtx == operands[1])"
-  "* return output_movsisf (insn, operands, NULL);"
+  {
+    return output_movsisf (insn, operands, NULL_RTX, NULL);
+  }
   [(set_attr "length" "4,4,8,9,4,10")
-   (set_attr "cc" "none,set_zn,clobber,clobber,none,clobber")])
+   (set_attr "cc" "none,set_zn,clobber,clobber,clobber,clobber")])
 
 ;; fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
 ;; move floating point numbers (32 bit)
@@ -451,13 +455,39 @@  (define_expand "movsf"
 }")
 
 (define_insn "*movsf"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,r,Qm,!d,r")
-        (match_operand:SF 1 "general_operand"       "r,G,Qm,r,F,F"))]
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,r,Qm,*d,r")
+        (match_operand:SF 1 "general_operand"       "r,G,Qm,rG,F,F"))]
   "register_operand (operands[0], SFmode)
-   || register_operand (operands[1], SFmode)"
-  "* return output_movsisf (insn, operands, NULL);"
+   || register_operand (operands[1], SFmode)
+   || operands[1] == CONST0_RTX (SFmode)"
+  {
+    return output_movsisf (insn, operands, NULL_RTX, NULL);
+  }
   [(set_attr "length" "4,4,8,9,4,10")
-   (set_attr "cc" "none,set_zn,clobber,clobber,none,clobber")])
+   (set_attr "cc" "none,set_zn,clobber,clobber,clobber,clobber")])
+
+(define_peephole2 ; *reload_insf
+  [(match_scratch:QI 2 "d")
+   (set (match_operand:SF 0 "l_register_operand" "")
+        (match_operand:SF 1 "const_double_operand" ""))
+   (match_dup 2)]
+  "operands[1] != CONST0_RTX (SFmode)"
+  [(parallel [(set (match_dup 0) 
+                   (match_dup 1))
+              (clobber (match_dup 2))])]
+  "")
+
+;; '*' because it is not used in rtl generation.
+(define_insn "*reload_insf"
+  [(set (match_operand:SF 0 "register_operand" "=r")
+        (match_operand:SF 1 "const_double_operand" "F"))
+   (clobber (match_operand:QI 2 "register_operand" "=&d"))]
+  "reload_completed"
+  {
+    return output_reload_insisf (insn, operands, operands[2], NULL);
+  }
+  [(set_attr "length" "8")
+   (set_attr "cc" "clobber")])
 
 ;;=========================================================================
 ;; move string (like memcpy)
Index: config/avr/avr-protos.h
===================================================================
--- config/avr/avr-protos.h	(revision 175811)
+++ config/avr/avr-protos.h	(working copy)
@@ -56,7 +56,7 @@  extern const char *out_movhi_r_mr (rtx i
 extern const char *out_movhi_mr_r (rtx insn, rtx op[], int *l);
 extern const char *out_movsi_r_mr (rtx insn, rtx op[], int *l);
 extern const char *out_movsi_mr_r (rtx insn, rtx op[], int *l);
-extern const char *output_movsisf (rtx insn, rtx operands[], int *l);
+extern const char *output_movsisf (rtx insn, rtx operands[], rtx clobber, int *l);
 extern const char *out_tstsi (rtx insn, rtx src, int *l);
 extern const char *out_tsthi (rtx insn, rtx src, int *l);
 extern const char *ret_cond_branch (rtx x, int len, int reverse);
@@ -85,7 +85,7 @@  extern const char *avr_out_sbxx_branch (
 extern int extra_constraint_Q (rtx x);
 extern int adjust_insn_length (rtx insn, int len);
 extern const char *output_reload_inhi (rtx insn, rtx *operands, int *len);
-extern const char *output_reload_insisf (rtx insn, rtx *operands, int *len);
+extern const char *output_reload_insisf (rtx insn, rtx *operands, rtx clobber, int *len);
 extern enum reg_class secondary_input_reload_class (enum reg_class,
 						    enum machine_mode,
 						    rtx);
Index: config/avr/avr.c
===================================================================
--- config/avr/avr.c	(revision 175811)
+++ config/avr/avr.c	(working copy)
@@ -1184,6 +1184,32 @@  avr_legitimize_address (rtx x, rtx oldx,
 }
 
 
+/* Helper function to print assembler resp. track instruction
+   sequence lengths.
+   
+   If PLEN == NULL:
+       Output assembler code from template TPL with operands supplied
+       by OPERANDS.  This is just forwarding to output_asm_insn.
+   
+   If PLEN != NULL:
+       Add N_WORDS to *PLEN.
+       Don't output anything.
+*/
+
+static void
+avr_asm_len (const char* tpl, rtx* operands, int* plen, int n_words)
+{
+  if (NULL == plen)
+    {
+      output_asm_insn (tpl, operands);
+    }
+  else
+    {
+      *plen += n_words;
+    }
+}
+
+
 /* Return a pointer register name as a string.  */
 
 static const char *
@@ -2600,7 +2626,7 @@  out_movsi_mr_r (rtx insn, rtx op[], int
 }
 
 const char *
-output_movsisf(rtx insn, rtx operands[], int *l)
+output_movsisf (rtx insn, rtx operands[], rtx clobber_reg, int *l)
 {
   int dummy;
   rtx dest = operands[0];
@@ -2643,6 +2669,11 @@  output_movsisf(rtx insn, rtx operands[],
 		      AS2 (mov,%D0,%D1));
 	    }
 	}
+      else if (CONST_INT_P (src)
+               || CONST_DOUBLE_P (src))
+        {
+          return output_reload_insisf (insn, operands, clobber_reg, real_l);
+        }
       else if (CONSTANT_P (src))
 	{
 	  if (test_hard_reg_class (LD_REGS, dest)) /* ldi d,i */
@@ -2653,68 +2684,6 @@  output_movsisf(rtx insn, rtx operands[],
 		      AS2 (ldi,%C0,hlo8(%1)) CR_TAB
 		      AS2 (ldi,%D0,hhi8(%1)));
 	    }
-	  
-	  if (GET_CODE (src) == CONST_INT)
-	    {
-	      const char *const clr_op0 =
-		AVR_HAVE_MOVW ? (AS1 (clr,%A0) CR_TAB
-				AS1 (clr,%B0) CR_TAB
-				AS2 (movw,%C0,%A0))
-			     : (AS1 (clr,%A0) CR_TAB
-				AS1 (clr,%B0) CR_TAB
-				AS1 (clr,%C0) CR_TAB
-				AS1 (clr,%D0));
-
-	      if (src == const0_rtx) /* mov r,L */
-		{
-		  *l = AVR_HAVE_MOVW ? 3 : 4;
-		  return clr_op0;
-		}
-	      else if (src == const1_rtx)
-		{
-		  if (!real_l)
-		    output_asm_insn (clr_op0, operands);
-		  *l = AVR_HAVE_MOVW ? 4 : 5;
-		  return AS1 (inc,%A0);
-		}
-	      else if (src == constm1_rtx)
-		{
-		  /* Immediate constants -1 to any register */
-		  if (AVR_HAVE_MOVW)
-		    {
-		      *l = 4;
-		      return (AS1 (clr,%A0)     CR_TAB
-			      AS1 (dec,%A0)     CR_TAB
-			      AS2 (mov,%B0,%A0) CR_TAB
-			      AS2 (movw,%C0,%A0));
-		    }
-		  *l = 5;
-		  return (AS1 (clr,%A0)     CR_TAB
-			  AS1 (dec,%A0)     CR_TAB
-			  AS2 (mov,%B0,%A0) CR_TAB
-			  AS2 (mov,%C0,%A0) CR_TAB
-			  AS2 (mov,%D0,%A0));
-		}
-	      else
-		{
-		  int bit_nr = exact_log2 (INTVAL (src));
-
-		  if (bit_nr >= 0)
-		    {
-		      *l = AVR_HAVE_MOVW ? 5 : 6;
-		      if (!real_l)
-			{
-			  output_asm_insn (clr_op0, operands);
-			  output_asm_insn ("set", operands);
-			}
-		      if (!real_l)
-			avr_output_bld (operands, bit_nr);
-
-		      return "";
-		    }
-		}
-	    }
-	  
 	  /* Last resort, better than loading from memory.  */
 	  *l = 10;
 	  return (AS2 (mov,__tmp_reg__,r31) CR_TAB
@@ -2735,7 +2704,7 @@  output_movsisf(rtx insn, rtx operands[],
     {
       const char *templ;
 
-      if (src == const0_rtx)
+      if (src == CONST0_RTX (GET_MODE (dest)))
 	  operands[1] = zero_reg_rtx;
 
       templ = out_movsi_mr_r (insn, operands, real_l);
@@ -4612,7 +4581,7 @@  adjust_insn_length (rtx insn, int len)
 	      break;
 	    case SImode:
 	    case SFmode:
-	      output_movsisf (insn, op, &len);
+	      output_movsisf (insn, op, NULL_RTX, &len);
 	      break;
 	    default:
 	      break;
@@ -4683,7 +4652,7 @@  adjust_insn_length (rtx insn, int len)
 	      break;
 	    case SImode:
 	    case SFmode:
-	      output_reload_insisf (insn, op, &len);
+	      output_reload_insisf (insn, op, XEXP (op[2], 0), &len);
 	      break;
 	    default:
 	      break;
@@ -6212,53 +6181,199 @@  output_reload_inhi (rtx insn ATTRIBUTE_U
 }
 
 
+/* Reload a SI or SF compile time constant (OP[1]) into a GPR (OP[0]).
+   CLOBBER_REG is a QI clobber reg needed to move vast majority of consts
+   into a NO_LD_REGS.  If CLOBBER_REG is NULL_RTX we either don't need a
+   clobber reg or have to cook one up.
+
+   LEN == NULL: Output instructions.
+   
+   LEN != NULL: Output nothing.  Increment *LEN by number of words occupied
+                by the insns printed.
+
+   Return "".  */
+
 const char *
-output_reload_insisf (rtx insn ATTRIBUTE_UNUSED, rtx *operands, int *len)
+output_reload_insisf (rtx insn ATTRIBUTE_UNUSED,
+                      rtx *op, rtx clobber_reg, int *len)
 {
-  rtx src = operands[1];
-  int cnst = (GET_CODE (src) == CONST_INT);
+  rtx src = op[1];
+  rtx dest = op[0];
+  rtx xval, xdest[4];
+  int ival[4];
+  int clobber_val = 1234;
+  bool cooked_clobber_p = false;
+  bool set_p = false;
+  unsigned int n;
+  enum machine_mode mode = GET_MODE (dest);
+  
+  gcc_assert (REG_P (dest));
 
   if (len)
+    *len = 0;
+  
+  /* (REG:SI 14) is special: It's neither in LD_REGS nor in NO_LD_REGS
+     but has some subregs that are in LD_REGS.  Use the MSB (REG:QI 17).  */
+  
+  if (14 == REGNO (dest))
     {
-      if (cnst)
-	*len = 4 + ((INTVAL (src) & 0xff) != 0)
-		+ ((INTVAL (src) & 0xff00) != 0)
-		+ ((INTVAL (src) & 0xff0000) != 0)
-		+ ((INTVAL (src) & 0xff000000) != 0);
-      else
-	*len = 8;
-
-      return "";
+      clobber_reg = gen_rtx_REG (QImode, 17);
     }
 
-  if (cnst && ((INTVAL (src) & 0xff) == 0))
-    output_asm_insn (AS2 (mov, %A0, __zero_reg__), operands);
-  else
-    {
-      output_asm_insn (AS2 (ldi, %2, lo8(%1)), operands);
-      output_asm_insn (AS2 (mov, %A0, %2), operands);
-    }
-  if (cnst && ((INTVAL (src) & 0xff00) == 0))
-    output_asm_insn (AS2 (mov, %B0, __zero_reg__), operands);
-  else
+  /* We might need a clobber reg but don't have one.  Look at the value
+     to be loaded more closely.  A clobber is only needed if it contains
+     a byte that is neither 0, -1 or a power of 2.  */
+  
+  if (NULL_RTX == clobber_reg
+      && !test_hard_reg_class (LD_REGS, dest))
     {
-      output_asm_insn (AS2 (ldi, %2, hi8(%1)), operands);
-      output_asm_insn (AS2 (mov, %B0, %2), operands);
+      for (n = 0; n < GET_MODE_SIZE (mode); n++)
+        {
+          xval = simplify_gen_subreg (QImode, src, mode, n);
+
+          if (!(const0_rtx == xval
+                || constm1_rtx == xval
+                || single_one_operand (xval, QImode)))
+            {
+              /* We have no clobber reg but need one.  Cook one up.
+                 That's cheaper than loading from constant pool.  */
+              
+              cooked_clobber_p = true;
+              clobber_reg = gen_rtx_REG (QImode, 31);
+              avr_asm_len ("mov __tmp_reg__,%0", &clobber_reg, len, 1);
+              break;
+            }
+        }
     }
-  if (cnst && ((INTVAL (src) & 0xff0000) == 0))
-    output_asm_insn (AS2 (mov, %C0, __zero_reg__), operands);
-  else
+
+  /* Now start filling DEST from LSB to MSB.  */
+  
+  for (n = 0; n < GET_MODE_SIZE (mode); n++)
     {
-      output_asm_insn (AS2 (ldi, %2, hlo8(%1)), operands);
-      output_asm_insn (AS2 (mov, %C0, %2), operands);
+      bool done_byte = false;
+      unsigned int j;
+      rtx xop[3];
+
+      /* Crop the n-th sub-byte.  */
+      
+      xval = simplify_gen_subreg (QImode, src, mode, n);
+      xdest[n] = simplify_gen_subreg (QImode, dest, mode, n);
+      ival[n] = INTVAL (xval);
+
+      /* Look if we can reuse the low word by means of MOVW.  */
+      
+      if (n == 2
+          && AVR_HAVE_MOVW)
+        {
+          rtx lo16 = simplify_gen_subreg (HImode, src, mode, 0);
+          rtx hi16 = simplify_gen_subreg (HImode, src, mode, 2);
+
+          if (INTVAL (lo16) == INTVAL (hi16))
+            {
+              avr_asm_len ("movw %C0,%A0", &op[0], len, 1);
+              break;
+            }
+        }
+
+      /* Use CLR to zero a value so that cc0 is set as expected
+         for zero.  */
+      
+      if (ival[n] == 0)
+        {
+          avr_asm_len ("clr %0", &xdest[n], len, 1);
+          continue;
+        }
+
+      if (clobber_val == ival[n]
+          && REGNO (clobber_reg) == REGNO (xdest[n]))
+        {
+          continue;
+        }
+
+      /* LD_REGS can use LDI to move a constant value */
+      
+      if (test_hard_reg_class (LD_REGS, xdest[n]))
+        {
+          xop[0] = xdest[n];
+          xop[1] = xval;
+          avr_asm_len ("ldi %0,lo8(%1)", xop, len, 1);
+          continue;
+        }
+
+      /* Try to reuse value already loaded in some lower byte. */
+      
+      for (j = 0; j < n; j++)
+        if (ival[j] == ival[n])
+          {
+            xop[0] = xdest[n];
+            xop[1] = xdest[j];
+            
+            avr_asm_len ("mov %0,%1", xop, len, 1);
+            done_byte = true;
+            break;
+          }
+
+      if (done_byte)
+        continue;
+
+      /* Need no clobber reg for -1: Use CLR/DEC */
+      
+      if (-1 == ival[n])
+        {
+          avr_asm_len ("clr %0" CR_TAB
+                       "dec %0", &xdest[n], len, 2);
+          continue;
+        }
+
+      /* Use T flag or INC to manage powers of 2 if we have
+         no clobber reg.  */
+
+      if (NULL_RTX == clobber_reg
+          && single_one_operand (xval, QImode))
+        {
+          if (1 == ival[n])
+            {
+              avr_asm_len ("clr %0" CR_TAB
+                           "inc %0", &xdest[n], len, 2);
+              continue;
+            }
+          
+          xop[0] = xdest[n];
+          xop[1] = GEN_INT (exact_log2 (ival[n] & GET_MODE_MASK (QImode)));
+
+          gcc_assert (constm1_rtx != xop[1]);
+
+          if (!set_p)
+            {
+              set_p = true;
+              avr_asm_len ("set", xop, len, 1);
+            }
+
+          avr_asm_len ("clr %0" CR_TAB
+                       "bld %0,%1", xop, len, 2);
+          continue;
+        }
+
+      /* We actually need the LD_REGS clobber reg.  */
+
+      gcc_assert (NULL_RTX != clobber_reg);
+        
+      xop[0] = xdest[n];
+      xop[1] = xval;
+      xop[2] = clobber_reg;
+      clobber_val = ival[n];
+        
+      avr_asm_len ("ldi %2,lo8(%1)" CR_TAB
+                   "mov %0,%2", xop, len, 2);
     }
-  if (cnst && ((INTVAL (src) & 0xff000000) == 0))
-    output_asm_insn (AS2 (mov, %D0, __zero_reg__), operands);
-  else
+  
+  /* If we cooked up a clobber reg above, restore it.  */
+  
+  if (cooked_clobber_p)
     {
-      output_asm_insn (AS2 (ldi, %2, hhi8(%1)), operands);
-      output_asm_insn (AS2 (mov, %D0, %2), operands);
+      avr_asm_len ("mov %0,__tmp_reg__", &clobber_reg, len, 1);
     }
+  
   return "";
 }