Message ID | 4E144C61.60600@gjlay.de |
---|---|
State | New |
Headers | show |
2011/7/6 Georg-Johann Lay <avr@gjlay.de>: > For loading a 32-bit constant in a register, there is room for > improvement: > > * SF can be handled the same way as SI and therefore the patch > adds a peep2 to produce a *reload_insf analogon to *reload_insi. > > * If the destination register overlaps NO_LD_REGS, values already > loaded into some other byte can be reused by a simple MOV. > This is helpful then moving values like, e.g. -2, -100 etc. because > all high bytes are 0xff. > > * 0.0f can be directly moved to memory. > > * The mov insns contain "!d" constraint. I see no reason to make "d" > expensive and discourage use of d-regs. A "*d" to hide is better > because it does it neither puts additional pressure on "d" nor > discourages "d". > I would like to have a real code examples. Denis.
Denis Chertykov wrote: > 2011/7/6 Georg-Johann Lay <avr@gjlay.de>: >> For loading a 32-bit constant in a register, there is room for >> improvement: >> >> * SF can be handled the same way as SI and therefore the patch >> adds a peep2 to produce a *reload_insf analogon to *reload_insi. >> >> * If the destination register overlaps NO_LD_REGS, values already >> loaded into some other byte can be reused by a simple MOV. >> This is helpful then moving values like, e.g. -2, -100 etc. because >> all high bytes are 0xff. >> >> * 0.0f can be directly moved to memory. >> >> * The mov insns contain "!d" constraint. I see no reason to make "d" >> expensive and discourage use of d-regs. A "*d" to hide is better >> because it does it neither puts additional pressure on "d" nor >> discourages "d". >> > > I would like to have a real code examples. > > Denis. Hi Denis. Attached you find a small C file and the asm that is generated by new and old versions (-Os -mmcu=atmega88 -S -dp). I took away some regs as potential clobbers (or -fno-peephole2) to show the effect of high register pressure. Bit even if a clobber was available you can see that the new version is smarter in reusing values, e.g. note the loading of -1L to r22-r25. Johann .file "oint.c" __SREG__ = 0x3f __SP_H__ = 0x3e __SP_L__ = 0x3d __tmp_reg__ = 0 __zero_reg__ = 1 .global __do_copy_data .global __do_clear_bss .text .global foo1 .type foo1, @function foo1: push r10 ; 16 *pushqi/1 [length = 1] push r11 ; 17 *pushqi/1 [length = 1] push r12 ; 18 *pushqi/1 [length = 1] push r13 ; 19 *pushqi/1 [length = 1] push r14 ; 20 *pushqi/1 [length = 1] push r15 ; 21 *pushqi/1 [length = 1] push r16 ; 22 *pushqi/1 [length = 1] push r17 ; 23 *pushqi/1 [length = 1] /* prologue: function */ /* frame size = 0 */ /* stack size = 8 */ .L__stack_usage = 8 movw r18,r22 ; 2 *movsi/1 [length = 2] movw r20,r24 ldi r22,lo8(-1) ; 7 *movsi/5 [length = 4] ldi r23,hi8(-1) ldi r24,hlo8(-1) ldi r25,hhi8(-1) mov __tmp_reg__,r31 ; 9 *movsi/6 [length = 10] ldi r31,lo8(-2) mov r14,r31 ldi r31,hi8(-2) mov r15,r31 ldi r31,hlo8(-2) mov r16,r31 ldi r31,hhi8(-2) mov r17,r31 mov r31,__tmp_reg__ mov __tmp_reg__,r31 ; 10 *movsi/6 [length = 10] ldi r31,lo8(-16744448) mov r10,r31 ldi r31,hi8(-16744448) mov r11,r31 ldi r31,hlo8(-16744448) mov r12,r31 ldi r31,hhi8(-16744448) mov r13,r31 mov r31,__tmp_reg__ rcall ibar ; 11 call_insn/3 [length = 1] /* epilogue start */ pop r17 ; 26 popqi [length = 1] pop r16 ; 27 popqi [length = 1] pop r15 ; 28 popqi [length = 1] pop r14 ; 29 popqi [length = 1] pop r13 ; 30 popqi [length = 1] pop r12 ; 31 popqi [length = 1] pop r11 ; 32 popqi [length = 1] pop r10 ; 33 popqi [length = 1] ret ; 34 return_from_epilogue [length = 1] .size foo1, .-foo1 .global foo2 .type foo2, @function foo2: push r10 ; 16 *pushqi/1 [length = 1] push r11 ; 17 *pushqi/1 [length = 1] push r12 ; 18 *pushqi/1 [length = 1] push r13 ; 19 *pushqi/1 [length = 1] push r14 ; 20 *pushqi/1 [length = 1] push r15 ; 21 *pushqi/1 [length = 1] push r16 ; 22 *pushqi/1 [length = 1] push r17 ; 23 *pushqi/1 [length = 1] /* prologue: function */ /* frame size = 0 */ /* stack size = 8 */ .L__stack_usage = 8 movw r18,r22 ; 2 *movsi/1 [length = 2] movw r20,r24 mov __tmp_reg__,r31 ; 9 *movsi/6 [length = 10] ldi r31,lo8(65537) mov r14,r31 ldi r31,hi8(65537) mov r15,r31 ldi r31,hlo8(65537) mov r16,r31 ldi r31,hhi8(65537) mov r17,r31 mov r31,__tmp_reg__ mov __tmp_reg__,r31 ; 10 *movsi/6 [length = 10] ldi r31,lo8(-64504) mov r10,r31 ldi r31,hi8(-64504) mov r11,r31 ldi r31,hlo8(-64504) mov r12,r31 ldi r31,hhi8(-64504) mov r13,r31 mov r31,__tmp_reg__ rcall ibar ; 11 call_insn/3 [length = 1] /* epilogue start */ pop r17 ; 26 popqi [length = 1] pop r16 ; 27 popqi [length = 1] pop r15 ; 28 popqi [length = 1] pop r14 ; 29 popqi [length = 1] pop r13 ; 30 popqi [length = 1] pop r12 ; 31 popqi [length = 1] pop r11 ; 32 popqi [length = 1] pop r10 ; 33 popqi [length = 1] ret ; 34 return_from_epilogue [length = 1] .size foo2, .-foo2 .global foo3 .type foo3, @function foo3: push r10 ; 16 *pushqi/1 [length = 1] push r11 ; 17 *pushqi/1 [length = 1] push r12 ; 18 *pushqi/1 [length = 1] push r13 ; 19 *pushqi/1 [length = 1] push r14 ; 20 *pushqi/1 [length = 1] push r15 ; 21 *pushqi/1 [length = 1] push r16 ; 22 *pushqi/1 [length = 1] push r17 ; 23 *pushqi/1 [length = 1] /* prologue: function */ /* frame size = 0 */ /* stack size = 8 */ .L__stack_usage = 8 movw r18,r22 ; 2 *movsi/1 [length = 2] movw r20,r24 mov __tmp_reg__,r31 ; 9 *movsf/6 [length = 10] ldi r31,lo8(0xc0400000) mov r14,r31 ldi r31,hi8(0xc0400000) mov r15,r31 ldi r31,hlo8(0xc0400000) mov r16,r31 ldi r31,hhi8(0xc0400000) mov r17,r31 mov r31,__tmp_reg__ mov __tmp_reg__,r31 ; 10 *movsf/6 [length = 10] ldi r31,lo8(0x40000000) mov r10,r31 ldi r31,hi8(0x40000000) mov r11,r31 ldi r31,hlo8(0x40000000) mov r12,r31 ldi r31,hhi8(0x40000000) mov r13,r31 mov r31,__tmp_reg__ rcall fbar ; 11 call_insn/3 [length = 1] /* epilogue start */ pop r17 ; 26 popqi [length = 1] pop r16 ; 27 popqi [length = 1] pop r15 ; 28 popqi [length = 1] pop r14 ; 29 popqi [length = 1] pop r13 ; 30 popqi [length = 1] pop r12 ; 31 popqi [length = 1] pop r11 ; 32 popqi [length = 1] pop r10 ; 33 popqi [length = 1] ret ; 34 return_from_epilogue [length = 1] .size foo3, .-foo3 .file "oint.c" __SREG__ = 0x3f __SP_H__ = 0x3e __SP_L__ = 0x3d __tmp_reg__ = 0 __zero_reg__ = 1 .text .global foo1 .type foo1, @function foo1: push r10 ; 16 *pushqi/1 [length = 1] push r11 ; 17 *pushqi/1 [length = 1] push r12 ; 18 *pushqi/1 [length = 1] push r13 ; 19 *pushqi/1 [length = 1] push r14 ; 20 *pushqi/1 [length = 1] push r15 ; 21 *pushqi/1 [length = 1] push r16 ; 22 *pushqi/1 [length = 1] push r17 ; 23 *pushqi/1 [length = 1] /* prologue: function */ /* frame size = 0 */ /* stack size = 8 */ .L__stack_usage = 8 movw r18,r22 ; 2 *movsi/1 [length = 2] movw r20,r24 ldi r22,lo8(-1) ; 7 *movsi/5 [length = 3] ldi r23,lo8(-1) movw r24,r22 ldi r17,lo8(-2) ; 9 *movsi/6 [length = 6] mov r14,r17 clr r15 dec r15 ldi r16,lo8(-1) ldi r17,lo8(-1) clr r10 ; 10 *movsi/6 [length = 7] set clr r11 bld r11,7 clr r12 clr r13 dec r13 rcall ibar ; 11 *call_insn/2 [length = 1] /* epilogue start */ pop r17 ; 26 popqi [length = 1] pop r16 ; 27 popqi [length = 1] pop r15 ; 28 popqi [length = 1] pop r14 ; 29 popqi [length = 1] pop r13 ; 30 popqi [length = 1] pop r12 ; 31 popqi [length = 1] pop r11 ; 32 popqi [length = 1] pop r10 ; 33 popqi [length = 1] ret ; 34 return_from_epilogue [length = 1] .size foo1, .-foo1 .global foo2 .type foo2, @function foo2: push r10 ; 16 *pushqi/1 [length = 1] push r11 ; 17 *pushqi/1 [length = 1] push r12 ; 18 *pushqi/1 [length = 1] push r13 ; 19 *pushqi/1 [length = 1] push r14 ; 20 *pushqi/1 [length = 1] push r15 ; 21 *pushqi/1 [length = 1] push r16 ; 22 *pushqi/1 [length = 1] push r17 ; 23 *pushqi/1 [length = 1] /* prologue: function */ /* frame size = 0 */ /* stack size = 8 */ .L__stack_usage = 8 movw r18,r22 ; 2 *movsi/1 [length = 2] movw r20,r24 ldi r17,lo8(1) ; 9 *movsi/6 [length = 4] mov r14,r17 clr r15 movw r16,r14 set ; 10 *movsi/6 [length = 8] clr r10 bld r10,3 clr r11 bld r11,2 clr r12 dec r12 mov r13,r12 rcall ibar ; 11 *call_insn/2 [length = 1] /* epilogue start */ pop r17 ; 26 popqi [length = 1] pop r16 ; 27 popqi [length = 1] pop r15 ; 28 popqi [length = 1] pop r14 ; 29 popqi [length = 1] pop r13 ; 30 popqi [length = 1] pop r12 ; 31 popqi [length = 1] pop r11 ; 32 popqi [length = 1] pop r10 ; 33 popqi [length = 1] ret ; 34 return_from_epilogue [length = 1] .size foo2, .-foo2 .global foo3 .type foo3, @function foo3: push r10 ; 16 *pushqi/1 [length = 1] push r11 ; 17 *pushqi/1 [length = 1] push r12 ; 18 *pushqi/1 [length = 1] push r13 ; 19 *pushqi/1 [length = 1] push r14 ; 20 *pushqi/1 [length = 1] push r15 ; 21 *pushqi/1 [length = 1] push r16 ; 22 *pushqi/1 [length = 1] push r17 ; 23 *pushqi/1 [length = 1] /* prologue: function */ /* frame size = 0 */ /* stack size = 8 */ .L__stack_usage = 8 movw r18,r22 ; 2 *movsi/1 [length = 2] movw r20,r24 clr r14 ; 9 *movsf/6 [length = 4] clr r15 ldi r16,lo8(64) ldi r17,lo8(-64) clr r10 ; 10 *movsf/6 [length = 6] clr r11 clr r12 set clr r13 bld r13,6 rcall fbar ; 11 *call_insn/2 [length = 1] /* epilogue start */ pop r17 ; 26 popqi [length = 1] pop r16 ; 27 popqi [length = 1] pop r15 ; 28 popqi [length = 1] pop r14 ; 29 popqi [length = 1] pop r13 ; 30 popqi [length = 1] pop r12 ; 31 popqi [length = 1] pop r11 ; 32 popqi [length = 1] pop r10 ; 33 popqi [length = 1] ret ; 34 return_from_epilogue [length = 1] .size foo3, .-foo3 .ident "GCC: (GNU) 4.7.0 20110704 (experimental)"
2011/7/6 Georg-Johann Lay <avr@gjlay.de>: > Denis Chertykov wrote: >> 2011/7/6 Georg-Johann Lay <avr@gjlay.de>: >>> For loading a 32-bit constant in a register, there is room for >>> improvement: >>> >>> * SF can be handled the same way as SI and therefore the patch >>> adds a peep2 to produce a *reload_insf analogon to *reload_insi. >>> >>> * If the destination register overlaps NO_LD_REGS, values already >>> loaded into some other byte can be reused by a simple MOV. >>> This is helpful then moving values like, e.g. -2, -100 etc. because >>> all high bytes are 0xff. >>> >>> * 0.0f can be directly moved to memory. >>> >>> * The mov insns contain "!d" constraint. I see no reason to make "d" >>> expensive and discourage use of d-regs. A "*d" to hide is better >>> because it does it neither puts additional pressure on "d" nor >>> discourages "d". >>> >> >> I would like to have a real code examples. >> >> Denis. > > Hi Denis. > > Attached you find a small C file and the asm that is generated by new > and old versions (-Os -mmcu=atmega88 -S -dp). > > I took away some regs as potential clobbers (or -fno-peephole2) to > show the effect of high register pressure. Bit even if a clobber was > available you can see that the new version is smarter in reusing > values, e.g. note the loading of -1L to r22-r25. I have asked about example of *d instead of !d. Just svn GCC with *d vs svn GCC !d. Denis.
Denis Chertykov wrote: > 2011/7/6 Georg-Johann Lay <avr@gjlay.de>: >> Denis Chertykov wrote: >>> 2011/7/6 Georg-Johann Lay <avr@gjlay.de>: >>>> For loading a 32-bit constant in a register, there is room for >>>> improvement: >>>> >>>> * SF can be handled the same way as SI and therefore the patch >>>> adds a peep2 to produce a *reload_insf analogon to *reload_insi. >>>> >>>> * If the destination register overlaps NO_LD_REGS, values already >>>> loaded into some other byte can be reused by a simple MOV. >>>> This is helpful then moving values like, e.g. -2, -100 etc. because >>>> all high bytes are 0xff. >>>> >>>> * 0.0f can be directly moved to memory. >>>> >>>> * The mov insns contain "!d" constraint. I see no reason to make "d" >>>> expensive and discourage use of d-regs. A "*d" to hide is better >>>> because it does it neither puts additional pressure on "d" nor >>>> discourages "d". >>>> >>> I would like to have a real code examples. >>> >>> Denis. >> Hi Denis. >> >> Attached you find a small C file and the asm that is generated by new >> and old versions (-Os -mmcu=atmega88 -S -dp). >> >> I took away some regs as potential clobbers (or -fno-peephole2) to >> show the effect of high register pressure. Bit even if a clobber was >> available you can see that the new version is smarter in reusing >> values, e.g. note the loading of -1L to r22-r25. > > I have asked about example of *d instead of !d. > Just svn GCC with *d vs svn GCC !d. > > > Denis. Ah, I couldn't depict that from your question. I thought it could help in cases like these: long z; void inc (long y) { z += y; } that gets compiled with -Os to inc: push r16 push r17 /* prologue: function */ /* frame size = 0 */ /* stack size = 2 */ .L__stack_usage = 2 lds r16,z lds r17,z+1 lds r18,z+2 lds r19,z+3 add r16,r22 adc r17,r23 adc r18,r24 adc r19,r25 sts z,r16 sts z+1,r17 sts z+2,r18 sts z+3,r19 /* epilogue start */ pop r17 pop r16 ret But with the *d the code is still the same and R16 chosen instead of better R18. Maybe that's an IRA issue. Looking again at the "*d" resp. "!d", I think the alternative is superfluous because there is a "r" alternative and "d" is a subset of "r", so allocator can always switch to "r" if it does not like or see "d". I think we con remove that alternative, it's just confusing. Johann
Denis Chertykov wrote: > I have asked about example of *d instead of !d. > Just svn GCC with *d vs svn GCC !d. > > Denis. Is the patch ok with the original !d instead of *d ? It still improves and the !d vs. *d don't matter because there's always r I think. Johann
2011/7/6 Georg-Johann Lay <avr@gjlay.de>: > Denis Chertykov wrote: >> I have asked about example of *d instead of !d. >> Just svn GCC with *d vs svn GCC !d. >> >> Denis. > > Is the patch ok with the original !d instead of *d ? Ok. Denis.
Index: config/avr/avr.md =================================================================== --- config/avr/avr.md (revision 175811) +++ config/avr/avr.md (working copy) @@ -402,10 +402,10 @@ (define_expand "movsi" -(define_peephole2 ; movsi_lreg_const +(define_peephole2 ; *reload_insi [(match_scratch:QI 2 "d") (set (match_operand:SI 0 "l_register_operand" "") - (match_operand:SI 1 "immediate_operand" "")) + (match_operand:SI 1 "const_int_operand" "")) (match_dup 2)] "(operands[1] != const0_rtx && operands[1] != constm1_rtx)" @@ -416,22 +416,26 @@ (define_peephole2 ; movsi_lreg_const ;; '*' because it is not used in rtl generation. (define_insn "*reload_insi" [(set (match_operand:SI 0 "register_operand" "=r") - (match_operand:SI 1 "immediate_operand" "i")) + (match_operand:SI 1 "const_int_operand" "n")) (clobber (match_operand:QI 2 "register_operand" "=&d"))] "reload_completed" - "* return output_reload_insisf (insn, operands, NULL);" + { + return output_reload_insisf (insn, operands, operands[2], NULL); + } [(set_attr "length" "8") - (set_attr "cc" "none")]) + (set_attr "cc" "clobber")]) (define_insn "*movsi" - [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r,Qm,!d,r") + [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r,Qm,*d,r") (match_operand:SI 1 "general_operand" "r,L,Qm,rL,i,i"))] "(register_operand (operands[0],SImode) || register_operand (operands[1],SImode) || const0_rtx == operands[1])" - "* return output_movsisf (insn, operands, NULL);" + { + return output_movsisf (insn, operands, NULL_RTX, NULL); + } [(set_attr "length" "4,4,8,9,4,10") - (set_attr "cc" "none,set_zn,clobber,clobber,none,clobber")]) + (set_attr "cc" "none,set_zn,clobber,clobber,clobber,clobber")]) ;; fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff ;; move floating point numbers (32 bit) @@ -451,13 +455,39 @@ (define_expand "movsf" }") (define_insn "*movsf" - [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,r,Qm,!d,r") - (match_operand:SF 1 "general_operand" "r,G,Qm,r,F,F"))] + [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,r,Qm,*d,r") + (match_operand:SF 1 "general_operand" "r,G,Qm,rG,F,F"))] "register_operand (operands[0], SFmode) - || register_operand (operands[1], SFmode)" - "* return output_movsisf (insn, operands, NULL);" + || register_operand (operands[1], SFmode) + || operands[1] == CONST0_RTX (SFmode)" + { + return output_movsisf (insn, operands, NULL_RTX, NULL); + } [(set_attr "length" "4,4,8,9,4,10") - (set_attr "cc" "none,set_zn,clobber,clobber,none,clobber")]) + (set_attr "cc" "none,set_zn,clobber,clobber,clobber,clobber")]) + +(define_peephole2 ; *reload_insf + [(match_scratch:QI 2 "d") + (set (match_operand:SF 0 "l_register_operand" "") + (match_operand:SF 1 "const_double_operand" "")) + (match_dup 2)] + "operands[1] != CONST0_RTX (SFmode)" + [(parallel [(set (match_dup 0) + (match_dup 1)) + (clobber (match_dup 2))])] + "") + +;; '*' because it is not used in rtl generation. +(define_insn "*reload_insf" + [(set (match_operand:SF 0 "register_operand" "=r") + (match_operand:SF 1 "const_double_operand" "F")) + (clobber (match_operand:QI 2 "register_operand" "=&d"))] + "reload_completed" + { + return output_reload_insisf (insn, operands, operands[2], NULL); + } + [(set_attr "length" "8") + (set_attr "cc" "clobber")]) ;;========================================================================= ;; move string (like memcpy) Index: config/avr/avr-protos.h =================================================================== --- config/avr/avr-protos.h (revision 175811) +++ config/avr/avr-protos.h (working copy) @@ -56,7 +56,7 @@ extern const char *out_movhi_r_mr (rtx i extern const char *out_movhi_mr_r (rtx insn, rtx op[], int *l); extern const char *out_movsi_r_mr (rtx insn, rtx op[], int *l); extern const char *out_movsi_mr_r (rtx insn, rtx op[], int *l); -extern const char *output_movsisf (rtx insn, rtx operands[], int *l); +extern const char *output_movsisf (rtx insn, rtx operands[], rtx clobber, int *l); extern const char *out_tstsi (rtx insn, rtx src, int *l); extern const char *out_tsthi (rtx insn, rtx src, int *l); extern const char *ret_cond_branch (rtx x, int len, int reverse); @@ -85,7 +85,7 @@ extern const char *avr_out_sbxx_branch ( extern int extra_constraint_Q (rtx x); extern int adjust_insn_length (rtx insn, int len); extern const char *output_reload_inhi (rtx insn, rtx *operands, int *len); -extern const char *output_reload_insisf (rtx insn, rtx *operands, int *len); +extern const char *output_reload_insisf (rtx insn, rtx *operands, rtx clobber, int *len); extern enum reg_class secondary_input_reload_class (enum reg_class, enum machine_mode, rtx); Index: config/avr/avr.c =================================================================== --- config/avr/avr.c (revision 175811) +++ config/avr/avr.c (working copy) @@ -1184,6 +1184,32 @@ avr_legitimize_address (rtx x, rtx oldx, } +/* Helper function to print assembler resp. track instruction + sequence lengths. + + If PLEN == NULL: + Output assembler code from template TPL with operands supplied + by OPERANDS. This is just forwarding to output_asm_insn. + + If PLEN != NULL: + Add N_WORDS to *PLEN. + Don't output anything. +*/ + +static void +avr_asm_len (const char* tpl, rtx* operands, int* plen, int n_words) +{ + if (NULL == plen) + { + output_asm_insn (tpl, operands); + } + else + { + *plen += n_words; + } +} + + /* Return a pointer register name as a string. */ static const char * @@ -2600,7 +2626,7 @@ out_movsi_mr_r (rtx insn, rtx op[], int } const char * -output_movsisf(rtx insn, rtx operands[], int *l) +output_movsisf (rtx insn, rtx operands[], rtx clobber_reg, int *l) { int dummy; rtx dest = operands[0]; @@ -2643,6 +2669,11 @@ output_movsisf(rtx insn, rtx operands[], AS2 (mov,%D0,%D1)); } } + else if (CONST_INT_P (src) + || CONST_DOUBLE_P (src)) + { + return output_reload_insisf (insn, operands, clobber_reg, real_l); + } else if (CONSTANT_P (src)) { if (test_hard_reg_class (LD_REGS, dest)) /* ldi d,i */ @@ -2653,68 +2684,6 @@ output_movsisf(rtx insn, rtx operands[], AS2 (ldi,%C0,hlo8(%1)) CR_TAB AS2 (ldi,%D0,hhi8(%1))); } - - if (GET_CODE (src) == CONST_INT) - { - const char *const clr_op0 = - AVR_HAVE_MOVW ? (AS1 (clr,%A0) CR_TAB - AS1 (clr,%B0) CR_TAB - AS2 (movw,%C0,%A0)) - : (AS1 (clr,%A0) CR_TAB - AS1 (clr,%B0) CR_TAB - AS1 (clr,%C0) CR_TAB - AS1 (clr,%D0)); - - if (src == const0_rtx) /* mov r,L */ - { - *l = AVR_HAVE_MOVW ? 3 : 4; - return clr_op0; - } - else if (src == const1_rtx) - { - if (!real_l) - output_asm_insn (clr_op0, operands); - *l = AVR_HAVE_MOVW ? 4 : 5; - return AS1 (inc,%A0); - } - else if (src == constm1_rtx) - { - /* Immediate constants -1 to any register */ - if (AVR_HAVE_MOVW) - { - *l = 4; - return (AS1 (clr,%A0) CR_TAB - AS1 (dec,%A0) CR_TAB - AS2 (mov,%B0,%A0) CR_TAB - AS2 (movw,%C0,%A0)); - } - *l = 5; - return (AS1 (clr,%A0) CR_TAB - AS1 (dec,%A0) CR_TAB - AS2 (mov,%B0,%A0) CR_TAB - AS2 (mov,%C0,%A0) CR_TAB - AS2 (mov,%D0,%A0)); - } - else - { - int bit_nr = exact_log2 (INTVAL (src)); - - if (bit_nr >= 0) - { - *l = AVR_HAVE_MOVW ? 5 : 6; - if (!real_l) - { - output_asm_insn (clr_op0, operands); - output_asm_insn ("set", operands); - } - if (!real_l) - avr_output_bld (operands, bit_nr); - - return ""; - } - } - } - /* Last resort, better than loading from memory. */ *l = 10; return (AS2 (mov,__tmp_reg__,r31) CR_TAB @@ -2735,7 +2704,7 @@ output_movsisf(rtx insn, rtx operands[], { const char *templ; - if (src == const0_rtx) + if (src == CONST0_RTX (GET_MODE (dest))) operands[1] = zero_reg_rtx; templ = out_movsi_mr_r (insn, operands, real_l); @@ -4612,7 +4581,7 @@ adjust_insn_length (rtx insn, int len) break; case SImode: case SFmode: - output_movsisf (insn, op, &len); + output_movsisf (insn, op, NULL_RTX, &len); break; default: break; @@ -4683,7 +4652,7 @@ adjust_insn_length (rtx insn, int len) break; case SImode: case SFmode: - output_reload_insisf (insn, op, &len); + output_reload_insisf (insn, op, XEXP (op[2], 0), &len); break; default: break; @@ -6212,53 +6181,199 @@ output_reload_inhi (rtx insn ATTRIBUTE_U } +/* Reload a SI or SF compile time constant (OP[1]) into a GPR (OP[0]). + CLOBBER_REG is a QI clobber reg needed to move vast majority of consts + into a NO_LD_REGS. If CLOBBER_REG is NULL_RTX we either don't need a + clobber reg or have to cook one up. + + LEN == NULL: Output instructions. + + LEN != NULL: Output nothing. Increment *LEN by number of words occupied + by the insns printed. + + Return "". */ + const char * -output_reload_insisf (rtx insn ATTRIBUTE_UNUSED, rtx *operands, int *len) +output_reload_insisf (rtx insn ATTRIBUTE_UNUSED, + rtx *op, rtx clobber_reg, int *len) { - rtx src = operands[1]; - int cnst = (GET_CODE (src) == CONST_INT); + rtx src = op[1]; + rtx dest = op[0]; + rtx xval, xdest[4]; + int ival[4]; + int clobber_val = 1234; + bool cooked_clobber_p = false; + bool set_p = false; + unsigned int n; + enum machine_mode mode = GET_MODE (dest); + + gcc_assert (REG_P (dest)); if (len) + *len = 0; + + /* (REG:SI 14) is special: It's neither in LD_REGS nor in NO_LD_REGS + but has some subregs that are in LD_REGS. Use the MSB (REG:QI 17). */ + + if (14 == REGNO (dest)) { - if (cnst) - *len = 4 + ((INTVAL (src) & 0xff) != 0) - + ((INTVAL (src) & 0xff00) != 0) - + ((INTVAL (src) & 0xff0000) != 0) - + ((INTVAL (src) & 0xff000000) != 0); - else - *len = 8; - - return ""; + clobber_reg = gen_rtx_REG (QImode, 17); } - if (cnst && ((INTVAL (src) & 0xff) == 0)) - output_asm_insn (AS2 (mov, %A0, __zero_reg__), operands); - else - { - output_asm_insn (AS2 (ldi, %2, lo8(%1)), operands); - output_asm_insn (AS2 (mov, %A0, %2), operands); - } - if (cnst && ((INTVAL (src) & 0xff00) == 0)) - output_asm_insn (AS2 (mov, %B0, __zero_reg__), operands); - else + /* We might need a clobber reg but don't have one. Look at the value + to be loaded more closely. A clobber is only needed if it contains + a byte that is neither 0, -1 or a power of 2. */ + + if (NULL_RTX == clobber_reg + && !test_hard_reg_class (LD_REGS, dest)) { - output_asm_insn (AS2 (ldi, %2, hi8(%1)), operands); - output_asm_insn (AS2 (mov, %B0, %2), operands); + for (n = 0; n < GET_MODE_SIZE (mode); n++) + { + xval = simplify_gen_subreg (QImode, src, mode, n); + + if (!(const0_rtx == xval + || constm1_rtx == xval + || single_one_operand (xval, QImode))) + { + /* We have no clobber reg but need one. Cook one up. + That's cheaper than loading from constant pool. */ + + cooked_clobber_p = true; + clobber_reg = gen_rtx_REG (QImode, 31); + avr_asm_len ("mov __tmp_reg__,%0", &clobber_reg, len, 1); + break; + } + } } - if (cnst && ((INTVAL (src) & 0xff0000) == 0)) - output_asm_insn (AS2 (mov, %C0, __zero_reg__), operands); - else + + /* Now start filling DEST from LSB to MSB. */ + + for (n = 0; n < GET_MODE_SIZE (mode); n++) { - output_asm_insn (AS2 (ldi, %2, hlo8(%1)), operands); - output_asm_insn (AS2 (mov, %C0, %2), operands); + bool done_byte = false; + unsigned int j; + rtx xop[3]; + + /* Crop the n-th sub-byte. */ + + xval = simplify_gen_subreg (QImode, src, mode, n); + xdest[n] = simplify_gen_subreg (QImode, dest, mode, n); + ival[n] = INTVAL (xval); + + /* Look if we can reuse the low word by means of MOVW. */ + + if (n == 2 + && AVR_HAVE_MOVW) + { + rtx lo16 = simplify_gen_subreg (HImode, src, mode, 0); + rtx hi16 = simplify_gen_subreg (HImode, src, mode, 2); + + if (INTVAL (lo16) == INTVAL (hi16)) + { + avr_asm_len ("movw %C0,%A0", &op[0], len, 1); + break; + } + } + + /* Use CLR to zero a value so that cc0 is set as expected + for zero. */ + + if (ival[n] == 0) + { + avr_asm_len ("clr %0", &xdest[n], len, 1); + continue; + } + + if (clobber_val == ival[n] + && REGNO (clobber_reg) == REGNO (xdest[n])) + { + continue; + } + + /* LD_REGS can use LDI to move a constant value */ + + if (test_hard_reg_class (LD_REGS, xdest[n])) + { + xop[0] = xdest[n]; + xop[1] = xval; + avr_asm_len ("ldi %0,lo8(%1)", xop, len, 1); + continue; + } + + /* Try to reuse value already loaded in some lower byte. */ + + for (j = 0; j < n; j++) + if (ival[j] == ival[n]) + { + xop[0] = xdest[n]; + xop[1] = xdest[j]; + + avr_asm_len ("mov %0,%1", xop, len, 1); + done_byte = true; + break; + } + + if (done_byte) + continue; + + /* Need no clobber reg for -1: Use CLR/DEC */ + + if (-1 == ival[n]) + { + avr_asm_len ("clr %0" CR_TAB + "dec %0", &xdest[n], len, 2); + continue; + } + + /* Use T flag or INC to manage powers of 2 if we have + no clobber reg. */ + + if (NULL_RTX == clobber_reg + && single_one_operand (xval, QImode)) + { + if (1 == ival[n]) + { + avr_asm_len ("clr %0" CR_TAB + "inc %0", &xdest[n], len, 2); + continue; + } + + xop[0] = xdest[n]; + xop[1] = GEN_INT (exact_log2 (ival[n] & GET_MODE_MASK (QImode))); + + gcc_assert (constm1_rtx != xop[1]); + + if (!set_p) + { + set_p = true; + avr_asm_len ("set", xop, len, 1); + } + + avr_asm_len ("clr %0" CR_TAB + "bld %0,%1", xop, len, 2); + continue; + } + + /* We actually need the LD_REGS clobber reg. */ + + gcc_assert (NULL_RTX != clobber_reg); + + xop[0] = xdest[n]; + xop[1] = xval; + xop[2] = clobber_reg; + clobber_val = ival[n]; + + avr_asm_len ("ldi %2,lo8(%1)" CR_TAB + "mov %0,%2", xop, len, 2); } - if (cnst && ((INTVAL (src) & 0xff000000) == 0)) - output_asm_insn (AS2 (mov, %D0, __zero_reg__), operands); - else + + /* If we cooked up a clobber reg above, restore it. */ + + if (cooked_clobber_p) { - output_asm_insn (AS2 (ldi, %2, hhi8(%1)), operands); - output_asm_insn (AS2 (mov, %D0, %2), operands); + avr_asm_len ("mov %0,__tmp_reg__", &clobber_reg, len, 1); } + return ""; }