Message ID | 20121003142206.9zuttl646c808so4-nzlynne@webmail.spamcop.net |
---|---|
State | New |
Headers | show |
On Wed, Oct 3, 2012 at 8:22 PM, Joern Rennecke <joern.rennecke@embecosm.com> wrote: > The ARCompact architecture has some pipelining features that result in > the vanilla branch shortening not always converging. > > Moreover, there are some short, complex branch instructions with very short > offsets; replacing them with a multi-insn sequence when the offset doesn't > reach makes the code significantly longer. Thus, when starting branch > shortening with pessimistic assumptions, the short branches are often not > used because of the pessimistic branch length causing the offsets going out > of range. > This problem can be avoided when starting with a low estimate and working > upwards. However, that makes the incidence of infinite branch shortening > cycles higher, and also makes it impossible to just break out after some > iteration count. > > To address these issues, I've made the generator programs recognize the > optional lock_length attribute. > > To quote from the documentation added for this feature: > > If you define the `lock_length' attribute, branch shortening will work > the other way round: it starts out assuming minimum instruction lengths > and iterates from there. In addition, the value of the `lock_length' > attribute does not decrease across iterations, and the value computed > for the `length' attribute will be no smaller than that of the > `lock_length' attribute. I miss a few things in this description: - what is the value of lock_length supposed to be? From the "lock" prefix it sounds like it is something unchanging, maybe even constant, thus a maximum? - the length attribute still needs to be specified when lock_length is? how do they relate? Is lock_length always smaller / bigger than length? - what happens if you have patterns with lock_length and patterns without? - what patterns does lock_length apply to? In general optimistically attacking this kind of problem should be always better - did you try simply switching this for all targets? It shouldn't be slower and the only thing you need to guarantee is that during iteration you never make insn-lenghts smaller again. Richard. > bootstrapped and regression tested on i686-pc-linux-gnu > > 2012-10-03 Joern Rennecke <joern.rennecke@embecosm.com> > > * final.c (get_attr_length_1): Use direct recursion rather than > calling get_attr_length. > (get_attr_lock_length): New function. > (INSN_VARIABLE_LENGTH_P): Define. > (shorten_branches): Take HAVE_ATTR_lock_length into account. > Don't overwrite non-delay slot insn lengths with the lengths of > delay slot insns with same uid. > * genattrtab.c (lock_length_str): New variable. > (make_length_attrs): New parameter base. > (main): Initialize lock_length_str. > Generate lock_lengths attributes. > * genattr.c (gen_attr): Emit declarations for lock_length attribute > related functions. > * doc/md.texi (node Insn Lengths): Document lock_length attribute. > > Index: doc/md.texi > =================================================================== > --- doc/md.texi (revision 192036) > +++ doc/md.texi (working copy) > @@ -8004,6 +8004,20 @@ (define_insn "jump" > (const_int 6)))]) > @end smallexample > > +@cindex lock_length > +Usually, branch shortening is done assuming the worst case (i.e. longest) > +lengths, and then iterating (if optimizing) to smaller lengths till > +no further changed occur. This does not work so well for architectures > +that have very small minimum offsets and considerable jumps in instruction > +lengths. > + > +If you define the @code{lock_length} attribute, branch shortening will > +work the other way round: it starts out assuming minimum instruction > +lengths and iterates from there. In addition, the value of the > +@code{lock_length} attribute does not decrease across iterations, and > +the value computed for the @code{length} attribute will be no smaller > +than that of the @code{lock_length} attribute. > + > @end ifset > @ifset INTERNALS > @node Constant Attributes > Index: final.c > =================================================================== > --- final.c (revision 192036) > +++ final.c (working copy) > @@ -312,6 +312,7 @@ dbr_sequence_length (void) > `insn_current_length'. */ > > static int *insn_lengths; > +static char *uid_lock_length; > > VEC(int,heap) *insn_addresses_; > > @@ -447,6 +448,20 @@ get_attr_length (rtx insn) > return get_attr_length_1 (insn, insn_default_length); > } > > +#ifdef HAVE_ATTR_lock_length > +int > +get_attr_lock_length (rtx insn) > +{ > + if (uid_lock_length && insn_lengths_max_uid > INSN_UID (insn)) > + return uid_lock_length[INSN_UID (insn)]; > + return get_attr_length_1 (insn, insn_min_lock_length); > +} > +#define INSN_VARIABLE_LENGTH_P(INSN) \ > + (insn_variable_length_p (INSN) || insn_variable_lock_length_p (INSN)) > +#else > +#define INSN_VARIABLE_LENGTH_P(INSN) (insn_variable_length_p (INSN)) > +#endif > + > /* Obtain the current length of an insn. If branch shortening has been > done, > get its actual length. Otherwise, get its minimum length. */ > int > @@ -865,6 +880,7 @@ shorten_branches (rtx first ATTRIBUTE_UN > rtx body; > int uid; > rtx align_tab[MAX_CODE_ALIGN]; > + int (*length_fun) (rtx); > > #endif > > @@ -875,6 +891,10 @@ shorten_branches (rtx first ATTRIBUTE_UN > free (uid_shuid); > > uid_shuid = XNEWVEC (int, max_uid); > +#ifdef HAVE_ATTR_lock_length > + uid_lock_length = XNEWVEC (char, max_uid); > + memset (uid_lock_length, 0, max_uid); > +#endif > > if (max_labelno != max_label_num ()) > { > @@ -1067,6 +1087,10 @@ shorten_branches (rtx first ATTRIBUTE_UN > #endif /* CASE_VECTOR_SHORTEN_MODE */ > > /* Compute initial lengths, addresses, and varying flags for each insn. > */ > + length_fun = insn_default_length; > +#ifdef HAVE_ATTR_lock_length > + length_fun = insn_min_length; > +#endif > for (insn_current_address = 0, insn = first; > insn != 0; > insn_current_address += insn_lengths[uid], insn = NEXT_INSN (insn)) > @@ -1131,26 +1155,32 @@ shorten_branches (rtx first ATTRIBUTE_UN > inner_length = (asm_insn_count (PATTERN (inner_insn)) > * insn_default_length (inner_insn)); > else > - inner_length = insn_default_length (inner_insn); > + inner_length = length_fun (inner_insn); > > insn_lengths[inner_uid] = inner_length; > if (const_delay_slots) > { > if ((varying_length[inner_uid] > - = insn_variable_length_p (inner_insn)) != 0) > + = INSN_VARIABLE_LENGTH_P (inner_insn)) != 0) > varying_length[uid] = 1; > INSN_ADDRESSES (inner_uid) = (insn_current_address > + insn_lengths[uid]); > } > else > - varying_length[inner_uid] = 0; > + { > + /* We'd need to make this code a bit more complicated > + to properly support non-const-delay-slots with the > + lock_length attribute. */ > + gcc_assert (length_fun == &insn_default_length); > + varying_length[inner_uid] = 0; > + } > insn_lengths[uid] += inner_length; > } > } > else if (GET_CODE (body) != USE && GET_CODE (body) != CLOBBER) > { > - insn_lengths[uid] = insn_default_length (insn); > - varying_length[uid] = insn_variable_length_p (insn); > + insn_lengths[uid] = length_fun (insn); > + varying_length[uid] = INSN_VARIABLE_LENGTH_P (insn); > } > > /* If needed, do any adjustment. */ > @@ -1220,6 +1250,7 @@ shorten_branches (rtx first ATTRIBUTE_UN > rtx prev; > int rel_align = 0; > addr_diff_vec_flags flags; > + enum machine_mode vec_mode; > > /* Avoid automatic aggregate initialization. */ > flags = ADDR_DIFF_VEC_FLAGS (body); > @@ -1298,9 +1329,15 @@ shorten_branches (rtx first ATTRIBUTE_UN > else > max_addr += align_fuzz (max_lab, rel_lab, 0, 0); > } > - PUT_MODE (body, CASE_VECTOR_SHORTEN_MODE (min_addr - rel_addr, > - max_addr - rel_addr, > - body)); > + vec_mode = CASE_VECTOR_SHORTEN_MODE (min_addr - rel_addr, > + max_addr - rel_addr, > body); > + if (!uid_lock_length > + || !uid_lock_length[uid] > + || (GET_MODE_SIZE (vec_mode) > + >= GET_MODE_SIZE (GET_MODE (body)))) > + PUT_MODE (body, vec_mode); > + if (uid_lock_length) > + uid_lock_length[uid] = 1; > if (JUMP_TABLES_IN_TEXT_SECTION > || readonly_data_section == text_section) > { > @@ -1360,18 +1397,44 @@ shorten_branches (rtx first ATTRIBUTE_UN > else > inner_length = insn_current_length (inner_insn); > > - if (inner_length != insn_lengths[inner_uid]) > + /* We can't record lengths of delay slot insns. */ > + if (i == 0 && inner_length != insn_lengths[inner_uid]) > { > - insn_lengths[inner_uid] = inner_length; > - something_changed = 1; > +#ifdef HAVE_ATTR_lock_length > + int lock_length = insn_current_lock_length > (inner_insn); > + > + if (lock_length > uid_lock_length[inner_uid]) > + uid_lock_length[inner_uid] = lock_length; > + else > + lock_length = uid_lock_length[inner_uid]; > + if (inner_length < lock_length) > + inner_length = lock_length; > + if (inner_length != insn_lengths[inner_uid]) > +#endif > + { > + insn_lengths[inner_uid] = inner_length; > + something_changed = 1; > + } > } > - insn_current_address += insn_lengths[inner_uid]; > + insn_current_address += inner_length; > new_length += inner_length; > } > } > else > { > new_length = insn_current_length (insn); > +#ifdef HAVE_ATTR_lock_length > + { > + int lock_length = insn_current_lock_length (insn); > + > + if (lock_length > uid_lock_length[uid]) > + uid_lock_length[uid] = lock_length; > + else > + lock_length = uid_lock_length[uid]; > + if (new_length < lock_length) > + new_length = lock_length; > + } > +#endif > insn_current_address += new_length; > } > > @@ -1388,12 +1451,17 @@ shorten_branches (rtx first ATTRIBUTE_UN > something_changed = 1; > } > } > +#ifndef HAVE_ATTR_lock_length > /* For a non-optimizing compile, do only a single pass. */ > if (!optimize) > break; > +#endif > } > > free (varying_length); > + if (uid_lock_length) > + free (uid_lock_length); > + uid_lock_length = 0; > > #endif /* HAVE_ATTR_length */ > } > Index: genattr.c > =================================================================== > --- genattr.c (revision 192036) > +++ genattr.c (working copy) > @@ -71,6 +71,10 @@ extern int insn_default_length (rtx);\n\ > extern int insn_min_length (rtx);\n\ > extern int insn_variable_length_p (rtx);\n\ > extern int insn_current_length (rtx);\n\n\ > +extern int insn_default_lock_length (rtx);\n\ > +extern int insn_min_lock_length (rtx);\n\ > +extern int insn_variable_lock_length_p (rtx);\n\ > +extern int insn_current_lock_length (rtx);\n\n\ > #include \"insn-addr.h\"\n"); > } > } > Index: genattrtab.c > =================================================================== > --- genattrtab.c (revision 192036) > +++ genattrtab.c (working copy) > @@ -242,6 +242,7 @@ struct attr_value_list **insn_code_value > > static const char *alternative_name; > static const char *length_str; > +static const char *lock_length_str; > static const char *delay_type_str; > static const char *delay_1_0_str; > static const char *num_delay_slots_str; > @@ -1541,14 +1542,14 @@ substitute_address (rtx exp, rtx (*no_ad > */ > > static void > -make_length_attrs (void) > +make_length_attrs (const char **base) > { > static const char *new_names[] = > { > - "*insn_default_length", > - "*insn_min_length", > - "*insn_variable_length_p", > - "*insn_current_length" > + "*insn_default_%s", > + "*insn_min_%s", > + "*insn_variable_%s_p", > + "*insn_current_%s" > }; > static rtx (*const no_address_fn[]) (rtx) > = {identity_fn,identity_fn, zero_fn, zero_fn}; > @@ -1561,7 +1562,7 @@ make_length_attrs (void) > > /* See if length attribute is defined. If so, it must be numeric. Make > it special so we don't output anything for it. */ > - length_attr = find_attr (&length_str, 0); > + length_attr = find_attr (base, 0); > if (length_attr == 0) > return; > > @@ -1574,11 +1575,14 @@ make_length_attrs (void) > /* Make each new attribute, in turn. */ > for (i = 0; i < ARRAY_SIZE (new_names); i++) > { > - make_internal_attr (new_names[i], > + const char *p = attr_printf (strlen (new_names[i]) - 2 + strlen > (*base), > + new_names[i], *base); > + > + make_internal_attr (p, > substitute_address > (length_attr->default_val->value, > no_address_fn[i], > address_fn[i]), > ATTR_NONE); > - new_attr = find_attr (&new_names[i], 0); > + new_attr = find_attr (&p, 0); > for (av = length_attr->first_value; av; av = av->next) > for (ie = av->first_insn; ie; ie = ie->next) > { > @@ -5178,6 +5182,7 @@ main (int argc, char **argv) > > alternative_name = DEF_ATTR_STRING ("alternative"); > length_str = DEF_ATTR_STRING ("length"); > + lock_length_str = DEF_ATTR_STRING ("lock_length"); > delay_type_str = DEF_ATTR_STRING ("*delay_type"); > delay_1_0_str = DEF_ATTR_STRING ("*delay_1_0"); > num_delay_slots_str = DEF_ATTR_STRING ("*num_delay_slots"); > @@ -5274,7 +5279,8 @@ main (int argc, char **argv) > fill_attr (attr); > > /* Construct extra attributes for `length'. */ > - make_length_attrs (); > + make_length_attrs (&length_str); > + make_length_attrs (&lock_length_str); > > /* Perform any possible optimizations to speed up compilation. */ > optimize_attrs (); >
Index: doc/md.texi =================================================================== --- doc/md.texi (revision 192036) +++ doc/md.texi (working copy) @@ -8004,6 +8004,20 @@ (define_insn "jump" (const_int 6)))]) @end smallexample +@cindex lock_length +Usually, branch shortening is done assuming the worst case (i.e. longest) +lengths, and then iterating (if optimizing) to smaller lengths till +no further changed occur. This does not work so well for architectures +that have very small minimum offsets and considerable jumps in instruction +lengths. + +If you define the @code{lock_length} attribute, branch shortening will +work the other way round: it starts out assuming minimum instruction +lengths and iterates from there. In addition, the value of the +@code{lock_length} attribute does not decrease across iterations, and +the value computed for the @code{length} attribute will be no smaller +than that of the @code{lock_length} attribute. + @end ifset @ifset INTERNALS @node Constant Attributes Index: final.c =================================================================== --- final.c (revision 192036) +++ final.c (working copy) @@ -312,6 +312,7 @@ dbr_sequence_length (void) `insn_current_length'. */ static int *insn_lengths; +static char *uid_lock_length; VEC(int,heap) *insn_addresses_; @@ -447,6 +448,20 @@ get_attr_length (rtx insn) return get_attr_length_1 (insn, insn_default_length); } +#ifdef HAVE_ATTR_lock_length +int +get_attr_lock_length (rtx insn) +{ + if (uid_lock_length && insn_lengths_max_uid > INSN_UID (insn)) + return uid_lock_length[INSN_UID (insn)]; + return get_attr_length_1 (insn, insn_min_lock_length); +} +#define INSN_VARIABLE_LENGTH_P(INSN) \ + (insn_variable_length_p (INSN) || insn_variable_lock_length_p (INSN)) +#else +#define INSN_VARIABLE_LENGTH_P(INSN) (insn_variable_length_p (INSN)) +#endif + /* Obtain the current length of an insn. If branch shortening has been done, get its actual length. Otherwise, get its minimum length. */ int @@ -865,6 +880,7 @@ shorten_branches (rtx first ATTRIBUTE_UN rtx body; int uid; rtx align_tab[MAX_CODE_ALIGN]; + int (*length_fun) (rtx); #endif @@ -875,6 +891,10 @@ shorten_branches (rtx first ATTRIBUTE_UN free (uid_shuid); uid_shuid = XNEWVEC (int, max_uid); +#ifdef HAVE_ATTR_lock_length + uid_lock_length = XNEWVEC (char, max_uid); + memset (uid_lock_length, 0, max_uid); +#endif if (max_labelno != max_label_num ()) { @@ -1067,6 +1087,10 @@ shorten_branches (rtx first ATTRIBUTE_UN #endif /* CASE_VECTOR_SHORTEN_MODE */ /* Compute initial lengths, addresses, and varying flags for each insn. */ + length_fun = insn_default_length; +#ifdef HAVE_ATTR_lock_length + length_fun = insn_min_length; +#endif for (insn_current_address = 0, insn = first; insn != 0; insn_current_address += insn_lengths[uid], insn = NEXT_INSN (insn)) @@ -1131,26 +1155,32 @@ shorten_branches (rtx first ATTRIBUTE_UN inner_length = (asm_insn_count (PATTERN (inner_insn)) * insn_default_length (inner_insn)); else - inner_length = insn_default_length (inner_insn); + inner_length = length_fun (inner_insn); insn_lengths[inner_uid] = inner_length; if (const_delay_slots) { if ((varying_length[inner_uid] - = insn_variable_length_p (inner_insn)) != 0) + = INSN_VARIABLE_LENGTH_P (inner_insn)) != 0) varying_length[uid] = 1; INSN_ADDRESSES (inner_uid) = (insn_current_address + insn_lengths[uid]); } else - varying_length[inner_uid] = 0; + { + /* We'd need to make this code a bit more complicated + to properly support non-const-delay-slots with the + lock_length attribute. */ + gcc_assert (length_fun == &insn_default_length); + varying_length[inner_uid] = 0; + } insn_lengths[uid] += inner_length; } } else if (GET_CODE (body) != USE && GET_CODE (body) != CLOBBER) { - insn_lengths[uid] = insn_default_length (insn); - varying_length[uid] = insn_variable_length_p (insn); + insn_lengths[uid] = length_fun (insn); + varying_length[uid] = INSN_VARIABLE_LENGTH_P (insn); } /* If needed, do any adjustment. */ @@ -1220,6 +1250,7 @@ shorten_branches (rtx first ATTRIBUTE_UN rtx prev; int rel_align = 0; addr_diff_vec_flags flags; + enum machine_mode vec_mode; /* Avoid automatic aggregate initialization. */ flags = ADDR_DIFF_VEC_FLAGS (body); @@ -1298,9 +1329,15 @@ shorten_branches (rtx first ATTRIBUTE_UN else max_addr += align_fuzz (max_lab, rel_lab, 0, 0); } - PUT_MODE (body, CASE_VECTOR_SHORTEN_MODE (min_addr - rel_addr, - max_addr - rel_addr, - body)); + vec_mode = CASE_VECTOR_SHORTEN_MODE (min_addr - rel_addr, + max_addr - rel_addr, body); + if (!uid_lock_length + || !uid_lock_length[uid] + || (GET_MODE_SIZE (vec_mode) + >= GET_MODE_SIZE (GET_MODE (body)))) + PUT_MODE (body, vec_mode); + if (uid_lock_length) + uid_lock_length[uid] = 1; if (JUMP_TABLES_IN_TEXT_SECTION || readonly_data_section == text_section) { @@ -1360,18 +1397,44 @@ shorten_branches (rtx first ATTRIBUTE_UN else inner_length = insn_current_length (inner_insn); - if (inner_length != insn_lengths[inner_uid]) + /* We can't record lengths of delay slot insns. */ + if (i == 0 && inner_length != insn_lengths[inner_uid]) { - insn_lengths[inner_uid] = inner_length; - something_changed = 1; +#ifdef HAVE_ATTR_lock_length + int lock_length = insn_current_lock_length (inner_insn); + + if (lock_length > uid_lock_length[inner_uid]) + uid_lock_length[inner_uid] = lock_length; + else + lock_length = uid_lock_length[inner_uid]; + if (inner_length < lock_length) + inner_length = lock_length; + if (inner_length != insn_lengths[inner_uid]) +#endif + { + insn_lengths[inner_uid] = inner_length; + something_changed = 1; + } } - insn_current_address += insn_lengths[inner_uid]; + insn_current_address += inner_length; new_length += inner_length; } } else { new_length = insn_current_length (insn); +#ifdef HAVE_ATTR_lock_length + { + int lock_length = insn_current_lock_length (insn); + + if (lock_length > uid_lock_length[uid]) + uid_lock_length[uid] = lock_length; + else + lock_length = uid_lock_length[uid]; + if (new_length < lock_length) + new_length = lock_length; + } +#endif insn_current_address += new_length; } @@ -1388,12 +1451,17 @@ shorten_branches (rtx first ATTRIBUTE_UN something_changed = 1; } } +#ifndef HAVE_ATTR_lock_length /* For a non-optimizing compile, do only a single pass. */ if (!optimize) break; +#endif } free (varying_length); + if (uid_lock_length) + free (uid_lock_length); + uid_lock_length = 0; #endif /* HAVE_ATTR_length */ } Index: genattr.c =================================================================== --- genattr.c (revision 192036) +++ genattr.c (working copy) @@ -71,6 +71,10 @@ extern int insn_default_length (rtx);\n\ extern int insn_min_length (rtx);\n\ extern int insn_variable_length_p (rtx);\n\ extern int insn_current_length (rtx);\n\n\ +extern int insn_default_lock_length (rtx);\n\ +extern int insn_min_lock_length (rtx);\n\ +extern int insn_variable_lock_length_p (rtx);\n\ +extern int insn_current_lock_length (rtx);\n\n\ #include \"insn-addr.h\"\n"); } } Index: genattrtab.c =================================================================== --- genattrtab.c (revision 192036) +++ genattrtab.c (working copy) @@ -242,6 +242,7 @@ struct attr_value_list **insn_code_value static const char *alternative_name; static const char *length_str; +static const char *lock_length_str; static const char *delay_type_str; static const char *delay_1_0_str; static const char *num_delay_slots_str; @@ -1541,14 +1542,14 @@ substitute_address (rtx exp, rtx (*no_ad */ static void -make_length_attrs (void) +make_length_attrs (const char **base) { static const char *new_names[] = { - "*insn_default_length", - "*insn_min_length", - "*insn_variable_length_p", - "*insn_current_length" + "*insn_default_%s", + "*insn_min_%s", + "*insn_variable_%s_p", + "*insn_current_%s" }; static rtx (*const no_address_fn[]) (rtx) = {identity_fn,identity_fn, zero_fn, zero_fn}; @@ -1561,7 +1562,7 @@ make_length_attrs (void) /* See if length attribute is defined. If so, it must be numeric. Make it special so we don't output anything for it. */ - length_attr = find_attr (&length_str, 0); + length_attr = find_attr (base, 0); if (length_attr == 0) return; @@ -1574,11 +1575,14 @@ make_length_attrs (void) /* Make each new attribute, in turn. */ for (i = 0; i < ARRAY_SIZE (new_names); i++) { - make_internal_attr (new_names[i], + const char *p = attr_printf (strlen (new_names[i]) - 2 + strlen (*base), + new_names[i], *base); + + make_internal_attr (p, substitute_address (length_attr->default_val->value, no_address_fn[i], address_fn[i]), ATTR_NONE); - new_attr = find_attr (&new_names[i], 0); + new_attr = find_attr (&p, 0); for (av = length_attr->first_value; av; av = av->next) for (ie = av->first_insn; ie; ie = ie->next) { @@ -5178,6 +5182,7 @@ main (int argc, char **argv) alternative_name = DEF_ATTR_STRING ("alternative"); length_str = DEF_ATTR_STRING ("length"); + lock_length_str = DEF_ATTR_STRING ("lock_length"); delay_type_str = DEF_ATTR_STRING ("*delay_type"); delay_1_0_str = DEF_ATTR_STRING ("*delay_1_0"); num_delay_slots_str = DEF_ATTR_STRING ("*num_delay_slots"); @@ -5274,7 +5279,8 @@ main (int argc, char **argv) fill_attr (attr); /* Construct extra attributes for `length'. */ - make_length_attrs (); + make_length_attrs (&length_str); + make_length_attrs (&lock_length_str); /* Perform any possible optimizations to speed up compilation. */ optimize_attrs ();