Message ID | CAAe5K+U957NHUd8r4Mvy36-E_285ESW265OJ9j67Jm-7nR583g@mail.gmail.com |
---|---|
State | New |
Headers | show |
The patch is good for google branches for now while waiting for upstream review. David On Sun, Dec 4, 2011 at 10:26 PM, Teresa Johnson <tejohnson@google.com> wrote: > Latest patch which improves the efficiency as described below is > included here. Boostrapped and checked again with > x86_64-unknown-linux-gnu. Could someone review? > > Thanks, > Teresa > > 2011-12-04 Teresa Johnson <tejohnson@google.com> > > * loop-unroll.c (decide_unroll_constant_iterations): Call loop > unroll target hook. > * config/i386/i386.c (ix86_loop_unroll_adjust): New function. > (TARGET_LOOP_UNROLL_ADJUST): Define hook for x86. > > =================================================================== > --- loop-unroll.c (revision 181902) > +++ loop-unroll.c (working copy) > @@ -547,6 +547,9 @@ decide_unroll_constant_iterations (struc > if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) > nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); > > + if (targetm.loop_unroll_adjust) > + nunroll = targetm.loop_unroll_adjust (nunroll, loop); > + > /* Skip big loops. */ > if (nunroll <= 1) > { > Index: config/i386/i386.c > =================================================================== > --- config/i386/i386.c (revision 181902) > +++ config/i386/i386.c (working copy) > @@ -60,6 +60,7 @@ along with GCC; see the file COPYING3. > #include "fibheap.h" > #include "opts.h" > #include "diagnostic.h" > +#include "cfgloop.h" > > enum upper_128bits_state > { > @@ -38370,6 +38371,82 @@ ix86_autovectorize_vector_sizes (void) > return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0; > } > > +/* If LOOP contains a possible LCP stalling instruction on corei7, > + calculate new number of times to unroll instead of NUNROLL so that > + the unrolled loop will still likely fit into the loop stream detector. */ > +static unsigned > +ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop) > +{ > + basic_block *body, bb; > + unsigned i; > + rtx insn; > + bool found = false; > + unsigned newunroll; > + > + if (ix86_tune != PROCESSOR_COREI7_64 && > + ix86_tune != PROCESSOR_COREI7_32) > + return nunroll; > + > + /* Look for instructions that store a constant into HImode (16-bit) > + memory. These require a length-changing prefix and on corei7 are > + prone to LCP stalls. These stalls can be avoided if the loop > + is streamed from the loop stream detector. */ > + body = get_loop_body (loop); > + for (i = 0; i < loop->num_nodes; i++) > + { > + bb = body[i]; > + > + FOR_BB_INSNS (bb, insn) > + { > + rtx set_expr, dest; > + set_expr = single_set (insn); > + if (!set_expr) > + continue; > + > + dest = SET_DEST (set_expr); > + > + /* Don't reduce unroll factor in loops with floating point > + computation, which tend to benefit more heavily from > + larger unroll factors and are less likely to bottleneck > + at the decoder. */ > + if (FLOAT_MODE_P (GET_MODE (dest))) > + { > + free (body); > + return nunroll; > + } > + > + if (!found > + && GET_MODE (dest) == HImode > + && CONST_INT_P (SET_SRC (set_expr)) > + && MEM_P (dest)) > + { > + found = true; > + /* Keep walking loop body to look for FP computations above. */ > + } > + } > + } > + free (body); > + > + if (!found) > + return nunroll; > + > + if (dump_file) > + { > + fprintf (dump_file, > + ";; Loop contains HImode store of const (possible LCP > stalls),\n"); > + fprintf (dump_file, > + " reduce unroll factor to fit into Loop Stream Detector\n"); > + } > + > + /* On corei7 the loop stream detector can hold 28 uops, so > + don't allow unrolling to exceed that many instructions. */ > + newunroll = 28 / loop->av_ninsns; > + if (newunroll < nunroll) > + return newunroll; > + > + return nunroll; > +} > + > /* Initialize the GCC target structure. */ > #undef TARGET_RETURN_IN_MEMORY > #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory > @@ -38685,6 +38762,9 @@ ix86_autovectorize_vector_sizes (void) > #define TARGET_INIT_LIBFUNCS darwin_rename_builtins > #endif > > +#undef TARGET_LOOP_UNROLL_ADJUST > +#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust > + > struct gcc_target targetm = TARGET_INITIALIZER; > > > #include "gt-i386.h" > > > On Fri, Dec 2, 2011 at 12:11 PM, Teresa Johnson <tejohnson@google.com> wrote: >> On Fri, Dec 2, 2011 at 11:36 AM, Andi Kleen <andi@firstfloor.org> wrote: >>> Teresa Johnson <tejohnson@google.com> writes: >>> >>> Interesting optimization. I would be concerned a little bit >>> about compile time, does it make a measurable difference? >> >> I haven't measured compile time explicitly, but I don't it should, >> especially after I address your efficiency suggestion (see below), >> since it will just have one pass over the instructions in innermost >> loops. >> >>> >>>> The attached patch detects loops containing instructions that tend to >>>> incur high LCP (loop changing prefix) stalls on Core i7, and limits >>>> their unroll factor to try to keep the unrolled loop body small enough >>>> to fit in the Corei7's loop stream detector which can hide LCP stalls >>>> in loops. >>> >>> One more optimization would be to optimize padding for this case, >>> the LSD only works if the loop is not spread over too many 32 byte >>> chunks. So if you detect the loop is LSD worthy always pad to 32 bytes >>> at the beginning. >> >> Thanks for the suggestion, I will look at doing that in follow-on tuning. >> >>> >>>> To do this I leveraged the existing TARGET_LOOP_UNROLL_ADJUST target >>>> hook, which was previously only defined for s390. I added one >>>> additional call to this target hook, when unrolling for constant trip >>>> count loops. Previously it was only called for runtime computed trip >>>> counts. Andreas, can you comment on the effect for s390 of this >>>> additional call of the target hook, since I can't measure that? >>> >>> On Sandy-Bridge there's also the decoded icache which is much larger, >>> but also has some restrictions. It would be nice if this optimization >>> was general enough to handle this case too. >>> >>> In general I notice that the tree loop unroller is too aggressive recently: >>> a lot of loops that probably shouldn't be unrolled (like containing >>> function calls etc.) are unrolled at -O3. So probably a better cost >>> model for unrolling would make sense anyways. >> >> These are both good suggestions, and I will look into Sandy Bridge >> heuristics in follow-on work, since we will need to tune for that >> soon. >> >>> >>>> + /* Don't reduce unroll factor in loops with floating point >>>> + computation, which tend to benefit more heavily from >>>> + larger unroll factors and are less likely to bottleneck >>>> + at the decoder. */ >>>> + has_FP = loop_has_FP_comp(loop); >>> >>> You could cache the loop body and pass it in here. >> >> That is a great idea, and in fact, I think I will do away with this >> separate function completely for this patch. I can more efficiently >> look for the FP computation while I am looking for the half word >> stores. I'll do that and send a follow up with the new patch. >> >>> >>> Patch looks reasonable to me, but I cannot approve. >> >> Thanks! >> Teresa >> >>> >>> -Andi >>> >>> -- >>> ak@linux.intel.com -- Speaking for myself only >> >> >> >> -- >> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413 > > > > -- > Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
Ping - now that stage 1 is open, could someone review? Thanks, Teresa On Sun, Dec 4, 2011 at 10:26 PM, Teresa Johnson <tejohnson@google.com> wrote: > Latest patch which improves the efficiency as described below is > included here. Boostrapped and checked again with > x86_64-unknown-linux-gnu. Could someone review? > > Thanks, > Teresa > > 2011-12-04 Teresa Johnson <tejohnson@google.com> > > * loop-unroll.c (decide_unroll_constant_iterations): Call loop > unroll target hook. > * config/i386/i386.c (ix86_loop_unroll_adjust): New function. > (TARGET_LOOP_UNROLL_ADJUST): Define hook for x86. > > =================================================================== > --- loop-unroll.c (revision 181902) > +++ loop-unroll.c (working copy) > @@ -547,6 +547,9 @@ decide_unroll_constant_iterations (struc > if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) > nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); > > + if (targetm.loop_unroll_adjust) > + nunroll = targetm.loop_unroll_adjust (nunroll, loop); > + > /* Skip big loops. */ > if (nunroll <= 1) > { > Index: config/i386/i386.c > =================================================================== > --- config/i386/i386.c (revision 181902) > +++ config/i386/i386.c (working copy) > @@ -60,6 +60,7 @@ along with GCC; see the file COPYING3. > #include "fibheap.h" > #include "opts.h" > #include "diagnostic.h" > +#include "cfgloop.h" > > enum upper_128bits_state > { > @@ -38370,6 +38371,82 @@ ix86_autovectorize_vector_sizes (void) > return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0; > } > > +/* If LOOP contains a possible LCP stalling instruction on corei7, > + calculate new number of times to unroll instead of NUNROLL so that > + the unrolled loop will still likely fit into the loop stream detector. */ > +static unsigned > +ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop) > +{ > + basic_block *body, bb; > + unsigned i; > + rtx insn; > + bool found = false; > + unsigned newunroll; > + > + if (ix86_tune != PROCESSOR_COREI7_64 && > + ix86_tune != PROCESSOR_COREI7_32) > + return nunroll; > + > + /* Look for instructions that store a constant into HImode (16-bit) > + memory. These require a length-changing prefix and on corei7 are > + prone to LCP stalls. These stalls can be avoided if the loop > + is streamed from the loop stream detector. */ > + body = get_loop_body (loop); > + for (i = 0; i < loop->num_nodes; i++) > + { > + bb = body[i]; > + > + FOR_BB_INSNS (bb, insn) > + { > + rtx set_expr, dest; > + set_expr = single_set (insn); > + if (!set_expr) > + continue; > + > + dest = SET_DEST (set_expr); > + > + /* Don't reduce unroll factor in loops with floating point > + computation, which tend to benefit more heavily from > + larger unroll factors and are less likely to bottleneck > + at the decoder. */ > + if (FLOAT_MODE_P (GET_MODE (dest))) > + { > + free (body); > + return nunroll; > + } > + > + if (!found > + && GET_MODE (dest) == HImode > + && CONST_INT_P (SET_SRC (set_expr)) > + && MEM_P (dest)) > + { > + found = true; > + /* Keep walking loop body to look for FP computations above. */ > + } > + } > + } > + free (body); > + > + if (!found) > + return nunroll; > + > + if (dump_file) > + { > + fprintf (dump_file, > + ";; Loop contains HImode store of const (possible LCP > stalls),\n"); > + fprintf (dump_file, > + " reduce unroll factor to fit into Loop Stream Detector\n"); > + } > + > + /* On corei7 the loop stream detector can hold 28 uops, so > + don't allow unrolling to exceed that many instructions. */ > + newunroll = 28 / loop->av_ninsns; > + if (newunroll < nunroll) > + return newunroll; > + > + return nunroll; > +} > + > /* Initialize the GCC target structure. */ > #undef TARGET_RETURN_IN_MEMORY > #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory > @@ -38685,6 +38762,9 @@ ix86_autovectorize_vector_sizes (void) > #define TARGET_INIT_LIBFUNCS darwin_rename_builtins > #endif > > +#undef TARGET_LOOP_UNROLL_ADJUST > +#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust > + > struct gcc_target targetm = TARGET_INITIALIZER; > > > #include "gt-i386.h" > > > On Fri, Dec 2, 2011 at 12:11 PM, Teresa Johnson <tejohnson@google.com> wrote: >> On Fri, Dec 2, 2011 at 11:36 AM, Andi Kleen <andi@firstfloor.org> wrote: >>> Teresa Johnson <tejohnson@google.com> writes: >>> >>> Interesting optimization. I would be concerned a little bit >>> about compile time, does it make a measurable difference? >> >> I haven't measured compile time explicitly, but I don't it should, >> especially after I address your efficiency suggestion (see below), >> since it will just have one pass over the instructions in innermost >> loops. >> >>> >>>> The attached patch detects loops containing instructions that tend to >>>> incur high LCP (loop changing prefix) stalls on Core i7, and limits >>>> their unroll factor to try to keep the unrolled loop body small enough >>>> to fit in the Corei7's loop stream detector which can hide LCP stalls >>>> in loops. >>> >>> One more optimization would be to optimize padding for this case, >>> the LSD only works if the loop is not spread over too many 32 byte >>> chunks. So if you detect the loop is LSD worthy always pad to 32 bytes >>> at the beginning. >> >> Thanks for the suggestion, I will look at doing that in follow-on tuning. >> >>> >>>> To do this I leveraged the existing TARGET_LOOP_UNROLL_ADJUST target >>>> hook, which was previously only defined for s390. I added one >>>> additional call to this target hook, when unrolling for constant trip >>>> count loops. Previously it was only called for runtime computed trip >>>> counts. Andreas, can you comment on the effect for s390 of this >>>> additional call of the target hook, since I can't measure that? >>> >>> On Sandy-Bridge there's also the decoded icache which is much larger, >>> but also has some restrictions. It would be nice if this optimization >>> was general enough to handle this case too. >>> >>> In general I notice that the tree loop unroller is too aggressive recently: >>> a lot of loops that probably shouldn't be unrolled (like containing >>> function calls etc.) are unrolled at -O3. So probably a better cost >>> model for unrolling would make sense anyways. >> >> These are both good suggestions, and I will look into Sandy Bridge >> heuristics in follow-on work, since we will need to tune for that >> soon. >> >>> >>>> + /* Don't reduce unroll factor in loops with floating point >>>> + computation, which tend to benefit more heavily from >>>> + larger unroll factors and are less likely to bottleneck >>>> + at the decoder. */ >>>> + has_FP = loop_has_FP_comp(loop); >>> >>> You could cache the loop body and pass it in here. >> >> That is a great idea, and in fact, I think I will do away with this >> separate function completely for this patch. I can more efficiently >> look for the FP computation while I am looking for the half word >> stores. I'll do that and send a follow up with the new patch. >> >>> >>> Patch looks reasonable to me, but I cannot approve. >> >> Thanks! >> Teresa >> >>> >>> -Andi >>> >>> -- >>> ak@linux.intel.com -- Speaking for myself only >> >> >> >> -- >> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413 > > > > -- > Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
Pulling this one back as I have a better solution, patch coming shortly. Thanks, Teresa On Fri, Mar 16, 2012 at 3:33 PM, Teresa Johnson <tejohnson@google.com> wrote: > > Ping - now that stage 1 is open, could someone review? > > Thanks, > Teresa > > On Sun, Dec 4, 2011 at 10:26 PM, Teresa Johnson <tejohnson@google.com> wrote: > > Latest patch which improves the efficiency as described below is > > included here. Boostrapped and checked again with > > x86_64-unknown-linux-gnu. Could someone review? > > > > Thanks, > > Teresa > > > > 2011-12-04 Teresa Johnson <tejohnson@google.com> > > > > * loop-unroll.c (decide_unroll_constant_iterations): Call loop > > unroll target hook. > > * config/i386/i386.c (ix86_loop_unroll_adjust): New function. > > (TARGET_LOOP_UNROLL_ADJUST): Define hook for x86. > > > > =================================================================== > > --- loop-unroll.c (revision 181902) > > +++ loop-unroll.c (working copy) > > @@ -547,6 +547,9 @@ decide_unroll_constant_iterations (struc > > if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) > > nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); > > > > + if (targetm.loop_unroll_adjust) > > + nunroll = targetm.loop_unroll_adjust (nunroll, loop); > > + > > /* Skip big loops. */ > > if (nunroll <= 1) > > { > > Index: config/i386/i386.c > > =================================================================== > > --- config/i386/i386.c (revision 181902) > > +++ config/i386/i386.c (working copy) > > @@ -60,6 +60,7 @@ along with GCC; see the file COPYING3. > > #include "fibheap.h" > > #include "opts.h" > > #include "diagnostic.h" > > +#include "cfgloop.h" > > > > enum upper_128bits_state > > { > > @@ -38370,6 +38371,82 @@ ix86_autovectorize_vector_sizes (void) > > return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0; > > } > > > > +/* If LOOP contains a possible LCP stalling instruction on corei7, > > + calculate new number of times to unroll instead of NUNROLL so that > > + the unrolled loop will still likely fit into the loop stream detector. */ > > +static unsigned > > +ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop) > > +{ > > + basic_block *body, bb; > > + unsigned i; > > + rtx insn; > > + bool found = false; > > + unsigned newunroll; > > + > > + if (ix86_tune != PROCESSOR_COREI7_64 && > > + ix86_tune != PROCESSOR_COREI7_32) > > + return nunroll; > > + > > + /* Look for instructions that store a constant into HImode (16-bit) > > + memory. These require a length-changing prefix and on corei7 are > > + prone to LCP stalls. These stalls can be avoided if the loop > > + is streamed from the loop stream detector. */ > > + body = get_loop_body (loop); > > + for (i = 0; i < loop->num_nodes; i++) > > + { > > + bb = body[i]; > > + > > + FOR_BB_INSNS (bb, insn) > > + { > > + rtx set_expr, dest; > > + set_expr = single_set (insn); > > + if (!set_expr) > > + continue; > > + > > + dest = SET_DEST (set_expr); > > + > > + /* Don't reduce unroll factor in loops with floating point > > + computation, which tend to benefit more heavily from > > + larger unroll factors and are less likely to bottleneck > > + at the decoder. */ > > + if (FLOAT_MODE_P (GET_MODE (dest))) > > + { > > + free (body); > > + return nunroll; > > + } > > + > > + if (!found > > + && GET_MODE (dest) == HImode > > + && CONST_INT_P (SET_SRC (set_expr)) > > + && MEM_P (dest)) > > + { > > + found = true; > > + /* Keep walking loop body to look for FP computations above. */ > > + } > > + } > > + } > > + free (body); > > + > > + if (!found) > > + return nunroll; > > + > > + if (dump_file) > > + { > > + fprintf (dump_file, > > + ";; Loop contains HImode store of const (possible LCP > > stalls),\n"); > > + fprintf (dump_file, > > + " reduce unroll factor to fit into Loop Stream Detector\n"); > > + } > > + > > + /* On corei7 the loop stream detector can hold 28 uops, so > > + don't allow unrolling to exceed that many instructions. */ > > + newunroll = 28 / loop->av_ninsns; > > + if (newunroll < nunroll) > > + return newunroll; > > + > > + return nunroll; > > +} > > + > > /* Initialize the GCC target structure. */ > > #undef TARGET_RETURN_IN_MEMORY > > #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory > > @@ -38685,6 +38762,9 @@ ix86_autovectorize_vector_sizes (void) > > #define TARGET_INIT_LIBFUNCS darwin_rename_builtins > > #endif > > > > +#undef TARGET_LOOP_UNROLL_ADJUST > > +#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust > > + > > struct gcc_target targetm = TARGET_INITIALIZER; > > > > > > #include "gt-i386.h" > > > > > > On Fri, Dec 2, 2011 at 12:11 PM, Teresa Johnson <tejohnson@google.com> wrote: > >> On Fri, Dec 2, 2011 at 11:36 AM, Andi Kleen <andi@firstfloor.org> wrote: > >>> Teresa Johnson <tejohnson@google.com> writes: > >>> > >>> Interesting optimization. I would be concerned a little bit > >>> about compile time, does it make a measurable difference? > >> > >> I haven't measured compile time explicitly, but I don't it should, > >> especially after I address your efficiency suggestion (see below), > >> since it will just have one pass over the instructions in innermost > >> loops. > >> > >>> > >>>> The attached patch detects loops containing instructions that tend to > >>>> incur high LCP (loop changing prefix) stalls on Core i7, and limits > >>>> their unroll factor to try to keep the unrolled loop body small enough > >>>> to fit in the Corei7's loop stream detector which can hide LCP stalls > >>>> in loops. > >>> > >>> One more optimization would be to optimize padding for this case, > >>> the LSD only works if the loop is not spread over too many 32 byte > >>> chunks. So if you detect the loop is LSD worthy always pad to 32 bytes > >>> at the beginning. > >> > >> Thanks for the suggestion, I will look at doing that in follow-on tuning. > >> > >>> > >>>> To do this I leveraged the existing TARGET_LOOP_UNROLL_ADJUST target > >>>> hook, which was previously only defined for s390. I added one > >>>> additional call to this target hook, when unrolling for constant trip > >>>> count loops. Previously it was only called for runtime computed trip > >>>> counts. Andreas, can you comment on the effect for s390 of this > >>>> additional call of the target hook, since I can't measure that? > >>> > >>> On Sandy-Bridge there's also the decoded icache which is much larger, > >>> but also has some restrictions. It would be nice if this optimization > >>> was general enough to handle this case too. > >>> > >>> In general I notice that the tree loop unroller is too aggressive recently: > >>> a lot of loops that probably shouldn't be unrolled (like containing > >>> function calls etc.) are unrolled at -O3. So probably a better cost > >>> model for unrolling would make sense anyways. > >> > >> These are both good suggestions, and I will look into Sandy Bridge > >> heuristics in follow-on work, since we will need to tune for that > >> soon. > >> > >>> > >>>> + /* Don't reduce unroll factor in loops with floating point > >>>> + computation, which tend to benefit more heavily from > >>>> + larger unroll factors and are less likely to bottleneck > >>>> + at the decoder. */ > >>>> + has_FP = loop_has_FP_comp(loop); > >>> > >>> You could cache the loop body and pass it in here. > >> > >> That is a great idea, and in fact, I think I will do away with this > >> separate function completely for this patch. I can more efficiently > >> look for the FP computation while I am looking for the half word > >> stores. I'll do that and send a follow up with the new patch. > >> > >>> > >>> Patch looks reasonable to me, but I cannot approve. > >> > >> Thanks! > >> Teresa > >> > >>> > >>> -Andi > >>> > >>> -- > >>> ak@linux.intel.com -- Speaking for myself only > >> > >> > >> > >> -- > >> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413 > > > > > > > > -- > > Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413 > > > > -- > Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413 -- Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
=================================================================== --- loop-unroll.c (revision 181902) +++ loop-unroll.c (working copy) @@ -547,6 +547,9 @@ decide_unroll_constant_iterations (struc if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); + if (targetm.loop_unroll_adjust) + nunroll = targetm.loop_unroll_adjust (nunroll, loop); + /* Skip big loops. */ if (nunroll <= 1) { Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 181902) +++ config/i386/i386.c (working copy) @@ -60,6 +60,7 @@ along with GCC; see the file COPYING3. #include "fibheap.h" #include "opts.h" #include "diagnostic.h" +#include "cfgloop.h" enum upper_128bits_state { @@ -38370,6 +38371,82 @@ ix86_autovectorize_vector_sizes (void) return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0; } +/* If LOOP contains a possible LCP stalling instruction on corei7, + calculate new number of times to unroll instead of NUNROLL so that + the unrolled loop will still likely fit into the loop stream detector. */ +static unsigned +ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop) +{ + basic_block *body, bb; + unsigned i; + rtx insn; + bool found = false; + unsigned newunroll; + + if (ix86_tune != PROCESSOR_COREI7_64 && + ix86_tune != PROCESSOR_COREI7_32) + return nunroll; + + /* Look for instructions that store a constant into HImode (16-bit) + memory. These require a length-changing prefix and on corei7 are + prone to LCP stalls. These stalls can be avoided if the loop + is streamed from the loop stream detector. */ + body = get_loop_body (loop); + for (i = 0; i < loop->num_nodes; i++) + { + bb = body[i]; + + FOR_BB_INSNS (bb, insn) + { + rtx set_expr, dest; + set_expr = single_set (insn); + if (!set_expr) + continue; + + dest = SET_DEST (set_expr); + + /* Don't reduce unroll factor in loops with floating point + computation, which tend to benefit more heavily from + larger unroll factors and are less likely to bottleneck + at the decoder. */ + if (FLOAT_MODE_P (GET_MODE (dest))) + { + free (body); + return nunroll; + } + + if (!found + && GET_MODE (dest) == HImode + && CONST_INT_P (SET_SRC (set_expr)) + && MEM_P (dest)) + { + found = true; + /* Keep walking loop body to look for FP computations above. */ + } + } + } + free (body); + + if (!found) + return nunroll; + + if (dump_file) + { + fprintf (dump_file, + ";; Loop contains HImode store of const (possible LCP stalls),\n"); + fprintf (dump_file, + " reduce unroll factor to fit into Loop Stream Detector\n"); + } + + /* On corei7 the loop stream detector can hold 28 uops, so + don't allow unrolling to exceed that many instructions. */ + newunroll = 28 / loop->av_ninsns; + if (newunroll < nunroll) + return newunroll; + + return nunroll; +} + /* Initialize the GCC target structure. */ #undef TARGET_RETURN_IN_MEMORY