diff mbox

[i386] Limit unroll factor for certain loops on Corei7

Message ID CAAe5K+U957NHUd8r4Mvy36-E_285ESW265OJ9j67Jm-7nR583g@mail.gmail.com
State New
Headers show

Commit Message

Teresa Johnson Dec. 5, 2011, 6:26 a.m. UTC
Latest patch which improves the efficiency as described below is
included here. Boostrapped and checked again with
x86_64-unknown-linux-gnu. Could someone review?

Thanks,
Teresa

2011-12-04  Teresa Johnson  <tejohnson@google.com>

	* loop-unroll.c (decide_unroll_constant_iterations): Call loop
	unroll target hook.
	* config/i386/i386.c (ix86_loop_unroll_adjust): New function.
	(TARGET_LOOP_UNROLL_ADJUST): Define hook for x86.

 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
@@ -38685,6 +38762,9 @@ ix86_autovectorize_vector_sizes (void)
 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
 #endif

+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
+
 struct gcc_target targetm = TARGET_INITIALIZER;


 #include "gt-i386.h"


On Fri, Dec 2, 2011 at 12:11 PM, Teresa Johnson <tejohnson@google.com> wrote:
> On Fri, Dec 2, 2011 at 11:36 AM, Andi Kleen <andi@firstfloor.org> wrote:
>> Teresa Johnson <tejohnson@google.com> writes:
>>
>> Interesting optimization. I would be concerned a little bit
>> about compile time, does it make a measurable difference?
>
> I haven't measured compile time explicitly, but I don't it should,
> especially after I address your efficiency suggestion (see below),
> since it will just have one pass over the instructions in innermost
> loops.
>
>>
>>> The attached patch detects loops containing instructions that tend to
>>> incur high LCP (loop changing prefix) stalls on Core i7, and limits
>>> their unroll factor to try to keep the unrolled loop body small enough
>>> to fit in the Corei7's loop stream detector which can hide LCP stalls
>>> in loops.
>>
>> One more optimization would be to optimize padding for this case,
>> the LSD only works if the loop is not spread over too many 32 byte
>> chunks. So if you detect the loop is LSD worthy always pad to 32 bytes
>> at the beginning.
>
> Thanks for the suggestion, I will look at doing that in follow-on tuning.
>
>>
>>> To do this I leveraged the existing TARGET_LOOP_UNROLL_ADJUST target
>>> hook, which was previously only defined for s390. I added one
>>> additional call to this target hook, when unrolling for constant trip
>>> count loops. Previously it was only called for runtime computed trip
>>> counts. Andreas, can you comment on the effect for s390 of this
>>> additional call of the target hook, since I can't measure that?
>>
>> On Sandy-Bridge there's also the decoded icache which is much larger,
>> but also has some restrictions. It would be nice if this optimization
>> was general enough to handle this case too.
>>
>> In general I notice that the tree loop unroller is too aggressive recently:
>> a lot of loops that probably shouldn't be unrolled (like containing
>> function calls etc.) are unrolled at -O3. So probably a better cost
>> model for unrolling would make sense anyways.
>
> These are both good suggestions, and I will look into Sandy Bridge
> heuristics in follow-on work, since we will need to tune for that
> soon.
>
>>
>>> +  /* Don't reduce unroll factor in loops with floating point
>>> +     computation, which tend to benefit more heavily from
>>> +     larger unroll factors and are less likely to bottleneck
>>> +     at the decoder. */
>>> +  has_FP = loop_has_FP_comp(loop);
>>
>> You could cache the loop body and pass it in here.
>
> That is a great idea, and in fact, I think I will do away with this
> separate function completely for this patch. I can more efficiently
> look for the FP computation while I am looking for the half word
> stores. I'll do that and send a follow up with the new patch.
>
>>
>> Patch looks reasonable to me, but I cannot approve.
>
> Thanks!
> Teresa
>
>>
>> -Andi
>>
>> --
>> ak@linux.intel.com -- Speaking for myself only
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413

Comments

Xinliang David Li Dec. 9, 2011, 10:47 p.m. UTC | #1
The patch is good for google branches for now while waiting for upstream review.

David

On Sun, Dec 4, 2011 at 10:26 PM, Teresa Johnson <tejohnson@google.com> wrote:
> Latest patch which improves the efficiency as described below is
> included here. Boostrapped and checked again with
> x86_64-unknown-linux-gnu. Could someone review?
>
> Thanks,
> Teresa
>
> 2011-12-04  Teresa Johnson  <tejohnson@google.com>
>
>        * loop-unroll.c (decide_unroll_constant_iterations): Call loop
>        unroll target hook.
>        * config/i386/i386.c (ix86_loop_unroll_adjust): New function.
>        (TARGET_LOOP_UNROLL_ADJUST): Define hook for x86.
>
> ===================================================================
> --- loop-unroll.c       (revision 181902)
> +++ loop-unroll.c       (working copy)
> @@ -547,6 +547,9 @@ decide_unroll_constant_iterations (struc
>   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
>     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
>
> +  if (targetm.loop_unroll_adjust)
> +    nunroll = targetm.loop_unroll_adjust (nunroll, loop);
> +
>   /* Skip big loops.  */
>   if (nunroll <= 1)
>     {
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 181902)
> +++ config/i386/i386.c  (working copy)
> @@ -60,6 +60,7 @@ along with GCC; see the file COPYING3.
>  #include "fibheap.h"
>  #include "opts.h"
>  #include "diagnostic.h"
> +#include "cfgloop.h"
>
>  enum upper_128bits_state
>  {
> @@ -38370,6 +38371,82 @@ ix86_autovectorize_vector_sizes (void)
>   return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
>  }
>
> +/* If LOOP contains a possible LCP stalling instruction on corei7,
> +   calculate new number of times to unroll instead of NUNROLL so that
> +   the unrolled loop will still likely fit into the loop stream detector. */
> +static unsigned
> +ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
> +{
> +  basic_block *body, bb;
> +  unsigned i;
> +  rtx insn;
> +  bool found = false;
> +  unsigned newunroll;
> +
> +  if (ix86_tune != PROCESSOR_COREI7_64 &&
> +      ix86_tune != PROCESSOR_COREI7_32)
> +    return nunroll;
> +
> +  /* Look for instructions that store a constant into HImode (16-bit)
> +     memory. These require a length-changing prefix and on corei7 are
> +     prone to LCP stalls. These stalls can be avoided if the loop
> +     is streamed from the loop stream detector. */
> +  body = get_loop_body (loop);
> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      bb = body[i];
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          rtx set_expr, dest;
> +          set_expr = single_set (insn);
> +          if (!set_expr)
> +            continue;
> +
> +          dest = SET_DEST (set_expr);
> +
> +          /* Don't reduce unroll factor in loops with floating point
> +             computation, which tend to benefit more heavily from
> +             larger unroll factors and are less likely to bottleneck
> +             at the decoder. */
> +          if (FLOAT_MODE_P (GET_MODE (dest)))
> +          {
> +            free (body);
> +            return nunroll;
> +          }
> +
> +          if (!found
> +              && GET_MODE (dest) == HImode
> +              && CONST_INT_P (SET_SRC (set_expr))
> +              && MEM_P (dest))
> +            {
> +              found = true;
> +              /* Keep walking loop body to look for FP computations above. */
> +            }
> +        }
> +    }
> +  free (body);
> +
> +  if (!found)
> +    return nunroll;
> +
> +  if (dump_file)
> +    {
> +      fprintf (dump_file,
> +               ";; Loop contains HImode store of const (possible LCP
> stalls),\n");
> +      fprintf (dump_file,
> +               "   reduce unroll factor to fit into Loop Stream Detector\n");
> +    }
> +
> +  /* On corei7 the loop stream detector can hold 28 uops, so
> +     don't allow unrolling to exceed that many instructions. */
> +  newunroll = 28 / loop->av_ninsns;
> +  if (newunroll < nunroll)
> +    return newunroll;
> +
> +  return nunroll;
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_RETURN_IN_MEMORY
>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
> @@ -38685,6 +38762,9 @@ ix86_autovectorize_vector_sizes (void)
>  #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
>  #endif
>
> +#undef TARGET_LOOP_UNROLL_ADJUST
> +#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>
>  #include "gt-i386.h"
>
>
> On Fri, Dec 2, 2011 at 12:11 PM, Teresa Johnson <tejohnson@google.com> wrote:
>> On Fri, Dec 2, 2011 at 11:36 AM, Andi Kleen <andi@firstfloor.org> wrote:
>>> Teresa Johnson <tejohnson@google.com> writes:
>>>
>>> Interesting optimization. I would be concerned a little bit
>>> about compile time, does it make a measurable difference?
>>
>> I haven't measured compile time explicitly, but I don't it should,
>> especially after I address your efficiency suggestion (see below),
>> since it will just have one pass over the instructions in innermost
>> loops.
>>
>>>
>>>> The attached patch detects loops containing instructions that tend to
>>>> incur high LCP (loop changing prefix) stalls on Core i7, and limits
>>>> their unroll factor to try to keep the unrolled loop body small enough
>>>> to fit in the Corei7's loop stream detector which can hide LCP stalls
>>>> in loops.
>>>
>>> One more optimization would be to optimize padding for this case,
>>> the LSD only works if the loop is not spread over too many 32 byte
>>> chunks. So if you detect the loop is LSD worthy always pad to 32 bytes
>>> at the beginning.
>>
>> Thanks for the suggestion, I will look at doing that in follow-on tuning.
>>
>>>
>>>> To do this I leveraged the existing TARGET_LOOP_UNROLL_ADJUST target
>>>> hook, which was previously only defined for s390. I added one
>>>> additional call to this target hook, when unrolling for constant trip
>>>> count loops. Previously it was only called for runtime computed trip
>>>> counts. Andreas, can you comment on the effect for s390 of this
>>>> additional call of the target hook, since I can't measure that?
>>>
>>> On Sandy-Bridge there's also the decoded icache which is much larger,
>>> but also has some restrictions. It would be nice if this optimization
>>> was general enough to handle this case too.
>>>
>>> In general I notice that the tree loop unroller is too aggressive recently:
>>> a lot of loops that probably shouldn't be unrolled (like containing
>>> function calls etc.) are unrolled at -O3. So probably a better cost
>>> model for unrolling would make sense anyways.
>>
>> These are both good suggestions, and I will look into Sandy Bridge
>> heuristics in follow-on work, since we will need to tune for that
>> soon.
>>
>>>
>>>> +  /* Don't reduce unroll factor in loops with floating point
>>>> +     computation, which tend to benefit more heavily from
>>>> +     larger unroll factors and are less likely to bottleneck
>>>> +     at the decoder. */
>>>> +  has_FP = loop_has_FP_comp(loop);
>>>
>>> You could cache the loop body and pass it in here.
>>
>> That is a great idea, and in fact, I think I will do away with this
>> separate function completely for this patch. I can more efficiently
>> look for the FP computation while I am looking for the half word
>> stores. I'll do that and send a follow up with the new patch.
>>
>>>
>>> Patch looks reasonable to me, but I cannot approve.
>>
>> Thanks!
>> Teresa
>>
>>>
>>> -Andi
>>>
>>> --
>>> ak@linux.intel.com -- Speaking for myself only
>>
>>
>>
>> --
>> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
Teresa Johnson March 16, 2012, 10:33 p.m. UTC | #2
Ping - now that stage 1 is open, could someone review?

Thanks,
Teresa

On Sun, Dec 4, 2011 at 10:26 PM, Teresa Johnson <tejohnson@google.com> wrote:
> Latest patch which improves the efficiency as described below is
> included here. Boostrapped and checked again with
> x86_64-unknown-linux-gnu. Could someone review?
>
> Thanks,
> Teresa
>
> 2011-12-04  Teresa Johnson  <tejohnson@google.com>
>
>        * loop-unroll.c (decide_unroll_constant_iterations): Call loop
>        unroll target hook.
>        * config/i386/i386.c (ix86_loop_unroll_adjust): New function.
>        (TARGET_LOOP_UNROLL_ADJUST): Define hook for x86.
>
> ===================================================================
> --- loop-unroll.c       (revision 181902)
> +++ loop-unroll.c       (working copy)
> @@ -547,6 +547,9 @@ decide_unroll_constant_iterations (struc
>   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
>     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
>
> +  if (targetm.loop_unroll_adjust)
> +    nunroll = targetm.loop_unroll_adjust (nunroll, loop);
> +
>   /* Skip big loops.  */
>   if (nunroll <= 1)
>     {
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 181902)
> +++ config/i386/i386.c  (working copy)
> @@ -60,6 +60,7 @@ along with GCC; see the file COPYING3.
>  #include "fibheap.h"
>  #include "opts.h"
>  #include "diagnostic.h"
> +#include "cfgloop.h"
>
>  enum upper_128bits_state
>  {
> @@ -38370,6 +38371,82 @@ ix86_autovectorize_vector_sizes (void)
>   return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
>  }
>
> +/* If LOOP contains a possible LCP stalling instruction on corei7,
> +   calculate new number of times to unroll instead of NUNROLL so that
> +   the unrolled loop will still likely fit into the loop stream detector. */
> +static unsigned
> +ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
> +{
> +  basic_block *body, bb;
> +  unsigned i;
> +  rtx insn;
> +  bool found = false;
> +  unsigned newunroll;
> +
> +  if (ix86_tune != PROCESSOR_COREI7_64 &&
> +      ix86_tune != PROCESSOR_COREI7_32)
> +    return nunroll;
> +
> +  /* Look for instructions that store a constant into HImode (16-bit)
> +     memory. These require a length-changing prefix and on corei7 are
> +     prone to LCP stalls. These stalls can be avoided if the loop
> +     is streamed from the loop stream detector. */
> +  body = get_loop_body (loop);
> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      bb = body[i];
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          rtx set_expr, dest;
> +          set_expr = single_set (insn);
> +          if (!set_expr)
> +            continue;
> +
> +          dest = SET_DEST (set_expr);
> +
> +          /* Don't reduce unroll factor in loops with floating point
> +             computation, which tend to benefit more heavily from
> +             larger unroll factors and are less likely to bottleneck
> +             at the decoder. */
> +          if (FLOAT_MODE_P (GET_MODE (dest)))
> +          {
> +            free (body);
> +            return nunroll;
> +          }
> +
> +          if (!found
> +              && GET_MODE (dest) == HImode
> +              && CONST_INT_P (SET_SRC (set_expr))
> +              && MEM_P (dest))
> +            {
> +              found = true;
> +              /* Keep walking loop body to look for FP computations above. */
> +            }
> +        }
> +    }
> +  free (body);
> +
> +  if (!found)
> +    return nunroll;
> +
> +  if (dump_file)
> +    {
> +      fprintf (dump_file,
> +               ";; Loop contains HImode store of const (possible LCP
> stalls),\n");
> +      fprintf (dump_file,
> +               "   reduce unroll factor to fit into Loop Stream Detector\n");
> +    }
> +
> +  /* On corei7 the loop stream detector can hold 28 uops, so
> +     don't allow unrolling to exceed that many instructions. */
> +  newunroll = 28 / loop->av_ninsns;
> +  if (newunroll < nunroll)
> +    return newunroll;
> +
> +  return nunroll;
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_RETURN_IN_MEMORY
>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
> @@ -38685,6 +38762,9 @@ ix86_autovectorize_vector_sizes (void)
>  #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
>  #endif
>
> +#undef TARGET_LOOP_UNROLL_ADJUST
> +#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>
>  #include "gt-i386.h"
>
>
> On Fri, Dec 2, 2011 at 12:11 PM, Teresa Johnson <tejohnson@google.com> wrote:
>> On Fri, Dec 2, 2011 at 11:36 AM, Andi Kleen <andi@firstfloor.org> wrote:
>>> Teresa Johnson <tejohnson@google.com> writes:
>>>
>>> Interesting optimization. I would be concerned a little bit
>>> about compile time, does it make a measurable difference?
>>
>> I haven't measured compile time explicitly, but I don't it should,
>> especially after I address your efficiency suggestion (see below),
>> since it will just have one pass over the instructions in innermost
>> loops.
>>
>>>
>>>> The attached patch detects loops containing instructions that tend to
>>>> incur high LCP (loop changing prefix) stalls on Core i7, and limits
>>>> their unroll factor to try to keep the unrolled loop body small enough
>>>> to fit in the Corei7's loop stream detector which can hide LCP stalls
>>>> in loops.
>>>
>>> One more optimization would be to optimize padding for this case,
>>> the LSD only works if the loop is not spread over too many 32 byte
>>> chunks. So if you detect the loop is LSD worthy always pad to 32 bytes
>>> at the beginning.
>>
>> Thanks for the suggestion, I will look at doing that in follow-on tuning.
>>
>>>
>>>> To do this I leveraged the existing TARGET_LOOP_UNROLL_ADJUST target
>>>> hook, which was previously only defined for s390. I added one
>>>> additional call to this target hook, when unrolling for constant trip
>>>> count loops. Previously it was only called for runtime computed trip
>>>> counts. Andreas, can you comment on the effect for s390 of this
>>>> additional call of the target hook, since I can't measure that?
>>>
>>> On Sandy-Bridge there's also the decoded icache which is much larger,
>>> but also has some restrictions. It would be nice if this optimization
>>> was general enough to handle this case too.
>>>
>>> In general I notice that the tree loop unroller is too aggressive recently:
>>> a lot of loops that probably shouldn't be unrolled (like containing
>>> function calls etc.) are unrolled at -O3. So probably a better cost
>>> model for unrolling would make sense anyways.
>>
>> These are both good suggestions, and I will look into Sandy Bridge
>> heuristics in follow-on work, since we will need to tune for that
>> soon.
>>
>>>
>>>> +  /* Don't reduce unroll factor in loops with floating point
>>>> +     computation, which tend to benefit more heavily from
>>>> +     larger unroll factors and are less likely to bottleneck
>>>> +     at the decoder. */
>>>> +  has_FP = loop_has_FP_comp(loop);
>>>
>>> You could cache the loop body and pass it in here.
>>
>> That is a great idea, and in fact, I think I will do away with this
>> separate function completely for this patch. I can more efficiently
>> look for the FP computation while I am looking for the half word
>> stores. I'll do that and send a follow up with the new patch.
>>
>>>
>>> Patch looks reasonable to me, but I cannot approve.
>>
>> Thanks!
>> Teresa
>>
>>>
>>> -Andi
>>>
>>> --
>>> ak@linux.intel.com -- Speaking for myself only
>>
>>
>>
>> --
>> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
Teresa Johnson March 30, 2012, 2:14 p.m. UTC | #3
Pulling this one back as I have a better solution, patch coming shortly.

Thanks,
Teresa

On Fri, Mar 16, 2012 at 3:33 PM, Teresa Johnson <tejohnson@google.com> wrote:
>
> Ping - now that stage 1 is open, could someone review?
>
> Thanks,
> Teresa
>
> On Sun, Dec 4, 2011 at 10:26 PM, Teresa Johnson <tejohnson@google.com> wrote:
> > Latest patch which improves the efficiency as described below is
> > included here. Boostrapped and checked again with
> > x86_64-unknown-linux-gnu. Could someone review?
> >
> > Thanks,
> > Teresa
> >
> > 2011-12-04  Teresa Johnson  <tejohnson@google.com>
> >
> >        * loop-unroll.c (decide_unroll_constant_iterations): Call loop
> >        unroll target hook.
> >        * config/i386/i386.c (ix86_loop_unroll_adjust): New function.
> >        (TARGET_LOOP_UNROLL_ADJUST): Define hook for x86.
> >
> > ===================================================================
> > --- loop-unroll.c       (revision 181902)
> > +++ loop-unroll.c       (working copy)
> > @@ -547,6 +547,9 @@ decide_unroll_constant_iterations (struc
> >   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
> >     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
> >
> > +  if (targetm.loop_unroll_adjust)
> > +    nunroll = targetm.loop_unroll_adjust (nunroll, loop);
> > +
> >   /* Skip big loops.  */
> >   if (nunroll <= 1)
> >     {
> > Index: config/i386/i386.c
> > ===================================================================
> > --- config/i386/i386.c  (revision 181902)
> > +++ config/i386/i386.c  (working copy)
> > @@ -60,6 +60,7 @@ along with GCC; see the file COPYING3.
> >  #include "fibheap.h"
> >  #include "opts.h"
> >  #include "diagnostic.h"
> > +#include "cfgloop.h"
> >
> >  enum upper_128bits_state
> >  {
> > @@ -38370,6 +38371,82 @@ ix86_autovectorize_vector_sizes (void)
> >   return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
> >  }
> >
> > +/* If LOOP contains a possible LCP stalling instruction on corei7,
> > +   calculate new number of times to unroll instead of NUNROLL so that
> > +   the unrolled loop will still likely fit into the loop stream detector. */
> > +static unsigned
> > +ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
> > +{
> > +  basic_block *body, bb;
> > +  unsigned i;
> > +  rtx insn;
> > +  bool found = false;
> > +  unsigned newunroll;
> > +
> > +  if (ix86_tune != PROCESSOR_COREI7_64 &&
> > +      ix86_tune != PROCESSOR_COREI7_32)
> > +    return nunroll;
> > +
> > +  /* Look for instructions that store a constant into HImode (16-bit)
> > +     memory. These require a length-changing prefix and on corei7 are
> > +     prone to LCP stalls. These stalls can be avoided if the loop
> > +     is streamed from the loop stream detector. */
> > +  body = get_loop_body (loop);
> > +  for (i = 0; i < loop->num_nodes; i++)
> > +    {
> > +      bb = body[i];
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +        {
> > +          rtx set_expr, dest;
> > +          set_expr = single_set (insn);
> > +          if (!set_expr)
> > +            continue;
> > +
> > +          dest = SET_DEST (set_expr);
> > +
> > +          /* Don't reduce unroll factor in loops with floating point
> > +             computation, which tend to benefit more heavily from
> > +             larger unroll factors and are less likely to bottleneck
> > +             at the decoder. */
> > +          if (FLOAT_MODE_P (GET_MODE (dest)))
> > +          {
> > +            free (body);
> > +            return nunroll;
> > +          }
> > +
> > +          if (!found
> > +              && GET_MODE (dest) == HImode
> > +              && CONST_INT_P (SET_SRC (set_expr))
> > +              && MEM_P (dest))
> > +            {
> > +              found = true;
> > +              /* Keep walking loop body to look for FP computations above. */
> > +            }
> > +        }
> > +    }
> > +  free (body);
> > +
> > +  if (!found)
> > +    return nunroll;
> > +
> > +  if (dump_file)
> > +    {
> > +      fprintf (dump_file,
> > +               ";; Loop contains HImode store of const (possible LCP
> > stalls),\n");
> > +      fprintf (dump_file,
> > +               "   reduce unroll factor to fit into Loop Stream Detector\n");
> > +    }
> > +
> > +  /* On corei7 the loop stream detector can hold 28 uops, so
> > +     don't allow unrolling to exceed that many instructions. */
> > +  newunroll = 28 / loop->av_ninsns;
> > +  if (newunroll < nunroll)
> > +    return newunroll;
> > +
> > +  return nunroll;
> > +}
> > +
> >  /* Initialize the GCC target structure.  */
> >  #undef TARGET_RETURN_IN_MEMORY
> >  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
> > @@ -38685,6 +38762,9 @@ ix86_autovectorize_vector_sizes (void)
> >  #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
> >  #endif
> >
> > +#undef TARGET_LOOP_UNROLL_ADJUST
> > +#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
> > +
> >  struct gcc_target targetm = TARGET_INITIALIZER;
> >
> >
> >  #include "gt-i386.h"
> >
> >
> > On Fri, Dec 2, 2011 at 12:11 PM, Teresa Johnson <tejohnson@google.com> wrote:
> >> On Fri, Dec 2, 2011 at 11:36 AM, Andi Kleen <andi@firstfloor.org> wrote:
> >>> Teresa Johnson <tejohnson@google.com> writes:
> >>>
> >>> Interesting optimization. I would be concerned a little bit
> >>> about compile time, does it make a measurable difference?
> >>
> >> I haven't measured compile time explicitly, but I don't it should,
> >> especially after I address your efficiency suggestion (see below),
> >> since it will just have one pass over the instructions in innermost
> >> loops.
> >>
> >>>
> >>>> The attached patch detects loops containing instructions that tend to
> >>>> incur high LCP (loop changing prefix) stalls on Core i7, and limits
> >>>> their unroll factor to try to keep the unrolled loop body small enough
> >>>> to fit in the Corei7's loop stream detector which can hide LCP stalls
> >>>> in loops.
> >>>
> >>> One more optimization would be to optimize padding for this case,
> >>> the LSD only works if the loop is not spread over too many 32 byte
> >>> chunks. So if you detect the loop is LSD worthy always pad to 32 bytes
> >>> at the beginning.
> >>
> >> Thanks for the suggestion, I will look at doing that in follow-on tuning.
> >>
> >>>
> >>>> To do this I leveraged the existing TARGET_LOOP_UNROLL_ADJUST target
> >>>> hook, which was previously only defined for s390. I added one
> >>>> additional call to this target hook, when unrolling for constant trip
> >>>> count loops. Previously it was only called for runtime computed trip
> >>>> counts. Andreas, can you comment on the effect for s390 of this
> >>>> additional call of the target hook, since I can't measure that?
> >>>
> >>> On Sandy-Bridge there's also the decoded icache which is much larger,
> >>> but also has some restrictions. It would be nice if this optimization
> >>> was general enough to handle this case too.
> >>>
> >>> In general I notice that the tree loop unroller is too aggressive recently:
> >>> a lot of loops that probably shouldn't be unrolled (like containing
> >>> function calls etc.) are unrolled at -O3. So probably a better cost
> >>> model for unrolling would make sense anyways.
> >>
> >> These are both good suggestions, and I will look into Sandy Bridge
> >> heuristics in follow-on work, since we will need to tune for that
> >> soon.
> >>
> >>>
> >>>> +  /* Don't reduce unroll factor in loops with floating point
> >>>> +     computation, which tend to benefit more heavily from
> >>>> +     larger unroll factors and are less likely to bottleneck
> >>>> +     at the decoder. */
> >>>> +  has_FP = loop_has_FP_comp(loop);
> >>>
> >>> You could cache the loop body and pass it in here.
> >>
> >> That is a great idea, and in fact, I think I will do away with this
> >> separate function completely for this patch. I can more efficiently
> >> look for the FP computation while I am looking for the half word
> >> stores. I'll do that and send a follow up with the new patch.
> >>
> >>>
> >>> Patch looks reasonable to me, but I cannot approve.
> >>
> >> Thanks!
> >> Teresa
> >>
> >>>
> >>> -Andi
> >>>
> >>> --
> >>> ak@linux.intel.com -- Speaking for myself only
> >>
> >>
> >>
> >> --
> >> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
> >
> >
> >
> > --
> > Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413




--
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
diff mbox

Patch

===================================================================
--- loop-unroll.c	(revision 181902)
+++ loop-unroll.c	(working copy)
@@ -547,6 +547,9 @@  decide_unroll_constant_iterations (struc
   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);

+  if (targetm.loop_unroll_adjust)
+    nunroll = targetm.loop_unroll_adjust (nunroll, loop);
+
   /* Skip big loops.  */
   if (nunroll <= 1)
     {
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 181902)
+++ config/i386/i386.c	(working copy)
@@ -60,6 +60,7 @@  along with GCC; see the file COPYING3.
 #include "fibheap.h"
 #include "opts.h"
 #include "diagnostic.h"
+#include "cfgloop.h"

 enum upper_128bits_state
 {
@@ -38370,6 +38371,82 @@  ix86_autovectorize_vector_sizes (void)
   return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
 }

+/* If LOOP contains a possible LCP stalling instruction on corei7,
+   calculate new number of times to unroll instead of NUNROLL so that
+   the unrolled loop will still likely fit into the loop stream detector. */
+static unsigned
+ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+  bool found = false;
+  unsigned newunroll;
+
+  if (ix86_tune != PROCESSOR_COREI7_64 &&
+      ix86_tune != PROCESSOR_COREI7_32)
+    return nunroll;
+
+  /* Look for instructions that store a constant into HImode (16-bit)
+     memory. These require a length-changing prefix and on corei7 are
+     prone to LCP stalls. These stalls can be avoided if the loop
+     is streamed from the loop stream detector. */
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          rtx set_expr, dest;
+          set_expr = single_set (insn);
+          if (!set_expr)
+            continue;
+
+          dest = SET_DEST (set_expr);
+
+          /* Don't reduce unroll factor in loops with floating point
+             computation, which tend to benefit more heavily from
+             larger unroll factors and are less likely to bottleneck
+             at the decoder. */
+          if (FLOAT_MODE_P (GET_MODE (dest)))
+          {
+            free (body);
+            return nunroll;
+          }
+
+          if (!found
+              && GET_MODE (dest) == HImode
+              && CONST_INT_P (SET_SRC (set_expr))
+              && MEM_P (dest))
+            {
+              found = true;
+              /* Keep walking loop body to look for FP computations above. */
+            }
+        }
+    }
+  free (body);
+
+  if (!found)
+    return nunroll;
+
+  if (dump_file)
+    {
+      fprintf (dump_file,
+               ";; Loop contains HImode store of const (possible LCP
stalls),\n");
+      fprintf (dump_file,
+               "   reduce unroll factor to fit into Loop Stream Detector\n");
+    }
+
+  /* On corei7 the loop stream detector can hold 28 uops, so
+     don't allow unrolling to exceed that many instructions. */
+  newunroll = 28 / loop->av_ninsns;
+  if (newunroll < nunroll)
+    return newunroll;
+
+  return nunroll;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_RETURN_IN_MEMORY