diff mbox

Reduce compilation time for IVOPT by skipping cost computation in use group

Message ID CAHFci2-Ltv=5Ee49f7-bGUiuCbGVR0LuxKpQd4v7Sxa5fcGgrg@mail.gmail.com
State New
Headers show

Commit Message

Bin.Cheng March 30, 2016, 12:11 p.m. UTC
On Wed, Mar 30, 2016 at 9:09 AM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Thu, Mar 24, 2016 at 6:26 PM, Bin Cheng <Bin.Cheng@arm.com> wrote:
>> Hi,
>> Quite lot of time is used when IVOPT computes cost for <use, cand> pairs.  As a matter of fact, some pairs are very similar to each other, and we can abstract and compute cost only once for these pairs.  This is a patch doing so, the idea is skipping cost computation for sub-uses in each group, of course it may result in different assembly code for some complicated cases because it estimates cost rather than doing real computation.  I did double check one of such case that the change in generated assembly is not degeneration.  For an IVOPT heavy program (spec2k/173), this patch reduces IVOPT's compilation time by 7~8%, as well as the memory consumption on my developing machine.
>>
>> Bootstrap & test on x86_64.
>>
>> For spec2k6 data on x86_64.  Maybe because I ran spec2k6 compiled with patched GCC in unclean environment, some cases are regressed by small amount (< %1).  I manually compared assembly code for several cases, including ones with the largest regression (still within <1%).  I could confirm that generated assembly code is exact the same as unpatched GCC, except for function emit_library_call_value_1 in 403.gcc/calls.c.
>>
>> In this case, difference of IVOPT dumps is as below:
>>
>> $ diff -y trunk/calls.c.154t.ivopts patch/calls.c.154t.ivopts
>>
>>   <bb 44>:                                                        <bb 44>:
>>   # val_21 = PHI <val_175(168), val_650(43)>                      # val_21 = PHI <val_175(168), val_650(43)>
>>   _811 = (void *) ivtmp.322_829;                                  _811 = (void *) ivtmp.322_829;
>>   MEM[base: _811, offset: -48B] = val_21;                     |   MEM[base: _811, offset: -32B] = val_21;
>>   _810 = (void *) ivtmp.322_829;                                  _810 = (void *) ivtmp.322_829;
>>   MEM[base: _810, offset: -40B] = mode_163;                   |   MEM[base: _810, offset: -24B] = mode_163;
>>   _182 = function_arg (&args_so_far, mode_163, 0B, 1);            _182 = function_arg (&args_so_far, mode_163, 0B, 1);
>>   _809 = (void *) ivtmp.322_829;                                  _809 = (void *) ivtmp.322_829;
>>   MEM[base: _809, offset: -32B] = _182;                       |   MEM[base: _809, offset: -16B] = _182;
>>   _807 = (void *) ivtmp.322_829;                                  _807 = (void *) ivtmp.322_829;
>>   MEM[base: _807, offset: -24B] = 0;                          |   MEM[base: _807, offset: -8B] = 0;
>>   _185 = (struct args_size *) ivtmp.322_829;                  |   _801 = ivtmp.322_829 + 16;
>>   _801 = ivtmp.322_829 + 18446744073709551600;                <
>>   _800 = (struct args_size *) _801;                               _800 = (struct args_size *) _801;
>>   _186 = _800;                                                |   _185 = _800;
>>                                                               >   _186 = (struct args_size *) ivtmp.322_829;
>>   _187 = _182 != 0B;                                              _187 = _182 != 0B;
>>   _188 = (int) _187;                                              _188 = (int) _187;
>>   locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1     locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1
>>   _802 = (void *) ivtmp.322_829;                                  _802 = (void *) ivtmp.322_829;
>>   _190 = MEM[base: _802, offset: 8B];                         |   _190 = MEM[base: _802, offset: 24B];
>>   if (_190 != 0B)                                                 if (_190 != 0B)
>>     goto <bb 45>;                                                   goto <bb 45>;
>>   else                                                            else
>>     goto <bb 46>;                                                   goto <bb 46>;
>>
>>   <bb 45>:                                                        <bb 45>:
>>   fancy_abort ("calls.c", 3724, &__FUNCTION__);                   fancy_abort ("calls.c", 3724, &__FUNCTION__);
>>
>> It's only an offset difference in IV.  And below is difference of generated assembly:
>> $ diff -y trunk/calls.S patch/calls.S
>> .L489:                                                          .L489:
>>         leaq    -80(%rbp), %rdi                                         leaq    -80(%rbp), %rdi
>>         xorl    %edx, %edx                                              xorl    %edx, %edx
>>         movl    $1, %ecx                                                movl    $1, %ecx
>>         movl    %r13d, %esi                                             movl    %r13d, %esi
>>         movq    %rax, -48(%r15)                               <
>>         movl    %r13d, -40(%r15)                              <
>>         call    function_arg                                  <
>>         movl    $0, -24(%r15)                                 <
>>         movq    %rax, -32(%r15)                                         movq    %rax, -32(%r15)
>>                                                               >         movl    %r13d, -24(%r15)
>>                                                               >         call    function_arg
>>         xorl    %edx, %edx                                              xorl    %edx, %edx
>>         pushq   %r12                                          |         movq    %rax, -16(%r15)
>>         testq   %rax, %rax                                              testq   %rax, %rax
>>         pushq   %r15                                          |         leaq    16(%r15), %rax     <--I1
>>         leaq    -16(%r15), %r9                                |         movl    $0, -8(%r15)
>>         leaq    -112(%rbp), %r8                                         leaq    -112(%rbp), %r8
>>                                                               >         pushq   %r12
>>         setne   %dl                                                     setne   %dl
>>         movl    %r13d, %edi                                   |         movq    %r15, %r9          <--I2
>>                                                               >         pushq   %rax               <--I3
>>         xorl    %ecx, %ecx                                              xorl    %ecx, %ecx
>>         xorl    %esi, %esi                                              xorl    %esi, %esi
>>                                                               >         movl    %r13d, %edi
>>         call    locate_and_pad_parm                                     call    locate_and_pad_parm
>>         cmpq    $0, 8(%r15)                                   |         cmpq    $0, 24(%r15)
>>         popq    %rax                                                    popq    %rax
>>         popq    %rdx                                                    popq    %rdx
>>         jne     .L602                                                   jne     .L602
>>
>> There is one additional move instruction (I2) after patching, but I believe it's a choice of RA.  If we switch %rax/%r9 in instructions I1/I2 as below:
>>         ...
>>         leaq    16(%r15), %r9
>>         ...
>>         movq    %r15, %rax
>>         pushq   %r15
>>
>> Then I2 becomes redundant and can be removed.
>>
>> I will collect performance data on AArch64 to make sure there is no breakage either.  So is it OK?
>
> I think the patch is ok - note that I have a hard time following the
> code, esp. the 'first' flag.
>
> +      /* Add cost for sub uses in group.  */
> +      do
> +       {
> +         /* Compute cost for the first sub_use with different offset to
> +            the first one and use it afterwards, because the cost could
> +            be very different if the offset is different.  */
> +         if (first && use->addr_offset != sub_use->addr_offset)
> +           {
> +             first = false;
> +             sub_cost = get_computation_cost (data, sub_use, cand, true,
> +                                              NULL, &can_autoinc, NULL);
> +             if (infinite_cost_p (sub_cost))
> +               {
> +                 cost = infinite_cost;
> +                 break;
> +               }
> +           }
> +
> +         cost = add_costs (cost, sub_cost);
> +         sub_use = sub_use->next;
> +       }
> +      while (sub_use);
>
> we start this loop with first = true, so for each sub-use we compute
> no new sub-cost until
> use->addr_offset changes for the first time after which we will never
> compute sub-cost
> again.  So we call get_computation_cost at most twice for al sub-uses.
>
> I suppose all sub-uses have equal ->addr_base.  Suppose sub-uses are then sorted
> after ->addr_offset what keeps that list from having three different
> addr_offset but
> with "very different cost"?  There seems to be group_address_uses but
> that suggests
> the cost might be actually the same for all sub-uses.
>
> So adding a little explaining before the loop over sub-uses would be
> appreciated.

Thanks for reviewing.  Here is the patch with trivially revised comments.
I also collected benchmark data for spec2k6 on AArch64, there is no
surprise except for case 456.hmmer.  I double checked generated
assembly and can confirm it's not real.
Will apply the patch later.

Thanks,
bin
>
> Thanks,
> Richard.
>
>> Thanks,
>> bin
>>
>> 2016-03-23  Bin Cheng  <bin.cheng@arm.com>
>>
>>         * tree-ssa-loop-ivopts.c (struct comp_cost): New scrach field.
>>         (no_cost, infinite_cost): Initialize the new field.
>>         (get_computation_cost_at): Record setup cost.
>>         (determine_use_iv_cost_address): Skip cost computation for sub
>>         uses if we can estimate it without losing accuracy.
>>

Comments

H.J. Lu March 31, 2016, 12:27 p.m. UTC | #1
On Wed, Mar 30, 2016 at 5:11 AM, Bin.Cheng <amker.cheng@gmail.com> wrote:
> On Wed, Mar 30, 2016 at 9:09 AM, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> On Thu, Mar 24, 2016 at 6:26 PM, Bin Cheng <Bin.Cheng@arm.com> wrote:
>>> Hi,
>>> Quite lot of time is used when IVOPT computes cost for <use, cand> pairs.  As a matter of fact, some pairs are very similar to each other, and we can abstract and compute cost only once for these pairs.  This is a patch doing so, the idea is skipping cost computation for sub-uses in each group, of course it may result in different assembly code for some complicated cases because it estimates cost rather than doing real computation.  I did double check one of such case that the change in generated assembly is not degeneration.  For an IVOPT heavy program (spec2k/173), this patch reduces IVOPT's compilation time by 7~8%, as well as the memory consumption on my developing machine.
>>>
>>> Bootstrap & test on x86_64.
>>>
>>> For spec2k6 data on x86_64.  Maybe because I ran spec2k6 compiled with patched GCC in unclean environment, some cases are regressed by small amount (< %1).  I manually compared assembly code for several cases, including ones with the largest regression (still within <1%).  I could confirm that generated assembly code is exact the same as unpatched GCC, except for function emit_library_call_value_1 in 403.gcc/calls.c.
>>>
>>> In this case, difference of IVOPT dumps is as below:
>>>
>>> $ diff -y trunk/calls.c.154t.ivopts patch/calls.c.154t.ivopts
>>>
>>>   <bb 44>:                                                        <bb 44>:
>>>   # val_21 = PHI <val_175(168), val_650(43)>                      # val_21 = PHI <val_175(168), val_650(43)>
>>>   _811 = (void *) ivtmp.322_829;                                  _811 = (void *) ivtmp.322_829;
>>>   MEM[base: _811, offset: -48B] = val_21;                     |   MEM[base: _811, offset: -32B] = val_21;
>>>   _810 = (void *) ivtmp.322_829;                                  _810 = (void *) ivtmp.322_829;
>>>   MEM[base: _810, offset: -40B] = mode_163;                   |   MEM[base: _810, offset: -24B] = mode_163;
>>>   _182 = function_arg (&args_so_far, mode_163, 0B, 1);            _182 = function_arg (&args_so_far, mode_163, 0B, 1);
>>>   _809 = (void *) ivtmp.322_829;                                  _809 = (void *) ivtmp.322_829;
>>>   MEM[base: _809, offset: -32B] = _182;                       |   MEM[base: _809, offset: -16B] = _182;
>>>   _807 = (void *) ivtmp.322_829;                                  _807 = (void *) ivtmp.322_829;
>>>   MEM[base: _807, offset: -24B] = 0;                          |   MEM[base: _807, offset: -8B] = 0;
>>>   _185 = (struct args_size *) ivtmp.322_829;                  |   _801 = ivtmp.322_829 + 16;
>>>   _801 = ivtmp.322_829 + 18446744073709551600;                <
>>>   _800 = (struct args_size *) _801;                               _800 = (struct args_size *) _801;
>>>   _186 = _800;                                                |   _185 = _800;
>>>                                                               >   _186 = (struct args_size *) ivtmp.322_829;
>>>   _187 = _182 != 0B;                                              _187 = _182 != 0B;
>>>   _188 = (int) _187;                                              _188 = (int) _187;
>>>   locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1     locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1
>>>   _802 = (void *) ivtmp.322_829;                                  _802 = (void *) ivtmp.322_829;
>>>   _190 = MEM[base: _802, offset: 8B];                         |   _190 = MEM[base: _802, offset: 24B];
>>>   if (_190 != 0B)                                                 if (_190 != 0B)
>>>     goto <bb 45>;                                                   goto <bb 45>;
>>>   else                                                            else
>>>     goto <bb 46>;                                                   goto <bb 46>;
>>>
>>>   <bb 45>:                                                        <bb 45>:
>>>   fancy_abort ("calls.c", 3724, &__FUNCTION__);                   fancy_abort ("calls.c", 3724, &__FUNCTION__);
>>>
>>> It's only an offset difference in IV.  And below is difference of generated assembly:
>>> $ diff -y trunk/calls.S patch/calls.S
>>> .L489:                                                          .L489:
>>>         leaq    -80(%rbp), %rdi                                         leaq    -80(%rbp), %rdi
>>>         xorl    %edx, %edx                                              xorl    %edx, %edx
>>>         movl    $1, %ecx                                                movl    $1, %ecx
>>>         movl    %r13d, %esi                                             movl    %r13d, %esi
>>>         movq    %rax, -48(%r15)                               <
>>>         movl    %r13d, -40(%r15)                              <
>>>         call    function_arg                                  <
>>>         movl    $0, -24(%r15)                                 <
>>>         movq    %rax, -32(%r15)                                         movq    %rax, -32(%r15)
>>>                                                               >         movl    %r13d, -24(%r15)
>>>                                                               >         call    function_arg
>>>         xorl    %edx, %edx                                              xorl    %edx, %edx
>>>         pushq   %r12                                          |         movq    %rax, -16(%r15)
>>>         testq   %rax, %rax                                              testq   %rax, %rax
>>>         pushq   %r15                                          |         leaq    16(%r15), %rax     <--I1
>>>         leaq    -16(%r15), %r9                                |         movl    $0, -8(%r15)
>>>         leaq    -112(%rbp), %r8                                         leaq    -112(%rbp), %r8
>>>                                                               >         pushq   %r12
>>>         setne   %dl                                                     setne   %dl
>>>         movl    %r13d, %edi                                   |         movq    %r15, %r9          <--I2
>>>                                                               >         pushq   %rax               <--I3
>>>         xorl    %ecx, %ecx                                              xorl    %ecx, %ecx
>>>         xorl    %esi, %esi                                              xorl    %esi, %esi
>>>                                                               >         movl    %r13d, %edi
>>>         call    locate_and_pad_parm                                     call    locate_and_pad_parm
>>>         cmpq    $0, 8(%r15)                                   |         cmpq    $0, 24(%r15)
>>>         popq    %rax                                                    popq    %rax
>>>         popq    %rdx                                                    popq    %rdx
>>>         jne     .L602                                                   jne     .L602
>>>
>>> There is one additional move instruction (I2) after patching, but I believe it's a choice of RA.  If we switch %rax/%r9 in instructions I1/I2 as below:
>>>         ...
>>>         leaq    16(%r15), %r9
>>>         ...
>>>         movq    %r15, %rax
>>>         pushq   %r15
>>>
>>> Then I2 becomes redundant and can be removed.
>>>
>>> I will collect performance data on AArch64 to make sure there is no breakage either.  So is it OK?
>>
>> I think the patch is ok - note that I have a hard time following the
>> code, esp. the 'first' flag.
>>
>> +      /* Add cost for sub uses in group.  */
>> +      do
>> +       {
>> +         /* Compute cost for the first sub_use with different offset to
>> +            the first one and use it afterwards, because the cost could
>> +            be very different if the offset is different.  */
>> +         if (first && use->addr_offset != sub_use->addr_offset)
>> +           {
>> +             first = false;
>> +             sub_cost = get_computation_cost (data, sub_use, cand, true,
>> +                                              NULL, &can_autoinc, NULL);
>> +             if (infinite_cost_p (sub_cost))
>> +               {
>> +                 cost = infinite_cost;
>> +                 break;
>> +               }
>> +           }
>> +
>> +         cost = add_costs (cost, sub_cost);
>> +         sub_use = sub_use->next;
>> +       }
>> +      while (sub_use);
>>
>> we start this loop with first = true, so for each sub-use we compute
>> no new sub-cost until
>> use->addr_offset changes for the first time after which we will never
>> compute sub-cost
>> again.  So we call get_computation_cost at most twice for al sub-uses.
>>
>> I suppose all sub-uses have equal ->addr_base.  Suppose sub-uses are then sorted
>> after ->addr_offset what keeps that list from having three different
>> addr_offset but
>> with "very different cost"?  There seems to be group_address_uses but
>> that suggests
>> the cost might be actually the same for all sub-uses.
>>
>> So adding a little explaining before the loop over sub-uses would be
>> appreciated.
>
> Thanks for reviewing.  Here is the patch with trivially revised comments.
> I also collected benchmark data for spec2k6 on AArch64, there is no
> surprise except for case 456.hmmer.  I double checked generated
> assembly and can confirm it's not real.
> Will apply the patch later.
>
> Thanks,
> bin
>>
>> Thanks,
>> Richard.
>>
>>> Thanks,
>>> bin
>>>
>>> 2016-03-23  Bin Cheng  <bin.cheng@arm.com>
>>>
>>>         * tree-ssa-loop-ivopts.c (struct comp_cost): New scrach field.
>>>         (no_cost, infinite_cost): Initialize the new field.
>>>         (get_computation_cost_at): Record setup cost.
>>>         (determine_use_iv_cost_address): Skip cost computation for sub
>>>         uses if we can estimate it without losing accuracy.
>>>

This may have caused bootstrap failure on ia32:

https://gcc.gnu.org/ml/gcc-regression/2016-03/msg00347.html
diff mbox

Patch

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 5302edf..3c81bf7 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -168,10 +168,11 @@  struct comp_cost
 			   the computation (in no concrete units --
 			   complexity field should be larger for more
 			   complex expressions and addressing modes).  */
+  int scratch;		/* Scratch used during cost computation.  */
 };
 
-static const comp_cost no_cost = {0, 0};
-static const comp_cost infinite_cost = {INFTY, INFTY};
+static const comp_cost no_cost = {0, 0, 0};
+static const comp_cost infinite_cost = {INFTY, INFTY, INFTY};
 
 /* The candidate - cost pair.  */
 struct cost_pair
@@ -4947,6 +4948,8 @@  get_computation_cost_at (struct ivopts_data *data,
       cost.cost += add_cost (data->speed, TYPE_MODE (ctype));
     }
 
+  /* Record setup cost in scrach field.  */
+  cost.scratch = cost.cost;
   /* Set of invariants depended on by sub use has already been computed
      for the first use in the group.  */
   if (use->sub_id)
@@ -5082,12 +5085,12 @@  determine_use_iv_cost_address (struct ivopts_data *data,
 			       struct iv_use *use, struct iv_cand *cand)
 {
   bitmap depends_on;
-  bool can_autoinc;
+  bool can_autoinc, first;
   int inv_expr_id = -1;
   struct iv_use *sub_use;
-  comp_cost sub_cost;
   comp_cost cost = get_computation_cost (data, use, cand, true, &depends_on,
 					 &can_autoinc, &inv_expr_id);
+  comp_cost sub_cost = cost;
 
   if (cand->ainc_use == use)
     {
@@ -5099,13 +5102,47 @@  determine_use_iv_cost_address (struct ivopts_data *data,
       else if (cand->pos == IP_AFTER_USE || cand->pos == IP_BEFORE_USE)
 	cost = infinite_cost;
     }
-  for (sub_use = use->next;
-       sub_use && !infinite_cost_p (cost);
-       sub_use = sub_use->next)
+
+  if (!infinite_cost_p (cost) && use->next)
     {
-      sub_cost = get_computation_cost (data, sub_use, cand, true, NULL,
-				       &can_autoinc, NULL);
-      cost = add_costs (cost, sub_cost);
+      first = true;
+      sub_use = use->next;
+      /* We don't want to add setup cost for sub-uses.  */
+      sub_cost.cost -= sub_cost.scratch;
+      /* Add cost for sub uses in group.  */
+      do
+	{
+	  /* Compute cost for the first sub use with different offset to
+	     the main use and add it afterwards.  Costs for these uses
+	     could be quite different.  Given below uses in a group:
+	       use 0  : {base + A + offset_0, step}
+	       use 0.1: {base + A + offset_0, step}
+	       use 0.2: {base + A + offset_1, step}
+	       use 0.3: {base + A + offset_2, step}
+	     when we need to compute costs with candidate:
+	       cand 1 : {base + B + offset_0, step}
+
+	     The first sub use with different offset is use 0.2, its cost
+	     is larger than cost of use 0/0.1 because we need to compute:
+	       A - B + offset_1 - offset_0
+	     rather than:
+	       A - B.  */
+	  if (first && use->addr_offset != sub_use->addr_offset)
+	    {
+	      first = false;
+	      sub_cost = get_computation_cost (data, sub_use, cand, true,
+					       NULL, &can_autoinc, NULL);
+	      if (infinite_cost_p (sub_cost))
+		{
+		  cost = infinite_cost;
+		  break;
+		}
+	    }
+
+	  cost = add_costs (cost, sub_cost);
+	  sub_use = sub_use->next;
+	}
+      while (sub_use);
     }
 
   set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE, ERROR_MARK,