diff mbox series

Vect: use a small step to calculate induction for the unrolled loop (PR tree-optimization/110449)

Message ID SJ2PR01MB86357ECFCF921CF3114FC894E12FA@SJ2PR01MB8635.prod.exchangelabs.com
State New
Headers show
Series Vect: use a small step to calculate induction for the unrolled loop (PR tree-optimization/110449) | expand

Commit Message

Hao Liu OS July 5, 2023, 6:43 a.m. UTC
Hi,

If a loop is unrolled by n times during vectoriation, two steps are used to
calculate the induction variable:
  - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
  - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)

This patch calculates an extra vec_n to replace vec_loop:
  vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.

So that we can save the large step register and related operations.

gcc/ChangeLog:

	PR tree-optimization/110449
	* tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
	vec_loop for the unrolled loop.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/pr110449.c: New testcase.
---
 gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
 gcc/tree-vect-loop.cc                       | 21 +++++++++--
 2 files changed, 58 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c

Comments

Richard Biener July 6, 2023, 12:44 p.m. UTC | #1
On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> If a loop is unrolled by n times during vectoriation, two steps are used to
> calculate the induction variable:
>   - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>   - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>
> This patch calculates an extra vec_n to replace vec_loop:
>   vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>
> So that we can save the large step register and related operations.

OK.  It would be nice to avoid the dead stmts created earlier though.

Thanks,
Richard.

> gcc/ChangeLog:
>
>         PR tree-optimization/110449
>         * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>         vec_loop for the unrolled loop.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/pr110449.c: New testcase.
> ---
>  gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
>  gcc/tree-vect-loop.cc                       | 21 +++++++++--
>  2 files changed, 58 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> new file mode 100644
> index 00000000000..bb3b6dcfe08
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
> +
> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
> +
> +   before (suggested_unroll_factor=2):
> +         fmov    s30, 8.0e+0
> +         fmov    s31, 4.0e+0
> +         dup     v27.4s, v30.s[0]
> +         dup     v28.4s, v31.s[0]
> +     .L6:
> +         mov     v30.16b, v31.16b
> +         fadd    v31.4s, v31.4s, v27.4s
> +         fadd    v29.4s, v30.4s, v28.4s
> +         stp     q30, q29, [x0]
> +         add     x0, x0, 32
> +         cmp     x1, x0
> +         bne     .L6
> +
> +   after:
> +         fmov    s31, 4.0e+0
> +         dup     v29.4s, v31.s[0]
> +     .L6:
> +         fadd    v30.4s, v31.4s, v29.4s
> +         stp     q31, q30, [x0]
> +         add     x0, x0, 32
> +         fadd    v31.4s, v29.4s, v30.4s
> +         cmp     x0, x1
> +         bne     .L6  */
> +
> +void
> +foo2 (float *arr, float freq, float step)
> +{
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      arr[i] = freq;
> +      freq += step;
> +    }
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 3b46c58a8d8..706ecbffd0c 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>                                    new_vec, step_vectype, NULL);
>
>        vec_def = induc_def;
> -      for (i = 1; i < ncopies; i++)
> +      for (i = 1; i < ncopies + 1; i++)
>         {
>           /* vec_i = vec_prev + vec_step  */
>           gimple_seq stmts = NULL;
> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>           vec_def = gimple_convert (&stmts, vectype, vec_def);
>
>           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> +         if (i < ncopies)
> +           {
> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> +           }
> +         else
> +           {
> +             /* vec_1 = vec_iv + (VF/n * S)
> +                vec_2 = vec_1 + (VF/n * S)
> +                ...
> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
> +
> +                vec_n is used as vec_loop to save the large step register and
> +                related operations.  */
> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
> +                          UNKNOWN_LOCATION);
> +           }
>         }
>      }
>
> --
> 2.34.1
Jeff Law July 6, 2023, 4:06 p.m. UTC | #2
On 7/6/23 06:44, Richard Biener via Gcc-patches wrote:
> On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> Hi,
>>
>> If a loop is unrolled by n times during vectoriation, two steps are used to
>> calculate the induction variable:
>>    - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>>    - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>>
>> This patch calculates an extra vec_n to replace vec_loop:
>>    vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>>
>> So that we can save the large step register and related operations.
> 
> OK.  It would be nice to avoid the dead stmts created earlier though.
> 
> Thanks,
> Richard.
> 
>> gcc/ChangeLog:
>>
>>          PR tree-optimization/110449
>>          * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>>          vec_loop for the unrolled loop.
>>
>> gcc/testsuite/ChangeLog:
>>
>>          * gcc.target/aarch64/pr110449.c: New testcase.
I didn't see Hao Liu in the MAINTAINERS file, so probably doesn't have 
write access.  Therefore I went ahead and pushed this for Hao.

jeff
Richard Sandiford July 6, 2023, 5:50 p.m. UTC | #3
Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> Hi,
>>
>> If a loop is unrolled by n times during vectoriation, two steps are used to
>> calculate the induction variable:
>>   - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>>   - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>>
>> This patch calculates an extra vec_n to replace vec_loop:
>>   vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>>
>> So that we can save the large step register and related operations.
>
> OK.  It would be nice to avoid the dead stmts created earlier though.

FWIW, I still don't think we should do this.  Part of the point of
unrolling is to shorten loop-carried dependencies, whereas this patch
is going in the opposite direction.

Richard

>
> Thanks,
> Richard.
>
>> gcc/ChangeLog:
>>
>>         PR tree-optimization/110449
>>         * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>>         vec_loop for the unrolled loop.
>>
>> gcc/testsuite/ChangeLog:
>>
>>         * gcc.target/aarch64/pr110449.c: New testcase.
>> ---
>>  gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
>>  gcc/tree-vect-loop.cc                       | 21 +++++++++--
>>  2 files changed, 58 insertions(+), 3 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
>>
>> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>> new file mode 100644
>> index 00000000000..bb3b6dcfe08
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>> @@ -0,0 +1,40 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
>> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
>> +
>> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
>> +
>> +   before (suggested_unroll_factor=2):
>> +         fmov    s30, 8.0e+0
>> +         fmov    s31, 4.0e+0
>> +         dup     v27.4s, v30.s[0]
>> +         dup     v28.4s, v31.s[0]
>> +     .L6:
>> +         mov     v30.16b, v31.16b
>> +         fadd    v31.4s, v31.4s, v27.4s
>> +         fadd    v29.4s, v30.4s, v28.4s
>> +         stp     q30, q29, [x0]
>> +         add     x0, x0, 32
>> +         cmp     x1, x0
>> +         bne     .L6
>> +
>> +   after:
>> +         fmov    s31, 4.0e+0
>> +         dup     v29.4s, v31.s[0]
>> +     .L6:
>> +         fadd    v30.4s, v31.4s, v29.4s
>> +         stp     q31, q30, [x0]
>> +         add     x0, x0, 32
>> +         fadd    v31.4s, v29.4s, v30.4s
>> +         cmp     x0, x1
>> +         bne     .L6  */
>> +
>> +void
>> +foo2 (float *arr, float freq, float step)
>> +{
>> +  for (int i = 0; i < 1024; i++)
>> +    {
>> +      arr[i] = freq;
>> +      freq += step;
>> +    }
>> +}
>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> index 3b46c58a8d8..706ecbffd0c 100644
>> --- a/gcc/tree-vect-loop.cc
>> +++ b/gcc/tree-vect-loop.cc
>> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>                                    new_vec, step_vectype, NULL);
>>
>>        vec_def = induc_def;
>> -      for (i = 1; i < ncopies; i++)
>> +      for (i = 1; i < ncopies + 1; i++)
>>         {
>>           /* vec_i = vec_prev + vec_step  */
>>           gimple_seq stmts = NULL;
>> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>           vec_def = gimple_convert (&stmts, vectype, vec_def);
>>
>>           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
>> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
>> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>> +         if (i < ncopies)
>> +           {
>> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
>> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>> +           }
>> +         else
>> +           {
>> +             /* vec_1 = vec_iv + (VF/n * S)
>> +                vec_2 = vec_1 + (VF/n * S)
>> +                ...
>> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
>> +
>> +                vec_n is used as vec_loop to save the large step register and
>> +                related operations.  */
>> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
>> +                          UNKNOWN_LOCATION);
>> +           }
>>         }
>>      }
>>
>> --
>> 2.34.1
Hao Liu OS July 7, 2023, 2:23 a.m. UTC | #4
Hi Jeff,

Thanks for your help.

Actually I have write access as I was added to the "contributor list". Anyway, that's very kind of you to help committing the patch.

Thanks,
-Hao
Richard Biener July 7, 2023, 5:32 a.m. UTC | #5
> Am 06.07.2023 um 19:50 schrieb Richard Sandiford <richard.sandiford@arm.com>:
> 
> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>> On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
>>> <gcc-patches@gcc.gnu.org> wrote:
>>> 
>>> Hi,
>>> 
>>> If a loop is unrolled by n times during vectoriation, two steps are used to
>>> calculate the induction variable:
>>>  - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>>>  - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>>> 
>>> This patch calculates an extra vec_n to replace vec_loop:
>>>  vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>>> 
>>> So that we can save the large step register and related operations.
>> 
>> OK.  It would be nice to avoid the dead stmts created earlier though.
> 
> FWIW, I still don't think we should do this.  Part of the point of
> unrolling is to shorten loop-carried dependencies, whereas this patch
> is going in the opposite direction.

Note ncopies can be >1 without additional unrolling.  With non VLA vectors all of the updates will be constant folded btw.

Richard 

> Richard
> 
>> 
>> Thanks,
>> Richard.
>> 
>>> gcc/ChangeLog:
>>> 
>>>        PR tree-optimization/110449
>>>        * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>>>        vec_loop for the unrolled loop.
>>> 
>>> gcc/testsuite/ChangeLog:
>>> 
>>>        * gcc.target/aarch64/pr110449.c: New testcase.
>>> ---
>>> gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
>>> gcc/tree-vect-loop.cc                       | 21 +++++++++--
>>> 2 files changed, 58 insertions(+), 3 deletions(-)
>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
>>> 
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>>> new file mode 100644
>>> index 00000000000..bb3b6dcfe08
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>>> @@ -0,0 +1,40 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
>>> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
>>> +
>>> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
>>> +
>>> +   before (suggested_unroll_factor=2):
>>> +         fmov    s30, 8.0e+0
>>> +         fmov    s31, 4.0e+0
>>> +         dup     v27.4s, v30.s[0]
>>> +         dup     v28.4s, v31.s[0]
>>> +     .L6:
>>> +         mov     v30.16b, v31.16b
>>> +         fadd    v31.4s, v31.4s, v27.4s
>>> +         fadd    v29.4s, v30.4s, v28.4s
>>> +         stp     q30, q29, [x0]
>>> +         add     x0, x0, 32
>>> +         cmp     x1, x0
>>> +         bne     .L6
>>> +
>>> +   after:
>>> +         fmov    s31, 4.0e+0
>>> +         dup     v29.4s, v31.s[0]
>>> +     .L6:
>>> +         fadd    v30.4s, v31.4s, v29.4s
>>> +         stp     q31, q30, [x0]
>>> +         add     x0, x0, 32
>>> +         fadd    v31.4s, v29.4s, v30.4s
>>> +         cmp     x0, x1
>>> +         bne     .L6  */
>>> +
>>> +void
>>> +foo2 (float *arr, float freq, float step)
>>> +{
>>> +  for (int i = 0; i < 1024; i++)
>>> +    {
>>> +      arr[i] = freq;
>>> +      freq += step;
>>> +    }
>>> +}
>>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>>> index 3b46c58a8d8..706ecbffd0c 100644
>>> --- a/gcc/tree-vect-loop.cc
>>> +++ b/gcc/tree-vect-loop.cc
>>> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>>                                   new_vec, step_vectype, NULL);
>>> 
>>>       vec_def = induc_def;
>>> -      for (i = 1; i < ncopies; i++)
>>> +      for (i = 1; i < ncopies + 1; i++)
>>>        {
>>>          /* vec_i = vec_prev + vec_step  */
>>>          gimple_seq stmts = NULL;
>>> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>>          vec_def = gimple_convert (&stmts, vectype, vec_def);
>>> 
>>>          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
>>> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
>>> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>>> +         if (i < ncopies)
>>> +           {
>>> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
>>> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>>> +           }
>>> +         else
>>> +           {
>>> +             /* vec_1 = vec_iv + (VF/n * S)
>>> +                vec_2 = vec_1 + (VF/n * S)
>>> +                ...
>>> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
>>> +
>>> +                vec_n is used as vec_loop to save the large step register and
>>> +                related operations.  */
>>> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
>>> +                          UNKNOWN_LOCATION);
>>> +           }
>>>        }
>>>     }
>>> 
>>> --
>>> 2.34.1
Richard Sandiford July 7, 2023, 7:53 a.m. UTC | #6
Richard Biener <richard.guenther@gmail.com> writes:
>> Am 06.07.2023 um 19:50 schrieb Richard Sandiford <richard.sandiford@arm.com>:
>> 
>> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>>> On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>> 
>>>> Hi,
>>>> 
>>>> If a loop is unrolled by n times during vectoriation, two steps are used to
>>>> calculate the induction variable:
>>>>  - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>>>>  - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>>>> 
>>>> This patch calculates an extra vec_n to replace vec_loop:
>>>>  vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>>>> 
>>>> So that we can save the large step register and related operations.
>>> 
>>> OK.  It would be nice to avoid the dead stmts created earlier though.
>> 
>> FWIW, I still don't think we should do this.  Part of the point of
>> unrolling is to shorten loop-carried dependencies, whereas this patch
>> is going in the opposite direction.
>
> Note ncopies can be >1 without additional unrolling.

Yeah, true.  But I think even there, avoiding a longer loop-carried
dependency should be a good thing.

> With non VLA vectors all of the updates will be constant folded btw.

Are you sure?  The motivating example is an Advanced SIMD one,
not a VLA one.  No variable-length vectors are involved.

Maybe constant folding caps the dependency chain to length 2?
But 2 is still more than 1. :)

Thanks,
Richard

>
> Richard 
>
>> Richard
>> 
>>> 
>>> Thanks,
>>> Richard.
>>> 
>>>> gcc/ChangeLog:
>>>> 
>>>>        PR tree-optimization/110449
>>>>        * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>>>>        vec_loop for the unrolled loop.
>>>> 
>>>> gcc/testsuite/ChangeLog:
>>>> 
>>>>        * gcc.target/aarch64/pr110449.c: New testcase.
>>>> ---
>>>> gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
>>>> gcc/tree-vect-loop.cc                       | 21 +++++++++--
>>>> 2 files changed, 58 insertions(+), 3 deletions(-)
>>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
>>>> 
>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>>>> new file mode 100644
>>>> index 00000000000..bb3b6dcfe08
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>>>> @@ -0,0 +1,40 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
>>>> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
>>>> +
>>>> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
>>>> +
>>>> +   before (suggested_unroll_factor=2):
>>>> +         fmov    s30, 8.0e+0
>>>> +         fmov    s31, 4.0e+0
>>>> +         dup     v27.4s, v30.s[0]
>>>> +         dup     v28.4s, v31.s[0]
>>>> +     .L6:
>>>> +         mov     v30.16b, v31.16b
>>>> +         fadd    v31.4s, v31.4s, v27.4s
>>>> +         fadd    v29.4s, v30.4s, v28.4s
>>>> +         stp     q30, q29, [x0]
>>>> +         add     x0, x0, 32
>>>> +         cmp     x1, x0
>>>> +         bne     .L6
>>>> +
>>>> +   after:
>>>> +         fmov    s31, 4.0e+0
>>>> +         dup     v29.4s, v31.s[0]
>>>> +     .L6:
>>>> +         fadd    v30.4s, v31.4s, v29.4s
>>>> +         stp     q31, q30, [x0]
>>>> +         add     x0, x0, 32
>>>> +         fadd    v31.4s, v29.4s, v30.4s
>>>> +         cmp     x0, x1
>>>> +         bne     .L6  */
>>>> +
>>>> +void
>>>> +foo2 (float *arr, float freq, float step)
>>>> +{
>>>> +  for (int i = 0; i < 1024; i++)
>>>> +    {
>>>> +      arr[i] = freq;
>>>> +      freq += step;
>>>> +    }
>>>> +}
>>>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>>>> index 3b46c58a8d8..706ecbffd0c 100644
>>>> --- a/gcc/tree-vect-loop.cc
>>>> +++ b/gcc/tree-vect-loop.cc
>>>> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>>>                                   new_vec, step_vectype, NULL);
>>>> 
>>>>       vec_def = induc_def;
>>>> -      for (i = 1; i < ncopies; i++)
>>>> +      for (i = 1; i < ncopies + 1; i++)
>>>>        {
>>>>          /* vec_i = vec_prev + vec_step  */
>>>>          gimple_seq stmts = NULL;
>>>> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>>>          vec_def = gimple_convert (&stmts, vectype, vec_def);
>>>> 
>>>>          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
>>>> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
>>>> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>>>> +         if (i < ncopies)
>>>> +           {
>>>> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
>>>> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>>>> +           }
>>>> +         else
>>>> +           {
>>>> +             /* vec_1 = vec_iv + (VF/n * S)
>>>> +                vec_2 = vec_1 + (VF/n * S)
>>>> +                ...
>>>> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
>>>> +
>>>> +                vec_n is used as vec_loop to save the large step register and
>>>> +                related operations.  */
>>>> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
>>>> +                          UNKNOWN_LOCATION);
>>>> +           }
>>>>        }
>>>>     }
>>>> 
>>>> --
>>>> 2.34.1
Richard Biener July 7, 2023, 10:23 a.m. UTC | #7
On Fri, Jul 7, 2023 at 9:53 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> >> Am 06.07.2023 um 19:50 schrieb Richard Sandiford <richard.sandiford@arm.com>:
> >>
> >> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >>>> On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
> >>>> <gcc-patches@gcc.gnu.org> wrote:
> >>>>
> >>>> Hi,
> >>>>
> >>>> If a loop is unrolled by n times during vectoriation, two steps are used to
> >>>> calculate the induction variable:
> >>>>  - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
> >>>>  - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
> >>>>
> >>>> This patch calculates an extra vec_n to replace vec_loop:
> >>>>  vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
> >>>>
> >>>> So that we can save the large step register and related operations.
> >>>
> >>> OK.  It would be nice to avoid the dead stmts created earlier though.
> >>
> >> FWIW, I still don't think we should do this.  Part of the point of
> >> unrolling is to shorten loop-carried dependencies, whereas this patch
> >> is going in the opposite direction.
> >
> > Note ncopies can be >1 without additional unrolling.
>
> Yeah, true.  But I think even there, avoiding a longer loop-carried
> dependency should be a good thing.
>
> > With non VLA vectors all of the updates will be constant folded btw.
>
> Are you sure?  The motivating example is an Advanced SIMD one,
> not a VLA one.  No variable-length vectors are involved.
>
> Maybe constant folding caps the dependency chain to length 2?
> But 2 is still more than 1. :)

The

  /* (A +- CST1) +- CST2 -> A + CST3
     Use view_convert because it is safe for vectors and equivalent for
     scalars.  */
  (for outer_op (plus minus)
   (for inner_op (plus minus)
        neg_inner_op (minus plus)

pattern should apply here for example during forwprop.  It handles
vector constants just fine, so I wonder why it doesn't trigger.

Richard.

> Thanks,
> Richard
>
> >
> > Richard
> >
> >> Richard
> >>
> >>>
> >>> Thanks,
> >>> Richard.
> >>>
> >>>> gcc/ChangeLog:
> >>>>
> >>>>        PR tree-optimization/110449
> >>>>        * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
> >>>>        vec_loop for the unrolled loop.
> >>>>
> >>>> gcc/testsuite/ChangeLog:
> >>>>
> >>>>        * gcc.target/aarch64/pr110449.c: New testcase.
> >>>> ---
> >>>> gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
> >>>> gcc/tree-vect-loop.cc                       | 21 +++++++++--
> >>>> 2 files changed, 58 insertions(+), 3 deletions(-)
> >>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
> >>>>
> >>>> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> >>>> new file mode 100644
> >>>> index 00000000000..bb3b6dcfe08
> >>>> --- /dev/null
> >>>> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> >>>> @@ -0,0 +1,40 @@
> >>>> +/* { dg-do compile } */
> >>>> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
> >>>> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
> >>>> +
> >>>> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
> >>>> +
> >>>> +   before (suggested_unroll_factor=2):
> >>>> +         fmov    s30, 8.0e+0
> >>>> +         fmov    s31, 4.0e+0
> >>>> +         dup     v27.4s, v30.s[0]
> >>>> +         dup     v28.4s, v31.s[0]
> >>>> +     .L6:
> >>>> +         mov     v30.16b, v31.16b
> >>>> +         fadd    v31.4s, v31.4s, v27.4s
> >>>> +         fadd    v29.4s, v30.4s, v28.4s
> >>>> +         stp     q30, q29, [x0]
> >>>> +         add     x0, x0, 32
> >>>> +         cmp     x1, x0
> >>>> +         bne     .L6
> >>>> +
> >>>> +   after:
> >>>> +         fmov    s31, 4.0e+0
> >>>> +         dup     v29.4s, v31.s[0]
> >>>> +     .L6:
> >>>> +         fadd    v30.4s, v31.4s, v29.4s
> >>>> +         stp     q31, q30, [x0]
> >>>> +         add     x0, x0, 32
> >>>> +         fadd    v31.4s, v29.4s, v30.4s
> >>>> +         cmp     x0, x1
> >>>> +         bne     .L6  */
> >>>> +
> >>>> +void
> >>>> +foo2 (float *arr, float freq, float step)
> >>>> +{
> >>>> +  for (int i = 0; i < 1024; i++)
> >>>> +    {
> >>>> +      arr[i] = freq;
> >>>> +      freq += step;
> >>>> +    }
> >>>> +}
> >>>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> >>>> index 3b46c58a8d8..706ecbffd0c 100644
> >>>> --- a/gcc/tree-vect-loop.cc
> >>>> +++ b/gcc/tree-vect-loop.cc
> >>>> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> >>>>                                   new_vec, step_vectype, NULL);
> >>>>
> >>>>       vec_def = induc_def;
> >>>> -      for (i = 1; i < ncopies; i++)
> >>>> +      for (i = 1; i < ncopies + 1; i++)
> >>>>        {
> >>>>          /* vec_i = vec_prev + vec_step  */
> >>>>          gimple_seq stmts = NULL;
> >>>> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> >>>>          vec_def = gimple_convert (&stmts, vectype, vec_def);
> >>>>
> >>>>          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> >>>> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
> >>>> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >>>> +         if (i < ncopies)
> >>>> +           {
> >>>> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
> >>>> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >>>> +           }
> >>>> +         else
> >>>> +           {
> >>>> +             /* vec_1 = vec_iv + (VF/n * S)
> >>>> +                vec_2 = vec_1 + (VF/n * S)
> >>>> +                ...
> >>>> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
> >>>> +
> >>>> +                vec_n is used as vec_loop to save the large step register and
> >>>> +                related operations.  */
> >>>> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
> >>>> +                          UNKNOWN_LOCATION);
> >>>> +           }
> >>>>        }
> >>>>     }
> >>>>
> >>>> --
> >>>> 2.34.1
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
new file mode 100644
index 00000000000..bb3b6dcfe08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
@@ -0,0 +1,40 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
+/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
+
+/* Calcualte the vectorized induction with smaller step for an unrolled loop.
+
+   before (suggested_unroll_factor=2):
+	  fmov    s30, 8.0e+0
+	  fmov    s31, 4.0e+0
+	  dup     v27.4s, v30.s[0]
+	  dup     v28.4s, v31.s[0]
+     .L6:
+	  mov     v30.16b, v31.16b
+	  fadd    v31.4s, v31.4s, v27.4s
+	  fadd    v29.4s, v30.4s, v28.4s
+	  stp     q30, q29, [x0]
+	  add     x0, x0, 32
+	  cmp     x1, x0
+	  bne     .L6
+
+   after:
+	  fmov    s31, 4.0e+0
+	  dup     v29.4s, v31.s[0]
+     .L6:
+	  fadd    v30.4s, v31.4s, v29.4s
+	  stp     q31, q30, [x0]
+	  add     x0, x0, 32
+	  fadd    v31.4s, v29.4s, v30.4s
+	  cmp     x0, x1
+	  bne     .L6  */
+
+void
+foo2 (float *arr, float freq, float step)
+{
+  for (int i = 0; i < 1024; i++)
+    {
+      arr[i] = freq;
+      freq += step;
+    }
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 3b46c58a8d8..706ecbffd0c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10114,7 +10114,7 @@  vectorizable_induction (loop_vec_info loop_vinfo,
 				   new_vec, step_vectype, NULL);
 
       vec_def = induc_def;
-      for (i = 1; i < ncopies; i++)
+      for (i = 1; i < ncopies + 1; i++)
 	{
 	  /* vec_i = vec_prev + vec_step  */
 	  gimple_seq stmts = NULL;
@@ -10124,8 +10124,23 @@  vectorizable_induction (loop_vec_info loop_vinfo,
 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
  
 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
-	  new_stmt = SSA_NAME_DEF_STMT (vec_def);
-	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	  if (i < ncopies)
+	    {
+	      new_stmt = SSA_NAME_DEF_STMT (vec_def);
+	      STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	    }
+	  else
+	    {
+	      /* vec_1 = vec_iv + (VF/n * S)
+		 vec_2 = vec_1 + (VF/n * S)
+		 ...
+		 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
+
+		 vec_n is used as vec_loop to save the large step register and
+		 related operations.  */
+	      add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+			   UNKNOWN_LOCATION);
+	    }
 	}
     }