diff mbox series

Don't reduce estimated unrolled size for innermost loop.

Message ID 20240513022737.3105192-1-hongtao.liu@intel.com
State New
Headers show
Series Don't reduce estimated unrolled size for innermost loop. | expand

Commit Message

liuhongt May 13, 2024, 2:27 a.m. UTC
As testcase in the PR, O3 cunrolli may prevent vectorization for the
innermost loop and increase register pressure.
The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
the reduction.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big impact for SPEC2017.
Ok for trunk?

gcc/ChangeLog:

	PR tree-optimization/112325
	* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
	new parameters: loop and ul, and remove unr_insns reduction
	for innermost loop.
	(try_unroll_loop_completely): Pass loop and ul to
	estimated_unrolled_size.

gcc/testsuite/ChangeLog:

	* gcc.dg/tree-ssa/pr112325.c: New test.
	* gcc.dg/vect/pr69783.c: Add extra option --param
	max-completely-peeled-insns=300.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
 gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
 3 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c

Comments

Richard Biener May 13, 2024, 7:40 a.m. UTC | #1
On Mon, May 13, 2024 at 4:29 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> As testcase in the PR, O3 cunrolli may prevent vectorization for the
> innermost loop and increase register pressure.
> The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
> ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> the reduction.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> No big impact for SPEC2017.
> Ok for trunk?

This removes the 1/3 reduction when unrolling a loop nest (the case I was
concerned about).  Unrolling of a nest is by iterating in
tree_unroll_loops_completely
so the to be unrolled loop appears innermost.  So I think you need a new
parameter on tree_unroll_loops_completely_1 indicating whether we're in the
first iteration (or whether to assume inner most loops will "simplify").

Few comments below

> gcc/ChangeLog:
>
>         PR tree-optimization/112325
>         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
>         new parameters: loop and ul, and remove unr_insns reduction
>         for innermost loop.
>         (try_unroll_loop_completely): Pass loop and ul to
>         estimated_unrolled_size.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/tree-ssa/pr112325.c: New test.
>         * gcc.dg/vect/pr69783.c: Add extra option --param
>         max-completely-peeled-insns=300.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
>  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
>  gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
>  3 files changed, 71 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> new file mode 100644
> index 00000000000..14208b3e7f8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> @@ -0,0 +1,57 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> +
> +typedef unsigned short ggml_fp16_t;
> +static float table_f32_f16[1 << 16];
> +
> +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> +    unsigned short s;
> +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> +    return table_f32_f16[s];
> +}
> +
> +typedef struct {
> +    ggml_fp16_t d;
> +    ggml_fp16_t m;
> +    unsigned char qh[4];
> +    unsigned char qs[32 / 2];
> +} block_q5_1;
> +
> +typedef struct {
> +    float d;
> +    float s;
> +    char qs[32];
> +} block_q8_1;
> +
> +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> +    const int qk = 32;
> +    const int nb = n / qk;
> +
> +    const block_q5_1 * restrict x = vx;
> +    const block_q8_1 * restrict y = vy;
> +
> +    float sumf = 0.0;
> +
> +    for (int i = 0; i < nb; i++) {
> +        unsigned qh;
> +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> +
> +        int sumi = 0;
> +
> +        for (int j = 0; j < qk/2; ++j) {
> +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> +
> +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> +
> +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> +        }
> +
> +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> +    }
> +
> +    *s = sumf;
> +}
> +
> +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> index 5df95d0ce4e..a1f75514d72 100644
> --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-require-effective-target vect_float } */
> -/* { dg-additional-options "-Ofast -funroll-loops" } */
> +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */

If we rely on unrolling of a loop can you put #pragma unroll [N]
before the respective loop
instead?

>  #define NXX 516
>  #define NYY 516
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index bf017137260..5e0eca647a1 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
>
>  static unsigned HOST_WIDE_INT
>  estimated_unrolled_size (struct loop_size *size,
> -                        unsigned HOST_WIDE_INT nunroll)
> +                        unsigned HOST_WIDE_INT nunroll,
> +                        enum unroll_level ul,
> +                        class loop* loop)
>  {
>    HOST_WIDE_INT unr_insns = ((nunroll)
>                              * (HOST_WIDE_INT) (size->overall
> @@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
>      unr_insns = 0;
>    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
>
> -  unr_insns = unr_insns * 2 / 3;
> +  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
> +     and may increase a lot of register pressure.
> +     UL != UL_ALL is need to unroll small loop at O2.  */
> +  class loop *loop_father = loop_outer (loop);
> +  if (loop->inner || !loop_father

Do we ever get here for !loop_father?  We shouldn't.

> +      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)

This means you excempt all loops that are direct children of the loop
root tree.  That doesn't make much sense.

> +      || ul != UL_ALL)

This is also quite odd - we're being more optimistic for UL_NO_GROWTH
than for UL_ALL?  This doesn't make much sense.

Overall I think this means removal of being optimistic doesn't work so well?

If we need some extra leeway for UL_NO_GROWTH for what we expect
to unroll it might be better to add sth like --param
nogrowth-completely-peeled-insns
specifying a fixed surplus size?  Or we need to look at what's the problem
with the testcases regressing or the one you are trying to fix.

I did experiment with better estimating cleanup done at some point
(see attached),
but didn't get to finishing that (and as said, as we're running VN on the result
we'd ideally do that as part of the estimation somehow).

Richard.

> +    unr_insns = unr_insns * 2 / 3;
> +
>    if (unr_insns <= 0)
>      unr_insns = 1;
>
> @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
>
>           unsigned HOST_WIDE_INT ninsns = size.overall;
>           unsigned HOST_WIDE_INT unr_insns
> -           = estimated_unrolled_size (&size, n_unroll);
> +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
>           if (dump_file && (dump_flags & TDF_DETAILS))
>             {
>               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> --
> 2.31.1
>
Hongtao Liu May 15, 2024, 2:14 a.m. UTC | #2
On Mon, May 13, 2024 at 3:40 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Mon, May 13, 2024 at 4:29 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > As testcase in the PR, O3 cunrolli may prevent vectorization for the
> > innermost loop and increase register pressure.
> > The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
> > ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> > the reduction.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > No big impact for SPEC2017.
> > Ok for trunk?
>
> This removes the 1/3 reduction when unrolling a loop nest (the case I was
> concerned about).  Unrolling of a nest is by iterating in
> tree_unroll_loops_completely
> so the to be unrolled loop appears innermost.  So I think you need a new
> parameter on tree_unroll_loops_completely_1 indicating whether we're in the
> first iteration (or whether to assume inner most loops will "simplify").
yes, it would be better.
>
> Few comments below
>
> > gcc/ChangeLog:
> >
> >         PR tree-optimization/112325
> >         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
> >         new parameters: loop and ul, and remove unr_insns reduction
> >         for innermost loop.
> >         (try_unroll_loop_completely): Pass loop and ul to
> >         estimated_unrolled_size.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.dg/tree-ssa/pr112325.c: New test.
> >         * gcc.dg/vect/pr69783.c: Add extra option --param
> >         max-completely-peeled-insns=300.
> > ---
> >  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
> >  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
> >  gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
> >  3 files changed, 71 insertions(+), 4 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> >
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > new file mode 100644
> > index 00000000000..14208b3e7f8
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > @@ -0,0 +1,57 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> > +
> > +typedef unsigned short ggml_fp16_t;
> > +static float table_f32_f16[1 << 16];
> > +
> > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> > +    unsigned short s;
> > +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> > +    return table_f32_f16[s];
> > +}
> > +
> > +typedef struct {
> > +    ggml_fp16_t d;
> > +    ggml_fp16_t m;
> > +    unsigned char qh[4];
> > +    unsigned char qs[32 / 2];
> > +} block_q5_1;
> > +
> > +typedef struct {
> > +    float d;
> > +    float s;
> > +    char qs[32];
> > +} block_q8_1;
> > +
> > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> > +    const int qk = 32;
> > +    const int nb = n / qk;
> > +
> > +    const block_q5_1 * restrict x = vx;
> > +    const block_q8_1 * restrict y = vy;
> > +
> > +    float sumf = 0.0;
> > +
> > +    for (int i = 0; i < nb; i++) {
> > +        unsigned qh;
> > +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> > +
> > +        int sumi = 0;
> > +
> > +        for (int j = 0; j < qk/2; ++j) {
> > +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> > +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> > +
> > +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> > +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> > +
> > +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> > +        }
> > +
> > +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> > +    }
> > +
> > +    *s = sumf;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > index 5df95d0ce4e..a1f75514d72 100644
> > --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> > +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > @@ -1,6 +1,6 @@
> >  /* { dg-do compile } */
> >  /* { dg-require-effective-target vect_float } */
> > -/* { dg-additional-options "-Ofast -funroll-loops" } */
> > +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
>
> If we rely on unrolling of a loop can you put #pragma unroll [N]
> before the respective loop
> instead?
>
> >  #define NXX 516
> >  #define NYY 516
> > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > index bf017137260..5e0eca647a1 100644
> > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
> >
> >  static unsigned HOST_WIDE_INT
> >  estimated_unrolled_size (struct loop_size *size,
> > -                        unsigned HOST_WIDE_INT nunroll)
> > +                        unsigned HOST_WIDE_INT nunroll,
> > +                        enum unroll_level ul,
> > +                        class loop* loop)
> >  {
> >    HOST_WIDE_INT unr_insns = ((nunroll)
> >                              * (HOST_WIDE_INT) (size->overall
> > @@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
> >      unr_insns = 0;
> >    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
> >
> > -  unr_insns = unr_insns * 2 / 3;
> > +  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
> > +     and may increase a lot of register pressure.
> > +     UL != UL_ALL is need to unroll small loop at O2.  */
> > +  class loop *loop_father = loop_outer (loop);
> > +  if (loop->inner || !loop_father
>
> Do we ever get here for !loop_father?  We shouldn't.
>
> > +      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
>
> This means you excempt all loops that are direct children of the loop
> root tree.  That doesn't make much sense.
>
> > +      || ul != UL_ALL)
>
> This is also quite odd - we're being more optimistic for UL_NO_GROWTH
> than for UL_ALL?  This doesn't make much sense.
>
> Overall I think this means removal of being optimistic doesn't work so well?
They're mostly used to avoid testcase regressions., the regressed
testcases rely on the behavior of complete unroll from the first
unroll, but now it's only unrolled by the second unroll.
I checked some, the codegen are the same, I need to go through all of
them, if the final codegen are the same or optimal, I'll just adjust
testcases?

++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 LP64 note (test for

g++warnings, line 56)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for

g++warnings, line 66)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for

g++warnings, line 56)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for

g++warnings, line 66)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for

g++warnings, line 56)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for

g++warnings, line 66)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for

g++warnings, line 56)

g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for

g++warnings, line 66)

gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)

gcc: gcc.dg/graphite/interchange-8.c execution test

gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
"Invalid sum"

gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
iteration exit edge was proved true."

gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
iterations completely unrolled"

gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
<unsigned long> \\[\\(char \\*\\)\\&a8] = "

gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"

gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1

gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
"Invalid sum" 0

gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"

gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
"vectorized 3 loops" 1

>
> If we need some extra leeway for UL_NO_GROWTH for what we expect
> to unroll it might be better to add sth like --param
> nogrowth-completely-peeled-insns
> specifying a fixed surplus size?  Or we need to look at what's the problem
> with the testcases regressing or the one you are trying to fix.
>
> I did experiment with better estimating cleanup done at some point
> (see attached),
> but didn't get to finishing that (and as said, as we're running VN on the result
> we'd ideally do that as part of the estimation somehow).
>
> Richard.
>
> > +    unr_insns = unr_insns * 2 / 3;
> > +
> >    if (unr_insns <= 0)
> >      unr_insns = 1;
> >
> > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> >
> >           unsigned HOST_WIDE_INT ninsns = size.overall;
> >           unsigned HOST_WIDE_INT unr_insns
> > -           = estimated_unrolled_size (&size, n_unroll);
> > +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
> >           if (dump_file && (dump_flags & TDF_DETAILS))
> >             {
> >               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> > --
> > 2.31.1
> >
Richard Biener May 15, 2024, 9:24 a.m. UTC | #3
On Wed, May 15, 2024 at 4:15 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, May 13, 2024 at 3:40 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Mon, May 13, 2024 at 4:29 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > As testcase in the PR, O3 cunrolli may prevent vectorization for the
> > > innermost loop and increase register pressure.
> > > The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
> > > ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> > > the reduction.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > No big impact for SPEC2017.
> > > Ok for trunk?
> >
> > This removes the 1/3 reduction when unrolling a loop nest (the case I was
> > concerned about).  Unrolling of a nest is by iterating in
> > tree_unroll_loops_completely
> > so the to be unrolled loop appears innermost.  So I think you need a new
> > parameter on tree_unroll_loops_completely_1 indicating whether we're in the
> > first iteration (or whether to assume inner most loops will "simplify").
> yes, it would be better.
> >
> > Few comments below
> >
> > > gcc/ChangeLog:
> > >
> > >         PR tree-optimization/112325
> > >         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
> > >         new parameters: loop and ul, and remove unr_insns reduction
> > >         for innermost loop.
> > >         (try_unroll_loop_completely): Pass loop and ul to
> > >         estimated_unrolled_size.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.dg/tree-ssa/pr112325.c: New test.
> > >         * gcc.dg/vect/pr69783.c: Add extra option --param
> > >         max-completely-peeled-insns=300.
> > > ---
> > >  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
> > >  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
> > >  gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
> > >  3 files changed, 71 insertions(+), 4 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > >
> > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > new file mode 100644
> > > index 00000000000..14208b3e7f8
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > @@ -0,0 +1,57 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> > > +
> > > +typedef unsigned short ggml_fp16_t;
> > > +static float table_f32_f16[1 << 16];
> > > +
> > > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> > > +    unsigned short s;
> > > +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> > > +    return table_f32_f16[s];
> > > +}
> > > +
> > > +typedef struct {
> > > +    ggml_fp16_t d;
> > > +    ggml_fp16_t m;
> > > +    unsigned char qh[4];
> > > +    unsigned char qs[32 / 2];
> > > +} block_q5_1;
> > > +
> > > +typedef struct {
> > > +    float d;
> > > +    float s;
> > > +    char qs[32];
> > > +} block_q8_1;
> > > +
> > > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> > > +    const int qk = 32;
> > > +    const int nb = n / qk;
> > > +
> > > +    const block_q5_1 * restrict x = vx;
> > > +    const block_q8_1 * restrict y = vy;
> > > +
> > > +    float sumf = 0.0;
> > > +
> > > +    for (int i = 0; i < nb; i++) {
> > > +        unsigned qh;
> > > +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> > > +
> > > +        int sumi = 0;
> > > +
> > > +        for (int j = 0; j < qk/2; ++j) {
> > > +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> > > +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> > > +
> > > +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> > > +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> > > +
> > > +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> > > +        }
> > > +
> > > +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> > > +    }
> > > +
> > > +    *s = sumf;
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> > > diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > index 5df95d0ce4e..a1f75514d72 100644
> > > --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > @@ -1,6 +1,6 @@
> > >  /* { dg-do compile } */
> > >  /* { dg-require-effective-target vect_float } */
> > > -/* { dg-additional-options "-Ofast -funroll-loops" } */
> > > +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
> >
> > If we rely on unrolling of a loop can you put #pragma unroll [N]
> > before the respective loop
> > instead?
> >
> > >  #define NXX 516
> > >  #define NYY 516
> > > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > > index bf017137260..5e0eca647a1 100644
> > > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > > @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
> > >
> > >  static unsigned HOST_WIDE_INT
> > >  estimated_unrolled_size (struct loop_size *size,
> > > -                        unsigned HOST_WIDE_INT nunroll)
> > > +                        unsigned HOST_WIDE_INT nunroll,
> > > +                        enum unroll_level ul,
> > > +                        class loop* loop)
> > >  {
> > >    HOST_WIDE_INT unr_insns = ((nunroll)
> > >                              * (HOST_WIDE_INT) (size->overall
> > > @@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
> > >      unr_insns = 0;
> > >    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
> > >
> > > -  unr_insns = unr_insns * 2 / 3;
> > > +  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
> > > +     and may increase a lot of register pressure.
> > > +     UL != UL_ALL is need to unroll small loop at O2.  */
> > > +  class loop *loop_father = loop_outer (loop);
> > > +  if (loop->inner || !loop_father
> >
> > Do we ever get here for !loop_father?  We shouldn't.
> >
> > > +      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
> >
> > This means you excempt all loops that are direct children of the loop
> > root tree.  That doesn't make much sense.
> >
> > > +      || ul != UL_ALL)
> >
> > This is also quite odd - we're being more optimistic for UL_NO_GROWTH
> > than for UL_ALL?  This doesn't make much sense.
> >
> > Overall I think this means removal of being optimistic doesn't work so well?
> They're mostly used to avoid testcase regressions., the regressed
> testcases rely on the behavior of complete unroll from the first
> unroll, but now it's only unrolled by the second unroll.
> I checked some, the codegen are the same, I need to go through all of
> them, if the final codegen are the same or optimal, I'll just adjust
> testcases?
>
> ++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 LP64 note (test for
>
> g++warnings, line 56)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for
>
> g++warnings, line 66)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for
>
> g++warnings, line 56)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for
>
> g++warnings, line 66)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for
>
> g++warnings, line 56)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for
>
> g++warnings, line 66)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for
>
> g++warnings, line 56)
>
> g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for
>
> g++warnings, line 66)

This seems to expect unrolling for an init loop rolling 1 times.  I don't
see 1/3 of the stmts vanishing but it's definitely an interesting corner
case.  That's why I was thinking of maybe adding a --param specifying
an absolute growth we consider "no growth" - but of course that's
ugly as well but it would cover these small loops.

How do the sizes play out here after your change?  Before it's

size: 13-3, last_iteration: 2-2
  Loop size: 13
  Estimated size after unrolling: 13

and the init is quite complex with virtual pointer inits.  We do have

  size:   1 _14 = _5 + -1;
   Induction variable computation will be folded away.
  size:   1 _15 = _4 + 40;
 BB: 3, after_exit: 1

where we don't realize the + 40 of _15 will be folded into the dereferences
but that would only subtract 1.

  size:   3 C::C (_23, &MEM <const void *[8]> [(void *)&_ZTT2D1 + 48B]);

that's the biggest cost.

To diagnose the array bound issue we rely on early unrolling since we avoid
-Warray-bounds after late unrolling due to false positives.

This is definitely not an unrolling that preserves code size.

> gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)
>
> gcc: gcc.dg/graphite/interchange-8.c execution test

An execute fail is bad ... can we avoid this (but file a bugreport!) when
placing #pragma GCC unroll before the innermost loop?  We should
probably honor that in early unrolling (not sure if we do).

> gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
> "Invalid sum"
>
> gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
> iteration exit edge was proved true."
>
> gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
> iterations completely unrolled"

again the current estimate is the same before/after unrolling, here
we expect to retain one compare & branch.

> gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
> <unsigned long> \\[\\(char \\*\\)\\&a8] = "
>
> gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"

again the 2/3 scaling is difficult to warrant.  The goal of the early unrolling
pass was abstraction penalty removal which works for low trip-count loops.
So maybe that new --param for allowed growth should scale but instead
of scaling by the loop size as 2/3 does it should scale by the number of
times we peel which means offsetting the body size estimate by a constant.

Honza?  Any idea how to go forward here?

Thanks,
Richard.

> gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1
>
> gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
> "Invalid sum" 0
>
> gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"
>
> gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
> "vectorized 3 loops" 1
>
> >
> > If we need some extra leeway for UL_NO_GROWTH for what we expect
> > to unroll it might be better to add sth like --param
> > nogrowth-completely-peeled-insns
> > specifying a fixed surplus size?  Or we need to look at what's the problem
> > with the testcases regressing or the one you are trying to fix.
> >
> > I did experiment with better estimating cleanup done at some point
> > (see attached),
> > but didn't get to finishing that (and as said, as we're running VN on the result
> > we'd ideally do that as part of the estimation somehow).
> >
> > Richard.
> >
> > > +    unr_insns = unr_insns * 2 / 3;
> > > +
> > >    if (unr_insns <= 0)
> > >      unr_insns = 1;
> > >
> > > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> > >
> > >           unsigned HOST_WIDE_INT ninsns = size.overall;
> > >           unsigned HOST_WIDE_INT unr_insns
> > > -           = estimated_unrolled_size (&size, n_unroll);
> > > +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
> > >           if (dump_file && (dump_flags & TDF_DETAILS))
> > >             {
> > >               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> > > --
> > > 2.31.1
> > >
>
>
>
> --
> BR,
> Hongtao
Hongtao Liu May 15, 2024, 9:49 a.m. UTC | #4
C  -std=gnu++14 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for
> >
> > g++warnings, line 66)
>
> This seems to expect unrolling for an init loop rolling 1 times.  I don't
> see 1/3 of the stmts vanishing but it's definitely an interesting corner
> case.  That's why I was thinking of maybe adding a --param specifying
> an absolute growth we consider "no growth" - but of course that's
> ugly as well but it would cover these small loops.
>
> How do the sizes play out here after your change?  Before it's
>
> size: 13-3, last_iteration: 2-2
>   Loop size: 13
>   Estimated size after unrolling: 13
After:
size: 13-3, last_iteration: 2-2
  Loop size: 13
  Estimated size after unrolling: 20
Not unrolling loop 1: size would grow.

>
> and the init is quite complex with virtual pointer inits.  We do have
>
>   size:   1 _14 = _5 + -1;
>    Induction variable computation will be folded away.
>   size:   1 _15 = _4 + 40;
>  BB: 3, after_exit: 1
>
> where we don't realize the + 40 of _15 will be folded into the dereferences
> but that would only subtract 1.
>
>   size:   3 C::C (_23, &MEM <const void *[8]> [(void *)&_ZTT2D1 + 48B]);
>
> that's the biggest cost.
>
> To diagnose the array bound issue we rely on early unrolling since we avoid
> -Warray-bounds after late unrolling due to false positives.
>
> This is definitely not an unrolling that preserves code size.
>
> > gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)
> >
> > gcc: gcc.dg/graphite/interchange-8.c execution test
>
> An execute fail is bad ... can we avoid this (but file a bugreport!) when
It's PR115101
> placing #pragma GCC unroll before the innermost loop?  We should
> probably honor that in early unrolling (not sure if we do).
>
> > gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
> > "Invalid sum"
> >
> > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
> > iteration exit edge was proved true."
> >
> > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
> > iterations completely unrolled"
>
> again the current estimate is the same before/after unrolling, here
> we expect to retain one compare & branch.
>
> > gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
> > <unsigned long> \\[\\(char \\*\\)\\&a8] = "
> >
> > gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"
>
> again the 2/3 scaling is difficult to warrant.  The goal of the early unrolling
> pass was abstraction penalty removal which works for low trip-count loops.
> So maybe that new --param for allowed growth should scale but instead
> of scaling by the loop size as 2/3 does it should scale by the number of
> times we peel which means offsetting the body size estimate by a constant.
>
> Honza?  Any idea how to go forward here?
>
> Thanks,
> Richard.
>
> > gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1
> >
> > gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
> > "Invalid sum" 0
> >
> > gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"
> >
> > gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
> > "vectorized 3 loops" 1
> >
> > >
> > > If we need some extra leeway for UL_NO_GROWTH for what we expect
> > > to unroll it might be better to add sth like --param
> > > nogrowth-completely-peeled-insns
> > > specifying a fixed surplus size?  Or we need to look at what's the problem
> > > with the testcases regressing or the one you are trying to fix.
> > >
> > > I did experiment with better estimating cleanup done at some point
> > > (see attached),
> > > but didn't get to finishing that (and as said, as we're running VN on the result
> > > we'd ideally do that as part of the estimation somehow).
> > >
> > > Richard.
> > >
> > > > +    unr_insns = unr_insns * 2 / 3;
> > > > +
> > > >    if (unr_insns <= 0)
> > > >      unr_insns = 1;
> > > >
> > > > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> > > >
> > > >           unsigned HOST_WIDE_INT ninsns = size.overall;
> > > >           unsigned HOST_WIDE_INT unr_insns
> > > > -           = estimated_unrolled_size (&size, n_unroll);
> > > > +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
> > > >           if (dump_file && (dump_flags & TDF_DETAILS))
> > > >             {
> > > >               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> > > > --
> > > > 2.31.1
> > > >
> >
> >
> >
> > --
> > BR,
> > Hongtao
Hongtao Liu May 21, 2024, 2:35 a.m. UTC | #5
On Wed, May 15, 2024 at 5:24 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, May 15, 2024 at 4:15 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Mon, May 13, 2024 at 3:40 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Mon, May 13, 2024 at 4:29 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > >
> > > > As testcase in the PR, O3 cunrolli may prevent vectorization for the
> > > > innermost loop and increase register pressure.
> > > > The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
> > > > ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> > > > the reduction.
> > > >
> > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > No big impact for SPEC2017.
> > > > Ok for trunk?
> > >
> > > This removes the 1/3 reduction when unrolling a loop nest (the case I was
> > > concerned about).  Unrolling of a nest is by iterating in
> > > tree_unroll_loops_completely
> > > so the to be unrolled loop appears innermost.  So I think you need a new
> > > parameter on tree_unroll_loops_completely_1 indicating whether we're in the
> > > first iteration (or whether to assume inner most loops will "simplify").
> > yes, it would be better.
> > >
> > > Few comments below
> > >
> > > > gcc/ChangeLog:
> > > >
> > > >         PR tree-optimization/112325
> > > >         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
> > > >         new parameters: loop and ul, and remove unr_insns reduction
> > > >         for innermost loop.
> > > >         (try_unroll_loop_completely): Pass loop and ul to
> > > >         estimated_unrolled_size.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         * gcc.dg/tree-ssa/pr112325.c: New test.
> > > >         * gcc.dg/vect/pr69783.c: Add extra option --param
> > > >         max-completely-peeled-insns=300.
> > > > ---
> > > >  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
> > > >  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
> > > >  gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
> > > >  3 files changed, 71 insertions(+), 4 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > >
> > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > new file mode 100644
> > > > index 00000000000..14208b3e7f8
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > @@ -0,0 +1,57 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> > > > +
> > > > +typedef unsigned short ggml_fp16_t;
> > > > +static float table_f32_f16[1 << 16];
> > > > +
> > > > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> > > > +    unsigned short s;
> > > > +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> > > > +    return table_f32_f16[s];
> > > > +}
> > > > +
> > > > +typedef struct {
> > > > +    ggml_fp16_t d;
> > > > +    ggml_fp16_t m;
> > > > +    unsigned char qh[4];
> > > > +    unsigned char qs[32 / 2];
> > > > +} block_q5_1;
> > > > +
> > > > +typedef struct {
> > > > +    float d;
> > > > +    float s;
> > > > +    char qs[32];
> > > > +} block_q8_1;
> > > > +
> > > > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> > > > +    const int qk = 32;
> > > > +    const int nb = n / qk;
> > > > +
> > > > +    const block_q5_1 * restrict x = vx;
> > > > +    const block_q8_1 * restrict y = vy;
> > > > +
> > > > +    float sumf = 0.0;
> > > > +
> > > > +    for (int i = 0; i < nb; i++) {
> > > > +        unsigned qh;
> > > > +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> > > > +
> > > > +        int sumi = 0;
> > > > +
> > > > +        for (int j = 0; j < qk/2; ++j) {
> > > > +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> > > > +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> > > > +
> > > > +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> > > > +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> > > > +
> > > > +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> > > > +        }
> > > > +
> > > > +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> > > > +    }
> > > > +
> > > > +    *s = sumf;
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> > > > diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > index 5df95d0ce4e..a1f75514d72 100644
> > > > --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > @@ -1,6 +1,6 @@
> > > >  /* { dg-do compile } */
> > > >  /* { dg-require-effective-target vect_float } */
> > > > -/* { dg-additional-options "-Ofast -funroll-loops" } */
> > > > +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
> > >
> > > If we rely on unrolling of a loop can you put #pragma unroll [N]
> > > before the respective loop
> > > instead?
> > >
> > > >  #define NXX 516
> > > >  #define NYY 516
> > > > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > > > index bf017137260..5e0eca647a1 100644
> > > > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > > > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > > > @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
> > > >
> > > >  static unsigned HOST_WIDE_INT
> > > >  estimated_unrolled_size (struct loop_size *size,
> > > > -                        unsigned HOST_WIDE_INT nunroll)
> > > > +                        unsigned HOST_WIDE_INT nunroll,
> > > > +                        enum unroll_level ul,
> > > > +                        class loop* loop)
> > > >  {
> > > >    HOST_WIDE_INT unr_insns = ((nunroll)
> > > >                              * (HOST_WIDE_INT) (size->overall
> > > > @@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
> > > >      unr_insns = 0;
> > > >    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
> > > >
> > > > -  unr_insns = unr_insns * 2 / 3;
> > > > +  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
> > > > +     and may increase a lot of register pressure.
> > > > +     UL != UL_ALL is need to unroll small loop at O2.  */
> > > > +  class loop *loop_father = loop_outer (loop);
> > > > +  if (loop->inner || !loop_father
> > >
> > > Do we ever get here for !loop_father?  We shouldn't.
> > >
> > > > +      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
> > >
> > > This means you excempt all loops that are direct children of the loop
> > > root tree.  That doesn't make much sense.
> > >
> > > > +      || ul != UL_ALL)
> > >
> > > This is also quite odd - we're being more optimistic for UL_NO_GROWTH
> > > than for UL_ALL?  This doesn't make much sense.
> > >
> > > Overall I think this means removal of being optimistic doesn't work so well?
> > They're mostly used to avoid testcase regressions., the regressed
> > testcases rely on the behavior of complete unroll from the first
> > unroll, but now it's only unrolled by the second unroll.
> > I checked some, the codegen are the same, I need to go through all of
> > them, if the final codegen are the same or optimal, I'll just adjust
> > testcases?
> >
> > ++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for
> >
> > g++warnings, line 66)
>
> This seems to expect unrolling for an init loop rolling 1 times.  I don't
> see 1/3 of the stmts vanishing but it's definitely an interesting corner
> case.  That's why I was thinking of maybe adding a --param specifying
> an absolute growth we consider "no growth" - but of course that's
> ugly as well but it would cover these small loops.
>
> How do the sizes play out here after your change?  Before it's
>
> size: 13-3, last_iteration: 2-2
>   Loop size: 13
>   Estimated size after unrolling: 13
>
> and the init is quite complex with virtual pointer inits.  We do have
>
>   size:   1 _14 = _5 + -1;
>    Induction variable computation will be folded away.
>   size:   1 _15 = _4 + 40;
>  BB: 3, after_exit: 1
>
> where we don't realize the + 40 of _15 will be folded into the dereferences
> but that would only subtract 1.
>
>   size:   3 C::C (_23, &MEM <const void *[8]> [(void *)&_ZTT2D1 + 48B]);
>
> that's the biggest cost.
>
> To diagnose the array bound issue we rely on early unrolling since we avoid
> -Warray-bounds after late unrolling due to false positives.
>
> This is definitely not an unrolling that preserves code size.
>
> > gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)
> >
> > gcc: gcc.dg/graphite/interchange-8.c execution test
>
> An execute fail is bad ... can we avoid this (but file a bugreport!) when
> placing #pragma GCC unroll before the innermost loop?  We should
> probably honor that in early unrolling (not sure if we do).
>
> > gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
> > "Invalid sum"
> >
> > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
> > iteration exit edge was proved true."
> >
> > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
> > iterations completely unrolled"
>
> again the current estimate is the same before/after unrolling, here
> we expect to retain one compare & branch.
>
> > gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
> > <unsigned long> \\[\\(char \\*\\)\\&a8] = "
> >
> > gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"
>
> again the 2/3 scaling is difficult to warrant.  The goal of the early unrolling
> pass was abstraction penalty removal which works for low trip-count loops.
> So maybe that new --param for allowed growth should scale but instead
> of scaling by the loop size as 2/3 does it should scale by the number of
> times we peel which means offsetting the body size estimate by a constant.
>
> Honza?  Any idea how to go forward here?
>
> Thanks,
> Richard.
>
> > gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1
> >
> > gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
> > "Invalid sum" 0
> >
> > gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"
> >
> > gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
> > "vectorized 3 loops" 1
> >
> > >
> > > If we need some extra leeway for UL_NO_GROWTH for what we expect
> > > to unroll it might be better to add sth like --param
> > > nogrowth-completely-peeled-insns
Hard to find a default value satisfying all testcases.
some require loop unroll with 7 insns increment, some don't want loop
unroll w/ 5 insn increment.
The original 2/3 reduction happened to meet all those testcases(or the
testcases are constructed based on the old 2/3).
Can we define the parameter as the size of the loop, below the size we
still do the reduction, so the small loop can be unrolled?
> > > specifying a fixed surplus size?  Or we need to look at what's the problem
> > > with the testcases regressing or the one you are trying to fix.
> > >
> > > I did experiment with better estimating cleanup done at some point
> > > (see attached),
> > > but didn't get to finishing that (and as said, as we're running VN on the result
> > > we'd ideally do that as part of the estimation somehow).
> > >
> > > Richard.
> > >
> > > > +    unr_insns = unr_insns * 2 / 3;
> > > > +
> > > >    if (unr_insns <= 0)
> > > >      unr_insns = 1;
> > > >
> > > > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> > > >
> > > >           unsigned HOST_WIDE_INT ninsns = size.overall;
> > > >           unsigned HOST_WIDE_INT unr_insns
> > > > -           = estimated_unrolled_size (&size, n_unroll);
> > > > +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
> > > >           if (dump_file && (dump_flags & TDF_DETAILS))
> > > >             {
> > > >               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> > > > --
> > > > 2.31.1
> > > >
> >
> >
> >
> > --
> > BR,
> > Hongtao
Richard Biener May 21, 2024, 6:14 a.m. UTC | #6
On Tue, May 21, 2024 at 4:35 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, May 15, 2024 at 5:24 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, May 15, 2024 at 4:15 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Mon, May 13, 2024 at 3:40 PM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Mon, May 13, 2024 at 4:29 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > >
> > > > > As testcase in the PR, O3 cunrolli may prevent vectorization for the
> > > > > innermost loop and increase register pressure.
> > > > > The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
> > > > > ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> > > > > the reduction.
> > > > >
> > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > > No big impact for SPEC2017.
> > > > > Ok for trunk?
> > > >
> > > > This removes the 1/3 reduction when unrolling a loop nest (the case I was
> > > > concerned about).  Unrolling of a nest is by iterating in
> > > > tree_unroll_loops_completely
> > > > so the to be unrolled loop appears innermost.  So I think you need a new
> > > > parameter on tree_unroll_loops_completely_1 indicating whether we're in the
> > > > first iteration (or whether to assume inner most loops will "simplify").
> > > yes, it would be better.
> > > >
> > > > Few comments below
> > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > >         PR tree-optimization/112325
> > > > >         * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
> > > > >         new parameters: loop and ul, and remove unr_insns reduction
> > > > >         for innermost loop.
> > > > >         (try_unroll_loop_completely): Pass loop and ul to
> > > > >         estimated_unrolled_size.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > >         * gcc.dg/tree-ssa/pr112325.c: New test.
> > > > >         * gcc.dg/vect/pr69783.c: Add extra option --param
> > > > >         max-completely-peeled-insns=300.
> > > > > ---
> > > > >  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 ++++++++++++++++++++++++
> > > > >  gcc/testsuite/gcc.dg/vect/pr69783.c      |  2 +-
> > > > >  gcc/tree-ssa-loop-ivcanon.cc             | 16 +++++--
> > > > >  3 files changed, 71 insertions(+), 4 deletions(-)
> > > > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > >
> > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > > new file mode 100644
> > > > > index 00000000000..14208b3e7f8
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > > @@ -0,0 +1,57 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> > > > > +
> > > > > +typedef unsigned short ggml_fp16_t;
> > > > > +static float table_f32_f16[1 << 16];
> > > > > +
> > > > > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> > > > > +    unsigned short s;
> > > > > +    __builtin_memcpy(&s, &f, sizeof(unsigned short));
> > > > > +    return table_f32_f16[s];
> > > > > +}
> > > > > +
> > > > > +typedef struct {
> > > > > +    ggml_fp16_t d;
> > > > > +    ggml_fp16_t m;
> > > > > +    unsigned char qh[4];
> > > > > +    unsigned char qs[32 / 2];
> > > > > +} block_q5_1;
> > > > > +
> > > > > +typedef struct {
> > > > > +    float d;
> > > > > +    float s;
> > > > > +    char qs[32];
> > > > > +} block_q8_1;
> > > > > +
> > > > > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
> > > > > +    const int qk = 32;
> > > > > +    const int nb = n / qk;
> > > > > +
> > > > > +    const block_q5_1 * restrict x = vx;
> > > > > +    const block_q8_1 * restrict y = vy;
> > > > > +
> > > > > +    float sumf = 0.0;
> > > > > +
> > > > > +    for (int i = 0; i < nb; i++) {
> > > > > +        unsigned qh;
> > > > > +        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> > > > > +
> > > > > +        int sumi = 0;
> > > > > +
> > > > > +        for (int j = 0; j < qk/2; ++j) {
> > > > > +            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> > > > > +            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> > > > > +
> > > > > +            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> > > > > +            const int x1 = (x[i].qs[j] >> 4) | xh_1;
> > > > > +
> > > > > +            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> > > > > +        }
> > > > > +
> > > > > +        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> > > > > +    }
> > > > > +
> > > > > +    *s = sumf;
> > > > > +}
> > > > > +
> > > > > +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
> > > > > diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > > index 5df95d0ce4e..a1f75514d72 100644
> > > > > --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > > +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > > > > @@ -1,6 +1,6 @@
> > > > >  /* { dg-do compile } */
> > > > >  /* { dg-require-effective-target vect_float } */
> > > > > -/* { dg-additional-options "-Ofast -funroll-loops" } */
> > > > > +/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
> > > >
> > > > If we rely on unrolling of a loop can you put #pragma unroll [N]
> > > > before the respective loop
> > > > instead?
> > > >
> > > > >  #define NXX 516
> > > > >  #define NYY 516
> > > > > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > > > > index bf017137260..5e0eca647a1 100644
> > > > > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > > > > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > > > > @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
> > > > >
> > > > >  static unsigned HOST_WIDE_INT
> > > > >  estimated_unrolled_size (struct loop_size *size,
> > > > > -                        unsigned HOST_WIDE_INT nunroll)
> > > > > +                        unsigned HOST_WIDE_INT nunroll,
> > > > > +                        enum unroll_level ul,
> > > > > +                        class loop* loop)
> > > > >  {
> > > > >    HOST_WIDE_INT unr_insns = ((nunroll)
> > > > >                              * (HOST_WIDE_INT) (size->overall
> > > > > @@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
> > > > >      unr_insns = 0;
> > > > >    unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
> > > > >
> > > > > -  unr_insns = unr_insns * 2 / 3;
> > > > > +  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
> > > > > +     and may increase a lot of register pressure.
> > > > > +     UL != UL_ALL is need to unroll small loop at O2.  */
> > > > > +  class loop *loop_father = loop_outer (loop);
> > > > > +  if (loop->inner || !loop_father
> > > >
> > > > Do we ever get here for !loop_father?  We shouldn't.
> > > >
> > > > > +      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
> > > >
> > > > This means you excempt all loops that are direct children of the loop
> > > > root tree.  That doesn't make much sense.
> > > >
> > > > > +      || ul != UL_ALL)
> > > >
> > > > This is also quite odd - we're being more optimistic for UL_NO_GROWTH
> > > > than for UL_ALL?  This doesn't make much sense.
> > > >
> > > > Overall I think this means removal of being optimistic doesn't work so well?
> > > They're mostly used to avoid testcase regressions., the regressed
> > > testcases rely on the behavior of complete unroll from the first
> > > unroll, but now it's only unrolled by the second unroll.
> > > I checked some, the codegen are the same, I need to go through all of
> > > them, if the final codegen are the same or optimal, I'll just adjust
> > > testcases?
> > >
> > > ++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 LP64 note (test for
> > >
> > > g++warnings, line 56)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for
> > >
> > > g++warnings, line 66)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for
> > >
> > > g++warnings, line 56)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for
> > >
> > > g++warnings, line 66)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for
> > >
> > > g++warnings, line 56)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for
> > >
> > > g++warnings, line 66)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for
> > >
> > > g++warnings, line 56)
> > >
> > > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for
> > >
> > > g++warnings, line 66)
> >
> > This seems to expect unrolling for an init loop rolling 1 times.  I don't
> > see 1/3 of the stmts vanishing but it's definitely an interesting corner
> > case.  That's why I was thinking of maybe adding a --param specifying
> > an absolute growth we consider "no growth" - but of course that's
> > ugly as well but it would cover these small loops.
> >
> > How do the sizes play out here after your change?  Before it's
> >
> > size: 13-3, last_iteration: 2-2
> >   Loop size: 13
> >   Estimated size after unrolling: 13
> >
> > and the init is quite complex with virtual pointer inits.  We do have
> >
> >   size:   1 _14 = _5 + -1;
> >    Induction variable computation will be folded away.
> >   size:   1 _15 = _4 + 40;
> >  BB: 3, after_exit: 1
> >
> > where we don't realize the + 40 of _15 will be folded into the dereferences
> > but that would only subtract 1.
> >
> >   size:   3 C::C (_23, &MEM <const void *[8]> [(void *)&_ZTT2D1 + 48B]);
> >
> > that's the biggest cost.
> >
> > To diagnose the array bound issue we rely on early unrolling since we avoid
> > -Warray-bounds after late unrolling due to false positives.
> >
> > This is definitely not an unrolling that preserves code size.
> >
> > > gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)
> > >
> > > gcc: gcc.dg/graphite/interchange-8.c execution test
> >
> > An execute fail is bad ... can we avoid this (but file a bugreport!) when
> > placing #pragma GCC unroll before the innermost loop?  We should
> > probably honor that in early unrolling (not sure if we do).
> >
> > > gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
> > > "Invalid sum"
> > >
> > > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
> > > iteration exit edge was proved true."
> > >
> > > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
> > > iterations completely unrolled"
> >
> > again the current estimate is the same before/after unrolling, here
> > we expect to retain one compare & branch.
> >
> > > gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
> > > <unsigned long> \\[\\(char \\*\\)\\&a8] = "
> > >
> > > gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"
> >
> > again the 2/3 scaling is difficult to warrant.  The goal of the early unrolling
> > pass was abstraction penalty removal which works for low trip-count loops.
> > So maybe that new --param for allowed growth should scale but instead
> > of scaling by the loop size as 2/3 does it should scale by the number of
> > times we peel which means offsetting the body size estimate by a constant.
> >
> > Honza?  Any idea how to go forward here?
> >
> > Thanks,
> > Richard.
> >
> > > gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1
> > >
> > > gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
> > > "Invalid sum" 0
> > >
> > > gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"
> > >
> > > gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
> > > "vectorized 3 loops" 1
> > >
> > > >
> > > > If we need some extra leeway for UL_NO_GROWTH for what we expect
> > > > to unroll it might be better to add sth like --param
> > > > nogrowth-completely-peeled-insns
> Hard to find a default value satisfying all testcases.
> some require loop unroll with 7 insns increment, some don't want loop
> unroll w/ 5 insn increment.
> The original 2/3 reduction happened to meet all those testcases(or the
> testcases are constructed based on the old 2/3).
> Can we define the parameter as the size of the loop, below the size we
> still do the reduction, so the small loop can be unrolled?

Yeah, that's also a sensible possibility.  Does it work to have a parameter
for the unrolled body size?  Thus, amend the existing
--param max-completely-peeled-insns with a --param
max-completely-peeled-insns-nogrowth?

Thanks,
Richard.

> > > > specifying a fixed surplus size?  Or we need to look at what's the problem
> > > > with the testcases regressing or the one you are trying to fix.
> > > >
> > > > I did experiment with better estimating cleanup done at some point
> > > > (see attached),
> > > > but didn't get to finishing that (and as said, as we're running VN on the result
> > > > we'd ideally do that as part of the estimation somehow).
> > > >
> > > > Richard.
> > > >
> > > > > +    unr_insns = unr_insns * 2 / 3;
> > > > > +
> > > > >    if (unr_insns <= 0)
> > > > >      unr_insns = 1;
> > > > >
> > > > > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> > > > >
> > > > >           unsigned HOST_WIDE_INT ninsns = size.overall;
> > > > >           unsigned HOST_WIDE_INT unr_insns
> > > > > -           = estimated_unrolled_size (&size, n_unroll);
> > > > > +           = estimated_unrolled_size (&size, n_unroll, ul, loop);
> > > > >           if (dump_file && (dump_flags & TDF_DETAILS))
> > > > >             {
> > > > >               fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
> > > > > --
> > > > > 2.31.1
> > > > >
> > >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
>
>
>
> --
> BR,
> Hongtao
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
new file mode 100644
index 00000000000..14208b3e7f8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
@@ -0,0 +1,57 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    unsigned short s;
+    __builtin_memcpy(&s, &f, sizeof(unsigned short));
+    return table_f32_f16[s];
+}
+
+typedef struct {
+    ggml_fp16_t d;
+    ggml_fp16_t m;
+    unsigned char qh[4];
+    unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+    float d;
+    float s;
+    char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = 32;
+    const int nb = n / qk;
+
+    const block_q5_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        unsigned qh;
+        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+            const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+}
+
+/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param max-completely-peel-times limit reached} "cunrolli"} } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c b/gcc/testsuite/gcc.dg/vect/pr69783.c
index 5df95d0ce4e..a1f75514d72 100644
--- a/gcc/testsuite/gcc.dg/vect/pr69783.c
+++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-require-effective-target vect_float } */
-/* { dg-additional-options "-Ofast -funroll-loops" } */
+/* { dg-additional-options "-Ofast -funroll-loops --param max-completely-peeled-insns=300" } */
 
 #define NXX 516
 #define NYY 516
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf017137260..5e0eca647a1 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -444,7 +444,9 @@  tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 
 static unsigned HOST_WIDE_INT
 estimated_unrolled_size (struct loop_size *size,
-			 unsigned HOST_WIDE_INT nunroll)
+			 unsigned HOST_WIDE_INT nunroll,
+			 enum unroll_level ul,
+			 class loop* loop)
 {
   HOST_WIDE_INT unr_insns = ((nunroll)
   			     * (HOST_WIDE_INT) (size->overall
@@ -453,7 +455,15 @@  estimated_unrolled_size (struct loop_size *size,
     unr_insns = 0;
   unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
 
-  unr_insns = unr_insns * 2 / 3;
+  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
+     and may increase a lot of register pressure.
+     UL != UL_ALL is need to unroll small loop at O2.  */
+  class loop *loop_father = loop_outer (loop);
+  if (loop->inner || !loop_father
+      || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
+      || ul != UL_ALL)
+    unr_insns = unr_insns * 2 / 3;
+
   if (unr_insns <= 0)
     unr_insns = 1;
 
@@ -837,7 +847,7 @@  try_unroll_loop_completely (class loop *loop,
 
 	  unsigned HOST_WIDE_INT ninsns = size.overall;
 	  unsigned HOST_WIDE_INT unr_insns
-	    = estimated_unrolled_size (&size, n_unroll);
+	    = estimated_unrolled_size (&size, n_unroll, ul, loop);
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    {
 	      fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);