diff mbox series

rs6000: Refine implicit funroll-loops with unroll_adjust hook.

Message ID 1572338524-29442-1-git-send-email-guojiufu@linux.ibm.com
State New
Headers show
Series rs6000: Refine implicit funroll-loops with unroll_adjust hook. | expand

Commit Message

Jiufu Guo Oct. 29, 2019, 8:42 a.m. UTC
Hi,

In previous patch r277501, which is changing PARAM_MAX_UNROLL_TIMES
and PARAM_MAX_UNROLLED_INSNS values during option overriding, while
it would better to use target loop unroll adjust hook. The hook can
also help to do further target related hueristic adjust.
This patch is adding target loop unroll adjust hook for rs6000 to
impliment previous behavior.

Bootstrapped and regtested on powerpc64le. Is this ok for trunk?

A combined patch is listed at the end of this mail for this and r277501.
If you want to review it as a whole, it can be referenced.

Jiufu Guo
BR


gcc/
2019-10-29  Jiufu Guo  <guojiufu@linux.ibm.com>	    

	PR tree-optimization/88760
	* config/rs6000/rs6000.c (rs6000_option_override_internal): Remove
	changes to PARAM_MAX_UNROLL_TIMES and PARAM_MAX_UNROLLED_INSNS.
	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
	small loop 2 times if funroll-loops enabled implicitly.

	
gcc.testsuite/
2019-10-29  Jiufu Guo  <guojiufu@linux.ibm.com>

	PR tree-optimization/88760
	* gcc.dg/pr59643.c: Update back to r277550.

Comments

Jiufu Guo Oct. 29, 2019, 8:51 a.m. UTC | #1
Jiufu Guo <guojiufu@linux.ibm.com> writes:

> Hi,
>
> In previous patch r277501, which is changing PARAM_MAX_UNROLL_TIMES
> and PARAM_MAX_UNROLLED_INSNS values during option overriding, while
> it would better to use target loop unroll adjust hook. The hook can
> also help to do further target related hueristic adjust.
> This patch is adding target loop unroll adjust hook for rs6000 to
> impliment previous behavior.
>
> Bootstrapped and regtested on powerpc64le. Is this ok for trunk?
>
> A combined patch is listed at the end of this mail for this and r277501.
> If you want to review it as a whole, it can be referenced.
>
> Jiufu Guo
> BR
>
>
> gcc/
> 2019-10-29  Jiufu Guo  <guojiufu@linux.ibm.com>	    
>
> 	PR tree-optimization/88760
> 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Remove
> 	changes to PARAM_MAX_UNROLL_TIMES and PARAM_MAX_UNROLLED_INSNS.
> 	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
> 	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
> 	small loop 2 times if funroll-loops enabled implicitly.
>
> 	
> gcc.testsuite/
> 2019-10-29  Jiufu Guo  <guojiufu@linux.ibm.com>
>
> 	PR tree-optimization/88760
> 	* gcc.dg/pr59643.c: Update back to r277550.
>
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c	(revision 277550)
> +++ gcc/config/rs6000/rs6000.c	(working copy)
> @@ -1428,6 +1428,9 @@ static const struct attribute_spec rs6000_attribut
>  #undef TARGET_VECTORIZE_DESTROY_COST_DATA
>  #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
>  
> +#undef TARGET_LOOP_UNROLL_ADJUST
> +#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
> +
>  #undef TARGET_INIT_BUILTINS
>  #define TARGET_INIT_BUILTINS rs6000_init_builtins
>  #undef TARGET_BUILTIN_DECL
> @@ -4540,20 +4543,11 @@ rs6000_option_override_internal (bool global_init_
>  			     global_options.x_param_values,
>  			     global_options_set.x_param_values);
>  
> -      /* unroll very small loops 2 time if no -funroll-loops.  */
> +      /* If funroll-loops is implicitly enabled, do not turn fweb or
> +	 frename-registers on implicitly.  */
>        if (!global_options_set.x_flag_unroll_loops
>  	  && !global_options_set.x_flag_unroll_all_loops)
>  	{
> -	  maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2,
> -				 global_options.x_param_values,
> -				 global_options_set.x_param_values);
> -
> -	  maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20,
> -				 global_options.x_param_values,
> -				 global_options_set.x_param_values);
> -
> -	  /* If fweb or frename-registers are not specificed in command-line,
> -	     do not turn them on implicitly.  */
>  	  if (!global_options_set.x_flag_web)
>  	    global_options.x_flag_web = 0;
>  	  if (!global_options_set.x_flag_rename_registers)
> @@ -5101,6 +5095,29 @@ rs6000_destroy_cost_data (void *data)
>    free (data);
>  }
>  
> +/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
> +   a new heristic number struct loop *loop should be unrolled.  */
> +
> +static unsigned
> +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
> +{
> +  /* If funroll-loops was enabled implicitly, like at -O2, we only unroll
> +     those small loops with small unroll factor.  */
> +  if (!global_options_set.x_flag_unroll_loops
> +      && !global_options_set.x_flag_unroll_all_loops)
Here, still checking global option setting: x_flag_unroll_loops to avoid
changing the behavior of -funroll-loops which come from command line.

And then only enhancing the behavior for implicit funroll-loops (by -O2/3
level).

We would remove this checking after heuristic refine when it can benefits
explicitly -funroll-loops.

> +    {
> +      /* If the loop contains few insns, treated it as small loops.
> +	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
> +	 if loop constians only 1-2 insns, we may unroll more times(4).
> +	 And we may use PARAM to control kinds of loop size.  */
> +      if (loop->ninsns <= 10)
> +	return 2;
> +      else
> +	return 0;
> +    }
> +  return nunroll;
> +}
> +
For this patch at -flto, loop-unrolling is affected by command line
besides optimize level per function. 
-flto -c t.c -O2 -funroll-loops
-flto  t.o -O2 or -flto t.o,
this linking will not be treated as explicit -funroll-loops. 

>  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
>     library with vectorized intrinsics.  */
>  
> Index: gcc/testsuite/gcc.dg/pr59643.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/pr59643.c	(revision 277550)
> +++ gcc/testsuite/gcc.dg/pr59643.c	(working copy)
> @@ -1,9 +1,6 @@
>  /* PR tree-optimization/59643 */
>  /* { dg-do compile } */
>  /* { dg-options "-O3 -fdump-tree-pcom-details" } */
> -/* { dg-additional-options "--param max-unrolled-insns=400" { target { powerpc*-*-* } } } */
> -/* Implicit threashold of max-unrolled-insn on ppc at O3 is too small for the
> -   loop of this case.  */
>  
>  void
>  foo (double *a, double *b, double *c, double d, double e, int n)
>
>
> -------- combined patch---------
> gcc/ChangeLog
> 2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>	    
>
> 	PR tree-optimization/88760
> 	* config/rs6000/rs6000-common.c (rs6000_option_optimization_table):
> 	Enable -funroll-loops for -O2 and above.
> 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Avoid turn
> 	on web and rnreg implicitly.
> 	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
> 	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
> 	small loop 2 times if funroll-loops enabled implicitly.
>
>
> gcc/testsuite/ChangeLog
> 2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>
>
> 	PR tree-optimization/88760
> 	* gcc.target/powerpc/small-loop-unroll.c: New test.
> 	* c-c++-common/tsan/thread_leak2.c: Update test.
> 	* gcc.target/powerpc/loop_align.c: Update test.
> 	* gcc.target/powerpc/ppc-fma-1.c: Update test.
> 	* gcc.target/powerpc/ppc-fma-2.c: Update test.
> 	* gcc.target/powerpc/ppc-fma-3.c: Update test.
> 	* gcc.target/powerpc/ppc-fma-4.c: Update test.
> 	* gcc.target/powerpc/pr78604.c: Update test.
>
>
> diff --git a/gcc/common/config/rs6000/rs6000-common.c b/gcc/common/config/rs6000/rs6000-common.c
> index 4b0c205..b947196 100644
> --- a/gcc/common/config/rs6000/rs6000-common.c
> +++ b/gcc/common/config/rs6000/rs6000-common.c
> @@ -35,6 +35,7 @@ static const struct default_options rs6000_option_optimization_table[] =
>      { OPT_LEVELS_ALL, OPT_fsplit_wide_types_early, NULL, 1 },
>      /* Enable -fsched-pressure for first pass instruction scheduling.  */
>      { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
> +    { OPT_LEVELS_2_PLUS, OPT_funroll_loops, NULL, 1 },
>      { OPT_LEVELS_NONE, 0, NULL, 0 }
>    };
>  
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 1399221..28ffa15 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -1428,6 +1428,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
>  #undef TARGET_VECTORIZE_DESTROY_COST_DATA
>  #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
>  
> +#undef TARGET_LOOP_UNROLL_ADJUST
> +#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
> +
>  #undef TARGET_INIT_BUILTINS
>  #define TARGET_INIT_BUILTINS rs6000_init_builtins
>  #undef TARGET_BUILTIN_DECL
> @@ -4540,6 +4543,17 @@ rs6000_option_override_internal (bool global_init_p)
>  			     global_options.x_param_values,
>  			     global_options_set.x_param_values);
>  
> +      /* If funroll-loops is implicitly enabled, do not turn fweb or
> +	 frename-registers on implicitly.  */
> +      if (!global_options_set.x_flag_unroll_loops
> +	  && !global_options_set.x_flag_unroll_all_loops)
> +	{
> +	  if (!global_options_set.x_flag_web)
> +	    global_options.x_flag_web = 0;
> +	  if (!global_options_set.x_flag_rename_registers)
> +	    global_options.x_flag_rename_registers = 0;
> +	}
> +
>        /* If using typedef char *va_list, signal that
>  	 __builtin_va_start (&ap, 0) can be optimized to
>  	 ap = __builtin_next_arg (0).  */
> @@ -5081,6 +5095,29 @@ rs6000_destroy_cost_data (void *data)
>    free (data);
>  }
>  
> +/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
> +   a new heristic number struct loop *loop should be unrolled.  */
> +
> +static unsigned
> +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
> +{
> +  /* If funroll-loops was enabled implicitly, like at -O2, we only unroll
> +     those small loops with small unroll factor.  */
> +  if (!global_options_set.x_flag_unroll_loops
> +      && !global_options_set.x_flag_unroll_all_loops)
> +    {
> +      /* If the loop contains few insns, treated it as small loops.
> +	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
> +	 if loop constians only 1-2 insns, we may unroll more times(4).
> +	 And we may use PARAM to control kinds of loop size.  */
> +      if (loop->ninsns <= 10)
> +	return 2;
> +      else
> +	return 0;
> +    }
> +  return nunroll;
> +}
> +
>  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
>     library with vectorized intrinsics.  */
>  
> diff --git a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
> index c9b8046..082f2aa 100644
> --- a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
> +++ b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
> @@ -1,5 +1,9 @@
>  /* { dg-shouldfail "tsan" } */
>  
> +/* { dg-additional-options "-fno-unroll-loops" { target { powerpc*-*-* } } } */
> +/* -fno-unroll-loops help to avoid ThreadSanitizer reporting multi-times
> +   message for pthread_create at difference calling addresses.  */
> +
>  #include <pthread.h>
>  #include <unistd.h>
>  
> diff --git a/gcc/testsuite/gcc.target/powerpc/loop_align.c b/gcc/testsuite/gcc.target/powerpc/loop_align.c
> index ebe3782..ef67f77 100644
> --- a/gcc/testsuite/gcc.target/powerpc/loop_align.c
> +++ b/gcc/testsuite/gcc.target/powerpc/loop_align.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
>  /* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */
> -/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16" } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16 -fno-unroll-loops" } */
>  /* { dg-final { scan-assembler ".p2align 5" } } */
>  
>  void f(double *a, double *b, double *c, unsigned long n) {
> diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
> index b4945e6..2a5b92c 100644
> --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
> +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
>  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>  /* { dg-require-effective-target powerpc_vsx_ok } */
> -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math" } */
> +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -fno-unroll-loops" } */
>  /* { dg-final { scan-assembler-times "xvmadd" 4 } } */
>  /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */
>  /* { dg-final { scan-assembler-times "fmadds" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
> index 5ed630a..bf2c67f 100644
> --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
> +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
>  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>  /* { dg-require-effective-target powerpc_vsx_ok } */
> -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off" } */
> +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off -fno-unroll-loops" } */
>  /* { dg-final { scan-assembler-times "xvmadd" 2 } } */
>  /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */
>  /* { dg-final { scan-assembler-times "fmadds" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
> index ef252b3..8608116 100644
> --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
> +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
> @@ -2,7 +2,7 @@
>  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>  /* { dg-require-effective-target powerpc_altivec_ok } */
>  /* { dg-require-effective-target powerpc_fprs } */
> -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math" } */
> +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -fno-unroll-loops" } */
>  /* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
>  /* { dg-final { scan-assembler-times "fmadd " 2 } } */
>  /* { dg-final { scan-assembler-times "fmadds" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
> index c2eaf1a..291c2ee 100644
> --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
> +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
> @@ -2,7 +2,7 @@
>  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>  /* { dg-require-effective-target powerpc_altivec_ok } */
>  /* { dg-require-effective-target powerpc_fprs } */
> -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off" } */
> +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off -fno-unroll-loops" } */
>  /* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
>  /* { dg-final { scan-assembler-times "fmadd " 1 } } */
>  /* { dg-final { scan-assembler-times "fmadds" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr78604.c b/gcc/testsuite/gcc.target/powerpc/pr78604.c
> index 76d8945..35bfdb3 100644
> --- a/gcc/testsuite/gcc.target/powerpc/pr78604.c
> +++ b/gcc/testsuite/gcc.target/powerpc/pr78604.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
>  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>  /* { dg-require-effective-target powerpc_p8vector_ok } */
> -/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details" } */
> +/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details -fno-unroll-loops" } */
>  
>  #ifndef SIZE
>  #define SIZE 1024
> diff --git a/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
> new file mode 100644
> index 0000000..fec5ae9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-loop2_unroll" } */
> +
> +void __attribute__ ((noinline)) foo(int n, int *arr)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    arr[i] = arr[i] - 10;
> +}
> +/* { dg-final { scan-rtl-dump-times "Unrolled loop 1 times" 1 "loop2_unroll" } } */
> +/* { dg-final { scan-assembler-times {\mlwz\M} 3 } } */
> +/* { dg-final { scan-assembler-times {\mstw\M} 3 } } */
> +
Richard Biener Oct. 29, 2019, 9:02 a.m. UTC | #2
On Tue, 29 Oct 2019, Jiufu Guo wrote:

> Jiufu Guo <guojiufu@linux.ibm.com> writes:
> 
> > Hi,
> >
> > In previous patch r277501, which is changing PARAM_MAX_UNROLL_TIMES
> > and PARAM_MAX_UNROLLED_INSNS values during option overriding, while
> > it would better to use target loop unroll adjust hook. The hook can
> > also help to do further target related hueristic adjust.
> > This patch is adding target loop unroll adjust hook for rs6000 to
> > impliment previous behavior.
> >
> > Bootstrapped and regtested on powerpc64le. Is this ok for trunk?
> >
> > A combined patch is listed at the end of this mail for this and r277501.
> > If you want to review it as a whole, it can be referenced.
> >
> > Jiufu Guo
> > BR
> >
> >
> > gcc/
> > 2019-10-29  Jiufu Guo  <guojiufu@linux.ibm.com>	    
> >
> > 	PR tree-optimization/88760
> > 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Remove
> > 	changes to PARAM_MAX_UNROLL_TIMES and PARAM_MAX_UNROLLED_INSNS.
> > 	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
> > 	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
> > 	small loop 2 times if funroll-loops enabled implicitly.
> >
> > 	
> > gcc.testsuite/
> > 2019-10-29  Jiufu Guo  <guojiufu@linux.ibm.com>
> >
> > 	PR tree-optimization/88760
> > 	* gcc.dg/pr59643.c: Update back to r277550.
> >
> > Index: gcc/config/rs6000/rs6000.c
> > ===================================================================
> > --- gcc/config/rs6000/rs6000.c	(revision 277550)
> > +++ gcc/config/rs6000/rs6000.c	(working copy)
> > @@ -1428,6 +1428,9 @@ static const struct attribute_spec rs6000_attribut
> >  #undef TARGET_VECTORIZE_DESTROY_COST_DATA
> >  #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
> >  
> > +#undef TARGET_LOOP_UNROLL_ADJUST
> > +#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
> > +
> >  #undef TARGET_INIT_BUILTINS
> >  #define TARGET_INIT_BUILTINS rs6000_init_builtins
> >  #undef TARGET_BUILTIN_DECL
> > @@ -4540,20 +4543,11 @@ rs6000_option_override_internal (bool global_init_
> >  			     global_options.x_param_values,
> >  			     global_options_set.x_param_values);
> >  
> > -      /* unroll very small loops 2 time if no -funroll-loops.  */
> > +      /* If funroll-loops is implicitly enabled, do not turn fweb or
> > +	 frename-registers on implicitly.  */
> >        if (!global_options_set.x_flag_unroll_loops
> >  	  && !global_options_set.x_flag_unroll_all_loops)
> >  	{
> > -	  maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2,
> > -				 global_options.x_param_values,
> > -				 global_options_set.x_param_values);
> > -
> > -	  maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20,
> > -				 global_options.x_param_values,
> > -				 global_options_set.x_param_values);
> > -
> > -	  /* If fweb or frename-registers are not specificed in command-line,
> > -	     do not turn them on implicitly.  */
> >  	  if (!global_options_set.x_flag_web)
> >  	    global_options.x_flag_web = 0;
> >  	  if (!global_options_set.x_flag_rename_registers)
> > @@ -5101,6 +5095,29 @@ rs6000_destroy_cost_data (void *data)
> >    free (data);
> >  }
> >  
> > +/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
> > +   a new heristic number struct loop *loop should be unrolled.  */
> > +
> > +static unsigned
> > +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
> > +{
> > +  /* If funroll-loops was enabled implicitly, like at -O2, we only unroll
> > +     those small loops with small unroll factor.  */
> > +  if (!global_options_set.x_flag_unroll_loops
> > +      && !global_options_set.x_flag_unroll_all_loops)
> Here, still checking global option setting: x_flag_unroll_loops to avoid
> changing the behavior of -funroll-loops which come from command line.
> 
> And then only enhancing the behavior for implicit funroll-loops (by -O2/3
> level).
> 
> We would remove this checking after heuristic refine when it can benefits
> explicitly -funroll-loops.
>
> > +    {
> > +      /* If the loop contains few insns, treated it as small loops.
> > +	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
> > +	 if loop constians only 1-2 insns, we may unroll more times(4).
> > +	 And we may use PARAM to control kinds of loop size.  */
> > +      if (loop->ninsns <= 10)
> > +	return 2;
> > +      else
> > +	return 0;
> > +    }
> > +  return nunroll;
> > +}
> > +
> For this patch at -flto, loop-unrolling is affected by command line
> besides optimize level per function. 
> -flto -c t.c -O2 -funroll-loops
> -flto  t.o -O2 or -flto t.o,
> this linking will not be treated as explicit -funroll-loops. 

That's because you check global_options_set - you should never do this
in hooks invoked by optimization passes.  IIRC some discussion in
a thread elsewhere that even with -flto you can manage to see whether
flag_unroll_loop was enabled explicitely or implicitely - not sure
if it was for power, s390 or aarch64.

Richard.
 
> >  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
> >     library with vectorized intrinsics.  */
> >  
> > Index: gcc/testsuite/gcc.dg/pr59643.c
> > ===================================================================
> > --- gcc/testsuite/gcc.dg/pr59643.c	(revision 277550)
> > +++ gcc/testsuite/gcc.dg/pr59643.c	(working copy)
> > @@ -1,9 +1,6 @@
> >  /* PR tree-optimization/59643 */
> >  /* { dg-do compile } */
> >  /* { dg-options "-O3 -fdump-tree-pcom-details" } */
> > -/* { dg-additional-options "--param max-unrolled-insns=400" { target { powerpc*-*-* } } } */
> > -/* Implicit threashold of max-unrolled-insn on ppc at O3 is too small for the
> > -   loop of this case.  */
> >  
> >  void
> >  foo (double *a, double *b, double *c, double d, double e, int n)
> >
> >
> > -------- combined patch---------
> > gcc/ChangeLog
> > 2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>	    
> >
> > 	PR tree-optimization/88760
> > 	* config/rs6000/rs6000-common.c (rs6000_option_optimization_table):
> > 	Enable -funroll-loops for -O2 and above.
> > 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Avoid turn
> > 	on web and rnreg implicitly.
> > 	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
> > 	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
> > 	small loop 2 times if funroll-loops enabled implicitly.
> >
> >
> > gcc/testsuite/ChangeLog
> > 2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>
> >
> > 	PR tree-optimization/88760
> > 	* gcc.target/powerpc/small-loop-unroll.c: New test.
> > 	* c-c++-common/tsan/thread_leak2.c: Update test.
> > 	* gcc.target/powerpc/loop_align.c: Update test.
> > 	* gcc.target/powerpc/ppc-fma-1.c: Update test.
> > 	* gcc.target/powerpc/ppc-fma-2.c: Update test.
> > 	* gcc.target/powerpc/ppc-fma-3.c: Update test.
> > 	* gcc.target/powerpc/ppc-fma-4.c: Update test.
> > 	* gcc.target/powerpc/pr78604.c: Update test.
> >
> >
> > diff --git a/gcc/common/config/rs6000/rs6000-common.c b/gcc/common/config/rs6000/rs6000-common.c
> > index 4b0c205..b947196 100644
> > --- a/gcc/common/config/rs6000/rs6000-common.c
> > +++ b/gcc/common/config/rs6000/rs6000-common.c
> > @@ -35,6 +35,7 @@ static const struct default_options rs6000_option_optimization_table[] =
> >      { OPT_LEVELS_ALL, OPT_fsplit_wide_types_early, NULL, 1 },
> >      /* Enable -fsched-pressure for first pass instruction scheduling.  */
> >      { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
> > +    { OPT_LEVELS_2_PLUS, OPT_funroll_loops, NULL, 1 },
> >      { OPT_LEVELS_NONE, 0, NULL, 0 }
> >    };
> >  
> > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> > index 1399221..28ffa15 100644
> > --- a/gcc/config/rs6000/rs6000.c
> > +++ b/gcc/config/rs6000/rs6000.c
> > @@ -1428,6 +1428,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
> >  #undef TARGET_VECTORIZE_DESTROY_COST_DATA
> >  #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
> >  
> > +#undef TARGET_LOOP_UNROLL_ADJUST
> > +#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
> > +
> >  #undef TARGET_INIT_BUILTINS
> >  #define TARGET_INIT_BUILTINS rs6000_init_builtins
> >  #undef TARGET_BUILTIN_DECL
> > @@ -4540,6 +4543,17 @@ rs6000_option_override_internal (bool global_init_p)
> >  			     global_options.x_param_values,
> >  			     global_options_set.x_param_values);
> >  
> > +      /* If funroll-loops is implicitly enabled, do not turn fweb or
> > +	 frename-registers on implicitly.  */
> > +      if (!global_options_set.x_flag_unroll_loops
> > +	  && !global_options_set.x_flag_unroll_all_loops)
> > +	{
> > +	  if (!global_options_set.x_flag_web)
> > +	    global_options.x_flag_web = 0;
> > +	  if (!global_options_set.x_flag_rename_registers)
> > +	    global_options.x_flag_rename_registers = 0;
> > +	}
> > +
> >        /* If using typedef char *va_list, signal that
> >  	 __builtin_va_start (&ap, 0) can be optimized to
> >  	 ap = __builtin_next_arg (0).  */
> > @@ -5081,6 +5095,29 @@ rs6000_destroy_cost_data (void *data)
> >    free (data);
> >  }
> >  
> > +/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
> > +   a new heristic number struct loop *loop should be unrolled.  */
> > +
> > +static unsigned
> > +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
> > +{
> > +  /* If funroll-loops was enabled implicitly, like at -O2, we only unroll
> > +     those small loops with small unroll factor.  */
> > +  if (!global_options_set.x_flag_unroll_loops
> > +      && !global_options_set.x_flag_unroll_all_loops)
> > +    {
> > +      /* If the loop contains few insns, treated it as small loops.
> > +	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
> > +	 if loop constians only 1-2 insns, we may unroll more times(4).
> > +	 And we may use PARAM to control kinds of loop size.  */
> > +      if (loop->ninsns <= 10)
> > +	return 2;
> > +      else
> > +	return 0;
> > +    }
> > +  return nunroll;
> > +}
> > +
> >  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
> >     library with vectorized intrinsics.  */
> >  
> > diff --git a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
> > index c9b8046..082f2aa 100644
> > --- a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
> > +++ b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
> > @@ -1,5 +1,9 @@
> >  /* { dg-shouldfail "tsan" } */
> >  
> > +/* { dg-additional-options "-fno-unroll-loops" { target { powerpc*-*-* } } } */
> > +/* -fno-unroll-loops help to avoid ThreadSanitizer reporting multi-times
> > +   message for pthread_create at difference calling addresses.  */
> > +
> >  #include <pthread.h>
> >  #include <unistd.h>
> >  
> > diff --git a/gcc/testsuite/gcc.target/powerpc/loop_align.c b/gcc/testsuite/gcc.target/powerpc/loop_align.c
> > index ebe3782..ef67f77 100644
> > --- a/gcc/testsuite/gcc.target/powerpc/loop_align.c
> > +++ b/gcc/testsuite/gcc.target/powerpc/loop_align.c
> > @@ -1,6 +1,6 @@
> >  /* { dg-do compile { target { powerpc*-*-* } } } */
> >  /* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */
> > -/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16" } */
> > +/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16 -fno-unroll-loops" } */
> >  /* { dg-final { scan-assembler ".p2align 5" } } */
> >  
> >  void f(double *a, double *b, double *c, unsigned long n) {
> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
> > index b4945e6..2a5b92c 100644
> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
> > @@ -1,7 +1,7 @@
> >  /* { dg-do compile { target { powerpc*-*-* } } } */
> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
> >  /* { dg-require-effective-target powerpc_vsx_ok } */
> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math" } */
> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -fno-unroll-loops" } */
> >  /* { dg-final { scan-assembler-times "xvmadd" 4 } } */
> >  /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */
> >  /* { dg-final { scan-assembler-times "fmadds" 2 } } */
> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
> > index 5ed630a..bf2c67f 100644
> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
> > @@ -1,7 +1,7 @@
> >  /* { dg-do compile { target { powerpc*-*-* } } } */
> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
> >  /* { dg-require-effective-target powerpc_vsx_ok } */
> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off" } */
> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off -fno-unroll-loops" } */
> >  /* { dg-final { scan-assembler-times "xvmadd" 2 } } */
> >  /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */
> >  /* { dg-final { scan-assembler-times "fmadds" 1 } } */
> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
> > index ef252b3..8608116 100644
> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
> > @@ -2,7 +2,7 @@
> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
> >  /* { dg-require-effective-target powerpc_altivec_ok } */
> >  /* { dg-require-effective-target powerpc_fprs } */
> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math" } */
> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -fno-unroll-loops" } */
> >  /* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
> >  /* { dg-final { scan-assembler-times "fmadd " 2 } } */
> >  /* { dg-final { scan-assembler-times "fmadds" 2 } } */
> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
> > index c2eaf1a..291c2ee 100644
> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
> > @@ -2,7 +2,7 @@
> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
> >  /* { dg-require-effective-target powerpc_altivec_ok } */
> >  /* { dg-require-effective-target powerpc_fprs } */
> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off" } */
> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off -fno-unroll-loops" } */
> >  /* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
> >  /* { dg-final { scan-assembler-times "fmadd " 1 } } */
> >  /* { dg-final { scan-assembler-times "fmadds" 1 } } */
> > diff --git a/gcc/testsuite/gcc.target/powerpc/pr78604.c b/gcc/testsuite/gcc.target/powerpc/pr78604.c
> > index 76d8945..35bfdb3 100644
> > --- a/gcc/testsuite/gcc.target/powerpc/pr78604.c
> > +++ b/gcc/testsuite/gcc.target/powerpc/pr78604.c
> > @@ -1,7 +1,7 @@
> >  /* { dg-do compile { target { powerpc*-*-* } } } */
> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
> >  /* { dg-require-effective-target powerpc_p8vector_ok } */
> > -/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details" } */
> > +/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details -fno-unroll-loops" } */
> >  
> >  #ifndef SIZE
> >  #define SIZE 1024
> > diff --git a/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
> > new file mode 100644
> > index 0000000..fec5ae9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
> > @@ -0,0 +1,13 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-rtl-loop2_unroll" } */
> > +
> > +void __attribute__ ((noinline)) foo(int n, int *arr)
> > +{
> > +  int i;
> > +  for (i = 0; i < n; i++)
> > +    arr[i] = arr[i] - 10;
> > +}
> > +/* { dg-final { scan-rtl-dump-times "Unrolled loop 1 times" 1 "loop2_unroll" } } */
> > +/* { dg-final { scan-assembler-times {\mlwz\M} 3 } } */
> > +/* { dg-final { scan-assembler-times {\mstw\M} 3 } } */
> > +
>
Jiufu Guo Oct. 29, 2019, 9:08 a.m. UTC | #3
Richard Biener <rguenther@suse.de> writes:

> On Tue, 29 Oct 2019, Jiufu Guo wrote:
>
>> Jiufu Guo <guojiufu@linux.ibm.com> writes:
>> 
>> > Hi,
>> >
>> > In previous patch r277501, which is changing PARAM_MAX_UNROLL_TIMES
>> > and PARAM_MAX_UNROLLED_INSNS values during option overriding, while
>> > it would better to use target loop unroll adjust hook. The hook can
>> > also help to do further target related hueristic adjust.
>> > This patch is adding target loop unroll adjust hook for rs6000 to
>> > impliment previous behavior.
>> >
>> > Bootstrapped and regtested on powerpc64le. Is this ok for trunk?
>> >
>> > A combined patch is listed at the end of this mail for this and r277501.
>> > If you want to review it as a whole, it can be referenced.
>> >
>> > Jiufu Guo
>> > BR
>> >
>> >
>> > gcc/
>> > 2019-10-29  Jiufu Guo  <guojiufu@linux.ibm.com>	    
>> >
>> > 	PR tree-optimization/88760
>> > 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Remove
>> > 	changes to PARAM_MAX_UNROLL_TIMES and PARAM_MAX_UNROLLED_INSNS.
>> > 	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
>> > 	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
>> > 	small loop 2 times if funroll-loops enabled implicitly.
>> >
>> > 	
>> > gcc.testsuite/
>> > 2019-10-29  Jiufu Guo  <guojiufu@linux.ibm.com>
>> >
>> > 	PR tree-optimization/88760
>> > 	* gcc.dg/pr59643.c: Update back to r277550.
>> >
>> > Index: gcc/config/rs6000/rs6000.c
>> > ===================================================================
>> > --- gcc/config/rs6000/rs6000.c	(revision 277550)
>> > +++ gcc/config/rs6000/rs6000.c	(working copy)
>> > @@ -1428,6 +1428,9 @@ static const struct attribute_spec rs6000_attribut
>> >  #undef TARGET_VECTORIZE_DESTROY_COST_DATA
>> >  #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
>> >  
>> > +#undef TARGET_LOOP_UNROLL_ADJUST
>> > +#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
>> > +
>> >  #undef TARGET_INIT_BUILTINS
>> >  #define TARGET_INIT_BUILTINS rs6000_init_builtins
>> >  #undef TARGET_BUILTIN_DECL
>> > @@ -4540,20 +4543,11 @@ rs6000_option_override_internal (bool global_init_
>> >  			     global_options.x_param_values,
>> >  			     global_options_set.x_param_values);
>> >  
>> > -      /* unroll very small loops 2 time if no -funroll-loops.  */
>> > +      /* If funroll-loops is implicitly enabled, do not turn fweb or
>> > +	 frename-registers on implicitly.  */
>> >        if (!global_options_set.x_flag_unroll_loops
>> >  	  && !global_options_set.x_flag_unroll_all_loops)
>> >  	{
>> > -	  maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2,
>> > -				 global_options.x_param_values,
>> > -				 global_options_set.x_param_values);
>> > -
>> > -	  maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20,
>> > -				 global_options.x_param_values,
>> > -				 global_options_set.x_param_values);
>> > -
>> > -	  /* If fweb or frename-registers are not specificed in command-line,
>> > -	     do not turn them on implicitly.  */
>> >  	  if (!global_options_set.x_flag_web)
>> >  	    global_options.x_flag_web = 0;
>> >  	  if (!global_options_set.x_flag_rename_registers)
>> > @@ -5101,6 +5095,29 @@ rs6000_destroy_cost_data (void *data)
>> >    free (data);
>> >  }
>> >  
>> > +/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
>> > +   a new heristic number struct loop *loop should be unrolled.  */
>> > +
>> > +static unsigned
>> > +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
>> > +{
>> > +  /* If funroll-loops was enabled implicitly, like at -O2, we only unroll
>> > +     those small loops with small unroll factor.  */
>> > +  if (!global_options_set.x_flag_unroll_loops
>> > +      && !global_options_set.x_flag_unroll_all_loops)
>> Here, still checking global option setting: x_flag_unroll_loops to avoid
>> changing the behavior of -funroll-loops which come from command line.
>> 
>> And then only enhancing the behavior for implicit funroll-loops (by -O2/3
>> level).
>> 
>> We would remove this checking after heuristic refine when it can benefits
>> explicitly -funroll-loops.
>>
>> > +    {
>> > +      /* If the loop contains few insns, treated it as small loops.
>> > +	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
>> > +	 if loop constians only 1-2 insns, we may unroll more times(4).
>> > +	 And we may use PARAM to control kinds of loop size.  */
>> > +      if (loop->ninsns <= 10)
>> > +	return 2;
>> > +      else
>> > +	return 0;
>> > +    }
>> > +  return nunroll;
>> > +}
>> > +
>> For this patch at -flto, loop-unrolling is affected by command line
>> besides optimize level per function. 
>> -flto -c t.c -O2 -funroll-loops
>> -flto  t.o -O2 or -flto t.o,
>> this linking will not be treated as explicit -funroll-loops. 
>
> That's because you check global_options_set - you should never do this
> in hooks invoked by optimization passes.  IIRC some discussion in
> a thread elsewhere that even with -flto you can manage to see whether
> flag_unroll_loop was enabled explicitely or implicitely - not sure
> if it was for power, s390 or aarch64.
Thanks for your so quick reply! 
>
> Richard.
>  
>> >  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
>> >     library with vectorized intrinsics.  */
>> >  
>> > Index: gcc/testsuite/gcc.dg/pr59643.c
>> > ===================================================================
>> > --- gcc/testsuite/gcc.dg/pr59643.c	(revision 277550)
>> > +++ gcc/testsuite/gcc.dg/pr59643.c	(working copy)
>> > @@ -1,9 +1,6 @@
>> >  /* PR tree-optimization/59643 */
>> >  /* { dg-do compile } */
>> >  /* { dg-options "-O3 -fdump-tree-pcom-details" } */
>> > -/* { dg-additional-options "--param max-unrolled-insns=400" { target { powerpc*-*-* } } } */
>> > -/* Implicit threashold of max-unrolled-insn on ppc at O3 is too small for the
>> > -   loop of this case.  */
>> >  
>> >  void
>> >  foo (double *a, double *b, double *c, double d, double e, int n)
>> >
>> >
>> > -------- combined patch---------
>> > gcc/ChangeLog
>> > 2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>	    
>> >
>> > 	PR tree-optimization/88760
>> > 	* config/rs6000/rs6000-common.c (rs6000_option_optimization_table):
>> > 	Enable -funroll-loops for -O2 and above.
>> > 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Avoid turn
>> > 	on web and rnreg implicitly.
>> > 	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
>> > 	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
>> > 	small loop 2 times if funroll-loops enabled implicitly.
>> >
>> >
>> > gcc/testsuite/ChangeLog
>> > 2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>
>> >
>> > 	PR tree-optimization/88760
>> > 	* gcc.target/powerpc/small-loop-unroll.c: New test.
>> > 	* c-c++-common/tsan/thread_leak2.c: Update test.
>> > 	* gcc.target/powerpc/loop_align.c: Update test.
>> > 	* gcc.target/powerpc/ppc-fma-1.c: Update test.
>> > 	* gcc.target/powerpc/ppc-fma-2.c: Update test.
>> > 	* gcc.target/powerpc/ppc-fma-3.c: Update test.
>> > 	* gcc.target/powerpc/ppc-fma-4.c: Update test.
>> > 	* gcc.target/powerpc/pr78604.c: Update test.
>> >
>> >
>> > diff --git a/gcc/common/config/rs6000/rs6000-common.c b/gcc/common/config/rs6000/rs6000-common.c
>> > index 4b0c205..b947196 100644
>> > --- a/gcc/common/config/rs6000/rs6000-common.c
>> > +++ b/gcc/common/config/rs6000/rs6000-common.c
>> > @@ -35,6 +35,7 @@ static const struct default_options rs6000_option_optimization_table[] =
>> >      { OPT_LEVELS_ALL, OPT_fsplit_wide_types_early, NULL, 1 },
>> >      /* Enable -fsched-pressure for first pass instruction scheduling.  */
>> >      { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
>> > +    { OPT_LEVELS_2_PLUS, OPT_funroll_loops, NULL, 1 },
>> >      { OPT_LEVELS_NONE, 0, NULL, 0 }
>> >    };
>> >  
>> > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>> > index 1399221..28ffa15 100644
>> > --- a/gcc/config/rs6000/rs6000.c
>> > +++ b/gcc/config/rs6000/rs6000.c
>> > @@ -1428,6 +1428,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
>> >  #undef TARGET_VECTORIZE_DESTROY_COST_DATA
>> >  #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
>> >  
>> > +#undef TARGET_LOOP_UNROLL_ADJUST
>> > +#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
>> > +
>> >  #undef TARGET_INIT_BUILTINS
>> >  #define TARGET_INIT_BUILTINS rs6000_init_builtins
>> >  #undef TARGET_BUILTIN_DECL
>> > @@ -4540,6 +4543,17 @@ rs6000_option_override_internal (bool global_init_p)
>> >  			     global_options.x_param_values,
>> >  			     global_options_set.x_param_values);
>> >  
>> > +      /* If funroll-loops is implicitly enabled, do not turn fweb or
>> > +	 frename-registers on implicitly.  */
>> > +      if (!global_options_set.x_flag_unroll_loops
>> > +	  && !global_options_set.x_flag_unroll_all_loops)
>> > +	{
>> > +	  if (!global_options_set.x_flag_web)
>> > +	    global_options.x_flag_web = 0;
>> > +	  if (!global_options_set.x_flag_rename_registers)
>> > +	    global_options.x_flag_rename_registers = 0;
>> > +	}
>> > +
>> >        /* If using typedef char *va_list, signal that
>> >  	 __builtin_va_start (&ap, 0) can be optimized to
>> >  	 ap = __builtin_next_arg (0).  */
>> > @@ -5081,6 +5095,29 @@ rs6000_destroy_cost_data (void *data)
>> >    free (data);
>> >  }
>> >  
>> > +/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
>> > +   a new heristic number struct loop *loop should be unrolled.  */
>> > +
>> > +static unsigned
>> > +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
>> > +{
>> > +  /* If funroll-loops was enabled implicitly, like at -O2, we only unroll
>> > +     those small loops with small unroll factor.  */
>> > +  if (!global_options_set.x_flag_unroll_loops
>> > +      && !global_options_set.x_flag_unroll_all_loops)
>> > +    {
>> > +      /* If the loop contains few insns, treated it as small loops.
>> > +	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
>> > +	 if loop constians only 1-2 insns, we may unroll more times(4).
>> > +	 And we may use PARAM to control kinds of loop size.  */
>> > +      if (loop->ninsns <= 10)
>> > +	return 2;
>> > +      else
>> > +	return 0;
>> > +    }
>> > +  return nunroll;
>> > +}
>> > +
>> >  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
>> >     library with vectorized intrinsics.  */
>> >  
>> > diff --git a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
>> > index c9b8046..082f2aa 100644
>> > --- a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
>> > +++ b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
>> > @@ -1,5 +1,9 @@
>> >  /* { dg-shouldfail "tsan" } */
>> >  
>> > +/* { dg-additional-options "-fno-unroll-loops" { target { powerpc*-*-* } } } */
>> > +/* -fno-unroll-loops help to avoid ThreadSanitizer reporting multi-times
>> > +   message for pthread_create at difference calling addresses.  */
>> > +
>> >  #include <pthread.h>
>> >  #include <unistd.h>
>> >  
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/loop_align.c b/gcc/testsuite/gcc.target/powerpc/loop_align.c
>> > index ebe3782..ef67f77 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/loop_align.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/loop_align.c
>> > @@ -1,6 +1,6 @@
>> >  /* { dg-do compile { target { powerpc*-*-* } } } */
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */
>> > -/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16" } */
>> > +/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16 -fno-unroll-loops" } */
>> >  /* { dg-final { scan-assembler ".p2align 5" } } */
>> >  
>> >  void f(double *a, double *b, double *c, unsigned long n) {
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
>> > index b4945e6..2a5b92c 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
>> > @@ -1,7 +1,7 @@
>> >  /* { dg-do compile { target { powerpc*-*-* } } } */
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> >  /* { dg-require-effective-target powerpc_vsx_ok } */
>> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math" } */
>> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -fno-unroll-loops" } */
>> >  /* { dg-final { scan-assembler-times "xvmadd" 4 } } */
>> >  /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */
>> >  /* { dg-final { scan-assembler-times "fmadds" 2 } } */
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
>> > index 5ed630a..bf2c67f 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
>> > @@ -1,7 +1,7 @@
>> >  /* { dg-do compile { target { powerpc*-*-* } } } */
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> >  /* { dg-require-effective-target powerpc_vsx_ok } */
>> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off" } */
>> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off -fno-unroll-loops" } */
>> >  /* { dg-final { scan-assembler-times "xvmadd" 2 } } */
>> >  /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */
>> >  /* { dg-final { scan-assembler-times "fmadds" 1 } } */
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
>> > index ef252b3..8608116 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
>> > @@ -2,7 +2,7 @@
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> >  /* { dg-require-effective-target powerpc_altivec_ok } */
>> >  /* { dg-require-effective-target powerpc_fprs } */
>> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math" } */
>> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -fno-unroll-loops" } */
>> >  /* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
>> >  /* { dg-final { scan-assembler-times "fmadd " 2 } } */
>> >  /* { dg-final { scan-assembler-times "fmadds" 2 } } */
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
>> > index c2eaf1a..291c2ee 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
>> > @@ -2,7 +2,7 @@
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> >  /* { dg-require-effective-target powerpc_altivec_ok } */
>> >  /* { dg-require-effective-target powerpc_fprs } */
>> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off" } */
>> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off -fno-unroll-loops" } */
>> >  /* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
>> >  /* { dg-final { scan-assembler-times "fmadd " 1 } } */
>> >  /* { dg-final { scan-assembler-times "fmadds" 1 } } */
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/pr78604.c b/gcc/testsuite/gcc.target/powerpc/pr78604.c
>> > index 76d8945..35bfdb3 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/pr78604.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/pr78604.c
>> > @@ -1,7 +1,7 @@
>> >  /* { dg-do compile { target { powerpc*-*-* } } } */
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> >  /* { dg-require-effective-target powerpc_p8vector_ok } */
>> > -/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details" } */
>> > +/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details -fno-unroll-loops" } */
>> >  
>> >  #ifndef SIZE
>> >  #define SIZE 1024
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
>> > new file mode 100644
>> > index 0000000..fec5ae9
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
>> > @@ -0,0 +1,13 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O2 -fdump-rtl-loop2_unroll" } */
>> > +
>> > +void __attribute__ ((noinline)) foo(int n, int *arr)
>> > +{
>> > +  int i;
>> > +  for (i = 0; i < n; i++)
>> > +    arr[i] = arr[i] - 10;
>> > +}
>> > +/* { dg-final { scan-rtl-dump-times "Unrolled loop 1 times" 1 "loop2_unroll" } } */
>> > +/* { dg-final { scan-assembler-times {\mlwz\M} 3 } } */
>> > +/* { dg-final { scan-assembler-times {\mstw\M} 3 } } */
>> > +
>>
Jiufu Guo Oct. 30, 2019, 3:12 a.m. UTC | #4
Richard Biener <rguenther@suse.de> writes:

> On Tue, 29 Oct 2019, Jiufu Guo wrote:
>
>> For this patch at -flto, loop-unrolling is affected by command line
>> besides optimize level per function. 
>> -flto -c t.c -O2 -funroll-loops
>> -flto  t.o -O2 or -flto t.o,
>> this linking will not be treated as explicit -funroll-loops. 
>
> That's because you check global_options_set - you should never do this
> in hooks invoked by optimization passes.  IIRC some discussion in
> a thread elsewhere that even with -flto you can manage to see whether
> flag_unroll_loop was enabled explicitely or implicitely - not sure
> if it was for power, s390 or aarch64.

Thanks Richard for your comments and sugguestions!

What we do in the hook is tunning out better unroll factor. This work
would not need to care about if flag_unroll_loop is explicit or
implicit.

We more care about: for O2 we could unroll for small loops. For O3, as we
checked, the result of -funroll-loops is fine with default unroll factor.

I would send a new patch for review.

Jiufu
BR.

>
> Richard.
>  
>> >  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
>> >     library with vectorized intrinsics.  */
>> >  
>> > Index: gcc/testsuite/gcc.dg/pr59643.c
>> > ===================================================================
>> > --- gcc/testsuite/gcc.dg/pr59643.c	(revision 277550)
>> > +++ gcc/testsuite/gcc.dg/pr59643.c	(working copy)
>> > @@ -1,9 +1,6 @@
>> >  /* PR tree-optimization/59643 */
>> >  /* { dg-do compile } */
>> >  /* { dg-options "-O3 -fdump-tree-pcom-details" } */
>> > -/* { dg-additional-options "--param max-unrolled-insns=400" { target { powerpc*-*-* } } } */
>> > -/* Implicit threashold of max-unrolled-insn on ppc at O3 is too small for the
>> > -   loop of this case.  */
>> >  
>> >  void
>> >  foo (double *a, double *b, double *c, double d, double e, int n)
>> >
>> >
>> > -------- combined patch---------
>> > gcc/ChangeLog
>> > 2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>	    
>> >
>> > 	PR tree-optimization/88760
>> > 	* config/rs6000/rs6000-common.c (rs6000_option_optimization_table):
>> > 	Enable -funroll-loops for -O2 and above.
>> > 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Avoid turn
>> > 	on web and rnreg implicitly.
>> > 	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
>> > 	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
>> > 	small loop 2 times if funroll-loops enabled implicitly.
>> >
>> >
>> > gcc/testsuite/ChangeLog
>> > 2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>
>> >
>> > 	PR tree-optimization/88760
>> > 	* gcc.target/powerpc/small-loop-unroll.c: New test.
>> > 	* c-c++-common/tsan/thread_leak2.c: Update test.
>> > 	* gcc.target/powerpc/loop_align.c: Update test.
>> > 	* gcc.target/powerpc/ppc-fma-1.c: Update test.
>> > 	* gcc.target/powerpc/ppc-fma-2.c: Update test.
>> > 	* gcc.target/powerpc/ppc-fma-3.c: Update test.
>> > 	* gcc.target/powerpc/ppc-fma-4.c: Update test.
>> > 	* gcc.target/powerpc/pr78604.c: Update test.
>> >
>> >
>> > diff --git a/gcc/common/config/rs6000/rs6000-common.c b/gcc/common/config/rs6000/rs6000-common.c
>> > index 4b0c205..b947196 100644
>> > --- a/gcc/common/config/rs6000/rs6000-common.c
>> > +++ b/gcc/common/config/rs6000/rs6000-common.c
>> > @@ -35,6 +35,7 @@ static const struct default_options rs6000_option_optimization_table[] =
>> >      { OPT_LEVELS_ALL, OPT_fsplit_wide_types_early, NULL, 1 },
>> >      /* Enable -fsched-pressure for first pass instruction scheduling.  */
>> >      { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
>> > +    { OPT_LEVELS_2_PLUS, OPT_funroll_loops, NULL, 1 },
>> >      { OPT_LEVELS_NONE, 0, NULL, 0 }
>> >    };
>> >  
>> > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>> > index 1399221..28ffa15 100644
>> > --- a/gcc/config/rs6000/rs6000.c
>> > +++ b/gcc/config/rs6000/rs6000.c
>> > @@ -1428,6 +1428,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
>> >  #undef TARGET_VECTORIZE_DESTROY_COST_DATA
>> >  #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
>> >  
>> > +#undef TARGET_LOOP_UNROLL_ADJUST
>> > +#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
>> > +
>> >  #undef TARGET_INIT_BUILTINS
>> >  #define TARGET_INIT_BUILTINS rs6000_init_builtins
>> >  #undef TARGET_BUILTIN_DECL
>> > @@ -4540,6 +4543,17 @@ rs6000_option_override_internal (bool global_init_p)
>> >  			     global_options.x_param_values,
>> >  			     global_options_set.x_param_values);
>> >  
>> > +      /* If funroll-loops is implicitly enabled, do not turn fweb or
>> > +	 frename-registers on implicitly.  */
>> > +      if (!global_options_set.x_flag_unroll_loops
>> > +	  && !global_options_set.x_flag_unroll_all_loops)
>> > +	{
>> > +	  if (!global_options_set.x_flag_web)
>> > +	    global_options.x_flag_web = 0;
>> > +	  if (!global_options_set.x_flag_rename_registers)
>> > +	    global_options.x_flag_rename_registers = 0;
>> > +	}
>> > +
>> >        /* If using typedef char *va_list, signal that
>> >  	 __builtin_va_start (&ap, 0) can be optimized to
>> >  	 ap = __builtin_next_arg (0).  */
>> > @@ -5081,6 +5095,29 @@ rs6000_destroy_cost_data (void *data)
>> >    free (data);
>> >  }
>> >  
>> > +/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
>> > +   a new heristic number struct loop *loop should be unrolled.  */
>> > +
>> > +static unsigned
>> > +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
>> > +{
>> > +  /* If funroll-loops was enabled implicitly, like at -O2, we only unroll
>> > +     those small loops with small unroll factor.  */
>> > +  if (!global_options_set.x_flag_unroll_loops
>> > +      && !global_options_set.x_flag_unroll_all_loops)
>> > +    {
>> > +      /* If the loop contains few insns, treated it as small loops.
>> > +	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
>> > +	 if loop constians only 1-2 insns, we may unroll more times(4).
>> > +	 And we may use PARAM to control kinds of loop size.  */
>> > +      if (loop->ninsns <= 10)
>> > +	return 2;
>> > +      else
>> > +	return 0;
>> > +    }
>> > +  return nunroll;
>> > +}
>> > +
>> >  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
>> >     library with vectorized intrinsics.  */
>> >  
>> > diff --git a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
>> > index c9b8046..082f2aa 100644
>> > --- a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
>> > +++ b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
>> > @@ -1,5 +1,9 @@
>> >  /* { dg-shouldfail "tsan" } */
>> >  
>> > +/* { dg-additional-options "-fno-unroll-loops" { target { powerpc*-*-* } } } */
>> > +/* -fno-unroll-loops help to avoid ThreadSanitizer reporting multi-times
>> > +   message for pthread_create at difference calling addresses.  */
>> > +
>> >  #include <pthread.h>
>> >  #include <unistd.h>
>> >  
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/loop_align.c b/gcc/testsuite/gcc.target/powerpc/loop_align.c
>> > index ebe3782..ef67f77 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/loop_align.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/loop_align.c
>> > @@ -1,6 +1,6 @@
>> >  /* { dg-do compile { target { powerpc*-*-* } } } */
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */
>> > -/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16" } */
>> > +/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16 -fno-unroll-loops" } */
>> >  /* { dg-final { scan-assembler ".p2align 5" } } */
>> >  
>> >  void f(double *a, double *b, double *c, unsigned long n) {
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
>> > index b4945e6..2a5b92c 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
>> > @@ -1,7 +1,7 @@
>> >  /* { dg-do compile { target { powerpc*-*-* } } } */
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> >  /* { dg-require-effective-target powerpc_vsx_ok } */
>> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math" } */
>> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -fno-unroll-loops" } */
>> >  /* { dg-final { scan-assembler-times "xvmadd" 4 } } */
>> >  /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */
>> >  /* { dg-final { scan-assembler-times "fmadds" 2 } } */
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
>> > index 5ed630a..bf2c67f 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
>> > @@ -1,7 +1,7 @@
>> >  /* { dg-do compile { target { powerpc*-*-* } } } */
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> >  /* { dg-require-effective-target powerpc_vsx_ok } */
>> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off" } */
>> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off -fno-unroll-loops" } */
>> >  /* { dg-final { scan-assembler-times "xvmadd" 2 } } */
>> >  /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */
>> >  /* { dg-final { scan-assembler-times "fmadds" 1 } } */
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
>> > index ef252b3..8608116 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
>> > @@ -2,7 +2,7 @@
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> >  /* { dg-require-effective-target powerpc_altivec_ok } */
>> >  /* { dg-require-effective-target powerpc_fprs } */
>> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math" } */
>> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -fno-unroll-loops" } */
>> >  /* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
>> >  /* { dg-final { scan-assembler-times "fmadd " 2 } } */
>> >  /* { dg-final { scan-assembler-times "fmadds" 2 } } */
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
>> > index c2eaf1a..291c2ee 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
>> > @@ -2,7 +2,7 @@
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> >  /* { dg-require-effective-target powerpc_altivec_ok } */
>> >  /* { dg-require-effective-target powerpc_fprs } */
>> > -/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off" } */
>> > +/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off -fno-unroll-loops" } */
>> >  /* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
>> >  /* { dg-final { scan-assembler-times "fmadd " 1 } } */
>> >  /* { dg-final { scan-assembler-times "fmadds" 1 } } */
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/pr78604.c b/gcc/testsuite/gcc.target/powerpc/pr78604.c
>> > index 76d8945..35bfdb3 100644
>> > --- a/gcc/testsuite/gcc.target/powerpc/pr78604.c
>> > +++ b/gcc/testsuite/gcc.target/powerpc/pr78604.c
>> > @@ -1,7 +1,7 @@
>> >  /* { dg-do compile { target { powerpc*-*-* } } } */
>> >  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> >  /* { dg-require-effective-target powerpc_p8vector_ok } */
>> > -/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details" } */
>> > +/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details -fno-unroll-loops" } */
>> >  
>> >  #ifndef SIZE
>> >  #define SIZE 1024
>> > diff --git a/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
>> > new file mode 100644
>> > index 0000000..fec5ae9
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
>> > @@ -0,0 +1,13 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O2 -fdump-rtl-loop2_unroll" } */
>> > +
>> > +void __attribute__ ((noinline)) foo(int n, int *arr)
>> > +{
>> > +  int i;
>> > +  for (i = 0; i < n; i++)
>> > +    arr[i] = arr[i] - 10;
>> > +}
>> > +/* { dg-final { scan-rtl-dump-times "Unrolled loop 1 times" 1 "loop2_unroll" } } */
>> > +/* { dg-final { scan-assembler-times {\mlwz\M} 3 } } */
>> > +/* { dg-final { scan-assembler-times {\mstw\M} 3 } } */
>> > +
>>
diff mbox series

Patch

Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 277550)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -1428,6 +1428,9 @@  static const struct attribute_spec rs6000_attribut
 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
 #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
+
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS rs6000_init_builtins
 #undef TARGET_BUILTIN_DECL
@@ -4540,20 +4543,11 @@  rs6000_option_override_internal (bool global_init_
 			     global_options.x_param_values,
 			     global_options_set.x_param_values);
 
-      /* unroll very small loops 2 time if no -funroll-loops.  */
+      /* If funroll-loops is implicitly enabled, do not turn fweb or
+	 frename-registers on implicitly.  */
       if (!global_options_set.x_flag_unroll_loops
 	  && !global_options_set.x_flag_unroll_all_loops)
 	{
-	  maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2,
-				 global_options.x_param_values,
-				 global_options_set.x_param_values);
-
-	  maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20,
-				 global_options.x_param_values,
-				 global_options_set.x_param_values);
-
-	  /* If fweb or frename-registers are not specificed in command-line,
-	     do not turn them on implicitly.  */
 	  if (!global_options_set.x_flag_web)
 	    global_options.x_flag_web = 0;
 	  if (!global_options_set.x_flag_rename_registers)
@@ -5101,6 +5095,29 @@  rs6000_destroy_cost_data (void *data)
   free (data);
 }
 
+/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
+   a new heristic number struct loop *loop should be unrolled.  */
+
+static unsigned
+rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
+{
+  /* If funroll-loops was enabled implicitly, like at -O2, we only unroll
+     those small loops with small unroll factor.  */
+  if (!global_options_set.x_flag_unroll_loops
+      && !global_options_set.x_flag_unroll_all_loops)
+    {
+      /* If the loop contains few insns, treated it as small loops.
+	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
+	 if loop constians only 1-2 insns, we may unroll more times(4).
+	 And we may use PARAM to control kinds of loop size.  */
+      if (loop->ninsns <= 10)
+	return 2;
+      else
+	return 0;
+    }
+  return nunroll;
+}
+
 /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
    library with vectorized intrinsics.  */
 
Index: gcc/testsuite/gcc.dg/pr59643.c
===================================================================
--- gcc/testsuite/gcc.dg/pr59643.c	(revision 277550)
+++ gcc/testsuite/gcc.dg/pr59643.c	(working copy)
@@ -1,9 +1,6 @@ 
 /* PR tree-optimization/59643 */
 /* { dg-do compile } */
 /* { dg-options "-O3 -fdump-tree-pcom-details" } */
-/* { dg-additional-options "--param max-unrolled-insns=400" { target { powerpc*-*-* } } } */
-/* Implicit threashold of max-unrolled-insn on ppc at O3 is too small for the
-   loop of this case.  */
 
 void
 foo (double *a, double *b, double *c, double d, double e, int n)


-------- combined patch---------
gcc/ChangeLog
2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>	    

	PR tree-optimization/88760
	* config/rs6000/rs6000-common.c (rs6000_option_optimization_table):
	Enable -funroll-loops for -O2 and above.
	* config/rs6000/rs6000.c (rs6000_option_override_internal): Avoid turn
	on web and rnreg implicitly.
	(TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook.
	(rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling
	small loop 2 times if funroll-loops enabled implicitly.


gcc/testsuite/ChangeLog
2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>

	PR tree-optimization/88760
	* gcc.target/powerpc/small-loop-unroll.c: New test.
	* c-c++-common/tsan/thread_leak2.c: Update test.
	* gcc.target/powerpc/loop_align.c: Update test.
	* gcc.target/powerpc/ppc-fma-1.c: Update test.
	* gcc.target/powerpc/ppc-fma-2.c: Update test.
	* gcc.target/powerpc/ppc-fma-3.c: Update test.
	* gcc.target/powerpc/ppc-fma-4.c: Update test.
	* gcc.target/powerpc/pr78604.c: Update test.


diff --git a/gcc/common/config/rs6000/rs6000-common.c b/gcc/common/config/rs6000/rs6000-common.c
index 4b0c205..b947196 100644
--- a/gcc/common/config/rs6000/rs6000-common.c
+++ b/gcc/common/config/rs6000/rs6000-common.c
@@ -35,6 +35,7 @@  static const struct default_options rs6000_option_optimization_table[] =
     { OPT_LEVELS_ALL, OPT_fsplit_wide_types_early, NULL, 1 },
     /* Enable -fsched-pressure for first pass instruction scheduling.  */
     { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_funroll_loops, NULL, 1 },
     { OPT_LEVELS_NONE, 0, NULL, 0 }
   };
 
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 1399221..28ffa15 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1428,6 +1428,9 @@  static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
 #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
+
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS rs6000_init_builtins
 #undef TARGET_BUILTIN_DECL
@@ -4540,6 +4543,17 @@  rs6000_option_override_internal (bool global_init_p)
 			     global_options.x_param_values,
 			     global_options_set.x_param_values);
 
+      /* If funroll-loops is implicitly enabled, do not turn fweb or
+	 frename-registers on implicitly.  */
+      if (!global_options_set.x_flag_unroll_loops
+	  && !global_options_set.x_flag_unroll_all_loops)
+	{
+	  if (!global_options_set.x_flag_web)
+	    global_options.x_flag_web = 0;
+	  if (!global_options_set.x_flag_rename_registers)
+	    global_options.x_flag_rename_registers = 0;
+	}
+
       /* If using typedef char *va_list, signal that
 	 __builtin_va_start (&ap, 0) can be optimized to
 	 ap = __builtin_next_arg (0).  */
@@ -5081,6 +5095,29 @@  rs6000_destroy_cost_data (void *data)
   free (data);
 }
 
+/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
+   a new heristic number struct loop *loop should be unrolled.  */
+
+static unsigned
+rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop)
+{
+  /* If funroll-loops was enabled implicitly, like at -O2, we only unroll
+     those small loops with small unroll factor.  */
+  if (!global_options_set.x_flag_unroll_loops
+      && !global_options_set.x_flag_unroll_all_loops)
+    {
+      /* If the loop contains few insns, treated it as small loops.
+	 TODO: Uing 10 hard code for now.  Continue to refine, For example,
+	 if loop constians only 1-2 insns, we may unroll more times(4).
+	 And we may use PARAM to control kinds of loop size.  */
+      if (loop->ninsns <= 10)
+	return 2;
+      else
+	return 0;
+    }
+  return nunroll;
+}
+
 /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
    library with vectorized intrinsics.  */
 
diff --git a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
index c9b8046..082f2aa 100644
--- a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
+++ b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
@@ -1,5 +1,9 @@ 
 /* { dg-shouldfail "tsan" } */
 
+/* { dg-additional-options "-fno-unroll-loops" { target { powerpc*-*-* } } } */
+/* -fno-unroll-loops help to avoid ThreadSanitizer reporting multi-times
+   message for pthread_create at difference calling addresses.  */
+
 #include <pthread.h>
 #include <unistd.h>
 
diff --git a/gcc/testsuite/gcc.target/powerpc/loop_align.c b/gcc/testsuite/gcc.target/powerpc/loop_align.c
index ebe3782..ef67f77 100644
--- a/gcc/testsuite/gcc.target/powerpc/loop_align.c
+++ b/gcc/testsuite/gcc.target/powerpc/loop_align.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */
-/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16" } */
+/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16 -fno-unroll-loops" } */
 /* { dg-final { scan-assembler ".p2align 5" } } */
 
 void f(double *a, double *b, double *c, unsigned long n) {
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
index b4945e6..2a5b92c 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
@@ -1,7 +1,7 @@ 
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "xvmadd" 4 } } */
 /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */
 /* { dg-final { scan-assembler-times "fmadds" 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
index 5ed630a..bf2c67f 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
@@ -1,7 +1,7 @@ 
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "xvmadd" 2 } } */
 /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */
 /* { dg-final { scan-assembler-times "fmadds" 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
index ef252b3..8608116 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
@@ -2,7 +2,7 @@ 
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_altivec_ok } */
 /* { dg-require-effective-target powerpc_fprs } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
 /* { dg-final { scan-assembler-times "fmadd " 2 } } */
 /* { dg-final { scan-assembler-times "fmadds" 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
index c2eaf1a..291c2ee 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
@@ -2,7 +2,7 @@ 
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_altivec_ok } */
 /* { dg-require-effective-target powerpc_fprs } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
 /* { dg-final { scan-assembler-times "fmadd " 1 } } */
 /* { dg-final { scan-assembler-times "fmadds" 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr78604.c b/gcc/testsuite/gcc.target/powerpc/pr78604.c
index 76d8945..35bfdb3 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr78604.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr78604.c
@@ -1,7 +1,7 @@ 
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_p8vector_ok } */
-/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details -fno-unroll-loops" } */
 
 #ifndef SIZE
 #define SIZE 1024
diff --git a/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
new file mode 100644
index 0000000..fec5ae9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-loop2_unroll" } */
+
+void __attribute__ ((noinline)) foo(int n, int *arr)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    arr[i] = arr[i] - 10;
+}
+/* { dg-final { scan-rtl-dump-times "Unrolled loop 1 times" 1 "loop2_unroll" } } */
+/* { dg-final { scan-assembler-times {\mlwz\M} 3 } } */
+/* { dg-final { scan-assembler-times {\mstw\M} 3 } } */
+