diff mbox

[Loop,optimizer] : Add logic to disable certain loop optimizations on pre-/post-loops

Message ID D4C76825A6780047854A11E93CDE84D004C1768411@SAUSEXMBP01.amd.com
State New
Headers show

Commit Message

Fang, Changpeng Dec. 17, 2010, 12:13 a.m. UTC
Hi,

Based on previous discussions, I modified the patch as such.

If a loop is marked as non-rolling, optimize_loop_for_size_p returns TRUE
and optimize_loop_for_speed_p returns FALSE. All users of these two 
functions will be affected.

After applying the modified patch, pb05 compilation time decreases 29%, binary
size decreases 20%, while a small (0.5%) performance increase was found which may
be just noise.

Modified patch passed bootstrapping and gcc regression tests on x86_64-unknown-linux-gnu.

Is it OK to commit to trunk?

Thanks,

Changpeng

Comments

Jack Howarth Dec. 17, 2010, 5:31 a.m. UTC | #1
On Thu, Dec 16, 2010 at 06:13:52PM -0600, Fang, Changpeng wrote:
> Hi,
> 
> Based on previous discussions, I modified the patch as such.
> 
> If a loop is marked as non-rolling, optimize_loop_for_size_p returns TRUE
> and optimize_loop_for_speed_p returns FALSE. All users of these two 
> functions will be affected.
> 
> After applying the modified patch, pb05 compilation time decreases 29%, binary
> size decreases 20%, while a small (0.5%) performance increase was found which may
> be just noise.
> 
> Modified patch passed bootstrapping and gcc regression tests on x86_64-unknown-linux-gnu.
> 
> Is it OK to commit to trunk?

Changpeng,
    On x86_64-apple-darwin10, I am finding a more severe penalty for this patch with
the pb05 benchmarks. Using a profiledbootstrap BOOT_CFLAGS="-g -O3" build with...

Configured with: ../gcc-4.6-20101216/configure --prefix=/sw --prefix=/sw/lib/gcc4.6 --mandir=/sw/share/man --infodir=/sw/lib/gcc4.6/info --enable-languages=c,c++,fortran,objc,obj-c++,java --with-gmp=/sw --with-libiconv-prefix=/sw --with-ppl=/sw --with-cloog=/sw --with-mpc=/sw --with-system-zlib --x-includes=/usr/X11R6/include --x-libraries=/usr/X11R6/lib --program-suffix=-fsf-4.6 --enable-checking=yes --enable-cloog-backend=isl --enable-build-with-cxx

I get without the patch...

================================================================================
Date & Time     : 16 Dec 2010 23:36:27
Test Name       : gfortran_lin_O3
Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
Maximum Times   :     2000.0
Target Error %  :      0.100
Minimum Repeats :    10
Maximum Repeats :   100

   Benchmark   Compile  Executable   Ave Run  Number   Estim
        Name    (secs)     (bytes)    (secs) Repeats   Err %
   ---------   -------  ----------   ------- -------  ------
          ac      1.76       10000      8.78      12  0.0081
      aermod     54.94       10000     17.28      10  0.0307
         air      3.44       10000      5.53      13  0.0734
    capacita      2.64       10000     32.65      10  0.0096
     channel      0.89       10000      1.84      20  0.0977
       doduc      8.12       10000     27.00      10  0.0132
     fatigue      3.06       10000      8.36      10  0.0104
     gas_dyn      5.00       10000      4.30      17  0.0915
      induct      6.24       10000     12.42      10  0.0100
       linpk      1.02       10000     15.50      12  0.0542
        mdbx      2.55       10000     11.24      10  0.0256
          nf      2.90       10000     30.16      20  0.0989
     protein      7.98       10000     33.72      10  0.0070
      rnflow      9.34       10000     23.21      10  0.0551
    test_fpu      6.72       10000      8.05      10  0.0426
        tfft      0.76       10000      1.87      10  0.0597

Geometric Mean Execution Time =      10.87 seconds

and with the patch...

================================================================================
Date & Time     : 16 Dec 2010 21:31:06
Test Name       : gfortran_lin_O3
Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
Maximum Times   :     2000.0
Target Error %  :      0.100
Minimum Repeats :    10
Maximum Repeats :   100

   Benchmark   Compile  Executable   Ave Run  Number   Estim
        Name    (secs)     (bytes)    (secs) Repeats   Err %
   ---------   -------  ----------   ------- -------  ------
          ac      1.19       10000      8.78      10  0.0099
      aermod     47.91       10000     16.95      10  0.0123
         air      2.85       10000      5.34      12  0.0715
    capacita      1.63       10000     33.10      10  0.0361
     channel      0.67       10000      1.87      10  0.0884
       doduc      6.42       10000     27.35      10  0.0206
     fatigue      2.10       10000      8.32      10  0.0194
     gas_dyn      2.07       10000      4.30      17  0.0843
      induct      5.38       10000     12.58      10  0.0088
       linpk      0.71       10000     15.69      18  0.0796
        mdbx      1.95       10000     11.41      10  0.0238
          nf      1.24       10000     31.34      12  0.0991
     protein      3.88       10000     35.13      10  0.0659
      rnflow      4.73       10000     25.97      10  0.0629
    test_fpu      3.66       10000      8.88      11  0.0989
        tfft      0.52       10000      1.89      10  0.0403

Geometric Mean Execution Time =      11.09 seconds

This shows about a 2.0% performance reduction in the Geometric
Mean Execution Time. I would note that intel darwin now defaults
to -mtune=core2 and always has defaulted to -fPIC.
           Jack

> 
> Thanks,
> 
> Changpeng
> 
> 
> 
> 
> 
> 
> 
> 
> 
> ________________________________________
> From: Zdenek Dvorak [rakdver@kam.mff.cuni.cz]
> Sent: Thursday, December 16, 2010 12:47 PM
> To: Fang, Changpeng
> Cc: Richard Guenther; Xinliang David Li; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH, Loop optimizer]: Add logic to disable certain loop optimizations on pre-/post-loops
> 
> Hi,
> 
> > For prefetching of prologue or epilogue loops, we have two choices (1) prefetching not
> > not unrolling, (2) not prefetching.  Which one do you prefer?
> 
> it is better not to prefetch (the current placement of prefetches is not good for non-rolling
> loops),
> 
> Zdenek
> 

Content-Description: polyhedron1.txt
> Comparison of Polyhedron (2005) Compile Time, Binary Size and Performance After Applying the NON-ROLLING Marking Patch
> 
> gfortran -Ofast -funroll-loops -march=amdfam10 %n.f90 -o %n
>     ============================================================================================================================							
> ||	         |  Before Patch		     |  After Patch			  | Changes                       ||
> ||========================================================================================================================||		
> ||  Benchmark    |  Compile    Binary Size  Run Time |  Compile   Binary Size	Run Time  | Compile  Binary   Performance ||
> ||        Name   |  Time (s)   (bytes)	   (s)       |	Time(s)	    (bytes)	 (secs)	  | Time (%) Size(%)	(%)       ||
> ||========================================================================================================================||
> ||          ac   |    3.36       41976	    13.26    |	  2.52	     34424	13.21	  |  -25.00   -17.99	0.38	  ||
> ||      aermod   |  103.45     1412221	    44.36    |	 86.55	   1268861	43.08	  |  -16.34   -10.15	2.97	  ||
> ||         air   |    6.11       75186	    11.73    |	  5.72	     71090      11.56	  |   -6.38    -5.45	1.47	  ||
> ||    capacita   |    6.83       91257	    88.40    |	  4.70	     74585      88.01	  |  -31.19   -18.27	0.44	  ||
> ||     channel   |    2.14       39984	     6.65    |	  1.84	     35888	 6.69	  |  -14.02   -10.24   -0.60	  ||
> ||       doduc   |   12.78      198624	    38.59    |	 12.20	    186336	38.18	  |   -4.54    -6.19	1.07	  ||
> ||     fatigue   |    9.11      110008	    10.15    |	  5.93	     92472	10.12	  |  -34.91   -15.94	0.30	  ||
> ||     gas_dyn   |   15.69      149726	     7.14    |	  8.45	    109342	 6.96	  |  -46.14   -26.97	2.59	  ||
> ||      induct   |   10.98      191800	    20.66    |	 10.62	    188168	20.61	  |   -3.28    -1.89	0.24	  ||
> ||       linpk   |    2.27       46073	    19.03    |	  1.68	     33401	19.03	  |  -25.99   -27.50	0.00	  ||
> ||        mdbx   |    5.63      103731	    21.41    |	  4.24	     83251	21.33	  |  -24.69   -19.74	0.38	  ||
> ||          nf   |   14.18      118451	    22.88    |	  5.55	     60499	23.14	  |  -60.86   -48.92   -1.12	  ||
> ||     protein   |   34.20      177700	    47.04    |	 19.16	    135012	46.94	  |  -43.98   -24.02	0.21	  ||
> ||      rnflow   |   42.13      283645	    40.30    |	 20.92	    178477	40.65	  |  -50.34   -37.08   -0.86	  ||
> ||    test_fpu   |   30.17      252080	    14.46    |	 14.44	    149136	14.32	  |  -52.14   -40.84	0.98	  ||
> ||        tfft   |    1.50       32450	     7.71    |	  1.12	     26546	 7.67	  |  -25.33   -18.19	0.52	  ||
> ||========================================================================================================================||
> ||       average   |			    19.57    |				19.46	  |  -29.07   -20.59	0.56	  ||
> ============================================================================================================================

Content-Description: 0001-Don-t-perform-certain-loop-optimizations-on-pre-post.patch
> From cd8b85bba1b39e108235f44d9d07918179ff3d79 Mon Sep 17 00:00:00 2001
> From: Changpeng Fang <chfang@houghton.(none)>
> Date: Mon, 13 Dec 2010 12:01:49 -0800
> Subject: [PATCH] Don't perform certain loop optimizations on pre/post loops
> 
> 	* basic-block.h (bb_flags): Add a new flag BB_HEADER_OF_NONROLLING_LOOP.
> 	* cfg.c (clear_bb_flags): Keep BB_HEADER_OF_NONROLLING marker.
> 	* cfgloop.h (mark_non_rolling_loop): New function declaration.
> 	  (non_rolling_loop_p): New function declaration.
> 	* predict.c (optimize_loop_for_size_p): Return true if the loop was marked
> 	  NON-ROLLING.  (optimize_loop_for_speed_p): Return false if the loop was
> 	  marked NON-ROLLING.
> 	* tree-ssa-loop-manip.c (tree_transform_and_unroll_loop): Mark the
> 	  non-rolling loop.
> 	* tree-ssa-loop-niter.c (mark_non_rolling_loop): Implement the new
> 	  function.  (non_rolling_loop_p): Implement the new function.
> 	* tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Mark the
> 	  non-rolling loop.  (vect_do_peeling_for_alignment): Mark the non-rolling
> 	  loop.
> ---
>  gcc/basic-block.h          |    6 +++++-
>  gcc/cfg.c                  |    7 ++++---
>  gcc/cfgloop.h              |    2 ++
>  gcc/predict.c              |    6 ++++++
>  gcc/tree-ssa-loop-manip.c  |    3 +++
>  gcc/tree-ssa-loop-niter.c  |   20 ++++++++++++++++++++
>  gcc/tree-vect-loop-manip.c |    8 ++++++++
>  7 files changed, 48 insertions(+), 4 deletions(-)
> 
> diff --git a/gcc/basic-block.h b/gcc/basic-block.h
> index be0a1d1..850472d 100644
> --- a/gcc/basic-block.h
> +++ b/gcc/basic-block.h
> @@ -245,7 +245,11 @@ enum bb_flags
>  
>    /* Set on blocks that cannot be threaded through.
>       Only used in cfgcleanup.c.  */
> -  BB_NONTHREADABLE_BLOCK = 1 << 11
> +  BB_NONTHREADABLE_BLOCK = 1 << 11,
> +
> +  /* Set on blocks that are headers of non-rolling loops.  */
> +  BB_HEADER_OF_NONROLLING_LOOP = 1 << 12
> +
>  };
>  
>  /* Dummy flag for convenience in the hot/cold partitioning code.  */
> diff --git a/gcc/cfg.c b/gcc/cfg.c
> index c8ef799..e59a637 100644
> --- a/gcc/cfg.c
> +++ b/gcc/cfg.c
> @@ -425,8 +425,8 @@ redirect_edge_pred (edge e, basic_block new_pred)
>    connect_src (e);
>  }
>  
> -/* Clear all basic block flags, with the exception of partitioning and
> -   setjmp_target.  */
> +/* Clear all basic block flags, with the exception of partitioning,
> +   setjmp_target, and the non-rolling loop marker.  */
>  void
>  clear_bb_flags (void)
>  {
> @@ -434,7 +434,8 @@ clear_bb_flags (void)
>  
>    FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR, NULL, next_bb)
>      bb->flags = (BB_PARTITION (bb)
> -		 | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET)));
> +		 | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET
> +				 + BB_HEADER_OF_NONROLLING_LOOP)));
>  }
>  
>  /* Check the consistency of profile information.  We can't do that
> diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
> index bf2614e..e856a78 100644
> --- a/gcc/cfgloop.h
> +++ b/gcc/cfgloop.h
> @@ -279,6 +279,8 @@ extern rtx doloop_condition_get (rtx);
>  void estimate_numbers_of_iterations_loop (struct loop *, bool);
>  HOST_WIDE_INT estimated_loop_iterations_int (struct loop *, bool);
>  bool estimated_loop_iterations (struct loop *, bool, double_int *);
> +void mark_non_rolling_loop (struct loop *);
> +bool non_rolling_loop_p (struct loop *);
>  
>  /* Loop manipulation.  */
>  extern bool can_duplicate_loop_p (const struct loop *loop);
> diff --git a/gcc/predict.c b/gcc/predict.c
> index c691990..bf729f8 100644
> --- a/gcc/predict.c
> +++ b/gcc/predict.c
> @@ -279,6 +279,9 @@ optimize_insn_for_speed_p (void)
>  bool
>  optimize_loop_for_size_p (struct loop *loop)
>  {
> +  /* Loops marked NON-ROLLING are not likely to be hot.  */
> +  if (non_rolling_loop_p (loop))
> +    return true;
>    return optimize_bb_for_size_p (loop->header);
>  }
>  
> @@ -287,6 +290,9 @@ optimize_loop_for_size_p (struct loop *loop)
>  bool
>  optimize_loop_for_speed_p (struct loop *loop)
>  {
> +  /* Loops marked NON-ROLLING are not likely to be hot.  */
> +  if (non_rolling_loop_p (loop))
> +    return false;
>    return optimize_bb_for_speed_p (loop->header);
>  }
>  
> diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c
> index 87b2c0d..bc977bb 100644
> --- a/gcc/tree-ssa-loop-manip.c
> +++ b/gcc/tree-ssa-loop-manip.c
> @@ -931,6 +931,9 @@ tree_transform_and_unroll_loop (struct loop *loop, unsigned factor,
>    gcc_assert (new_loop != NULL);
>    update_ssa (TODO_update_ssa);
>  
> +  /* NEW_LOOP is a non-rolling loop.  */
> +  mark_non_rolling_loop (new_loop);
> +
>    /* Determine the probability of the exit edge of the unrolled loop.  */
>    new_est_niter = est_niter / factor;
>  
> diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c
> index ee85f6f..1e2e4b2 100644
> --- a/gcc/tree-ssa-loop-niter.c
> +++ b/gcc/tree-ssa-loop-niter.c
> @@ -3011,6 +3011,26 @@ estimate_numbers_of_iterations (bool use_undefined_p)
>    fold_undefer_and_ignore_overflow_warnings ();
>  }
>  
> +/* Mark LOOP as a non-rolling loop.  */
> +
> +void
> +mark_non_rolling_loop (struct loop *loop)
> +{
> +  gcc_assert (loop && loop->header);
> +  loop->header->flags |= BB_HEADER_OF_NONROLLING_LOOP;
> +}
> +
> +/* Return true if LOOP is a non-rolling loop.  */
> +
> +bool
> +non_rolling_loop_p (struct loop *loop)
> +{
> +  int masked_flags;
> +  gcc_assert (loop && loop->header);
> +  masked_flags = (loop->header->flags & BB_HEADER_OF_NONROLLING_LOOP);
> +  return (masked_flags != 0);
> +}
> +
>  /* Returns true if statement S1 dominates statement S2.  */
>  
>  bool
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 6ecd304..216de78 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -1938,6 +1938,10 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio,
>  					    cond_expr, cond_expr_stmt_list);
>    gcc_assert (new_loop);
>    gcc_assert (loop_num == loop->num);
> +  
> +  /* NEW_LOOP is a non-rolling loop.  */
> +  mark_non_rolling_loop (new_loop);
> +
>  #ifdef ENABLE_CHECKING
>    slpeel_verify_cfg_after_peeling (loop, new_loop);
>  #endif
> @@ -2191,6 +2195,10 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
>  				   th, true, NULL_TREE, NULL);
>  
>    gcc_assert (new_loop);
> +
> +  /* NEW_LOOP is a non-rolling loop.  */
> +  mark_non_rolling_loop (new_loop);
> +  
>  #ifdef ENABLE_CHECKING
>    slpeel_verify_cfg_after_peeling (new_loop, loop);
>  #endif
> -- 
> 1.6.3.3
>
Fang, Changpeng Dec. 17, 2010, 7:14 a.m. UTC | #2
Hi, Jack:

Thanks for the testing.

This patch is not supposed to slow down a program by 10% (rnflow and test_fpu).
It would be helpful if you can provide analysis why they are slowed down.

We did see a significant compilation time reduction for most pb05 programs.
(I don't know why you do not have executable size data).

>I would note that intel darwin now defaults
>to -mtune=core2 and always has defaulted to -fPIC.

I could not understand these default for darwin. My understanding is that,
for x86_64, the default should be -mtune=generic.

Thanks,

Changpeng
Jack Howarth Dec. 17, 2010, 3:12 p.m. UTC | #3
On Fri, Dec 17, 2010 at 01:14:49AM -0600, Fang, Changpeng wrote:
> Hi, Jack:
> 
> Thanks for the testing.
> 
> This patch is not supposed to slow down a program by 10% (rnflow and test_fpu).
> It would be helpful if you can provide analysis why they are slowed down.

Unfortunately gprof is pretty broken on darwin10 but I can try to do profiling
in Shark with -fno-omit-frame-pointer. Could you try rebenchmarking with the flags...

-ffast-math -funroll-loops -O3 -fPIC -mtune=generic

I'll try benchmarking again with '-Ofast -funroll-loops -mtune=generic' and
see if the performance loss is reduced.

> 
> We did see a significant compilation time reduction for most pb05 programs.
> (I don't know why you do not have executable size data).

  The sources I have for pb05 aren't fixed for detecting the code size properly
under darwin and I never spent the the time trying to fix that.

> 
> >I would note that intel darwin now defaults
> >to -mtune=core2 and always has defaulted to -fPIC.
> 
> I could not understand these default for darwin. My understanding is that,
> for x86_64, the default should be -mtune=generic.

This is a recent change.

Author: mrs
Date: Wed Dec  8 23:32:27 2010
New Revision: 167611

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=167611
Log:
2010-12-08  Iain Sandoe <iains@gcc.gnu.org>

	gcc/config.gcc (with_cpu): Default i[34567]86-*-darwin* and
	x86_64-*-darwin* to with_cpu:-core2.
	gcc/config/i386/mmx.md (*mov<mode>_internal_rex64): Replace movq
	with movd for darwin assembler.
	gcc/config/i386/sse.md (*vec_concatv2di_rex64_sse4_1): Ditto.
	(*vec_concatv2di_rex64_sse): Ditto.

Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/config.gcc
    trunk/gcc/config/i386/mmx.md
    trunk/gcc/config/i386/sse.md

We had intended to do this awhile back (since all darwin hardware is core2
or recent Xeon), but held off until the core2 costs tables were improved to
match those of generic.

http://gcc.gnu.org/ml/gcc-patches/2010-12/msg00000.html

> 
> Thanks,
> 
> Changpeng
> 
> ________________________________________
> From: Jack Howarth [howarth@bromo.med.uc.edu]
> Sent: Thursday, December 16, 2010 11:31 PM
> To: Fang, Changpeng
> Cc: Zdenek Dvorak; Richard Guenther; Xinliang David Li; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH, Loop optimizer]: Add logic to disable certain loop optimizations on pre-/post-loops
> 
> On Thu, Dec 16, 2010 at 06:13:52PM -0600, Fang, Changpeng wrote:
> > Hi,
> >
> > Based on previous discussions, I modified the patch as such.
> >
> > If a loop is marked as non-rolling, optimize_loop_for_size_p returns TRUE
> > and optimize_loop_for_speed_p returns FALSE. All users of these two
> > functions will be affected.
> >
> > After applying the modified patch, pb05 compilation time decreases 29%, binary
> > size decreases 20%, while a small (0.5%) performance increase was found which may
> > be just noise.
> >
> > Modified patch passed bootstrapping and gcc regression tests on x86_64-unknown-linux-gnu.
> >
> > Is it OK to commit to trunk?
> 
> Changpeng,
>     On x86_64-apple-darwin10, I am finding a more severe penalty for this patch with
> the pb05 benchmarks. Using a profiledbootstrap BOOT_CFLAGS="-g -O3" build with...
> 
> Configured with: ../gcc-4.6-20101216/configure --prefix=/sw --prefix=/sw/lib/gcc4.6 --mandir=/sw/share/man --infodir=/sw/lib/gcc4.6/info --enable-languages=c,c++,fortran,objc,obj-c++,java --with-gmp=/sw --with-libiconv-prefix=/sw --with-ppl=/sw --with-cloog=/sw --with-mpc=/sw --with-system-zlib --x-includes=/usr/X11R6/include --x-libraries=/usr/X11R6/lib --program-suffix=-fsf-4.6 --enable-checking=yes --enable-cloog-backend=isl --enable-build-with-cxx
> 
> I get without the patch...
> 
> ================================================================================
> Date & Time     : 16 Dec 2010 23:36:27
> Test Name       : gfortran_lin_O3
> Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
> Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
> Maximum Times   :     2000.0
> Target Error %  :      0.100
> Minimum Repeats :    10
> Maximum Repeats :   100
> 
>    Benchmark   Compile  Executable   Ave Run  Number   Estim
>         Name    (secs)     (bytes)    (secs) Repeats   Err %
>    ---------   -------  ----------   ------- -------  ------
>           ac      1.76       10000      8.78      12  0.0081
>       aermod     54.94       10000     17.28      10  0.0307
>          air      3.44       10000      5.53      13  0.0734
>     capacita      2.64       10000     32.65      10  0.0096
>      channel      0.89       10000      1.84      20  0.0977
>        doduc      8.12       10000     27.00      10  0.0132
>      fatigue      3.06       10000      8.36      10  0.0104
>      gas_dyn      5.00       10000      4.30      17  0.0915
>       induct      6.24       10000     12.42      10  0.0100
>        linpk      1.02       10000     15.50      12  0.0542
>         mdbx      2.55       10000     11.24      10  0.0256
>           nf      2.90       10000     30.16      20  0.0989
>      protein      7.98       10000     33.72      10  0.0070
>       rnflow      9.34       10000     23.21      10  0.0551
>     test_fpu      6.72       10000      8.05      10  0.0426
>         tfft      0.76       10000      1.87      10  0.0597
> 
> Geometric Mean Execution Time =      10.87 seconds
> 
> and with the patch...
> 
> ================================================================================
> Date & Time     : 16 Dec 2010 21:31:06
> Test Name       : gfortran_lin_O3
> Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
> Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
> Maximum Times   :     2000.0
> Target Error %  :      0.100
> Minimum Repeats :    10
> Maximum Repeats :   100
> 
>    Benchmark   Compile  Executable   Ave Run  Number   Estim
>         Name    (secs)     (bytes)    (secs) Repeats   Err %
>    ---------   -------  ----------   ------- -------  ------
>           ac      1.19       10000      8.78      10  0.0099
>       aermod     47.91       10000     16.95      10  0.0123
>          air      2.85       10000      5.34      12  0.0715
>     capacita      1.63       10000     33.10      10  0.0361
>      channel      0.67       10000      1.87      10  0.0884
>        doduc      6.42       10000     27.35      10  0.0206
>      fatigue      2.10       10000      8.32      10  0.0194
>      gas_dyn      2.07       10000      4.30      17  0.0843
>       induct      5.38       10000     12.58      10  0.0088
>        linpk      0.71       10000     15.69      18  0.0796
>         mdbx      1.95       10000     11.41      10  0.0238
>           nf      1.24       10000     31.34      12  0.0991
>      protein      3.88       10000     35.13      10  0.0659
>       rnflow      4.73       10000     25.97      10  0.0629
>     test_fpu      3.66       10000      8.88      11  0.0989
>         tfft      0.52       10000      1.89      10  0.0403
> 
> Geometric Mean Execution Time =      11.09 seconds
> 
> This shows about a 2.0% performance reduction in the Geometric
> Mean Execution Time. I would note that intel darwin now defaults
> to -mtune=core2 and always has defaulted to -fPIC.
>            Jack
> 
> >
> > Thanks,
> >
> > Changpeng
> >
> >
> >
> >
> >
> >
> >
> >
> >
> > ________________________________________
> > From: Zdenek Dvorak [rakdver@kam.mff.cuni.cz]
> > Sent: Thursday, December 16, 2010 12:47 PM
> > To: Fang, Changpeng
> > Cc: Richard Guenther; Xinliang David Li; gcc-patches@gcc.gnu.org
> > Subject: Re: [PATCH, Loop optimizer]: Add logic to disable certain loop optimizations on pre-/post-loops
> >
> > Hi,
> >
> > > For prefetching of prologue or epilogue loops, we have two choices (1) prefetching not
> > > not unrolling, (2) not prefetching.  Which one do you prefer?
> >
> > it is better not to prefetch (the current placement of prefetches is not good for non-rolling
> > loops),
> >
> > Zdenek
> >
> 
> Content-Description: polyhedron1.txt
> > Comparison of Polyhedron (2005) Compile Time, Binary Size and Performance After Applying the NON-ROLLING Marking Patch
> >
> > gfortran -Ofast -funroll-loops -march=amdfam10 %n.f90 -o %n
> >     ============================================================================================================================
> > ||             |  Before Patch                     |  After Patch                       | Changes                       ||
> > ||========================================================================================================================||
> > ||  Benchmark    |  Compile    Binary Size  Run Time |  Compile   Binary Size Run Time  | Compile  Binary   Performance ||
> > ||        Name   |  Time (s)   (bytes)           (s)       |  Time(s)     (bytes)      (secs)   | Time (%) Size(%)    (%)       ||
> > ||========================================================================================================================||
> > ||          ac   |    3.36       41976            13.26    |    2.52       34424      13.21     |  -25.00   -17.99    0.38      ||
> > ||      aermod   |  103.45     1412221            44.36    |   86.55     1268861      43.08     |  -16.34   -10.15    2.97      ||
> > ||         air   |    6.11       75186            11.73    |    5.72       71090      11.56     |   -6.38    -5.45    1.47      ||
> > ||    capacita   |    6.83       91257            88.40    |    4.70       74585      88.01     |  -31.19   -18.27    0.44      ||
> > ||     channel   |    2.14       39984             6.65    |    1.84       35888       6.69     |  -14.02   -10.24   -0.60      ||
> > ||       doduc   |   12.78      198624            38.59    |   12.20      186336      38.18     |   -4.54    -6.19    1.07      ||
> > ||     fatigue   |    9.11      110008            10.15    |    5.93       92472      10.12     |  -34.91   -15.94    0.30      ||
> > ||     gas_dyn   |   15.69      149726             7.14    |    8.45      109342       6.96     |  -46.14   -26.97    2.59      ||
> > ||      induct   |   10.98      191800            20.66    |   10.62      188168      20.61     |   -3.28    -1.89    0.24      ||
> > ||       linpk   |    2.27       46073            19.03    |    1.68       33401      19.03     |  -25.99   -27.50    0.00      ||
> > ||        mdbx   |    5.63      103731            21.41    |    4.24       83251      21.33     |  -24.69   -19.74    0.38      ||
> > ||          nf   |   14.18      118451            22.88    |    5.55       60499      23.14     |  -60.86   -48.92   -1.12      ||
> > ||     protein   |   34.20      177700            47.04    |   19.16      135012      46.94     |  -43.98   -24.02    0.21      ||
> > ||      rnflow   |   42.13      283645            40.30    |   20.92      178477      40.65     |  -50.34   -37.08   -0.86      ||
> > ||    test_fpu   |   30.17      252080            14.46    |   14.44      149136      14.32     |  -52.14   -40.84    0.98      ||
> > ||        tfft   |    1.50       32450             7.71    |    1.12       26546       7.67     |  -25.33   -18.19    0.52      ||
> > ||========================================================================================================================||
> > ||       average   |                      19.57    |                          19.46     |  -29.07   -20.59    0.56      ||
> > ============================================================================================================================
> 
> Content-Description: 0001-Don-t-perform-certain-loop-optimizations-on-pre-post.patch
> > From cd8b85bba1b39e108235f44d9d07918179ff3d79 Mon Sep 17 00:00:00 2001
> > From: Changpeng Fang <chfang@houghton.(none)>
> > Date: Mon, 13 Dec 2010 12:01:49 -0800
> > Subject: [PATCH] Don't perform certain loop optimizations on pre/post loops
> >
> >       * basic-block.h (bb_flags): Add a new flag BB_HEADER_OF_NONROLLING_LOOP.
> >       * cfg.c (clear_bb_flags): Keep BB_HEADER_OF_NONROLLING marker.
> >       * cfgloop.h (mark_non_rolling_loop): New function declaration.
> >         (non_rolling_loop_p): New function declaration.
> >       * predict.c (optimize_loop_for_size_p): Return true if the loop was marked
> >         NON-ROLLING.  (optimize_loop_for_speed_p): Return false if the loop was
> >         marked NON-ROLLING.
> >       * tree-ssa-loop-manip.c (tree_transform_and_unroll_loop): Mark the
> >         non-rolling loop.
> >       * tree-ssa-loop-niter.c (mark_non_rolling_loop): Implement the new
> >         function.  (non_rolling_loop_p): Implement the new function.
> >       * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Mark the
> >         non-rolling loop.  (vect_do_peeling_for_alignment): Mark the non-rolling
> >         loop.
> > ---
> >  gcc/basic-block.h          |    6 +++++-
> >  gcc/cfg.c                  |    7 ++++---
> >  gcc/cfgloop.h              |    2 ++
> >  gcc/predict.c              |    6 ++++++
> >  gcc/tree-ssa-loop-manip.c  |    3 +++
> >  gcc/tree-ssa-loop-niter.c  |   20 ++++++++++++++++++++
> >  gcc/tree-vect-loop-manip.c |    8 ++++++++
> >  7 files changed, 48 insertions(+), 4 deletions(-)
> >
> > diff --git a/gcc/basic-block.h b/gcc/basic-block.h
> > index be0a1d1..850472d 100644
> > --- a/gcc/basic-block.h
> > +++ b/gcc/basic-block.h
> > @@ -245,7 +245,11 @@ enum bb_flags
> >
> >    /* Set on blocks that cannot be threaded through.
> >       Only used in cfgcleanup.c.  */
> > -  BB_NONTHREADABLE_BLOCK = 1 << 11
> > +  BB_NONTHREADABLE_BLOCK = 1 << 11,
> > +
> > +  /* Set on blocks that are headers of non-rolling loops.  */
> > +  BB_HEADER_OF_NONROLLING_LOOP = 1 << 12
> > +
> >  };
> >
> >  /* Dummy flag for convenience in the hot/cold partitioning code.  */
> > diff --git a/gcc/cfg.c b/gcc/cfg.c
> > index c8ef799..e59a637 100644
> > --- a/gcc/cfg.c
> > +++ b/gcc/cfg.c
> > @@ -425,8 +425,8 @@ redirect_edge_pred (edge e, basic_block new_pred)
> >    connect_src (e);
> >  }
> >
> > -/* Clear all basic block flags, with the exception of partitioning and
> > -   setjmp_target.  */
> > +/* Clear all basic block flags, with the exception of partitioning,
> > +   setjmp_target, and the non-rolling loop marker.  */
> >  void
> >  clear_bb_flags (void)
> >  {
> > @@ -434,7 +434,8 @@ clear_bb_flags (void)
> >
> >    FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR, NULL, next_bb)
> >      bb->flags = (BB_PARTITION (bb)
> > -              | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET)));
> > +              | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET
> > +                              + BB_HEADER_OF_NONROLLING_LOOP)));
> >  }
> >
> >  /* Check the consistency of profile information.  We can't do that
> > diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
> > index bf2614e..e856a78 100644
> > --- a/gcc/cfgloop.h
> > +++ b/gcc/cfgloop.h
> > @@ -279,6 +279,8 @@ extern rtx doloop_condition_get (rtx);
> >  void estimate_numbers_of_iterations_loop (struct loop *, bool);
> >  HOST_WIDE_INT estimated_loop_iterations_int (struct loop *, bool);
> >  bool estimated_loop_iterations (struct loop *, bool, double_int *);
> > +void mark_non_rolling_loop (struct loop *);
> > +bool non_rolling_loop_p (struct loop *);
> >
> >  /* Loop manipulation.  */
> >  extern bool can_duplicate_loop_p (const struct loop *loop);
> > diff --git a/gcc/predict.c b/gcc/predict.c
> > index c691990..bf729f8 100644
> > --- a/gcc/predict.c
> > +++ b/gcc/predict.c
> > @@ -279,6 +279,9 @@ optimize_insn_for_speed_p (void)
> >  bool
> >  optimize_loop_for_size_p (struct loop *loop)
> >  {
> > +  /* Loops marked NON-ROLLING are not likely to be hot.  */
> > +  if (non_rolling_loop_p (loop))
> > +    return true;
> >    return optimize_bb_for_size_p (loop->header);
> >  }
> >
> > @@ -287,6 +290,9 @@ optimize_loop_for_size_p (struct loop *loop)
> >  bool
> >  optimize_loop_for_speed_p (struct loop *loop)
> >  {
> > +  /* Loops marked NON-ROLLING are not likely to be hot.  */
> > +  if (non_rolling_loop_p (loop))
> > +    return false;
> >    return optimize_bb_for_speed_p (loop->header);
> >  }
> >
> > diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c
> > index 87b2c0d..bc977bb 100644
> > --- a/gcc/tree-ssa-loop-manip.c
> > +++ b/gcc/tree-ssa-loop-manip.c
> > @@ -931,6 +931,9 @@ tree_transform_and_unroll_loop (struct loop *loop, unsigned factor,
> >    gcc_assert (new_loop != NULL);
> >    update_ssa (TODO_update_ssa);
> >
> > +  /* NEW_LOOP is a non-rolling loop.  */
> > +  mark_non_rolling_loop (new_loop);
> > +
> >    /* Determine the probability of the exit edge of the unrolled loop.  */
> >    new_est_niter = est_niter / factor;
> >
> > diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c
> > index ee85f6f..1e2e4b2 100644
> > --- a/gcc/tree-ssa-loop-niter.c
> > +++ b/gcc/tree-ssa-loop-niter.c
> > @@ -3011,6 +3011,26 @@ estimate_numbers_of_iterations (bool use_undefined_p)
> >    fold_undefer_and_ignore_overflow_warnings ();
> >  }
> >
> > +/* Mark LOOP as a non-rolling loop.  */
> > +
> > +void
> > +mark_non_rolling_loop (struct loop *loop)
> > +{
> > +  gcc_assert (loop && loop->header);
> > +  loop->header->flags |= BB_HEADER_OF_NONROLLING_LOOP;
> > +}
> > +
> > +/* Return true if LOOP is a non-rolling loop.  */
> > +
> > +bool
> > +non_rolling_loop_p (struct loop *loop)
> > +{
> > +  int masked_flags;
> > +  gcc_assert (loop && loop->header);
> > +  masked_flags = (loop->header->flags & BB_HEADER_OF_NONROLLING_LOOP);
> > +  return (masked_flags != 0);
> > +}
> > +
> >  /* Returns true if statement S1 dominates statement S2.  */
> >
> >  bool
> > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> > index 6ecd304..216de78 100644
> > --- a/gcc/tree-vect-loop-manip.c
> > +++ b/gcc/tree-vect-loop-manip.c
> > @@ -1938,6 +1938,10 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio,
> >                                           cond_expr, cond_expr_stmt_list);
> >    gcc_assert (new_loop);
> >    gcc_assert (loop_num == loop->num);
> > +
> > +  /* NEW_LOOP is a non-rolling loop.  */
> > +  mark_non_rolling_loop (new_loop);
> > +
> >  #ifdef ENABLE_CHECKING
> >    slpeel_verify_cfg_after_peeling (loop, new_loop);
> >  #endif
> > @@ -2191,6 +2195,10 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
> >                                  th, true, NULL_TREE, NULL);
> >
> >    gcc_assert (new_loop);
> > +
> > +  /* NEW_LOOP is a non-rolling loop.  */
> > +  mark_non_rolling_loop (new_loop);
> > +
> >  #ifdef ENABLE_CHECKING
> >    slpeel_verify_cfg_after_peeling (new_loop, loop);
> >  #endif
> > --
> > 1.6.3.3
> >
Fang, Changpeng Dec. 17, 2010, 4:13 p.m. UTC | #4
Hi, Jack:

Have you tested my original patch (I remembered you bootstrapped it) in this thread?
That is with the same logic which only applies to prefetch, unswitch and unrolling?

It helps a lot if you have data about that?

Thanks,

Changpeng
Jack Howarth Dec. 17, 2010, 4:41 p.m. UTC | #5
On Fri, Dec 17, 2010 at 10:13:24AM -0600, Fang, Changpeng wrote:
> 
> Hi, Jack:
> 
> Have you tested my original patch (I remembered you bootstrapped it) in this thread?
> That is with the same logic which only applies to prefetch, unswitch and unrolling?
> 
> It helps a lot if you have data about that?

Not for pb05 benchmarking but I try that patch and benchmark against it as well.
    Jack

> 
> Thanks,
> 
> Changpeng
> 
> 
> ________________________________________
> From: Jack Howarth [howarth@bromo.med.uc.edu]
> Sent: Friday, December 17, 2010 9:12 AM
> To: Fang, Changpeng
> Cc: Zdenek Dvorak; Richard Guenther; Xinliang David Li; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH, Loop optimizer]: Add logic to disable certain loop optimizations on pre-/post-loops
> 
> On Fri, Dec 17, 2010 at 01:14:49AM -0600, Fang, Changpeng wrote:
> > Hi, Jack:
> >
> > Thanks for the testing.
> >
> > This patch is not supposed to slow down a program by 10% (rnflow and test_fpu).
> > It would be helpful if you can provide analysis why they are slowed down.
> 
> Unfortunately gprof is pretty broken on darwin10 but I can try to do profiling
> in Shark with -fno-omit-frame-pointer. Could you try rebenchmarking with the flags...
> 
> -ffast-math -funroll-loops -O3 -fPIC -mtune=generic
> 
> I'll try benchmarking again with '-Ofast -funroll-loops -mtune=generic' and
> see if the performance loss is reduced.
> 
> >
> > We did see a significant compilation time reduction for most pb05 programs.
> > (I don't know why you do not have executable size data).
> 
>   The sources I have for pb05 aren't fixed for detecting the code size properly
> under darwin and I never spent the the time trying to fix that.
> 
> >
> > >I would note that intel darwin now defaults
> > >to -mtune=core2 and always has defaulted to -fPIC.
> >
> > I could not understand these default for darwin. My understanding is that,
> > for x86_64, the default should be -mtune=generic.
> 
> This is a recent change.
> 
> Author: mrs
> Date: Wed Dec  8 23:32:27 2010
> New Revision: 167611
> 
> URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=167611
> Log:
> 2010-12-08  Iain Sandoe <iains@gcc.gnu.org>
> 
>         gcc/config.gcc (with_cpu): Default i[34567]86-*-darwin* and
>         x86_64-*-darwin* to with_cpu:-core2.
>         gcc/config/i386/mmx.md (*mov<mode>_internal_rex64): Replace movq
>         with movd for darwin assembler.
>         gcc/config/i386/sse.md (*vec_concatv2di_rex64_sse4_1): Ditto.
>         (*vec_concatv2di_rex64_sse): Ditto.
> 
> Modified:
>     trunk/gcc/ChangeLog
>     trunk/gcc/config.gcc
>     trunk/gcc/config/i386/mmx.md
>     trunk/gcc/config/i386/sse.md
> 
> We had intended to do this awhile back (since all darwin hardware is core2
> or recent Xeon), but held off until the core2 costs tables were improved to
> match those of generic.
> 
> http://gcc.gnu.org/ml/gcc-patches/2010-12/msg00000.html
> 
> >
> > Thanks,
> >
> > Changpeng
> >
> > ________________________________________
> > From: Jack Howarth [howarth@bromo.med.uc.edu]
> > Sent: Thursday, December 16, 2010 11:31 PM
> > To: Fang, Changpeng
> > Cc: Zdenek Dvorak; Richard Guenther; Xinliang David Li; gcc-patches@gcc.gnu.org
> > Subject: Re: [PATCH, Loop optimizer]: Add logic to disable certain loop optimizations on pre-/post-loops
> >
> > On Thu, Dec 16, 2010 at 06:13:52PM -0600, Fang, Changpeng wrote:
> > > Hi,
> > >
> > > Based on previous discussions, I modified the patch as such.
> > >
> > > If a loop is marked as non-rolling, optimize_loop_for_size_p returns TRUE
> > > and optimize_loop_for_speed_p returns FALSE. All users of these two
> > > functions will be affected.
> > >
> > > After applying the modified patch, pb05 compilation time decreases 29%, binary
> > > size decreases 20%, while a small (0.5%) performance increase was found which may
> > > be just noise.
> > >
> > > Modified patch passed bootstrapping and gcc regression tests on x86_64-unknown-linux-gnu.
> > >
> > > Is it OK to commit to trunk?
> >
> > Changpeng,
> >     On x86_64-apple-darwin10, I am finding a more severe penalty for this patch with
> > the pb05 benchmarks. Using a profiledbootstrap BOOT_CFLAGS="-g -O3" build with...
> >
> > Configured with: ../gcc-4.6-20101216/configure --prefix=/sw --prefix=/sw/lib/gcc4.6 --mandir=/sw/share/man --infodir=/sw/lib/gcc4.6/info --enable-languages=c,c++,fortran,objc,obj-c++,java --with-gmp=/sw --with-libiconv-prefix=/sw --with-ppl=/sw --with-cloog=/sw --with-mpc=/sw --with-system-zlib --x-includes=/usr/X11R6/include --x-libraries=/usr/X11R6/lib --program-suffix=-fsf-4.6 --enable-checking=yes --enable-cloog-backend=isl --enable-build-with-cxx
> >
> > I get without the patch...
> >
> > ================================================================================
> > Date & Time     : 16 Dec 2010 23:36:27
> > Test Name       : gfortran_lin_O3
> > Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
> > Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
> > Maximum Times   :     2000.0
> > Target Error %  :      0.100
> > Minimum Repeats :    10
> > Maximum Repeats :   100
> >
> >    Benchmark   Compile  Executable   Ave Run  Number   Estim
> >         Name    (secs)     (bytes)    (secs) Repeats   Err %
> >    ---------   -------  ----------   ------- -------  ------
> >           ac      1.76       10000      8.78      12  0.0081
> >       aermod     54.94       10000     17.28      10  0.0307
> >          air      3.44       10000      5.53      13  0.0734
> >     capacita      2.64       10000     32.65      10  0.0096
> >      channel      0.89       10000      1.84      20  0.0977
> >        doduc      8.12       10000     27.00      10  0.0132
> >      fatigue      3.06       10000      8.36      10  0.0104
> >      gas_dyn      5.00       10000      4.30      17  0.0915
> >       induct      6.24       10000     12.42      10  0.0100
> >        linpk      1.02       10000     15.50      12  0.0542
> >         mdbx      2.55       10000     11.24      10  0.0256
> >           nf      2.90       10000     30.16      20  0.0989
> >      protein      7.98       10000     33.72      10  0.0070
> >       rnflow      9.34       10000     23.21      10  0.0551
> >     test_fpu      6.72       10000      8.05      10  0.0426
> >         tfft      0.76       10000      1.87      10  0.0597
> >
> > Geometric Mean Execution Time =      10.87 seconds
> >
> > and with the patch...
> >
> > ================================================================================
> > Date & Time     : 16 Dec 2010 21:31:06
> > Test Name       : gfortran_lin_O3
> > Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
> > Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
> > Maximum Times   :     2000.0
> > Target Error %  :      0.100
> > Minimum Repeats :    10
> > Maximum Repeats :   100
> >
> >    Benchmark   Compile  Executable   Ave Run  Number   Estim
> >         Name    (secs)     (bytes)    (secs) Repeats   Err %
> >    ---------   -------  ----------   ------- -------  ------
> >           ac      1.19       10000      8.78      10  0.0099
> >       aermod     47.91       10000     16.95      10  0.0123
> >          air      2.85       10000      5.34      12  0.0715
> >     capacita      1.63       10000     33.10      10  0.0361
> >      channel      0.67       10000      1.87      10  0.0884
> >        doduc      6.42       10000     27.35      10  0.0206
> >      fatigue      2.10       10000      8.32      10  0.0194
> >      gas_dyn      2.07       10000      4.30      17  0.0843
> >       induct      5.38       10000     12.58      10  0.0088
> >        linpk      0.71       10000     15.69      18  0.0796
> >         mdbx      1.95       10000     11.41      10  0.0238
> >           nf      1.24       10000     31.34      12  0.0991
> >      protein      3.88       10000     35.13      10  0.0659
> >       rnflow      4.73       10000     25.97      10  0.0629
> >     test_fpu      3.66       10000      8.88      11  0.0989
> >         tfft      0.52       10000      1.89      10  0.0403
> >
> > Geometric Mean Execution Time =      11.09 seconds
> >
> > This shows about a 2.0% performance reduction in the Geometric
> > Mean Execution Time. I would note that intel darwin now defaults
> > to -mtune=core2 and always has defaulted to -fPIC.
> >            Jack
> >
> > >
> > > Thanks,
> > >
> > > Changpeng
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > > ________________________________________
> > > From: Zdenek Dvorak [rakdver@kam.mff.cuni.cz]
> > > Sent: Thursday, December 16, 2010 12:47 PM
> > > To: Fang, Changpeng
> > > Cc: Richard Guenther; Xinliang David Li; gcc-patches@gcc.gnu.org
> > > Subject: Re: [PATCH, Loop optimizer]: Add logic to disable certain loop optimizations on pre-/post-loops
> > >
> > > Hi,
> > >
> > > > For prefetching of prologue or epilogue loops, we have two choices (1) prefetching not
> > > > not unrolling, (2) not prefetching.  Which one do you prefer?
> > >
> > > it is better not to prefetch (the current placement of prefetches is not good for non-rolling
> > > loops),
> > >
> > > Zdenek
> > >
> >
> > Content-Description: polyhedron1.txt
> > > Comparison of Polyhedron (2005) Compile Time, Binary Size and Performance After Applying the NON-ROLLING Marking Patch
> > >
> > > gfortran -Ofast -funroll-loops -march=amdfam10 %n.f90 -o %n
> > >     ============================================================================================================================
> > > ||             |  Before Patch                     |  After Patch                       | Changes                       ||
> > > ||========================================================================================================================||
> > > ||  Benchmark    |  Compile    Binary Size  Run Time |  Compile   Binary Size Run Time  | Compile  Binary   Performance ||
> > > ||        Name   |  Time (s)   (bytes)           (s)       |  Time(s)     (bytes)      (secs)   | Time (%) Size(%)    (%)       ||
> > > ||========================================================================================================================||
> > > ||          ac   |    3.36       41976            13.26    |    2.52       34424      13.21     |  -25.00   -17.99    0.38      ||
> > > ||      aermod   |  103.45     1412221            44.36    |   86.55     1268861      43.08     |  -16.34   -10.15    2.97      ||
> > > ||         air   |    6.11       75186            11.73    |    5.72       71090      11.56     |   -6.38    -5.45    1.47      ||
> > > ||    capacita   |    6.83       91257            88.40    |    4.70       74585      88.01     |  -31.19   -18.27    0.44      ||
> > > ||     channel   |    2.14       39984             6.65    |    1.84       35888       6.69     |  -14.02   -10.24   -0.60      ||
> > > ||       doduc   |   12.78      198624            38.59    |   12.20      186336      38.18     |   -4.54    -6.19    1.07      ||
> > > ||     fatigue   |    9.11      110008            10.15    |    5.93       92472      10.12     |  -34.91   -15.94    0.30      ||
> > > ||     gas_dyn   |   15.69      149726             7.14    |    8.45      109342       6.96     |  -46.14   -26.97    2.59      ||
> > > ||      induct   |   10.98      191800            20.66    |   10.62      188168      20.61     |   -3.28    -1.89    0.24      ||
> > > ||       linpk   |    2.27       46073            19.03    |    1.68       33401      19.03     |  -25.99   -27.50    0.00      ||
> > > ||        mdbx   |    5.63      103731            21.41    |    4.24       83251      21.33     |  -24.69   -19.74    0.38      ||
> > > ||          nf   |   14.18      118451            22.88    |    5.55       60499      23.14     |  -60.86   -48.92   -1.12      ||
> > > ||     protein   |   34.20      177700            47.04    |   19.16      135012      46.94     |  -43.98   -24.02    0.21      ||
> > > ||      rnflow   |   42.13      283645            40.30    |   20.92      178477      40.65     |  -50.34   -37.08   -0.86      ||
> > > ||    test_fpu   |   30.17      252080            14.46    |   14.44      149136      14.32     |  -52.14   -40.84    0.98      ||
> > > ||        tfft   |    1.50       32450             7.71    |    1.12       26546       7.67     |  -25.33   -18.19    0.52      ||
> > > ||========================================================================================================================||
> > > ||       average   |                      19.57    |                          19.46     |  -29.07   -20.59    0.56      ||
> > > ============================================================================================================================
> >
> > Content-Description: 0001-Don-t-perform-certain-loop-optimizations-on-pre-post.patch
> > > From cd8b85bba1b39e108235f44d9d07918179ff3d79 Mon Sep 17 00:00:00 2001
> > > From: Changpeng Fang <chfang@houghton.(none)>
> > > Date: Mon, 13 Dec 2010 12:01:49 -0800
> > > Subject: [PATCH] Don't perform certain loop optimizations on pre/post loops
> > >
> > >       * basic-block.h (bb_flags): Add a new flag BB_HEADER_OF_NONROLLING_LOOP.
> > >       * cfg.c (clear_bb_flags): Keep BB_HEADER_OF_NONROLLING marker.
> > >       * cfgloop.h (mark_non_rolling_loop): New function declaration.
> > >         (non_rolling_loop_p): New function declaration.
> > >       * predict.c (optimize_loop_for_size_p): Return true if the loop was marked
> > >         NON-ROLLING.  (optimize_loop_for_speed_p): Return false if the loop was
> > >         marked NON-ROLLING.
> > >       * tree-ssa-loop-manip.c (tree_transform_and_unroll_loop): Mark the
> > >         non-rolling loop.
> > >       * tree-ssa-loop-niter.c (mark_non_rolling_loop): Implement the new
> > >         function.  (non_rolling_loop_p): Implement the new function.
> > >       * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Mark the
> > >         non-rolling loop.  (vect_do_peeling_for_alignment): Mark the non-rolling
> > >         loop.
> > > ---
> > >  gcc/basic-block.h          |    6 +++++-
> > >  gcc/cfg.c                  |    7 ++++---
> > >  gcc/cfgloop.h              |    2 ++
> > >  gcc/predict.c              |    6 ++++++
> > >  gcc/tree-ssa-loop-manip.c  |    3 +++
> > >  gcc/tree-ssa-loop-niter.c  |   20 ++++++++++++++++++++
> > >  gcc/tree-vect-loop-manip.c |    8 ++++++++
> > >  7 files changed, 48 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/gcc/basic-block.h b/gcc/basic-block.h
> > > index be0a1d1..850472d 100644
> > > --- a/gcc/basic-block.h
> > > +++ b/gcc/basic-block.h
> > > @@ -245,7 +245,11 @@ enum bb_flags
> > >
> > >    /* Set on blocks that cannot be threaded through.
> > >       Only used in cfgcleanup.c.  */
> > > -  BB_NONTHREADABLE_BLOCK = 1 << 11
> > > +  BB_NONTHREADABLE_BLOCK = 1 << 11,
> > > +
> > > +  /* Set on blocks that are headers of non-rolling loops.  */
> > > +  BB_HEADER_OF_NONROLLING_LOOP = 1 << 12
> > > +
> > >  };
> > >
> > >  /* Dummy flag for convenience in the hot/cold partitioning code.  */
> > > diff --git a/gcc/cfg.c b/gcc/cfg.c
> > > index c8ef799..e59a637 100644
> > > --- a/gcc/cfg.c
> > > +++ b/gcc/cfg.c
> > > @@ -425,8 +425,8 @@ redirect_edge_pred (edge e, basic_block new_pred)
> > >    connect_src (e);
> > >  }
> > >
> > > -/* Clear all basic block flags, with the exception of partitioning and
> > > -   setjmp_target.  */
> > > +/* Clear all basic block flags, with the exception of partitioning,
> > > +   setjmp_target, and the non-rolling loop marker.  */
> > >  void
> > >  clear_bb_flags (void)
> > >  {
> > > @@ -434,7 +434,8 @@ clear_bb_flags (void)
> > >
> > >    FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR, NULL, next_bb)
> > >      bb->flags = (BB_PARTITION (bb)
> > > -              | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET)));
> > > +              | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET
> > > +                              + BB_HEADER_OF_NONROLLING_LOOP)));
> > >  }
> > >
> > >  /* Check the consistency of profile information.  We can't do that
> > > diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
> > > index bf2614e..e856a78 100644
> > > --- a/gcc/cfgloop.h
> > > +++ b/gcc/cfgloop.h
> > > @@ -279,6 +279,8 @@ extern rtx doloop_condition_get (rtx);
> > >  void estimate_numbers_of_iterations_loop (struct loop *, bool);
> > >  HOST_WIDE_INT estimated_loop_iterations_int (struct loop *, bool);
> > >  bool estimated_loop_iterations (struct loop *, bool, double_int *);
> > > +void mark_non_rolling_loop (struct loop *);
> > > +bool non_rolling_loop_p (struct loop *);
> > >
> > >  /* Loop manipulation.  */
> > >  extern bool can_duplicate_loop_p (const struct loop *loop);
> > > diff --git a/gcc/predict.c b/gcc/predict.c
> > > index c691990..bf729f8 100644
> > > --- a/gcc/predict.c
> > > +++ b/gcc/predict.c
> > > @@ -279,6 +279,9 @@ optimize_insn_for_speed_p (void)
> > >  bool
> > >  optimize_loop_for_size_p (struct loop *loop)
> > >  {
> > > +  /* Loops marked NON-ROLLING are not likely to be hot.  */
> > > +  if (non_rolling_loop_p (loop))
> > > +    return true;
> > >    return optimize_bb_for_size_p (loop->header);
> > >  }
> > >
> > > @@ -287,6 +290,9 @@ optimize_loop_for_size_p (struct loop *loop)
> > >  bool
> > >  optimize_loop_for_speed_p (struct loop *loop)
> > >  {
> > > +  /* Loops marked NON-ROLLING are not likely to be hot.  */
> > > +  if (non_rolling_loop_p (loop))
> > > +    return false;
> > >    return optimize_bb_for_speed_p (loop->header);
> > >  }
> > >
> > > diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c
> > > index 87b2c0d..bc977bb 100644
> > > --- a/gcc/tree-ssa-loop-manip.c
> > > +++ b/gcc/tree-ssa-loop-manip.c
> > > @@ -931,6 +931,9 @@ tree_transform_and_unroll_loop (struct loop *loop, unsigned factor,
> > >    gcc_assert (new_loop != NULL);
> > >    update_ssa (TODO_update_ssa);
> > >
> > > +  /* NEW_LOOP is a non-rolling loop.  */
> > > +  mark_non_rolling_loop (new_loop);
> > > +
> > >    /* Determine the probability of the exit edge of the unrolled loop.  */
> > >    new_est_niter = est_niter / factor;
> > >
> > > diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c
> > > index ee85f6f..1e2e4b2 100644
> > > --- a/gcc/tree-ssa-loop-niter.c
> > > +++ b/gcc/tree-ssa-loop-niter.c
> > > @@ -3011,6 +3011,26 @@ estimate_numbers_of_iterations (bool use_undefined_p)
> > >    fold_undefer_and_ignore_overflow_warnings ();
> > >  }
> > >
> > > +/* Mark LOOP as a non-rolling loop.  */
> > > +
> > > +void
> > > +mark_non_rolling_loop (struct loop *loop)
> > > +{
> > > +  gcc_assert (loop && loop->header);
> > > +  loop->header->flags |= BB_HEADER_OF_NONROLLING_LOOP;
> > > +}
> > > +
> > > +/* Return true if LOOP is a non-rolling loop.  */
> > > +
> > > +bool
> > > +non_rolling_loop_p (struct loop *loop)
> > > +{
> > > +  int masked_flags;
> > > +  gcc_assert (loop && loop->header);
> > > +  masked_flags = (loop->header->flags & BB_HEADER_OF_NONROLLING_LOOP);
> > > +  return (masked_flags != 0);
> > > +}
> > > +
> > >  /* Returns true if statement S1 dominates statement S2.  */
> > >
> > >  bool
> > > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> > > index 6ecd304..216de78 100644
> > > --- a/gcc/tree-vect-loop-manip.c
> > > +++ b/gcc/tree-vect-loop-manip.c
> > > @@ -1938,6 +1938,10 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio,
> > >                                           cond_expr, cond_expr_stmt_list);
> > >    gcc_assert (new_loop);
> > >    gcc_assert (loop_num == loop->num);
> > > +
> > > +  /* NEW_LOOP is a non-rolling loop.  */
> > > +  mark_non_rolling_loop (new_loop);
> > > +
> > >  #ifdef ENABLE_CHECKING
> > >    slpeel_verify_cfg_after_peeling (loop, new_loop);
> > >  #endif
> > > @@ -2191,6 +2195,10 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
> > >                                  th, true, NULL_TREE, NULL);
> > >
> > >    gcc_assert (new_loop);
> > > +
> > > +  /* NEW_LOOP is a non-rolling loop.  */
> > > +  mark_non_rolling_loop (new_loop);
> > > +
> > >  #ifdef ENABLE_CHECKING
> > >    slpeel_verify_cfg_after_peeling (new_loop, loop);
> > >  #endif
> > > --
> > > 1.6.3.3
> > >
>
Jack Howarth Dec. 17, 2010, 4:55 p.m. UTC | #6
On Fri, Dec 17, 2010 at 10:13:24AM -0600, Fang, Changpeng wrote:
> 
> Hi, Jack:
> 
> Have you tested my original patch (I remembered you bootstrapped it) in this thread?
> That is with the same logic which only applies to prefetch, unswitch and unrolling?
> 
> It helps a lot if you have data about that?
> 
> Thanks,
> 
> Changpeng
> 
> 

Changpeng,
   I noticed that neither of your patches cleanly apply to recent gcc trunk due to
the context change in gcc/basic-block.h around...

  /* Set on blocks that cannot be threaded through.
     Only used in cfgcleanup.c.  */
  BB_NONTHREADABLE_BLOCK = 1 << 11,

  /* Set on blocks that were modified in some way.  This bit is set in
     df_set_bb_dirty, but not cleared by df_analyze, so it can be used
     to test whether a block has been modified prior to a df_analyze
     call.  */
  BB_MODIFIED = 1 << 12
};

You might want to rebenchmark pb05 against current gcc trunk to make sure
that r167779 didn't impact the results.
         Jack
Fang, Changpeng Dec. 17, 2010, 5 p.m. UTC | #7
Thanks, Jack:

I am going to update the patch against the current trunk!

Thanks,

Changpeng
Jack Howarth Dec. 17, 2010, 9:04 p.m. UTC | #8
On Fri, Dec 17, 2010 at 12:31:15AM -0500, Jack Howarth wrote:
> On Thu, Dec 16, 2010 at 06:13:52PM -0600, Fang, Changpeng wrote:
> > Hi,
> > 
> > Based on previous discussions, I modified the patch as such.
> > 
> > If a loop is marked as non-rolling, optimize_loop_for_size_p returns TRUE
> > and optimize_loop_for_speed_p returns FALSE. All users of these two 
> > functions will be affected.
> > 
> > After applying the modified patch, pb05 compilation time decreases 29%, binary
> > size decreases 20%, while a small (0.5%) performance increase was found which may
> > be just noise.
> > 
> > Modified patch passed bootstrapping and gcc regression tests on x86_64-unknown-linux-gnu.
> > 
> > Is it OK to commit to trunk?
> 
> Changpeng,
>     On x86_64-apple-darwin10, I am finding a more severe penalty for this patch with
> the pb05 benchmarks. Using a profiledbootstrap BOOT_CFLAGS="-g -O3" build with...
> 
> Configured with: ../gcc-4.6-20101216/configure --prefix=/sw --prefix=/sw/lib/gcc4.6 --mandir=/sw/share/man --infodir=/sw/lib/gcc4.6/info --enable-languages=c,c++,fortran,objc,obj-c++,java --with-gmp=/sw --with-libiconv-prefix=/sw --with-ppl=/sw --with-cloog=/sw --with-mpc=/sw --with-system-zlib --x-includes=/usr/X11R6/include --x-libraries=/usr/X11R6/lib --program-suffix=-fsf-4.6 --enable-checking=yes --enable-cloog-backend=isl --enable-build-with-cxx
> 
> I get without the patch...
> 
> ================================================================================
> Date & Time     : 16 Dec 2010 23:36:27
> Test Name       : gfortran_lin_O3
> Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
> Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
> Maximum Times   :     2000.0
> Target Error %  :      0.100
> Minimum Repeats :    10
> Maximum Repeats :   100
> 
>    Benchmark   Compile  Executable   Ave Run  Number   Estim
>         Name    (secs)     (bytes)    (secs) Repeats   Err %
>    ---------   -------  ----------   ------- -------  ------
>           ac      1.76       10000      8.78      12  0.0081
>       aermod     54.94       10000     17.28      10  0.0307
>          air      3.44       10000      5.53      13  0.0734
>     capacita      2.64       10000     32.65      10  0.0096
>      channel      0.89       10000      1.84      20  0.0977
>        doduc      8.12       10000     27.00      10  0.0132
>      fatigue      3.06       10000      8.36      10  0.0104
>      gas_dyn      5.00       10000      4.30      17  0.0915
>       induct      6.24       10000     12.42      10  0.0100
>        linpk      1.02       10000     15.50      12  0.0542
>         mdbx      2.55       10000     11.24      10  0.0256
>           nf      2.90       10000     30.16      20  0.0989
>      protein      7.98       10000     33.72      10  0.0070
>       rnflow      9.34       10000     23.21      10  0.0551
>     test_fpu      6.72       10000      8.05      10  0.0426
>         tfft      0.76       10000      1.87      10  0.0597
> 
> Geometric Mean Execution Time =      10.87 seconds
> 
> and with the patch...
> 
> ================================================================================
> Date & Time     : 16 Dec 2010 21:31:06
> Test Name       : gfortran_lin_O3
> Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
> Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
> Maximum Times   :     2000.0
> Target Error %  :      0.100
> Minimum Repeats :    10
> Maximum Repeats :   100
> 
>    Benchmark   Compile  Executable   Ave Run  Number   Estim
>         Name    (secs)     (bytes)    (secs) Repeats   Err %
>    ---------   -------  ----------   ------- -------  ------
>           ac      1.19       10000      8.78      10  0.0099
>       aermod     47.91       10000     16.95      10  0.0123
>          air      2.85       10000      5.34      12  0.0715
>     capacita      1.63       10000     33.10      10  0.0361
>      channel      0.67       10000      1.87      10  0.0884
>        doduc      6.42       10000     27.35      10  0.0206
>      fatigue      2.10       10000      8.32      10  0.0194
>      gas_dyn      2.07       10000      4.30      17  0.0843
>       induct      5.38       10000     12.58      10  0.0088
>        linpk      0.71       10000     15.69      18  0.0796
>         mdbx      1.95       10000     11.41      10  0.0238
>           nf      1.24       10000     31.34      12  0.0991
>      protein      3.88       10000     35.13      10  0.0659
>       rnflow      4.73       10000     25.97      10  0.0629
>     test_fpu      3.66       10000      8.88      11  0.0989
>         tfft      0.52       10000      1.89      10  0.0403
> 
> Geometric Mean Execution Time =      11.09 seconds
> 
> This shows about a 2.0% performance reduction in the Geometric
> Mean Execution Time. I would note that intel darwin now defaults
> to -mtune=core2 and always has defaulted to -fPIC.
>            Jack
> 

Changpeng,
   Using your original patch from http://gcc.gnu.org/ml/gcc-patches/2010-12/msg01024.html
I get...

================================================================================
Date & Time     : 17 Dec 2010 14:52:35
Test Name       : gfortran_lin_O3
Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
Maximum Times   :     2000.0
Target Error %  :      0.100
Minimum Repeats :    10
Maximum Repeats :   100

   Benchmark   Compile  Executable   Ave Run  Number   Estim
        Name    (secs)     (bytes)    (secs) Repeats   Err %
   ---------   -------  ----------   ------- -------  ------
          ac      1.24       10000      8.76      10  0.0118
      aermod     63.85       10000     17.23      10  0.0061
         air      2.92       10000      5.39      10  0.0577
    capacita      1.66       10000     33.06      10  0.0259
     channel      0.82       10000      1.87      12  0.0223
       doduc      6.51       10000     27.39      10  0.0064
     fatigue      2.14       10000      8.31      10  0.0228
     gas_dyn      2.33       10000      4.29      18  0.0889
      induct      5.66       10000     12.57      10  0.0099
       linpk      0.76       10000     15.54      10  0.0560
        mdbx      2.01       10000     11.45      10  0.0239
          nf      1.29       10000     31.32      17  0.0929
     protein      4.06       10000     33.54      10  0.0081
      rnflow      4.89       10000     25.93      10  0.0152
    test_fpu      3.77       10000      8.83      10  0.0279
        tfft      0.54       10000      1.89      10  0.0266

Geometric Mean Execution Time =      11.06 seconds

which is only a slightly lower performance penalty on the
Geometric Mean Execution Time (1.7%).
         Jack

> > 
> > Thanks,
> > 
> > Changpeng
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > ________________________________________
> > From: Zdenek Dvorak [rakdver@kam.mff.cuni.cz]
> > Sent: Thursday, December 16, 2010 12:47 PM
> > To: Fang, Changpeng
> > Cc: Richard Guenther; Xinliang David Li; gcc-patches@gcc.gnu.org
> > Subject: Re: [PATCH, Loop optimizer]: Add logic to disable certain loop optimizations on pre-/post-loops
> > 
> > Hi,
> > 
> > > For prefetching of prologue or epilogue loops, we have two choices (1) prefetching not
> > > not unrolling, (2) not prefetching.  Which one do you prefer?
> > 
> > it is better not to prefetch (the current placement of prefetches is not good for non-rolling
> > loops),
> > 
> > Zdenek
> > 
> 
> Content-Description: polyhedron1.txt
> > Comparison of Polyhedron (2005) Compile Time, Binary Size and Performance After Applying the NON-ROLLING Marking Patch
> > 
> > gfortran -Ofast -funroll-loops -march=amdfam10 %n.f90 -o %n
> >     ============================================================================================================================							
> > ||	         |  Before Patch		     |  After Patch			  | Changes                       ||
> > ||========================================================================================================================||		
> > ||  Benchmark    |  Compile    Binary Size  Run Time |  Compile   Binary Size	Run Time  | Compile  Binary   Performance ||
> > ||        Name   |  Time (s)   (bytes)	   (s)       |	Time(s)	    (bytes)	 (secs)	  | Time (%) Size(%)	(%)       ||
> > ||========================================================================================================================||
> > ||          ac   |    3.36       41976	    13.26    |	  2.52	     34424	13.21	  |  -25.00   -17.99	0.38	  ||
> > ||      aermod   |  103.45     1412221	    44.36    |	 86.55	   1268861	43.08	  |  -16.34   -10.15	2.97	  ||
> > ||         air   |    6.11       75186	    11.73    |	  5.72	     71090      11.56	  |   -6.38    -5.45	1.47	  ||
> > ||    capacita   |    6.83       91257	    88.40    |	  4.70	     74585      88.01	  |  -31.19   -18.27	0.44	  ||
> > ||     channel   |    2.14       39984	     6.65    |	  1.84	     35888	 6.69	  |  -14.02   -10.24   -0.60	  ||
> > ||       doduc   |   12.78      198624	    38.59    |	 12.20	    186336	38.18	  |   -4.54    -6.19	1.07	  ||
> > ||     fatigue   |    9.11      110008	    10.15    |	  5.93	     92472	10.12	  |  -34.91   -15.94	0.30	  ||
> > ||     gas_dyn   |   15.69      149726	     7.14    |	  8.45	    109342	 6.96	  |  -46.14   -26.97	2.59	  ||
> > ||      induct   |   10.98      191800	    20.66    |	 10.62	    188168	20.61	  |   -3.28    -1.89	0.24	  ||
> > ||       linpk   |    2.27       46073	    19.03    |	  1.68	     33401	19.03	  |  -25.99   -27.50	0.00	  ||
> > ||        mdbx   |    5.63      103731	    21.41    |	  4.24	     83251	21.33	  |  -24.69   -19.74	0.38	  ||
> > ||          nf   |   14.18      118451	    22.88    |	  5.55	     60499	23.14	  |  -60.86   -48.92   -1.12	  ||
> > ||     protein   |   34.20      177700	    47.04    |	 19.16	    135012	46.94	  |  -43.98   -24.02	0.21	  ||
> > ||      rnflow   |   42.13      283645	    40.30    |	 20.92	    178477	40.65	  |  -50.34   -37.08   -0.86	  ||
> > ||    test_fpu   |   30.17      252080	    14.46    |	 14.44	    149136	14.32	  |  -52.14   -40.84	0.98	  ||
> > ||        tfft   |    1.50       32450	     7.71    |	  1.12	     26546	 7.67	  |  -25.33   -18.19	0.52	  ||
> > ||========================================================================================================================||
> > ||       average   |			    19.57    |				19.46	  |  -29.07   -20.59	0.56	  ||
> > ============================================================================================================================
> 
> Content-Description: 0001-Don-t-perform-certain-loop-optimizations-on-pre-post.patch
> > From cd8b85bba1b39e108235f44d9d07918179ff3d79 Mon Sep 17 00:00:00 2001
> > From: Changpeng Fang <chfang@houghton.(none)>
> > Date: Mon, 13 Dec 2010 12:01:49 -0800
> > Subject: [PATCH] Don't perform certain loop optimizations on pre/post loops
> > 
> > 	* basic-block.h (bb_flags): Add a new flag BB_HEADER_OF_NONROLLING_LOOP.
> > 	* cfg.c (clear_bb_flags): Keep BB_HEADER_OF_NONROLLING marker.
> > 	* cfgloop.h (mark_non_rolling_loop): New function declaration.
> > 	  (non_rolling_loop_p): New function declaration.
> > 	* predict.c (optimize_loop_for_size_p): Return true if the loop was marked
> > 	  NON-ROLLING.  (optimize_loop_for_speed_p): Return false if the loop was
> > 	  marked NON-ROLLING.
> > 	* tree-ssa-loop-manip.c (tree_transform_and_unroll_loop): Mark the
> > 	  non-rolling loop.
> > 	* tree-ssa-loop-niter.c (mark_non_rolling_loop): Implement the new
> > 	  function.  (non_rolling_loop_p): Implement the new function.
> > 	* tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Mark the
> > 	  non-rolling loop.  (vect_do_peeling_for_alignment): Mark the non-rolling
> > 	  loop.
> > ---
> >  gcc/basic-block.h          |    6 +++++-
> >  gcc/cfg.c                  |    7 ++++---
> >  gcc/cfgloop.h              |    2 ++
> >  gcc/predict.c              |    6 ++++++
> >  gcc/tree-ssa-loop-manip.c  |    3 +++
> >  gcc/tree-ssa-loop-niter.c  |   20 ++++++++++++++++++++
> >  gcc/tree-vect-loop-manip.c |    8 ++++++++
> >  7 files changed, 48 insertions(+), 4 deletions(-)
> > 
> > diff --git a/gcc/basic-block.h b/gcc/basic-block.h
> > index be0a1d1..850472d 100644
> > --- a/gcc/basic-block.h
> > +++ b/gcc/basic-block.h
> > @@ -245,7 +245,11 @@ enum bb_flags
> >  
> >    /* Set on blocks that cannot be threaded through.
> >       Only used in cfgcleanup.c.  */
> > -  BB_NONTHREADABLE_BLOCK = 1 << 11
> > +  BB_NONTHREADABLE_BLOCK = 1 << 11,
> > +
> > +  /* Set on blocks that are headers of non-rolling loops.  */
> > +  BB_HEADER_OF_NONROLLING_LOOP = 1 << 12
> > +
> >  };
> >  
> >  /* Dummy flag for convenience in the hot/cold partitioning code.  */
> > diff --git a/gcc/cfg.c b/gcc/cfg.c
> > index c8ef799..e59a637 100644
> > --- a/gcc/cfg.c
> > +++ b/gcc/cfg.c
> > @@ -425,8 +425,8 @@ redirect_edge_pred (edge e, basic_block new_pred)
> >    connect_src (e);
> >  }
> >  
> > -/* Clear all basic block flags, with the exception of partitioning and
> > -   setjmp_target.  */
> > +/* Clear all basic block flags, with the exception of partitioning,
> > +   setjmp_target, and the non-rolling loop marker.  */
> >  void
> >  clear_bb_flags (void)
> >  {
> > @@ -434,7 +434,8 @@ clear_bb_flags (void)
> >  
> >    FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR, NULL, next_bb)
> >      bb->flags = (BB_PARTITION (bb)
> > -		 | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET)));
> > +		 | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET
> > +				 + BB_HEADER_OF_NONROLLING_LOOP)));
> >  }
> >  
> >  /* Check the consistency of profile information.  We can't do that
> > diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
> > index bf2614e..e856a78 100644
> > --- a/gcc/cfgloop.h
> > +++ b/gcc/cfgloop.h
> > @@ -279,6 +279,8 @@ extern rtx doloop_condition_get (rtx);
> >  void estimate_numbers_of_iterations_loop (struct loop *, bool);
> >  HOST_WIDE_INT estimated_loop_iterations_int (struct loop *, bool);
> >  bool estimated_loop_iterations (struct loop *, bool, double_int *);
> > +void mark_non_rolling_loop (struct loop *);
> > +bool non_rolling_loop_p (struct loop *);
> >  
> >  /* Loop manipulation.  */
> >  extern bool can_duplicate_loop_p (const struct loop *loop);
> > diff --git a/gcc/predict.c b/gcc/predict.c
> > index c691990..bf729f8 100644
> > --- a/gcc/predict.c
> > +++ b/gcc/predict.c
> > @@ -279,6 +279,9 @@ optimize_insn_for_speed_p (void)
> >  bool
> >  optimize_loop_for_size_p (struct loop *loop)
> >  {
> > +  /* Loops marked NON-ROLLING are not likely to be hot.  */
> > +  if (non_rolling_loop_p (loop))
> > +    return true;
> >    return optimize_bb_for_size_p (loop->header);
> >  }
> >  
> > @@ -287,6 +290,9 @@ optimize_loop_for_size_p (struct loop *loop)
> >  bool
> >  optimize_loop_for_speed_p (struct loop *loop)
> >  {
> > +  /* Loops marked NON-ROLLING are not likely to be hot.  */
> > +  if (non_rolling_loop_p (loop))
> > +    return false;
> >    return optimize_bb_for_speed_p (loop->header);
> >  }
> >  
> > diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c
> > index 87b2c0d..bc977bb 100644
> > --- a/gcc/tree-ssa-loop-manip.c
> > +++ b/gcc/tree-ssa-loop-manip.c
> > @@ -931,6 +931,9 @@ tree_transform_and_unroll_loop (struct loop *loop, unsigned factor,
> >    gcc_assert (new_loop != NULL);
> >    update_ssa (TODO_update_ssa);
> >  
> > +  /* NEW_LOOP is a non-rolling loop.  */
> > +  mark_non_rolling_loop (new_loop);
> > +
> >    /* Determine the probability of the exit edge of the unrolled loop.  */
> >    new_est_niter = est_niter / factor;
> >  
> > diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c
> > index ee85f6f..1e2e4b2 100644
> > --- a/gcc/tree-ssa-loop-niter.c
> > +++ b/gcc/tree-ssa-loop-niter.c
> > @@ -3011,6 +3011,26 @@ estimate_numbers_of_iterations (bool use_undefined_p)
> >    fold_undefer_and_ignore_overflow_warnings ();
> >  }
> >  
> > +/* Mark LOOP as a non-rolling loop.  */
> > +
> > +void
> > +mark_non_rolling_loop (struct loop *loop)
> > +{
> > +  gcc_assert (loop && loop->header);
> > +  loop->header->flags |= BB_HEADER_OF_NONROLLING_LOOP;
> > +}
> > +
> > +/* Return true if LOOP is a non-rolling loop.  */
> > +
> > +bool
> > +non_rolling_loop_p (struct loop *loop)
> > +{
> > +  int masked_flags;
> > +  gcc_assert (loop && loop->header);
> > +  masked_flags = (loop->header->flags & BB_HEADER_OF_NONROLLING_LOOP);
> > +  return (masked_flags != 0);
> > +}
> > +
> >  /* Returns true if statement S1 dominates statement S2.  */
> >  
> >  bool
> > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> > index 6ecd304..216de78 100644
> > --- a/gcc/tree-vect-loop-manip.c
> > +++ b/gcc/tree-vect-loop-manip.c
> > @@ -1938,6 +1938,10 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio,
> >  					    cond_expr, cond_expr_stmt_list);
> >    gcc_assert (new_loop);
> >    gcc_assert (loop_num == loop->num);
> > +  
> > +  /* NEW_LOOP is a non-rolling loop.  */
> > +  mark_non_rolling_loop (new_loop);
> > +
> >  #ifdef ENABLE_CHECKING
> >    slpeel_verify_cfg_after_peeling (loop, new_loop);
> >  #endif
> > @@ -2191,6 +2195,10 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
> >  				   th, true, NULL_TREE, NULL);
> >  
> >    gcc_assert (new_loop);
> > +
> > +  /* NEW_LOOP is a non-rolling loop.  */
> > +  mark_non_rolling_loop (new_loop);
> > +  
> >  #ifdef ENABLE_CHECKING
> >    slpeel_verify_cfg_after_peeling (new_loop, loop);
> >  #endif
> > -- 
> > 1.6.3.3
> >
Fang, Changpeng Dec. 17, 2010, 9:30 p.m. UTC | #9
Hi, Jack:

Is prefetch default on at -O3 on your systems?

Can you do an additional test to test
-O3 -ffast-math with/without the original patch?

This way, we can know that whether it is the rtl-loop unrolling problem.

Thanks,

Changpeng
Jack Howarth Dec. 17, 2010, 9:45 p.m. UTC | #10
On Fri, Dec 17, 2010 at 03:30:37PM -0600, Fang, Changpeng wrote:
> Hi, Jack:
> 
> Is prefetch default on at -O3 on your systems?
> 

Changpeng,
    On x86_64-apple-darwin10, from 'touch t.f90; gfortran -fverbose-asm -O3 t.f90 -S',
I get...

# GNU Fortran (GCC) version 4.6.0 20101217 (experimental) (x86_64-apple-darwin10.5.0)
#       compiled by GNU C version 4.6.0 20101217 (experimental), GMP version 4.3.2, MPFR version 2.4.2-p3, MPC version 0.8.2
# GGC heuristics: --param ggc-min-expand=30 --param ggc-min-heapsize=4096
# options passed:  t.f90 -fPIC -mmacosx-version-min=10.6.5 -mtune=core2 -O3
# -fverbose-asm
# -fintrinsic-modules-path /sw/lib/gcc4.6/lib/gcc/x86_64-apple-darwin10.5.0/4.6.0/finclude
# options enabled:  -Wnonportable-cfstrings -fPIC
# -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
# -fcaller-saves -fcombine-stack-adjustments -fcommon -fcprop-registers
# -fcrossjumping -fcse-follow-jumps -fdefer-pop
# -fdelete-null-pointer-checks -fearly-inlining
# -feliminate-unused-debug-types -fexpensive-optimizations
# -fforward-propagate -ffunction-cse -fgcse -fgcse-after-reload -fgcse-lm
# -fguess-branch-probability -fident -fif-conversion -fif-conversion2
# -findirect-inlining -finline -finline-functions
# -finline-functions-called-once -finline-small-functions -fipa-cp
# -fipa-cp-clone -fipa-profile -fipa-pure-const -fipa-reference -fipa-sra
# -fira-share-save-slots -fira-share-spill-slots -fivopts
# -fkeep-static-consts -fleading-underscore -fmath-errno -fmerge-constants
# -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer
# -foptimize-register-move -foptimize-sibling-calls -fpartial-inlining
# -fpeephole -fpeephole2 -fpredictive-commoning -fprefetch-loop-arrays
# -freg-struct-return -fregmove -freorder-blocks -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-insns2
# -fshow-column -fsigned-zeros -fsplit-ivs-in-unroller -fsplit-wide-types
# -fstrict-aliasing -fstrict-overflow -fstrict-volatile-bitfields
# -fthread-jumps -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp
# -ftree-builtin-call-dce -ftree-ccp -ftree-ch -ftree-copy-prop
# -ftree-copyrename -ftree-cselim -ftree-dce -ftree-dominator-opts
# -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-distribute-patterns
# -ftree-loop-if-convert -ftree-loop-im -ftree-loop-ivcanon
# -ftree-loop-optimize -ftree-parallelize-loops= -ftree-phiprop -ftree-pre
# -ftree-pta -ftree-reassoc -ftree-scev-cprop -ftree-sink
# -ftree-slp-vectorize -ftree-sra -ftree-switch-conversion -ftree-ter
# -ftree-vect-loop-version -ftree-vectorize -ftree-vrp -funit-at-a-time
# -funswitch-loops -funwind-tables -fvect-cost-model -fverbose-asm -fzee
# -fzero-initialized-in-bss -gstrict-dwarf -m128bit-long-double -m64
# -m80387 -maccumulate-outgoing-args -malign-stringops -matt-stubs
# -mconstant-cfstrings -mfancy-math-387 -mfp-ret-in-387 -mieee-fp -mmmx
# -mno-sse4 -mpush-args -mred-zone -msse -msse2 -msse3

        .subsections_via_symbols

> Can you do an additional test to test
> -O3 -ffast-math with/without the original patch?
> 
> This way, we can know that whether it is the rtl-loop unrolling problem.

I'll do this later tonight.

> 
> Thanks,
> 
> Changpeng
> 
> 
> 
> ________________________________________
> From: Jack Howarth [howarth@bromo.med.uc.edu]
> Sent: Friday, December 17, 2010 3:04 PM
> To: Fang, Changpeng
> Cc: Zdenek Dvorak; Richard Guenther; Xinliang David Li; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH, Loop optimizer]: Add logic to disable certain loop optimizations on pre-/post-loops
> 
> On Fri, Dec 17, 2010 at 12:31:15AM -0500, Jack Howarth wrote:
> > On Thu, Dec 16, 2010 at 06:13:52PM -0600, Fang, Changpeng wrote:
> > > Hi,
> > >
> > > Based on previous discussions, I modified the patch as such.
> > >
> > > If a loop is marked as non-rolling, optimize_loop_for_size_p returns TRUE
> > > and optimize_loop_for_speed_p returns FALSE. All users of these two
> > > functions will be affected.
> > >
> > > After applying the modified patch, pb05 compilation time decreases 29%, binary
> > > size decreases 20%, while a small (0.5%) performance increase was found which may
> > > be just noise.
> > >
> > > Modified patch passed bootstrapping and gcc regression tests on x86_64-unknown-linux-gnu.
> > >
> > > Is it OK to commit to trunk?
> >
> > Changpeng,
> >     On x86_64-apple-darwin10, I am finding a more severe penalty for this patch with
> > the pb05 benchmarks. Using a profiledbootstrap BOOT_CFLAGS="-g -O3" build with...
> >
> > Configured with: ../gcc-4.6-20101216/configure --prefix=/sw --prefix=/sw/lib/gcc4.6 --mandir=/sw/share/man --infodir=/sw/lib/gcc4.6/info --enable-languages=c,c++,fortran,objc,obj-c++,java --with-gmp=/sw --with-libiconv-prefix=/sw --with-ppl=/sw --with-cloog=/sw --with-mpc=/sw --with-system-zlib --x-includes=/usr/X11R6/include --x-libraries=/usr/X11R6/lib --program-suffix=-fsf-4.6 --enable-checking=yes --enable-cloog-backend=isl --enable-build-with-cxx
> >
> > I get without the patch...
> >
> > ================================================================================
> > Date & Time     : 16 Dec 2010 23:36:27
> > Test Name       : gfortran_lin_O3
> > Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
> > Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
> > Maximum Times   :     2000.0
> > Target Error %  :      0.100
> > Minimum Repeats :    10
> > Maximum Repeats :   100
> >
> >    Benchmark   Compile  Executable   Ave Run  Number   Estim
> >         Name    (secs)     (bytes)    (secs) Repeats   Err %
> >    ---------   -------  ----------   ------- -------  ------
> >           ac      1.76       10000      8.78      12  0.0081
> >       aermod     54.94       10000     17.28      10  0.0307
> >          air      3.44       10000      5.53      13  0.0734
> >     capacita      2.64       10000     32.65      10  0.0096
> >      channel      0.89       10000      1.84      20  0.0977
> >        doduc      8.12       10000     27.00      10  0.0132
> >      fatigue      3.06       10000      8.36      10  0.0104
> >      gas_dyn      5.00       10000      4.30      17  0.0915
> >       induct      6.24       10000     12.42      10  0.0100
> >        linpk      1.02       10000     15.50      12  0.0542
> >         mdbx      2.55       10000     11.24      10  0.0256
> >           nf      2.90       10000     30.16      20  0.0989
> >      protein      7.98       10000     33.72      10  0.0070
> >       rnflow      9.34       10000     23.21      10  0.0551
> >     test_fpu      6.72       10000      8.05      10  0.0426
> >         tfft      0.76       10000      1.87      10  0.0597
> >
> > Geometric Mean Execution Time =      10.87 seconds
> >
> > and with the patch...
> >
> > ================================================================================
> > Date & Time     : 16 Dec 2010 21:31:06
> > Test Name       : gfortran_lin_O3
> > Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
> > Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
> > Maximum Times   :     2000.0
> > Target Error %  :      0.100
> > Minimum Repeats :    10
> > Maximum Repeats :   100
> >
> >    Benchmark   Compile  Executable   Ave Run  Number   Estim
> >         Name    (secs)     (bytes)    (secs) Repeats   Err %
> >    ---------   -------  ----------   ------- -------  ------
> >           ac      1.19       10000      8.78      10  0.0099
> >       aermod     47.91       10000     16.95      10  0.0123
> >          air      2.85       10000      5.34      12  0.0715
> >     capacita      1.63       10000     33.10      10  0.0361
> >      channel      0.67       10000      1.87      10  0.0884
> >        doduc      6.42       10000     27.35      10  0.0206
> >      fatigue      2.10       10000      8.32      10  0.0194
> >      gas_dyn      2.07       10000      4.30      17  0.0843
> >       induct      5.38       10000     12.58      10  0.0088
> >        linpk      0.71       10000     15.69      18  0.0796
> >         mdbx      1.95       10000     11.41      10  0.0238
> >           nf      1.24       10000     31.34      12  0.0991
> >      protein      3.88       10000     35.13      10  0.0659
> >       rnflow      4.73       10000     25.97      10  0.0629
> >     test_fpu      3.66       10000      8.88      11  0.0989
> >         tfft      0.52       10000      1.89      10  0.0403
> >
> > Geometric Mean Execution Time =      11.09 seconds
> >
> > This shows about a 2.0% performance reduction in the Geometric
> > Mean Execution Time. I would note that intel darwin now defaults
> > to -mtune=core2 and always has defaulted to -fPIC.
> >            Jack
> >
> 
> Changpeng,
>    Using your original patch from http://gcc.gnu.org/ml/gcc-patches/2010-12/msg01024.html
> I get...
> 
> ================================================================================
> Date & Time     : 17 Dec 2010 14:52:35
> Test Name       : gfortran_lin_O3
> Compile Command : gfortran -ffast-math -funroll-loops -O3 %n.f90 -o %n
> Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
> Maximum Times   :     2000.0
> Target Error %  :      0.100
> Minimum Repeats :    10
> Maximum Repeats :   100
> 
>    Benchmark   Compile  Executable   Ave Run  Number   Estim
>         Name    (secs)     (bytes)    (secs) Repeats   Err %
>    ---------   -------  ----------   ------- -------  ------
>           ac      1.24       10000      8.76      10  0.0118
>       aermod     63.85       10000     17.23      10  0.0061
>          air      2.92       10000      5.39      10  0.0577
>     capacita      1.66       10000     33.06      10  0.0259
>      channel      0.82       10000      1.87      12  0.0223
>        doduc      6.51       10000     27.39      10  0.0064
>      fatigue      2.14       10000      8.31      10  0.0228
>      gas_dyn      2.33       10000      4.29      18  0.0889
>       induct      5.66       10000     12.57      10  0.0099
>        linpk      0.76       10000     15.54      10  0.0560
>         mdbx      2.01       10000     11.45      10  0.0239
>           nf      1.29       10000     31.32      17  0.0929
>      protein      4.06       10000     33.54      10  0.0081
>       rnflow      4.89       10000     25.93      10  0.0152
>     test_fpu      3.77       10000      8.83      10  0.0279
>         tfft      0.54       10000      1.89      10  0.0266
> 
> Geometric Mean Execution Time =      11.06 seconds
> 
> which is only a slightly lower performance penalty on the
> Geometric Mean Execution Time (1.7%).
>          Jack
> 
> > >
> > > Thanks,
> > >
> > > Changpeng
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > > ________________________________________
> > > From: Zdenek Dvorak [rakdver@kam.mff.cuni.cz]
> > > Sent: Thursday, December 16, 2010 12:47 PM
> > > To: Fang, Changpeng
> > > Cc: Richard Guenther; Xinliang David Li; gcc-patches@gcc.gnu.org
> > > Subject: Re: [PATCH, Loop optimizer]: Add logic to disable certain loop optimizations on pre-/post-loops
> > >
> > > Hi,
> > >
> > > > For prefetching of prologue or epilogue loops, we have two choices (1) prefetching not
> > > > not unrolling, (2) not prefetching.  Which one do you prefer?
> > >
> > > it is better not to prefetch (the current placement of prefetches is not good for non-rolling
> > > loops),
> > >
> > > Zdenek
> > >
> >
> > Content-Description: polyhedron1.txt
> > > Comparison of Polyhedron (2005) Compile Time, Binary Size and Performance After Applying the NON-ROLLING Marking Patch
> > >
> > > gfortran -Ofast -funroll-loops -march=amdfam10 %n.f90 -o %n
> > >     ============================================================================================================================
> > > ||           |  Before Patch                     |  After Patch                       | Changes                       ||
> > > ||========================================================================================================================||
> > > ||  Benchmark    |  Compile    Binary Size  Run Time |  Compile   Binary Size       Run Time  | Compile  Binary   Performance ||
> > > ||        Name   |  Time (s)   (bytes)         (s)       |  Time(s)     (bytes)      (secs)   | Time (%) Size(%)    (%)       ||
> > > ||========================================================================================================================||
> > > ||          ac   |    3.36       41976          13.26    |    2.52       34424      13.21     |  -25.00   -17.99    0.38      ||
> > > ||      aermod   |  103.45     1412221          44.36    |   86.55     1268861      43.08     |  -16.34   -10.15    2.97      ||
> > > ||         air   |    6.11       75186          11.73    |    5.72       71090      11.56     |   -6.38    -5.45    1.47      ||
> > > ||    capacita   |    6.83       91257          88.40    |    4.70       74585      88.01     |  -31.19   -18.27    0.44      ||
> > > ||     channel   |    2.14       39984           6.65    |    1.84       35888       6.69     |  -14.02   -10.24   -0.60      ||
> > > ||       doduc   |   12.78      198624          38.59    |   12.20      186336      38.18     |   -4.54    -6.19    1.07      ||
> > > ||     fatigue   |    9.11      110008          10.15    |    5.93       92472      10.12     |  -34.91   -15.94    0.30      ||
> > > ||     gas_dyn   |   15.69      149726           7.14    |    8.45      109342       6.96     |  -46.14   -26.97    2.59      ||
> > > ||      induct   |   10.98      191800          20.66    |   10.62      188168      20.61     |   -3.28    -1.89    0.24      ||
> > > ||       linpk   |    2.27       46073          19.03    |    1.68       33401      19.03     |  -25.99   -27.50    0.00      ||
> > > ||        mdbx   |    5.63      103731          21.41    |    4.24       83251      21.33     |  -24.69   -19.74    0.38      ||
> > > ||          nf   |   14.18      118451          22.88    |    5.55       60499      23.14     |  -60.86   -48.92   -1.12      ||
> > > ||     protein   |   34.20      177700          47.04    |   19.16      135012      46.94     |  -43.98   -24.02    0.21      ||
> > > ||      rnflow   |   42.13      283645          40.30    |   20.92      178477      40.65     |  -50.34   -37.08   -0.86      ||
> > > ||    test_fpu   |   30.17      252080          14.46    |   14.44      149136      14.32     |  -52.14   -40.84    0.98      ||
> > > ||        tfft   |    1.50       32450           7.71    |    1.12       26546       7.67     |  -25.33   -18.19    0.52      ||
> > > ||========================================================================================================================||
> > > ||       average   |                            19.57    |                          19.46     |  -29.07   -20.59    0.56      ||
> > > ============================================================================================================================
> >
> > Content-Description: 0001-Don-t-perform-certain-loop-optimizations-on-pre-post.patch
> > > From cd8b85bba1b39e108235f44d9d07918179ff3d79 Mon Sep 17 00:00:00 2001
> > > From: Changpeng Fang <chfang@houghton.(none)>
> > > Date: Mon, 13 Dec 2010 12:01:49 -0800
> > > Subject: [PATCH] Don't perform certain loop optimizations on pre/post loops
> > >
> > >     * basic-block.h (bb_flags): Add a new flag BB_HEADER_OF_NONROLLING_LOOP.
> > >     * cfg.c (clear_bb_flags): Keep BB_HEADER_OF_NONROLLING marker.
> > >     * cfgloop.h (mark_non_rolling_loop): New function declaration.
> > >       (non_rolling_loop_p): New function declaration.
> > >     * predict.c (optimize_loop_for_size_p): Return true if the loop was marked
> > >       NON-ROLLING.  (optimize_loop_for_speed_p): Return false if the loop was
> > >       marked NON-ROLLING.
> > >     * tree-ssa-loop-manip.c (tree_transform_and_unroll_loop): Mark the
> > >       non-rolling loop.
> > >     * tree-ssa-loop-niter.c (mark_non_rolling_loop): Implement the new
> > >       function.  (non_rolling_loop_p): Implement the new function.
> > >     * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Mark the
> > >       non-rolling loop.  (vect_do_peeling_for_alignment): Mark the non-rolling
> > >       loop.
> > > ---
> > >  gcc/basic-block.h          |    6 +++++-
> > >  gcc/cfg.c                  |    7 ++++---
> > >  gcc/cfgloop.h              |    2 ++
> > >  gcc/predict.c              |    6 ++++++
> > >  gcc/tree-ssa-loop-manip.c  |    3 +++
> > >  gcc/tree-ssa-loop-niter.c  |   20 ++++++++++++++++++++
> > >  gcc/tree-vect-loop-manip.c |    8 ++++++++
> > >  7 files changed, 48 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/gcc/basic-block.h b/gcc/basic-block.h
> > > index be0a1d1..850472d 100644
> > > --- a/gcc/basic-block.h
> > > +++ b/gcc/basic-block.h
> > > @@ -245,7 +245,11 @@ enum bb_flags
> > >
> > >    /* Set on blocks that cannot be threaded through.
> > >       Only used in cfgcleanup.c.  */
> > > -  BB_NONTHREADABLE_BLOCK = 1 << 11
> > > +  BB_NONTHREADABLE_BLOCK = 1 << 11,
> > > +
> > > +  /* Set on blocks that are headers of non-rolling loops.  */
> > > +  BB_HEADER_OF_NONROLLING_LOOP = 1 << 12
> > > +
> > >  };
> > >
> > >  /* Dummy flag for convenience in the hot/cold partitioning code.  */
> > > diff --git a/gcc/cfg.c b/gcc/cfg.c
> > > index c8ef799..e59a637 100644
> > > --- a/gcc/cfg.c
> > > +++ b/gcc/cfg.c
> > > @@ -425,8 +425,8 @@ redirect_edge_pred (edge e, basic_block new_pred)
> > >    connect_src (e);
> > >  }
> > >
> > > -/* Clear all basic block flags, with the exception of partitioning and
> > > -   setjmp_target.  */
> > > +/* Clear all basic block flags, with the exception of partitioning,
> > > +   setjmp_target, and the non-rolling loop marker.  */
> > >  void
> > >  clear_bb_flags (void)
> > >  {
> > > @@ -434,7 +434,8 @@ clear_bb_flags (void)
> > >
> > >    FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR, NULL, next_bb)
> > >      bb->flags = (BB_PARTITION (bb)
> > > -            | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET)));
> > > +            | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET
> > > +                            + BB_HEADER_OF_NONROLLING_LOOP)));
> > >  }
> > >
> > >  /* Check the consistency of profile information.  We can't do that
> > > diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
> > > index bf2614e..e856a78 100644
> > > --- a/gcc/cfgloop.h
> > > +++ b/gcc/cfgloop.h
> > > @@ -279,6 +279,8 @@ extern rtx doloop_condition_get (rtx);
> > >  void estimate_numbers_of_iterations_loop (struct loop *, bool);
> > >  HOST_WIDE_INT estimated_loop_iterations_int (struct loop *, bool);
> > >  bool estimated_loop_iterations (struct loop *, bool, double_int *);
> > > +void mark_non_rolling_loop (struct loop *);
> > > +bool non_rolling_loop_p (struct loop *);
> > >
> > >  /* Loop manipulation.  */
> > >  extern bool can_duplicate_loop_p (const struct loop *loop);
> > > diff --git a/gcc/predict.c b/gcc/predict.c
> > > index c691990..bf729f8 100644
> > > --- a/gcc/predict.c
> > > +++ b/gcc/predict.c
> > > @@ -279,6 +279,9 @@ optimize_insn_for_speed_p (void)
> > >  bool
> > >  optimize_loop_for_size_p (struct loop *loop)
> > >  {
> > > +  /* Loops marked NON-ROLLING are not likely to be hot.  */
> > > +  if (non_rolling_loop_p (loop))
> > > +    return true;
> > >    return optimize_bb_for_size_p (loop->header);
> > >  }
> > >
> > > @@ -287,6 +290,9 @@ optimize_loop_for_size_p (struct loop *loop)
> > >  bool
> > >  optimize_loop_for_speed_p (struct loop *loop)
> > >  {
> > > +  /* Loops marked NON-ROLLING are not likely to be hot.  */
> > > +  if (non_rolling_loop_p (loop))
> > > +    return false;
> > >    return optimize_bb_for_speed_p (loop->header);
> > >  }
> > >
> > > diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c
> > > index 87b2c0d..bc977bb 100644
> > > --- a/gcc/tree-ssa-loop-manip.c
> > > +++ b/gcc/tree-ssa-loop-manip.c
> > > @@ -931,6 +931,9 @@ tree_transform_and_unroll_loop (struct loop *loop, unsigned factor,
> > >    gcc_assert (new_loop != NULL);
> > >    update_ssa (TODO_update_ssa);
> > >
> > > +  /* NEW_LOOP is a non-rolling loop.  */
> > > +  mark_non_rolling_loop (new_loop);
> > > +
> > >    /* Determine the probability of the exit edge of the unrolled loop.  */
> > >    new_est_niter = est_niter / factor;
> > >
> > > diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c
> > > index ee85f6f..1e2e4b2 100644
> > > --- a/gcc/tree-ssa-loop-niter.c
> > > +++ b/gcc/tree-ssa-loop-niter.c
> > > @@ -3011,6 +3011,26 @@ estimate_numbers_of_iterations (bool use_undefined_p)
> > >    fold_undefer_and_ignore_overflow_warnings ();
> > >  }
> > >
> > > +/* Mark LOOP as a non-rolling loop.  */
> > > +
> > > +void
> > > +mark_non_rolling_loop (struct loop *loop)
> > > +{
> > > +  gcc_assert (loop && loop->header);
> > > +  loop->header->flags |= BB_HEADER_OF_NONROLLING_LOOP;
> > > +}
> > > +
> > > +/* Return true if LOOP is a non-rolling loop.  */
> > > +
> > > +bool
> > > +non_rolling_loop_p (struct loop *loop)
> > > +{
> > > +  int masked_flags;
> > > +  gcc_assert (loop && loop->header);
> > > +  masked_flags = (loop->header->flags & BB_HEADER_OF_NONROLLING_LOOP);
> > > +  return (masked_flags != 0);
> > > +}
> > > +
> > >  /* Returns true if statement S1 dominates statement S2.  */
> > >
> > >  bool
> > > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> > > index 6ecd304..216de78 100644
> > > --- a/gcc/tree-vect-loop-manip.c
> > > +++ b/gcc/tree-vect-loop-manip.c
> > > @@ -1938,6 +1938,10 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio,
> > >                                         cond_expr, cond_expr_stmt_list);
> > >    gcc_assert (new_loop);
> > >    gcc_assert (loop_num == loop->num);
> > > +
> > > +  /* NEW_LOOP is a non-rolling loop.  */
> > > +  mark_non_rolling_loop (new_loop);
> > > +
> > >  #ifdef ENABLE_CHECKING
> > >    slpeel_verify_cfg_after_peeling (loop, new_loop);
> > >  #endif
> > > @@ -2191,6 +2195,10 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
> > >                                th, true, NULL_TREE, NULL);
> > >
> > >    gcc_assert (new_loop);
> > > +
> > > +  /* NEW_LOOP is a non-rolling loop.  */
> > > +  mark_non_rolling_loop (new_loop);
> > > +
> > >  #ifdef ENABLE_CHECKING
> > >    slpeel_verify_cfg_after_peeling (new_loop, loop);
> > >  #endif
> > > --
> > > 1.6.3.3
> > >
>
Jack Howarth Dec. 18, 2010, 3:42 a.m. UTC | #11
On Fri, Dec 17, 2010 at 03:30:37PM -0600, Fang, Changpeng wrote:
> Hi, Jack:
> 
> Is prefetch default on at -O3 on your systems?
> 
> Can you do an additional test to test
> -O3 -ffast-math with/without the original patch?
> 
> This way, we can know that whether it is the rtl-loop unrolling problem.

unpatched r168001 

================================================================================
Date & Time     : 17 Dec 2010 19:24:47
Test Name       : gfortran_lin_O3_nounroll
Compile Command : gfortran -ffast-math -O3 %n.f90 -o %n
Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
Maximum Times   :     2000.0
Target Error %  :      0.100
Minimum Repeats :    10
Maximum Repeats :   100

   Benchmark   Compile  Executable   Ave Run  Number   Estim
        Name    (secs)     (bytes)    (secs) Repeats   Err %
   ---------   -------  ----------   ------- -------  ------
          ac      1.01       10000      8.76      10  0.0129
      aermod     46.64       10000     17.23      10  0.0154
         air      2.57       10000      5.45      10  0.0579
    capacita      1.63       10000     33.17      10  0.0407
     channel      0.62       10000      1.87      10  0.0166
       doduc      6.21       10000     27.33      10  0.0067
     fatigue      2.02       10000      7.95      10  0.0339
     gas_dyn      2.18       10000      4.36      19  0.0871
      induct      5.08       10000     12.44      10  0.0054
       linpk      0.70       10000     15.52      10  0.0638
        mdbx      1.87       10000     11.46      10  0.0080
          nf      1.25       10000     31.27      19  0.0955
     protein      3.66       10000     35.68      10  0.0062
      rnflow      4.54       10000     26.09      10  0.0104
    test_fpu      3.46       10000      8.81      10  0.0443
        tfft      0.50       10000      1.90      10  0.0442

Geometric Mean Execution Time =      11.09 seconds

================================================================================

r168001 with original patch

================================================================================
Date & Time     : 17 Dec 2010 21:49:34
Test Name       : gfortran_lin_O3_nounroll
Compile Command : gfortran -ffast-math -O3 %n.f90 -o %n
Benchmarks      : ac aermod air capacita channel doduc fatigue gas_dyn induct linpk mdbx nf protein rnflow test_fpu tfft
Maximum Times   :     2000.0
Target Error %  :      0.100
Minimum Repeats :    10
Maximum Repeats :   100

   Benchmark   Compile  Executable   Ave Run  Number   Estim
        Name    (secs)     (bytes)    (secs) Repeats   Err %
   ---------   -------  ----------   ------- -------  ------
          ac      0.89       10000      8.76      10  0.0146
      aermod     45.65       10000     17.23      10  0.0068
         air      2.48       10000      5.44      12  0.0345
    capacita      1.50       10000     33.20      10  0.0661
     channel      0.60       10000      1.87      10  0.0331
       doduc      6.07       10000     27.34      10  0.0118
     fatigue      1.84       10000      7.96      10  0.0169
     gas_dyn      1.95       10000      4.36      17  0.0911
      induct      4.97       10000     12.44      10  0.0068
       linpk      0.69       10000     15.54      10  0.0884
        mdbx      1.83       10000     11.46      10  0.0258
          nf      1.09       10000     31.38      20  0.0899
     protein      3.09       10000     35.64      10  0.0141
      rnflow      4.22       10000     26.06      10  0.0883
    test_fpu      3.24       10000      8.82      12  0.0866
        tfft      0.50       10000      1.91      10  0.0475

Geometric Mean Execution Time =      11.09 seconds

================================================================================

So the performance regression with the patch only manifests itself with
-funroll-loops.
              Jack

> 
> Thanks,
> 
> Changpeng
> 
> 
> 
>
diff mbox

Patch

From cd8b85bba1b39e108235f44d9d07918179ff3d79 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <chfang@houghton.(none)>
Date: Mon, 13 Dec 2010 12:01:49 -0800
Subject: [PATCH] Don't perform certain loop optimizations on pre/post loops

	* basic-block.h (bb_flags): Add a new flag BB_HEADER_OF_NONROLLING_LOOP.
	* cfg.c (clear_bb_flags): Keep BB_HEADER_OF_NONROLLING marker.
	* cfgloop.h (mark_non_rolling_loop): New function declaration.
	  (non_rolling_loop_p): New function declaration.
	* predict.c (optimize_loop_for_size_p): Return true if the loop was marked
	  NON-ROLLING.  (optimize_loop_for_speed_p): Return false if the loop was
	  marked NON-ROLLING.
	* tree-ssa-loop-manip.c (tree_transform_and_unroll_loop): Mark the
	  non-rolling loop.
	* tree-ssa-loop-niter.c (mark_non_rolling_loop): Implement the new
	  function.  (non_rolling_loop_p): Implement the new function.
	* tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Mark the
	  non-rolling loop.  (vect_do_peeling_for_alignment): Mark the non-rolling
	  loop.
---
 gcc/basic-block.h          |    6 +++++-
 gcc/cfg.c                  |    7 ++++---
 gcc/cfgloop.h              |    2 ++
 gcc/predict.c              |    6 ++++++
 gcc/tree-ssa-loop-manip.c  |    3 +++
 gcc/tree-ssa-loop-niter.c  |   20 ++++++++++++++++++++
 gcc/tree-vect-loop-manip.c |    8 ++++++++
 7 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/gcc/basic-block.h b/gcc/basic-block.h
index be0a1d1..850472d 100644
--- a/gcc/basic-block.h
+++ b/gcc/basic-block.h
@@ -245,7 +245,11 @@  enum bb_flags
 
   /* Set on blocks that cannot be threaded through.
      Only used in cfgcleanup.c.  */
-  BB_NONTHREADABLE_BLOCK = 1 << 11
+  BB_NONTHREADABLE_BLOCK = 1 << 11,
+
+  /* Set on blocks that are headers of non-rolling loops.  */
+  BB_HEADER_OF_NONROLLING_LOOP = 1 << 12
+
 };
 
 /* Dummy flag for convenience in the hot/cold partitioning code.  */
diff --git a/gcc/cfg.c b/gcc/cfg.c
index c8ef799..e59a637 100644
--- a/gcc/cfg.c
+++ b/gcc/cfg.c
@@ -425,8 +425,8 @@  redirect_edge_pred (edge e, basic_block new_pred)
   connect_src (e);
 }
 
-/* Clear all basic block flags, with the exception of partitioning and
-   setjmp_target.  */
+/* Clear all basic block flags, with the exception of partitioning,
+   setjmp_target, and the non-rolling loop marker.  */
 void
 clear_bb_flags (void)
 {
@@ -434,7 +434,8 @@  clear_bb_flags (void)
 
   FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR, NULL, next_bb)
     bb->flags = (BB_PARTITION (bb)
-		 | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET)));
+		 | (bb->flags & (BB_DISABLE_SCHEDULE + BB_RTL + BB_NON_LOCAL_GOTO_TARGET
+				 + BB_HEADER_OF_NONROLLING_LOOP)));
 }
 
 /* Check the consistency of profile information.  We can't do that
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index bf2614e..e856a78 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -279,6 +279,8 @@  extern rtx doloop_condition_get (rtx);
 void estimate_numbers_of_iterations_loop (struct loop *, bool);
 HOST_WIDE_INT estimated_loop_iterations_int (struct loop *, bool);
 bool estimated_loop_iterations (struct loop *, bool, double_int *);
+void mark_non_rolling_loop (struct loop *);
+bool non_rolling_loop_p (struct loop *);
 
 /* Loop manipulation.  */
 extern bool can_duplicate_loop_p (const struct loop *loop);
diff --git a/gcc/predict.c b/gcc/predict.c
index c691990..bf729f8 100644
--- a/gcc/predict.c
+++ b/gcc/predict.c
@@ -279,6 +279,9 @@  optimize_insn_for_speed_p (void)
 bool
 optimize_loop_for_size_p (struct loop *loop)
 {
+  /* Loops marked NON-ROLLING are not likely to be hot.  */
+  if (non_rolling_loop_p (loop))
+    return true;
   return optimize_bb_for_size_p (loop->header);
 }
 
@@ -287,6 +290,9 @@  optimize_loop_for_size_p (struct loop *loop)
 bool
 optimize_loop_for_speed_p (struct loop *loop)
 {
+  /* Loops marked NON-ROLLING are not likely to be hot.  */
+  if (non_rolling_loop_p (loop))
+    return false;
   return optimize_bb_for_speed_p (loop->header);
 }
 
diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c
index 87b2c0d..bc977bb 100644
--- a/gcc/tree-ssa-loop-manip.c
+++ b/gcc/tree-ssa-loop-manip.c
@@ -931,6 +931,9 @@  tree_transform_and_unroll_loop (struct loop *loop, unsigned factor,
   gcc_assert (new_loop != NULL);
   update_ssa (TODO_update_ssa);
 
+  /* NEW_LOOP is a non-rolling loop.  */
+  mark_non_rolling_loop (new_loop);
+
   /* Determine the probability of the exit edge of the unrolled loop.  */
   new_est_niter = est_niter / factor;
 
diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c
index ee85f6f..1e2e4b2 100644
--- a/gcc/tree-ssa-loop-niter.c
+++ b/gcc/tree-ssa-loop-niter.c
@@ -3011,6 +3011,26 @@  estimate_numbers_of_iterations (bool use_undefined_p)
   fold_undefer_and_ignore_overflow_warnings ();
 }
 
+/* Mark LOOP as a non-rolling loop.  */
+
+void
+mark_non_rolling_loop (struct loop *loop)
+{
+  gcc_assert (loop && loop->header);
+  loop->header->flags |= BB_HEADER_OF_NONROLLING_LOOP;
+}
+
+/* Return true if LOOP is a non-rolling loop.  */
+
+bool
+non_rolling_loop_p (struct loop *loop)
+{
+  int masked_flags;
+  gcc_assert (loop && loop->header);
+  masked_flags = (loop->header->flags & BB_HEADER_OF_NONROLLING_LOOP);
+  return (masked_flags != 0);
+}
+
 /* Returns true if statement S1 dominates statement S2.  */
 
 bool
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 6ecd304..216de78 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -1938,6 +1938,10 @@  vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio,
 					    cond_expr, cond_expr_stmt_list);
   gcc_assert (new_loop);
   gcc_assert (loop_num == loop->num);
+  
+  /* NEW_LOOP is a non-rolling loop.  */
+  mark_non_rolling_loop (new_loop);
+
 #ifdef ENABLE_CHECKING
   slpeel_verify_cfg_after_peeling (loop, new_loop);
 #endif
@@ -2191,6 +2195,10 @@  vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
 				   th, true, NULL_TREE, NULL);
 
   gcc_assert (new_loop);
+
+  /* NEW_LOOP is a non-rolling loop.  */
+  mark_non_rolling_loop (new_loop);
+  
 #ifdef ENABLE_CHECKING
   slpeel_verify_cfg_after_peeling (new_loop, loop);
 #endif
-- 
1.6.3.3