Patchwork [35/44] New pass: loop flattening.

login
register
mail settings
Submitter Sebastian Pop
Date Sept. 30, 2010, 6:01 p.m.
Message ID <1285869696-10915-36-git-send-email-sebpop@gmail.com>
Download mbox | patch
Permalink /patch/66227/
State New
Headers show

Comments

Sebastian Pop - Sept. 30, 2010, 6:01 p.m.
From: spop <spop@138bc75d-0d04-0410-961f-82ee72b054a4>

2010-09-09  Sebastian Pop  <sebastian.pop@amd.com>

	* Makefile.in (OBJS-common): Add graphite-flattening.o.
	(graphite-flattening.o): New rule.
	* common.opt (floop-flatten): New flag.
	* doc/invoke.texi (-floop-flatten): Documented.
	* graphite-flattening.c: New.
	* graphite-poly.c (apply_poly_transforms): Call flatten_all_loops.
	* graphite-poly.h (flatten_all_loops): Declared.
	(lst_remove_loop_and_inline_stmts_in_loop_father): New.
	* tree-ssa-loop.c (gate_graphite_transforms): When flag_loop_flatten
	is set, also set flag_graphite.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/graphite@164128 138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc/ChangeLog             |   13 ++
 gcc/ChangeLog.graphite    |   13 ++
 gcc/Makefile.in           |    7 +
 gcc/common.opt            |    4 +
 gcc/doc/invoke.texi       |   12 +-
 gcc/graphite-flattening.c |  442 +++++++++++++++++++++++++++++++++++++++++++++
 gcc/graphite-poly.c       |    6 +-
 gcc/graphite-poly.h       |   27 +++-
 gcc/tree-ssa-loop.c       |    8 +-
 9 files changed, 525 insertions(+), 7 deletions(-)
 create mode 100644 gcc/graphite-flattening.c

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d4ff450..ff91fff 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,18 @@ 
 2010-09-30  Sebastian Pop  <sebastian.pop@amd.com>
 
+	* Makefile.in (OBJS-common): Add graphite-flattening.o.
+	(graphite-flattening.o): New rule.
+	* common.opt (floop-flatten): New flag.
+	* doc/invoke.texi (-floop-flatten): Documented.
+	* graphite-flattening.c: New.
+	* graphite-poly.c (apply_poly_transforms): Call flatten_all_loops.
+	* graphite-poly.h (flatten_all_loops): Declared.
+	(lst_remove_loop_and_inline_stmts_in_loop_father): New.
+	* tree-ssa-loop.c (gate_graphite_transforms): When flag_loop_flatten
+	is set, also set flag_graphite.
+
+2010-09-30  Sebastian Pop  <sebastian.pop@amd.com>
+
 	* graphite-poly.c (cloog_checksum): New.
 	* graphite-poly.h (cloog_checksum): Declared.
 
diff --git a/gcc/ChangeLog.graphite b/gcc/ChangeLog.graphite
index 368d77d..7e0e887 100644
--- a/gcc/ChangeLog.graphite
+++ b/gcc/ChangeLog.graphite
@@ -1,5 +1,18 @@ 
 2010-09-09  Sebastian Pop  <sebastian.pop@amd.com>
 
+	* Makefile.in (OBJS-common): Add graphite-flattening.o.
+	(graphite-flattening.o): New rule.
+	* common.opt (floop-flatten): New flag.
+	* doc/invoke.texi (-floop-flatten): Documented.
+	* graphite-flattening.c: New.
+	* graphite-poly.c (apply_poly_transforms): Call flatten_all_loops.
+	* graphite-poly.h (flatten_all_loops): Declared.
+	(lst_remove_loop_and_inline_stmts_in_loop_father): New.
+	* tree-ssa-loop.c (gate_graphite_transforms): When flag_loop_flatten
+	is set, also set flag_graphite.
+
+2010-09-09  Sebastian Pop  <sebastian.pop@amd.com>
+
 	* graphite-poly.c (cloog_checksum): New.
 	* graphite-poly.h (cloog_checksum): Declared.
 
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index dc10c58..26cd529 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1244,6 +1244,7 @@  OBJS-common = \
 	graphite-clast-to-gimple.o \
 	graphite-cloog-util.o \
 	graphite-dependences.o \
+	graphite-flattening.o \
 	graphite-interchange.o \
 	graphite-poly.o \
 	graphite-ppl.o \
@@ -2695,6 +2696,12 @@  graphite-dependences.o: graphite-dependences.c $(CONFIG_H) $(SYSTEM_H) \
    $(TOPLEV_H) $(DIAGNOSTIC_CORE_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
    $(GIMPLE_H) $(TREE_DATA_REF_H) tree-pass.h domwalk.h \
    graphite.h graphite-poly.h graphite-ppl.h graphite-dependences.h
+graphite-flattening.o: graphite-flattening.c $(CONFIG_H) $(SYSTEM_H)	\
+   coretypes.h $(TM_H) $(GGC_H) $(TREE_H) $(RTL_H) output.h		\
+   $(BASIC_BLOCK_H) $(DIAGNOSTIC_H) $(TOPLEV_H) $(TREE_FLOW_H)		\
+   $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) $(GIMPLE_H)			\
+   $(TREE_DATA_REF_H) tree-pass.h domwalk.h value-prof.h graphite.h	\
+   graphite-poly.h graphite-ppl.h
 graphite-interchange.o: graphite-interchange.c $(CONFIG_H) $(SYSTEM_H) \
    coretypes.h \
    $(TM_H) $(GGC_H) $(TREE_H) $(RTL_H) output.h $(BASIC_BLOCK_H) $(DIAGNOSTIC_H) \
diff --git a/gcc/common.opt b/gcc/common.opt
index 2b64366..938b76e 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -870,6 +870,10 @@  floop-block
 Common Report Var(flag_loop_block) Optimization
 Enable Loop Blocking transformation
 
+floop-flatten
+Common Report Var(flag_loop_flatten) Optimization
+Enable Loop Flattening transformation
+
 fstrict-volatile-bitfields
 Common Report Var(flag_strict_volatile_bitfields) Init(-1)
 Force bitfield accesses to match their type width
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 51ce647..606706b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -352,7 +352,7 @@  Objective-C and Objective-C++ Dialects}.
 -fira-loop-pressure -fno-ira-share-save-slots @gol
 -fno-ira-share-spill-slots -fira-verbose=@var{n} @gol
 -fivopts -fkeep-inline-functions -fkeep-static-consts @gol
--floop-block -floop-interchange -floop-strip-mine @gol
+-floop-block -floop-flatten -floop-interchange -floop-strip-mine @gol
 -floop-parallelize-all -flto -flto-compression-level -flto-report @gol
 -fltrans -fltrans-output-list -fmerge-all-constants -fmerge-constants @gol
 -fmodulo-sched -fmodulo-sched-allow-regmoves -fmove-loop-invariants @gol
@@ -6788,6 +6788,7 @@  Perform linear loop transformations on tree.  This flag can improve cache
 performance and allow further loop optimizations to take place.
 
 @item -floop-interchange
+@opindex floop-interchange
 Perform loop interchange transformations on loops.  Interchanging two
 nested loops switches the inner and outer loops.  For example, given a
 loop like:
@@ -6816,6 +6817,7 @@  with @option{--with-ppl} and @option{--with-cloog} to enable the
 Graphite loop transformation infrastructure.
 
 @item -floop-strip-mine
+@opindex floop-strip-mine
 Perform loop strip mining transformations on loops.  Strip mining
 splits a loop into two nested loops.  The outer loop has strides
 equal to the strip size and the inner loop has strides of the
@@ -6841,6 +6843,7 @@  be configured with @option{--with-ppl} and @option{--with-cloog} to
 enable the Graphite loop transformation infrastructure.
 
 @item -floop-block
+@opindex floop-block
 Perform loop blocking transformations on loops.  Blocking strip mines
 each loop in the loop nest such that the memory accesses of the
 element loops fit inside caches.  The strip length can be changed
@@ -6882,7 +6885,14 @@  GIMPLE -> GRAPHITE -> GIMPLE transformation.  Some minimal optimizations
 are also performed by the code generator CLooG, like index splitting and
 dead code elimination in loops.
 
+@item -floop-flatten
+@opindex floop-flatten
+Removes the loop nesting structure: transforms the loop nest into a
+single loop.  This transformation can be useful to vectorize all the
+levels of the loop nest.
+
 @item -floop-parallelize-all
+@opindex floop-parallelize-all
 Use the Graphite data dependence analysis to identify loops that can
 be parallelized.  Parallelize all the loops that can be analyzed to
 not contain loop carried dependences without checking that it is
diff --git a/gcc/graphite-flattening.c b/gcc/graphite-flattening.c
new file mode 100644
index 0000000..0f98337
--- /dev/null
+++ b/gcc/graphite-flattening.c
@@ -0,0 +1,442 @@ 
+/* Loop flattening for Graphite.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Sebastian Pop <sebastian.pop@amd.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "ggc.h"
+#include "tree.h"
+#include "rtl.h"
+#include "output.h"
+#include "basic-block.h"
+#include "diagnostic.h"
+#include "tree-flow.h"
+#include "toplev.h"
+#include "tree-dump.h"
+#include "timevar.h"
+#include "cfgloop.h"
+#include "tree-chrec.h"
+#include "tree-data-ref.h"
+#include "tree-scalar-evolution.h"
+#include "tree-pass.h"
+#include "domwalk.h"
+#include "value-prof.h"
+#include "pointer-set.h"
+#include "gimple.h"
+#include "params.h"
+
+#ifdef HAVE_cloog
+#include "ppl_c.h"
+#include "sese.h"
+#include "graphite-ppl.h"
+#include "graphite.h"
+#include "graphite-poly.h"
+
+/* The loop flattening pass transforms loop nests into a single loop,
+   removing the loop nesting structure.  The auto-vectorization can
+   then apply on the full loop body, without needing the outer-loop
+   vectorization.
+
+   The canonical example is as follows: suppose that we have a loop
+   nest with known iteration counts
+
+   | for (i = 1; i <= 6; i++)
+   |   for (j = 1; j <= 6; j++)
+   |     S1(i,j);
+
+   The loop flattening is performed by linearizing the iteration space
+   using the function "f (x) = 6 * i + j".  In this case, CLooG would
+   produce this code:
+
+   | for (c1=7;c1<=42;c1++) {
+   |   i = floord(c1-1,6);
+   |   S1(i,c1-6*i);
+   | }
+
+   There are several limitations for loop flattening that are linked
+   to the expressivity of the polyhedral model.  One has to take an
+   upper bound approximation to deal with the parametric case of loop
+   flattening.  For example, in the loop nest:
+
+   | for (i = 1; i <= N; i++)
+   |   for (j = 1; j <= M; j++)
+   |     S1(i,j);
+
+   One would like to flatten this loop using a linearization function
+   like this "f (x) = M * i + j".  However CLooG's schedules are not
+   expressive enough to deal with this case, and so the parameter M
+   has to be replaced by an integer upper bound approximation.  If we
+   further know in the context of the scop that "M <= 6", then it is
+   possible to linearize the loop with "f (x) = 6 * i + j".  In this
+   case, CLooG would produce this code:
+
+   | for (c1=7;c1<=6*M+N;c1++) {
+   |   i = ceild(c1-N,6);
+   |   if (i <= floord(c1-1,6)) {
+   |     S1(i,c1-6*i);
+   |   }
+   | }
+
+   For an arbitrarily complex loop nests the algorithm proceeds in two
+   steps.  First, the LST is flattened by removing the loops structure
+   and by inserting the statements in the order they appear in
+   depth-first order.  Then, the scattering of each statement is
+   transformed such that it
+
+   Supposing that the original program is represented by the following
+   LST:
+
+   | (loop_1
+   |  stmt_1
+   |  (loop_2 stmt_3
+   |   (loop_3 stmt_4)
+   |   (loop_4 stmt_5 stmt_6)
+   |   stmt_7
+   |  )
+   |  stmt_2
+   | )
+
+   Loop flattening traverses the LST in depth-first order, and
+   flattens pairs of loops successively by projecting the inner loops
+   in the iteration domain of the outer loops:
+
+   lst_project_loop (loop_2, loop_3, stride)
+
+   | (loop_1
+   |  stmt_1
+   |  (loop_2 stmt_3 stmt_4
+   |   (loop_4 stmt_5 stmt_6)
+   |   stmt_7
+   |  )
+   |  stmt_2
+   | )
+
+   lst_project_loop (loop_2, loop_4, stride)
+
+   | (loop_1
+   |  stmt_1
+   |  (loop_2 stmt_3 stmt_4 stmt_5 stmt_6 stmt_7)
+   |  stmt_2
+   | )
+
+   lst_project_loop (loop_1, loop_2, stride)
+
+   | (loop_1
+   |  stmt_1 stmt_3 stmt_4 stmt_5 stmt_6 stmt_7 stmt_2
+   | )
+
+   At each step, the iteration domain of the outer loop is enlarged to
+   contain enough points to iterate over the inner loop domain.  */
+
+/* Initializes RES to the number of iterations of the linearized loop
+   LST.  RES is the cardinal of the iteration domain of LST.  */
+
+static void
+lst_linearized_niter (lst_p lst, mpz_t res)
+{
+  int i;
+  lst_p l;
+  mpz_t n;
+
+  mpz_init (n);
+  mpz_set_si (res, 0);
+
+  FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, l)
+    if (LST_LOOP_P (l))
+      {
+	lst_linearized_niter (l, n);
+	mpz_add (res, res, n);
+      }
+
+  if (LST_LOOP_P (lst))
+    {
+      lst_niter_for_loop (lst, n);
+
+      if (mpz_cmp_si (res, 0) != 0)
+	mpz_mul (res, res, n);
+      else
+	mpz_set (res, n);
+    }
+
+  mpz_clear (n);
+}
+
+/* Applies the translation "f (x) = x + OFFSET" to the loop containing
+   STMT.  */
+
+static void
+lst_offset (lst_p stmt, mpz_t offset)
+{
+  lst_p inner = LST_LOOP_FATHER (stmt);
+  poly_bb_p pbb = LST_PBB (stmt);
+  ppl_Polyhedron_t poly = PBB_TRANSFORMED_SCATTERING (pbb);
+  int inner_depth = lst_depth (inner);
+  ppl_dimension_type inner_dim = psct_dynamic_dim (pbb, inner_depth);
+  ppl_Linear_Expression_t expr;
+  ppl_dimension_type dim;
+  ppl_Coefficient_t one;
+  mpz_t x;
+
+  mpz_init (x);
+  mpz_set_si (x, 1);
+  ppl_new_Coefficient (&one);
+  ppl_assign_Coefficient_from_mpz_t (one, x);
+
+  ppl_Polyhedron_space_dimension (poly, &dim);
+  ppl_new_Linear_Expression_with_dimension (&expr, dim);
+
+  ppl_set_coef (expr, inner_dim, 1);
+  ppl_set_inhomogeneous_gmp (expr, offset);
+  ppl_Polyhedron_affine_image (poly, inner_dim, expr, one);
+  ppl_delete_Linear_Expression (expr);
+  ppl_delete_Coefficient (one);
+}
+
+/* Scale by FACTOR the loop LST containing STMT.  */
+
+static void
+lst_scale (lst_p lst, lst_p stmt, mpz_t factor)
+{
+  mpz_t x;
+  ppl_Coefficient_t one;
+  int outer_depth = lst_depth (lst);
+  poly_bb_p pbb = LST_PBB (stmt);
+  ppl_Polyhedron_t poly = PBB_TRANSFORMED_SCATTERING (pbb);
+  ppl_dimension_type outer_dim = psct_dynamic_dim (pbb, outer_depth);
+  ppl_Linear_Expression_t expr;
+  ppl_dimension_type dim;
+
+  mpz_init (x);
+  mpz_set_si (x, 1);
+  ppl_new_Coefficient (&one);
+  ppl_assign_Coefficient_from_mpz_t (one, x);
+
+  ppl_Polyhedron_space_dimension (poly, &dim);
+  ppl_new_Linear_Expression_with_dimension (&expr, dim);
+
+  /* outer_dim = factor * outer_dim.  */
+  ppl_set_coef_gmp (expr, outer_dim, factor);
+  ppl_Polyhedron_affine_image (poly, outer_dim, expr, one);
+  ppl_delete_Linear_Expression (expr);
+
+  mpz_clear (x);
+  ppl_delete_Coefficient (one);
+}
+
+/* Project the INNER loop into the iteration domain of the OUTER loop.
+   STRIDE is the number of iterations between two iterations of the
+   outer loop.  */
+
+static void
+lst_project_loop (lst_p outer, lst_p inner, mpz_t stride)
+{
+  int i;
+  lst_p stmt;
+  mpz_t x;
+  ppl_Coefficient_t one;
+  int outer_depth = lst_depth (outer);
+  int inner_depth = lst_depth (inner);
+
+  mpz_init (x);
+  mpz_set_si (x, 1);
+  ppl_new_Coefficient (&one);
+  ppl_assign_Coefficient_from_mpz_t (one, x);
+
+  FOR_EACH_VEC_ELT (lst_p, LST_SEQ (inner), i, stmt)
+    {
+      poly_bb_p pbb = LST_PBB (stmt);
+      ppl_Polyhedron_t poly = PBB_TRANSFORMED_SCATTERING (pbb);
+      ppl_dimension_type outer_dim = psct_dynamic_dim (pbb, outer_depth);
+      ppl_dimension_type inner_dim = psct_dynamic_dim (pbb, inner_depth);
+      ppl_Linear_Expression_t expr;
+      ppl_dimension_type dim;
+      ppl_dimension_type *ds;
+
+      /* There should be no loops under INNER.  */
+      gcc_assert (!LST_LOOP_P (stmt));
+      ppl_Polyhedron_space_dimension (poly, &dim);
+      ppl_new_Linear_Expression_with_dimension (&expr, dim);
+
+      /* outer_dim = outer_dim * stride + inner_dim.  */
+      ppl_set_coef (expr, inner_dim, 1);
+      ppl_set_coef_gmp (expr, outer_dim, stride);
+      ppl_Polyhedron_affine_image (poly, outer_dim, expr, one);
+      ppl_delete_Linear_Expression (expr);
+
+      /* Project on inner_dim.  */
+      ppl_new_Linear_Expression_with_dimension (&expr, dim - 1);
+      ppl_Polyhedron_affine_image (poly, inner_dim, expr, one);
+      ppl_delete_Linear_Expression (expr);
+
+      /* Remove inner loop and the static schedule of its body.  */
+      ds = XNEWVEC (ppl_dimension_type, 2);
+      ds[0] = inner_dim;
+      ds[1] = inner_dim + 1;
+      ppl_Polyhedron_remove_space_dimensions (poly, ds, 2);
+      PBB_NB_SCATTERING_TRANSFORM (pbb) -= 2;
+      free (ds);
+    }
+
+  mpz_clear (x);
+  ppl_delete_Coefficient (one);
+}
+
+/* Flattens the loop nest LST.  Return true when something changed.
+   OFFSET is used to compute the number of iterations of the outermost
+   loop before the current LST is executed.  */
+
+static bool
+lst_flatten_loop (lst_p lst, mpz_t init_offset)
+{
+  int i;
+  lst_p l;
+  bool res = false;
+  mpz_t n, one, offset, stride;
+
+  mpz_init (n);
+  mpz_init (one);
+  mpz_init (offset);
+  mpz_init (stride);
+  mpz_set (offset, init_offset);
+  mpz_set_si (one, 1);
+
+  lst_linearized_niter (lst, stride);
+  lst_niter_for_loop (lst, n);
+  mpz_tdiv_q (stride, stride, n);
+
+  FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, l)
+    if (LST_LOOP_P (l))
+      {
+	res = true;
+
+	lst_flatten_loop (l, offset);
+	lst_niter_for_loop (l, n);
+
+	lst_project_loop (lst, l, stride);
+
+	/* The offset is the number of iterations minus 1, as we want
+	   to execute the next statements at the same iteration as the
+	   last iteration of the loop.  */
+	mpz_sub (n, n, one);
+	mpz_add (offset, offset, n);
+      }
+    else
+      {
+	lst_scale (lst, l, stride);
+	if (mpz_cmp_si (offset, 0) != 0)
+	  lst_offset (l, offset);
+      }
+
+  FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, l)
+    if (LST_LOOP_P (l))
+      lst_remove_loop_and_inline_stmts_in_loop_father (l);
+
+  mpz_clear (n);
+  mpz_clear (one);
+  mpz_clear (offset);
+  mpz_clear (stride);
+  return res;
+}
+
+/* Remove all but the first 3 dimensions of the scattering:
+   - dim0: the static schedule for the loop
+   - dim1: the dynamic schedule of the loop
+   - dim2: the static schedule for the loop body.  */
+
+static void
+remove_unused_scattering_dimensions (lst_p lst)
+{
+  int i;
+  lst_p stmt;
+  mpz_t x;
+  ppl_Coefficient_t one;
+
+  mpz_init (x);
+  mpz_set_si (x, 1);
+  ppl_new_Coefficient (&one);
+  ppl_assign_Coefficient_from_mpz_t (one, x);
+
+  FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, stmt)
+    {
+      poly_bb_p pbb = LST_PBB (stmt);
+      ppl_Polyhedron_t poly = PBB_TRANSFORMED_SCATTERING (pbb);
+      int j, nb_dims_to_remove = PBB_NB_SCATTERING_TRANSFORM (pbb) - 3;
+      ppl_dimension_type *ds;
+
+      /* There should be no loops inside LST after flattening.  */
+      gcc_assert (!LST_LOOP_P (stmt));
+
+      if (!nb_dims_to_remove)
+	continue;
+
+      ds = XNEWVEC (ppl_dimension_type, nb_dims_to_remove);
+      for (j = 0; j < nb_dims_to_remove; j++)
+	ds[j] = j + 3;
+
+      ppl_Polyhedron_remove_space_dimensions (poly, ds, nb_dims_to_remove);
+      PBB_NB_SCATTERING_TRANSFORM (pbb) -= nb_dims_to_remove;
+      free (ds);
+    }
+
+  mpz_clear (x);
+  ppl_delete_Coefficient (one);
+}
+
+/* Flattens all the loop nests of LST.  Return true when something
+   changed.  */
+
+static bool
+lst_do_flatten (lst_p lst)
+{
+  int i;
+  lst_p l;
+  bool res = false;
+  mpz_t zero;
+
+  if (!lst
+      || !LST_LOOP_P (lst))
+    return false;
+
+  mpz_init (zero);
+  mpz_set_si (zero, 0);
+
+  FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, l)
+    if (LST_LOOP_P (l))
+      {
+	res |= lst_flatten_loop (l, zero);
+	remove_unused_scattering_dimensions (l);
+      }
+
+  lst_update_scattering (lst);
+  mpz_clear (zero);
+  return res;
+}
+
+/* Flatten all the loop nests in SCOP.  Returns true when something
+   changed.  */
+
+bool
+flatten_all_loops (scop_p scop)
+{
+  return lst_do_flatten (SCOP_TRANSFORMED_SCHEDULE (scop));
+}
+
+#endif
diff --git a/gcc/graphite-poly.c b/gcc/graphite-poly.c
index 4214f9e..e09b570 100644
--- a/gcc/graphite-poly.c
+++ b/gcc/graphite-poly.c
@@ -783,6 +783,9 @@  apply_poly_transforms (scop_p scop)
 	transform_done |= scop_do_interchange (scop);
     }
 
+  if (flag_loop_flatten)
+    transform_done |= flatten_all_loops (scop);
+
   /* This feature is only enabled in the Graphite branch.  */
   if (0)
     {
@@ -1688,7 +1691,8 @@  pbb_number_of_iterations_at_time (poly_bb_p pbb,
       ppl_delete_Constraint_System (cs);
     }
 
-  /* Compute the lower bound on the original iteration domain.  */
+  /* Compute the lower bound on the original iteration domain and add
+     it to the scattering.  */
   ppl_new_Pointset_Powerset_C_Polyhedron_from_C_Polyhedron
     (&sctr_lb, PBB_TRANSFORMED_SCATTERING (pbb));
   for (i = 0; i < (int) domain_dim; i++)
diff --git a/gcc/graphite-poly.h b/gcc/graphite-poly.h
index b9bf1ed..8b950a4 100644
--- a/gcc/graphite-poly.h
+++ b/gcc/graphite-poly.h
@@ -414,6 +414,7 @@  extern void debug_iteration_domains (scop_p, int);
 extern bool scop_do_interchange (scop_p);
 extern bool scop_do_strip_mine (scop_p);
 extern bool scop_do_block (scop_p);
+extern bool flatten_all_loops (scop_p);
 extern void pbb_number_of_iterations_at_time (poly_bb_p, graphite_dim_t, mpz_t);
 extern void pbb_remove_duplicate_pdrs (poly_bb_p);
 
@@ -944,7 +945,7 @@  find_lst_loop (lst_p stmt, int loop_depth)
   return loop;
 }
 
-/* Return the first lst representing a PBB statement in LST.  */
+/* Return the first LST representing a PBB statement in LST.  */
 
 static inline lst_p
 lst_find_first_pbb (lst_p lst)
@@ -968,7 +969,7 @@  lst_find_first_pbb (lst_p lst)
   return NULL;
 }
 
-/* Returns true when LST is a loop that does not contains
+/* Returns true when LST is a loop that does not contain
    statements.  */
 
 static inline bool
@@ -977,7 +978,7 @@  lst_empty_p (lst_p lst)
   return !lst_find_first_pbb (lst);
 }
 
-/* Return the last lst representing a PBB statement in LST.  */
+/* Return the last LST representing a PBB statement in LST.  */
 
 static inline lst_p
 lst_find_last_pbb (lst_p lst)
@@ -1061,6 +1062,26 @@  lst_remove_from_sequence (lst_p lst)
   LST_LOOP_FATHER (lst) = NULL;
 }
 
+/* Removes the loop LST and inline its body in the father loop.  */
+
+static inline void
+lst_remove_loop_and_inline_stmts_in_loop_father (lst_p lst)
+{
+  lst_p l, father = LST_LOOP_FATHER (lst);
+  int i, dewey = lst_dewey_number (lst);
+
+  gcc_assert (lst && father && dewey >= 0);
+
+  VEC_ordered_remove (lst_p, LST_SEQ (father), dewey);
+  LST_LOOP_FATHER (lst) = NULL;
+
+  FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, l)
+    {
+      VEC_safe_insert (lst_p, heap, LST_SEQ (father), dewey + i, l);
+      LST_LOOP_FATHER (l) = father;
+    }
+}
+
 /* Sets NITER to the upper bound approximation of the number of
    iterations of loop LST.  */
 
diff --git a/gcc/tree-ssa-loop.c b/gcc/tree-ssa-loop.c
index a62098a..7aa78f8 100644
--- a/gcc/tree-ssa-loop.c
+++ b/gcc/tree-ssa-loop.c
@@ -303,8 +303,12 @@  gate_graphite_transforms (void)
 {
   /* Enable -fgraphite pass if any one of the graphite optimization flags
      is turned on.  */
-  if (flag_loop_block || flag_loop_interchange || flag_loop_strip_mine
-      || flag_graphite_identity || flag_loop_parallelize_all)
+  if (flag_loop_block
+      || flag_loop_interchange
+      || flag_loop_strip_mine
+      || flag_graphite_identity
+      || flag_loop_parallelize_all
+      || flag_loop_flatten)
     flag_graphite = 1;
 
   return flag_graphite != 0;