Patchwork [google,4.6,i386] Support autocloning for corei7 with -mvarch= option to remove LCP stalls in loops (issue5865043)

login
register
mail settings
Submitter Sriraman Tallam
Date March 20, 2012, 9:04 p.m.
Message ID <20120320210457.B03C8B21D0@azwildcat.mtv.corp.google.com>
Download mbox | patch
Permalink /patch/147842/
State New
Headers show

Comments

Sriraman Tallam - March 20, 2012, 9:04 p.m.
This patch adds support to version for corei7 with -mvarch option. The versioning supported is in the case where a loop generates a LCP stalling instruction in corei7. In such cases, on corei7, limiting the unroll factor to try to keep the unrolled loop body small enough to fit in the Corei7's loop stream detector can hide LCP stalls in loops. With mvarch, the function containing the loop is multi-versioned and one version is tagged with "tune=corei7" so that the unroll factor can be limited on this version.

Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01230.html for discussion on mvarch option.
Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg00123.html for discussion  on LCP stalls in corei7.


The autocloning framework is only avaiable in google/gcc-4_6 branch. I am working on porting this to trunk.

	* config/i386/i386.c (find_himode_assigns): New function.
	(mversionable_for_core2_p): Add new param version_number.
	(mversionable_for_corei7_p): New function.
	(ix86_mversion_function): Check for corei7 versioning.
	* params.def (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING): Bump
	allowed limit to 5000.
	*  mversn-dispatch.c (do_auto_clone): Reverse fn_ver_addr_chain.


--
This patch is available for review at http://codereview.appspot.com/5865043
Teresa Johnson - March 20, 2012, 10:59 p.m.
On Tue, Mar 20, 2012 at 2:04 PM, Sriraman Tallam <tmsriram@google.com> wrote:
> This patch adds support to version for corei7 with -mvarch option. The versioning supported is in the case where a loop generates a LCP stalling instruction in corei7. In such cases, on corei7, limiting the unroll factor to try to keep the unrolled loop body small enough to fit in the Corei7's loop stream detector can hide LCP stalls in loops. With mvarch, the function containing the loop is multi-versioned and one version is tagged with "tune=corei7" so that the unroll factor can be limited on this version.
>
> Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01230.html for discussion on mvarch option.
> Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg00123.html for discussion  on LCP stalls in corei7.
>
>
> The autocloning framework is only avaiable in google/gcc-4_6 branch. I am working on porting this to trunk.
>
>        * config/i386/i386.c (find_himode_assigns): New function.
>        (mversionable_for_core2_p): Add new param version_number.
>        (mversionable_for_corei7_p): New function.
>        (ix86_mversion_function): Check for corei7 versioning.
>        * params.def (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING): Bump
>        allowed limit to 5000.
>        *  mversn-dispatch.c (do_auto_clone): Reverse fn_ver_addr_chain.
>
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 185514)
> +++ config/i386/i386.c  (working copy)
> @@ -26507,6 +26507,132 @@ any_loops_vectorizable_with_load_store (void)
>   return vectorizable_loop_found;
>  }
>
> +/* Returns true if this function finds a loop that contains a possible LCP
> +   stalling instruction on corei7.   This is used to multiversion functions
> +   for corei7.
> +
> +   This function looks for instructions that store a constant into
> +   HImode (16-bit) memory. These require a length-changing prefix and on
> +   corei7 are prone to LCP stalls. These stalls can be avoided if the loop
> +   is streamed from the loop stream detector.  */
> +
> +static bool
> +find_himode_assigns (void)
> +{
> +  gimple_stmt_iterator gsi;
> +  gimple stmt;
> +  enum gimple_code code;
> +  tree lhs/*, rhs*/;

Can rhs be removed?

> +  enum machine_mode mode;
> +  basic_block *body;
> +  unsigned i;
> +  loop_iterator li;
> +  struct loop *loop;
> +  bool found = false;
> +  location_t locus = 0;

locus is dead (assigned but not read).

> +  int stmt_count;
> +  unsigned HOST_WIDE_INT n_unroll, max_unroll;
> +
> +  if (!flag_unroll_loops)
> +    return false;
> +
> +  loop_optimizer_init (LOOPS_NORMAL
> +                       | LOOPS_HAVE_RECORDED_EXITS);
> +  if (number_of_loops () < 1)
> +    return false;
> +
> +  scev_initialize();
> +
> +  if (profile_status == PROFILE_READ)
> +    max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES_FEEDBACK);
> +  else
> +    max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);

It might be clearer to rename max_unroll to max_peel_times or
something like that to be clearer.

> +
> +  FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
> +    {
> +      tree niter;
> +
> +      /* Will not peel/unroll cold areas.  */
> +      if (optimize_loop_for_size_p (loop))
> +        continue;
> +
> +      /* Can the loop be manipulated?  */
> +      if (!can_duplicate_loop_p (loop))
> +        continue;
> +
> +      niter = number_of_latch_executions (loop);
> +      if (host_integerp (niter, 1))
> +       {
> +         n_unroll = tree_low_cst (niter, 1);
> +         if (n_unroll <= max_unroll)
> +           continue;
> +       }
> +
> +      body = get_loop_body (loop);
> +      found = false;
> +      stmt_count = 0;
> +
> +      for (i = 0; i < loop->num_nodes; i++)
> +       {
> +         for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
> +           {
> +             stmt = gsi_stmt (gsi);
> +             stmt_count++;
> +             if (found)
> +               continue;
> +             code = gimple_code (stmt);
> +             if (code != GIMPLE_ASSIGN)
> +               continue;
> +             lhs = gimple_assign_lhs (stmt);
> +             if (TREE_CODE (lhs) != MEM_REF &&
> +                 TREE_CODE (lhs) != COMPONENT_REF &&
> +                 TREE_CODE (lhs) != ARRAY_REF)
> +               continue;
> +             if (gimple_assign_rhs_code(stmt) != INTEGER_CST)
> +               continue;
> +             mode = TYPE_MODE (TREE_TYPE (lhs));
> +             if (mode == HImode)
> +               {
> +                 locus = gimple_location (stmt);
> +                 found = true;
> +               }
> +          }
> +       }
> +      /* Don't worry about large loops that won't be unrolled anyway. In fact,
> +       * don't worry about unrolling loops that are already over the size of the
> +       * LSD (28 insts). Since instruction counts may be a little off at this
> +       * point, due to downstream transformations, include loops a little bigger
> +       * than the LSD size.
> +       */
> +      if (found && stmt_count < 40)
> +       {
> +         n_unroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS)/stmt_count;
> +         /* Check for a simple peel candidate */
> +         if (!(loop->header->count
> +               && expected_loop_iterations (loop) < 2 * n_unroll))
> +           {
> +             location_t locus2;

locus2 is dead (assigned but not read) and can be removed.

> +             edge exit;
> +             if ((exit = single_exit(loop)) != NULL)
> +               locus2 = gimple_location(last_stmt(exit->src));
> +             else
> +               locus2 = gimple_location(first_stmt(loop->latch));
> +           }
> +         else
> +           found = false;
> +       }
> +      else
> +       found = false;
> +      free (body);
> +    }
> +
> +  scev_finalize();
> +  loop_optimizer_finalize();
> +  return found;
> +}
> +
>  /* This makes the function that chooses the version to execute based
>    on the condition.  This condition function will decide which version
>    of the function to execute.  It should look like this:
> @@ -26601,7 +26727,7 @@ create_mtune_target_opt_node (const char *arch_tun
>    the newly created version, that is tag "tune=core2" on the new version.  */
>
>  static bool
> -mversionable_for_core2_p (tree *optimization_node,
> +mversionable_for_core2_p (int version_number, tree *optimization_node,
>                          tree *cond_func_decl, basic_block *new_bb)
>  {
>   tree predicate_decl;
> @@ -26635,11 +26761,60 @@ static bool
>   *optimization_node = create_mtune_target_opt_node ("core2");
>
>   predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS_INTEL_CORE2];
> -  *new_bb = add_condition_to_bb (*cond_func_decl, 0, *new_bb,
> +  *new_bb = add_condition_to_bb (*cond_func_decl, version_number, *new_bb,
>                                 predicate_decl, false);
>   return true;
>  }
>
> +/* Should a version of this function be specially optimized for core2?
> +
> +   This function should have checks to see if there are any opportunities for
> +   core2 specific optimizations, otherwise do not create a clone.  The
> +   following opportunities are checked.

The above 2 references to core2 should be changed to corei7?

> +
> +   * Check if there are any loops that contain LCP stalling instruction on
> +     corei7.
> +
> +   This versioning is triggered only when multi-versioning for corei7 is
> +   requested using -mvarch=corei7.
> +
> +   Return false if no versioning is required.  Return true if a version must
> +   be created.  Generate the *OPTIMIZATION_NODE that must be used to optimize
> +   the newly created version, that is tag "tune=corei7" on the new version.  */
> +
> +static bool
> +mversionable_for_corei7_p (int version_number, tree *optimization_node,
> +                          tree *cond_func_decl, basic_block *new_bb)
> +{
> +  tree predicate_decl;
> +  bool is_mversion_target_corei7 = false;
> +  bool create_version = false;
> +
> +  if (ix86_varch_specified
> +      && (ix86_varch[PROCESSOR_COREI7_64]
> +         || ix86_varch[PROCESSOR_COREI7_32]))
> +    is_mversion_target_corei7 = true;
> +
> +  if (is_mversion_target_corei7 && find_himode_assigns ())
> +    create_version = true;
> +  /* else if XXX: Add more criteria to version for corei7.  */
> +
> +  if (!create_version)
> +    return false;
> +
> +  /* If the condition function's body has not been created, create it now.  */
> +  if (*cond_func_decl == NULL)
> +    *cond_func_decl = make_condition_function (new_bb);
> +
> +  *optimization_node = create_mtune_target_opt_node ("corei7");
> +
> +  predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS_INTEL_COREI7];
> +  *new_bb = add_condition_to_bb (*cond_func_decl, version_number, *new_bb,
> +                                predicate_decl, false);
> +  return true;
> +}
> +
> +
>  /* Should this function CURRENT_FUNCTION_DECL be multi-versioned, if so
>    the number of versions to be created (other than the original) is
>    returned.  The outcome of COND_FUNC_DECL will decide the version to be
> @@ -26654,19 +26829,33 @@ ix86_mversion_function (tree fndecl ATTRIBUTE_UNUS
>   basic_block new_bb;
>   tree optimization_node;
>   int num_versions_created = 0;
> -
> +
>   if (ix86_mv_arch_string == NULL)
>     return 0;
>
> -  if (mversionable_for_core2_p (&optimization_node, cond_func_decl, &new_bb))
> -    num_versions_created++;
> +  if (mversionable_for_core2_p (num_versions_created, &optimization_node,
> +                               cond_func_decl, &new_bb))
> +    {
> +      num_versions_created++;
> +      *optimization_node_chain = tree_cons (optimization_node,
> +                                           NULL_TREE,
> +                                           *optimization_node_chain);
> +    }
>
> +  if (mversionable_for_corei7_p (num_versions_created, &optimization_node,
> +                                cond_func_decl, &new_bb))
> +    {
> +      num_versions_created++;
> +      *optimization_node_chain = tree_cons (optimization_node,
> +                                           NULL_TREE,
> +                                           *optimization_node_chain);
> +    }
> +
> +  *optimization_node_chain = nreverse (*optimization_node_chain);

What is the nreverse here and below needed for?

Thanks,
Teresa

> +
>   if (!num_versions_created)
>     return 0;
>
> -  *optimization_node_chain = tree_cons (optimization_node,
> -                                       NULL_TREE, *optimization_node_chain);
> -
>   /* Return the default version as the last stmt in cond_func_decl.  */
>   if (*cond_func_decl != NULL)
>     new_bb = add_condition_to_bb (*cond_func_decl, num_versions_created,
> Index: params.def
> ===================================================================
> --- params.def  (revision 185514)
> +++ params.def  (working copy)
> @@ -1040,7 +1040,7 @@ DEFPARAM (PARAM_PMU_PROFILE_N_ADDRESS,
>  DEFPARAM (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING,
>          "autoclone-function-size-limit",
>          "Do not auto clone functions beyond this size.",
> -         450, 0, 100000)
> +         5000, 0, 100000)
>
>  /*
>  Local variables:
> Index: mversn-dispatch.c
> ===================================================================
> --- mversn-dispatch.c   (revision 185514)
> +++ mversn-dispatch.c   (working copy)
> @@ -2423,6 +2423,8 @@ do_auto_clone (void)
>       opt_node = TREE_CHAIN (opt_node);
>     }
>
> +  fn_ver_addr_chain = nreverse (fn_ver_addr_chain);
> +
>   /* The current function is replaced by an ifunc call to the right version.
>      Make another clone for the default.  */
>   default_decl = clone_function (current_function_decl, "autoclone.original");
> @@ -2432,6 +2434,7 @@ do_auto_clone (void)
>   default_ver = build_fold_addr_expr (default_decl);
>   cond_func_addr = build_fold_addr_expr (cond_func_decl);
>
> +
>   /* Get the gimple sequence to replace the current function's body with a
>      ifunc dispatch call to the right version.  */
>   gseq = dispatch_using_ifunc (num_versions, current_function_decl,
>
> --
> This patch is available for review at http://codereview.appspot.com/5865043

Patch

Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 185514)
+++ config/i386/i386.c	(working copy)
@@ -26507,6 +26507,132 @@  any_loops_vectorizable_with_load_store (void)
   return vectorizable_loop_found;
 }
 
+/* Returns true if this function finds a loop that contains a possible LCP
+   stalling instruction on corei7.   This is used to multiversion functions
+   for corei7.  
+
+   This function looks for instructions that store a constant into
+   HImode (16-bit) memory. These require a length-changing prefix and on
+   corei7 are prone to LCP stalls. These stalls can be avoided if the loop
+   is streamed from the loop stream detector.  */
+
+static bool
+find_himode_assigns (void)
+{
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+  enum gimple_code code;
+  tree lhs/*, rhs*/;
+  enum machine_mode mode;
+  basic_block *body;
+  unsigned i;
+  loop_iterator li;
+  struct loop *loop;
+  bool found = false;
+  location_t locus = 0;
+  int stmt_count;
+  unsigned HOST_WIDE_INT n_unroll, max_unroll;
+
+  if (!flag_unroll_loops)
+    return false;
+
+  loop_optimizer_init (LOOPS_NORMAL
+                       | LOOPS_HAVE_RECORDED_EXITS);
+  if (number_of_loops () < 1)
+    return false;
+
+  scev_initialize();
+
+  if (profile_status == PROFILE_READ)
+    max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES_FEEDBACK);
+  else
+    max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
+
+  FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
+    {
+      tree niter;
+
+      /* Will not peel/unroll cold areas.  */
+      if (optimize_loop_for_size_p (loop))
+        continue;
+
+      /* Can the loop be manipulated?  */
+      if (!can_duplicate_loop_p (loop))
+        continue;
+
+      niter = number_of_latch_executions (loop);
+      if (host_integerp (niter, 1))
+	{
+	  n_unroll = tree_low_cst (niter, 1);
+	  if (n_unroll <= max_unroll)
+	    continue;
+	}
+
+      body = get_loop_body (loop);
+      found = false;
+      stmt_count = 0;
+
+      for (i = 0; i < loop->num_nodes; i++)
+	{
+	  for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+	    {
+	      stmt = gsi_stmt (gsi);
+	      stmt_count++;
+	      if (found)
+		continue;
+	      code = gimple_code (stmt);
+	      if (code != GIMPLE_ASSIGN)
+		continue;
+	      lhs = gimple_assign_lhs (stmt);
+	      if (TREE_CODE (lhs) != MEM_REF &&
+	          TREE_CODE (lhs) != COMPONENT_REF &&
+	          TREE_CODE (lhs) != ARRAY_REF)
+		continue;
+	      if (gimple_assign_rhs_code(stmt) != INTEGER_CST)
+		continue;
+	      mode = TYPE_MODE (TREE_TYPE (lhs));
+	      if (mode == HImode)
+		{
+		  locus = gimple_location (stmt);
+		  found = true;
+		}
+	   }
+	}
+      /* Don't worry about large loops that won't be unrolled anyway. In fact,
+       * don't worry about unrolling loops that are already over the size of the
+       * LSD (28 insts). Since instruction counts may be a little off at this
+       * point, due to downstream transformations, include loops a little bigger
+       * than the LSD size.
+       */
+      if (found && stmt_count < 40)
+	{
+	  n_unroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS)/stmt_count;
+	  /* Check for a simple peel candidate */
+	  if (!(loop->header->count
+		&& expected_loop_iterations (loop) < 2 * n_unroll))
+	    {
+	      location_t locus2;
+	      edge exit;
+	      if ((exit = single_exit(loop)) != NULL)
+	        locus2 = gimple_location(last_stmt(exit->src));
+	      else
+	        locus2 = gimple_location(first_stmt(loop->latch));
+	    }
+	  else
+	    found = false;
+	}
+      else
+	found = false;
+      free (body);
+    }
+
+  scev_finalize();
+  loop_optimizer_finalize();
+  return found;
+}
+
 /* This makes the function that chooses the version to execute based
    on the condition.  This condition function will decide which version
    of the function to execute.  It should look like this:
@@ -26601,7 +26727,7 @@  create_mtune_target_opt_node (const char *arch_tun
    the newly created version, that is tag "tune=core2" on the new version.  */
 
 static bool
-mversionable_for_core2_p (tree *optimization_node,
+mversionable_for_core2_p (int version_number, tree *optimization_node,
 			  tree *cond_func_decl, basic_block *new_bb)
 {
   tree predicate_decl;
@@ -26635,11 +26761,60 @@  static bool
   *optimization_node = create_mtune_target_opt_node ("core2");
 
   predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS_INTEL_CORE2];
-  *new_bb = add_condition_to_bb (*cond_func_decl, 0, *new_bb,
+  *new_bb = add_condition_to_bb (*cond_func_decl, version_number, *new_bb,
 				 predicate_decl, false);
   return true;
 }
 
+/* Should a version of this function be specially optimized for core2?
+
+   This function should have checks to see if there are any opportunities for
+   core2 specific optimizations, otherwise do not create a clone.  The
+   following opportunities are checked.
+
+   * Check if there are any loops that contain LCP stalling instruction on
+     corei7.
+
+   This versioning is triggered only when multi-versioning for corei7 is
+   requested using -mvarch=corei7. 
+
+   Return false if no versioning is required.  Return true if a version must
+   be created.  Generate the *OPTIMIZATION_NODE that must be used to optimize
+   the newly created version, that is tag "tune=corei7" on the new version.  */
+
+static bool
+mversionable_for_corei7_p (int version_number, tree *optimization_node,
+			   tree *cond_func_decl, basic_block *new_bb)
+{
+  tree predicate_decl;
+  bool is_mversion_target_corei7 = false;
+  bool create_version = false;
+
+  if (ix86_varch_specified
+      && (ix86_varch[PROCESSOR_COREI7_64]
+	  || ix86_varch[PROCESSOR_COREI7_32]))
+    is_mversion_target_corei7 = true;
+
+  if (is_mversion_target_corei7 && find_himode_assigns ())
+    create_version = true;
+  /* else if XXX: Add more criteria to version for corei7.  */
+
+  if (!create_version)
+    return false;
+
+  /* If the condition function's body has not been created, create it now.  */
+  if (*cond_func_decl == NULL)
+    *cond_func_decl = make_condition_function (new_bb);
+
+  *optimization_node = create_mtune_target_opt_node ("corei7");
+
+  predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS_INTEL_COREI7];
+  *new_bb = add_condition_to_bb (*cond_func_decl, version_number, *new_bb,
+				 predicate_decl, false);
+  return true;
+}
+
+
 /* Should this function CURRENT_FUNCTION_DECL be multi-versioned, if so 
    the number of versions to be created (other than the original) is
    returned.  The outcome of COND_FUNC_DECL will decide the version to be
@@ -26654,19 +26829,33 @@  ix86_mversion_function (tree fndecl ATTRIBUTE_UNUS
   basic_block new_bb;
   tree optimization_node;
   int num_versions_created = 0;
-
+   
   if (ix86_mv_arch_string == NULL)
     return 0;
 
-  if (mversionable_for_core2_p (&optimization_node, cond_func_decl, &new_bb))
-    num_versions_created++;
+  if (mversionable_for_core2_p (num_versions_created, &optimization_node,
+				cond_func_decl, &new_bb))
+    {
+      num_versions_created++;
+      *optimization_node_chain = tree_cons (optimization_node,
+					    NULL_TREE,
+					    *optimization_node_chain);
+    }
 
+  if (mversionable_for_corei7_p (num_versions_created, &optimization_node,
+				 cond_func_decl, &new_bb))
+    {
+      num_versions_created++;
+      *optimization_node_chain = tree_cons (optimization_node,
+					    NULL_TREE,
+					    *optimization_node_chain);
+    }
+
+  *optimization_node_chain = nreverse (*optimization_node_chain);
+
   if (!num_versions_created)
     return 0;
 
-  *optimization_node_chain = tree_cons (optimization_node,
-					NULL_TREE, *optimization_node_chain);
-
   /* Return the default version as the last stmt in cond_func_decl.  */
   if (*cond_func_decl != NULL)
     new_bb = add_condition_to_bb (*cond_func_decl, num_versions_created,
Index: params.def
===================================================================
--- params.def	(revision 185514)
+++ params.def	(working copy)
@@ -1040,7 +1040,7 @@  DEFPARAM (PARAM_PMU_PROFILE_N_ADDRESS,
 DEFPARAM (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING,
 	  "autoclone-function-size-limit",
 	  "Do not auto clone functions beyond this size.",
-	  450, 0, 100000)
+	  5000, 0, 100000)
 
 /*
 Local variables:
Index: mversn-dispatch.c
===================================================================
--- mversn-dispatch.c	(revision 185514)
+++ mversn-dispatch.c	(working copy)
@@ -2423,6 +2423,8 @@  do_auto_clone (void)
       opt_node = TREE_CHAIN (opt_node);
     }
 
+  fn_ver_addr_chain = nreverse (fn_ver_addr_chain);
+
   /* The current function is replaced by an ifunc call to the right version.
      Make another clone for the default.  */
   default_decl = clone_function (current_function_decl, "autoclone.original");
@@ -2432,6 +2434,7 @@  do_auto_clone (void)
   default_ver = build_fold_addr_expr (default_decl);
   cond_func_addr = build_fold_addr_expr (cond_func_decl);
 
+
   /* Get the gimple sequence to replace the current function's body with a
      ifunc dispatch call to the right version.  */
   gseq = dispatch_using_ifunc (num_versions, current_function_decl,