diff mbox

[PR50527] Don't assume alignment of vla-related allocas.

Message ID 4E846F6F.7070002@mentor.com
State New
Headers show

Commit Message

Tom de Vries Sept. 29, 2011, 1:15 p.m. UTC
On 09/28/2011 11:53 AM, Richard Guenther wrote:
> On Wed, Sep 28, 2011 at 11:34 AM, Tom de Vries <Tom_deVries@mentor.com> wrote:
>> Richard,
>>
>> I got a patch for PR50527.
>>
>> The patch prevents the alignment of vla-related allocas to be set to
>> BIGGEST_ALIGNMENT in ccp. The alignment may turn out smaller after folding
>> the alloca.
>>
>> Bootstrapped and regtested on x86_64.
>>
>> OK for trunk?
> 
> Hmm.  As gfortran with -fstack-arrays uses VLAs it's probably bad that
> the vectorizer then will no longer see that the arrays are properly aligned.
> 
> I'm not sure what the best thing to do is here, other than trying to record
> the alignment requirement of the VLA somewhere.
> 
> Forcing the alignment of the alloca replacement decl to BIGGEST_ALIGNMENT
> has the issue that it will force stack-realignment which isn't free (and the
> point was to make the decl cheaper than the alloca).  But that might
> possibly be the better choice.
> 
> Any other thoughts?

How about the approach in this (untested) patch? Using the DECL_ALIGN of the vla
for the new array prevents stack realignment for folded vla-allocas, also for
large vlas.

This will not help in vectorizing large folded vla-allocas, but I think it's not
reasonable to expect BIGGEST_ALIGNMENT when writing a vla (although that has
been the case up until we started to fold). If you want to trigger vectorization
for a vla, you can still use the aligned attribute on the declaration.

Still, the unfolded vla-allocas will have BIGGEST_ALIGNMENT, also without using
an attribute on the decl. This patch exploits this by setting it at the end of
the 3rd pass_ccp, renamed to pass_ccp_last. This is not very effective in
propagation though, because although the ptr_info of the lhs is propagated via
copy_prop afterwards, it's not propagated anymore via ccp.

Another way to do this would be to set BIGGEST_ALIGNMENT at the end of ccp2 and
not fold during ccp3.

Thanks,
- Tom

Comments

Richard Biener Sept. 30, 2011, 1:29 p.m. UTC | #1
On Thu, Sep 29, 2011 at 3:15 PM, Tom de Vries <Tom_deVries@mentor.com> wrote:
> On 09/28/2011 11:53 AM, Richard Guenther wrote:
>> On Wed, Sep 28, 2011 at 11:34 AM, Tom de Vries <Tom_deVries@mentor.com> wrote:
>>> Richard,
>>>
>>> I got a patch for PR50527.
>>>
>>> The patch prevents the alignment of vla-related allocas to be set to
>>> BIGGEST_ALIGNMENT in ccp. The alignment may turn out smaller after folding
>>> the alloca.
>>>
>>> Bootstrapped and regtested on x86_64.
>>>
>>> OK for trunk?
>>
>> Hmm.  As gfortran with -fstack-arrays uses VLAs it's probably bad that
>> the vectorizer then will no longer see that the arrays are properly aligned.
>>
>> I'm not sure what the best thing to do is here, other than trying to record
>> the alignment requirement of the VLA somewhere.
>>
>> Forcing the alignment of the alloca replacement decl to BIGGEST_ALIGNMENT
>> has the issue that it will force stack-realignment which isn't free (and the
>> point was to make the decl cheaper than the alloca).  But that might
>> possibly be the better choice.
>>
>> Any other thoughts?
>
> How about the approach in this (untested) patch? Using the DECL_ALIGN of the vla
> for the new array prevents stack realignment for folded vla-allocas, also for
> large vlas.
>
> This will not help in vectorizing large folded vla-allocas, but I think it's not
> reasonable to expect BIGGEST_ALIGNMENT when writing a vla (although that has
> been the case up until we started to fold). If you want to trigger vectorization
> for a vla, you can still use the aligned attribute on the declaration.
>
> Still, the unfolded vla-allocas will have BIGGEST_ALIGNMENT, also without using
> an attribute on the decl. This patch exploits this by setting it at the end of
> the 3rd pass_ccp, renamed to pass_ccp_last. This is not very effective in
> propagation though, because although the ptr_info of the lhs is propagated via
> copy_prop afterwards, it's not propagated anymore via ccp.
>
> Another way to do this would be to set BIGGEST_ALIGNMENT at the end of ccp2 and
> not fold during ccp3.

Ugh, somehow I like this the least ;)

How about lowering VLAs to

  p = __builtin_alloca (...);
  p = __builtin_assume_aligned (p, DECL_ALIGN (vla));

and not assume anything for alloca itself if it feeds a
__builtin_assume_aligned?

Or rather introduce a __builtin_alloca_with_align () and for VLAs do

 p = __builtin_alloca_with_align (..., DECL_ALIGN (vla));

that's less awkward to use?

Sorry for not having a clear plan here ;)

Richard.

> Thanks,
> - Tom
>
>
diff mbox

Patch

Index: gcc/tree-pass.h
===================================================================
--- gcc/tree-pass.h (revision 179043)
+++ gcc/tree-pass.h (working copy)
@@ -389,6 +389,7 @@  extern struct gimple_opt_pass pass_iv_op
 extern struct gimple_opt_pass pass_tree_loop_done;
 extern struct gimple_opt_pass pass_ch;
 extern struct gimple_opt_pass pass_ccp;
+extern struct gimple_opt_pass pass_ccp_last;
 extern struct gimple_opt_pass pass_phi_only_cprop;
 extern struct gimple_opt_pass pass_build_ssa;
 extern struct gimple_opt_pass pass_build_alias;
Index: gcc/tree-ssa-ccp.c
===================================================================
--- gcc/tree-ssa-ccp.c (revision 179210)
+++ gcc/tree-ssa-ccp.c (working copy)
@@ -167,8 +167,12 @@  typedef struct prop_value_d prop_value_t
    doing the store).  */
 static prop_value_t *const_val;
 
+/* Indicates whether we're in pass_ccp_last.  */
+static bool ccp_last;
+
 static void canonicalize_float_value (prop_value_t *);
 static bool ccp_fold_stmt (gimple_stmt_iterator *);
+static bool fold_builtin_alloca_for_var_p (gimple);
 
 /* Dump constant propagation value VAL to file OUTF prefixed by PREFIX.  */
 
@@ -813,6 +817,15 @@  ccp_finalize (void)
 	  || !POINTER_TYPE_P (TREE_TYPE (name)))
 	continue;
 
+      if (ccp_last && !SSA_NAME_IS_DEFAULT_DEF (name)
+	  && gimple_call_alloca_for_var_p (SSA_NAME_DEF_STMT (name))
+	  && !fold_builtin_alloca_for_var_p (SSA_NAME_DEF_STMT (name)))
+	{
+	  pi = get_ptr_info (name);
+	  pi->align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+	  continue;
+	}
+
       val = get_value (name);
       if (val->lattice_val != CONSTANT
 	  || TREE_CODE (val->value) != INTEGER_CST)
@@ -1492,6 +1505,7 @@  evaluate_stmt (gimple stmt)
   tree simplified = NULL_TREE;
   ccp_lattice_t likelyvalue = likely_value (stmt);
   bool is_constant = false;
+  unsigned int align;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
@@ -1632,10 +1646,13 @@  evaluate_stmt (gimple stmt)
 	      break;
 
 	    case BUILT_IN_ALLOCA:
+	      align = (gimple_call_alloca_for_var_p (stmt)
+		       ? TYPE_ALIGN (TREE_TYPE (gimple_get_lhs (stmt)))
+		       : BIGGEST_ALIGNMENT);
 	      val.lattice_val = CONSTANT;
 	      val.value = build_int_cst (TREE_TYPE (gimple_get_lhs (stmt)), 0);
 	      val.mask = shwi_to_double_int
-		  	   (~(((HOST_WIDE_INT) BIGGEST_ALIGNMENT)
+		  	   (~(((HOST_WIDE_INT) align)
 			      / BITS_PER_UNIT - 1));
 	      break;
 
@@ -1685,27 +1702,25 @@  evaluate_stmt (gimple stmt)
   return val;
 }
 
-/* Detects a vla-related alloca with a constant argument.  Declares fixed-size
-   array and return the address, if found, otherwise returns NULL_TREE.  */
-
-static tree
-fold_builtin_alloca_for_var (gimple stmt)
+static bool
+fold_builtin_alloca_for_var_p (gimple stmt)
 {
-  unsigned HOST_WIDE_INT size, threshold, n_elem;
-  tree lhs, arg, block, var, elem_type, array_type;
-  unsigned int align;
+  unsigned HOST_WIDE_INT size, threshold;
+  tree lhs, arg, block;
+
+  gcc_checking_assert (gimple_call_alloca_for_var_p (stmt));
 
   /* Get lhs.  */
   lhs = gimple_call_lhs (stmt);
   if (lhs == NULL_TREE)
-    return NULL_TREE;
+    return false;
 
   /* Detect constant argument.  */
   arg = get_constant_value (gimple_call_arg (stmt, 0));
   if (arg == NULL_TREE
       || TREE_CODE (arg) != INTEGER_CST
       || !host_integerp (arg, 1))
-    return NULL_TREE;
+    return false;
 
   size = TREE_INT_CST_LOW (arg);
 
@@ -1718,17 +1733,33 @@  fold_builtin_alloca_for_var (gimple stmt
         && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
     threshold /= 10;
   if (size > threshold)
-    return NULL_TREE;
+    return false;
+
+  return true;
+}
+
+/* Detects a vla-related alloca with a constant argument.  Declares fixed-size
+   array and return the address, if found, otherwise returns NULL_TREE.  */
+
+static tree
+fold_builtin_alloca_for_var (gimple stmt)
+{
+  unsigned HOST_WIDE_INT size, n_elem;
+  tree lhs, arg, var, elem_type, array_type;
+
+  if (!fold_builtin_alloca_for_var_p (stmt))
+    return NULL;
+
+  lhs = gimple_call_lhs (stmt);
+  arg = get_constant_value (gimple_call_arg (stmt, 0));
+  size = TREE_INT_CST_LOW (arg);
 
   /* Declare array.  */
   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
   n_elem = size * 8 / BITS_PER_UNIT;
-  align = MIN (size * 8, BIGGEST_ALIGNMENT);
-  if (align < BITS_PER_UNIT)
-    align = BITS_PER_UNIT;
   array_type = build_array_type_nelts (elem_type, n_elem);
   var = create_tmp_var (array_type, NULL);
-  DECL_ALIGN (var) = align;
+  DECL_ALIGN (var) = TYPE_ALIGN (TREE_TYPE (TREE_TYPE (SSA_NAME_VAR (lhs))));
   {
     struct ptr_info_def *pi = SSA_NAME_PTR_INFO (lhs);
     if (pi != NULL && !pi->pt.anything)
@@ -2030,6 +2061,19 @@  do_ssa_ccp (void)
     return 0;
 }
 
+/* Last invocation of SSA Conditional Constant Propagation.
+   At the end of the last invocation, we know which vla-allocas will remain
+   allocas, and we can assume BIGGEST_ALIGNMENT for those.  */
+
+static unsigned int
+do_ssa_ccp_last (void)
+{
+  unsigned int res;
+  ccp_last = true;
+  res = do_ssa_ccp ();
+  ccp_last = false;
+  return res;
+}
 
 static bool
 gate_ccp (void)
@@ -2058,6 +2102,25 @@  struct gimple_opt_pass pass_ccp =
  }
 };
 
+struct gimple_opt_pass pass_ccp_last =
+{
+ {
+  GIMPLE_PASS,
+  "ccp3",				/* name */
+  gate_ccp,				/* gate */
+  do_ssa_ccp_last,			/* execute */
+  NULL,					/* sub */
+  NULL,					/* next */
+  0,					/* static_pass_number */
+  TV_TREE_CCP,				/* tv_id */
+  PROP_cfg | PROP_ssa,			/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_verify_ssa
+  | TODO_verify_stmts | TODO_ggc_collect/* todo_flags_finish */
+ }
+};
 
 
 /* Try to optimize out __builtin_stack_restore.  Optimize it out
Index: gcc/gimplify.c
===================================================================
--- gcc/gimplify.c (revision 179043)
+++ gcc/gimplify.c (working copy)
@@ -1321,7 +1321,8 @@  gimplify_vla_decl (tree decl, gimple_seq
      things: First, it lets the rest of the gimplifier know what
      replacement to use.  Second, it lets the debug info know
      where to find the value.  */
-  ptr_type = build_pointer_type (TREE_TYPE (decl));
+  ptr_type = build_pointer_type (build_aligned_type (TREE_TYPE (decl),
+						     DECL_ALIGN (decl)));
   addr = create_tmp_var (ptr_type, get_name (decl));
   DECL_IGNORED_P (addr) = 0;
   t = build_fold_indirect_ref (addr);
Index: gcc/passes.c
===================================================================
--- gcc/passes.c (revision 179043)
+++ gcc/passes.c (working copy)
@@ -1321,7 +1321,7 @@  init_optimization_passes (void)
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_phiopt);
       NEXT_PASS (pass_object_sizes);
-      NEXT_PASS (pass_ccp);
+      NEXT_PASS (pass_ccp_last);
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_cse_sincos);
       NEXT_PASS (pass_optimize_bswap);