diff mbox

Use get_nonzero_bits to improve vectorization

Message ID 20131025173844.GK30970@tucnak.zalov.cz
State New
Headers show

Commit Message

Jakub Jelinek Oct. 25, 2013, 5:38 p.m. UTC
Hi!

The following patch makes use of the computed nonzero_bits preserved
in the SSA_NAME_RANGE_INFO.
I chose to write a new routine instead of improving current
highest_pow2_factor, because that routine didn't care about overflows etc.
and by working on ctz numbers instead of powers of two in UHWI we can handle
even larger constants etc.  highest_pow2_factor could very well overflow to
zero etc.  So, the patch introduces a new tree_ctz routine and reimplements
highest_pow2_factor on top of that, plus uses tree_ctz also in
get_object_alignment_2 and in the vectorizer to determine if it can avoid
scalar loop for bound (and indirectly also in the analysis whether peeling
for alignment is needed).

With this patch, e.g.
int a[1024];

void
foo (int x, int y)
{
  int i;
  x &= -32;
  y &= -32;
  for (i = x + 32; i < y; i++)
    a[i]++;
}
can be vectorized without any peeling for alignment or scalar loop
afterwards.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2013-10-25  Jakub Jelinek  <jakub@redhat.com>

	* tree.c (tree_ctz): New function.
	* tree.h (tree_ctz): New prototype.
	* tree-ssanames.h (get_range_info, get_nonzero_bits): Change
	first argument from tree to const_tree.
	* tree-ssanames.c (get_range_info, get_nonzero_bits): Likewise.
	* tree-vectorizer.h (vect_generate_tmps_on_preheader): New prototype.
	* tree-vect-loop-manip.c (vect_generate_tmps_on_preheader): No longer
	static.
	* expr.c (highest_pow2_factor): Reimplemented using tree_ctz.
	* tree-vect-loop.c (vect_analyze_loop_operations,
	vect_transform_loop): Don't force scalar loop for bound just because
	number of iterations is unknown, only do it if it is not known to be
	a multiple of vectorization_factor.
	* builtins.c (get_object_alignment_2): Use tree_ctz on offset.



	Jakub

Comments

Richard Biener Oct. 29, 2013, 12:11 p.m. UTC | #1
On Fri, 25 Oct 2013, Jakub Jelinek wrote:

> Hi!
> 
> The following patch makes use of the computed nonzero_bits preserved
> in the SSA_NAME_RANGE_INFO.
> I chose to write a new routine instead of improving current
> highest_pow2_factor, because that routine didn't care about overflows etc.
> and by working on ctz numbers instead of powers of two in UHWI we can handle
> even larger constants etc.  highest_pow2_factor could very well overflow to
> zero etc.  So, the patch introduces a new tree_ctz routine and reimplements
> highest_pow2_factor on top of that, plus uses tree_ctz also in
> get_object_alignment_2 and in the vectorizer to determine if it can avoid
> scalar loop for bound (and indirectly also in the analysis whether peeling
> for alignment is needed).
> 
> With this patch, e.g.
> int a[1024];
> 
> void
> foo (int x, int y)
> {
>   int i;
>   x &= -32;
>   y &= -32;
>   for (i = x + 32; i < y; i++)
>     a[i]++;
> }
> can be vectorized without any peeling for alignment or scalar loop
> afterwards.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2013-10-25  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* tree.c (tree_ctz): New function.
> 	* tree.h (tree_ctz): New prototype.
> 	* tree-ssanames.h (get_range_info, get_nonzero_bits): Change
> 	first argument from tree to const_tree.
> 	* tree-ssanames.c (get_range_info, get_nonzero_bits): Likewise.
> 	* tree-vectorizer.h (vect_generate_tmps_on_preheader): New prototype.
> 	* tree-vect-loop-manip.c (vect_generate_tmps_on_preheader): No longer
> 	static.
> 	* expr.c (highest_pow2_factor): Reimplemented using tree_ctz.
> 	* tree-vect-loop.c (vect_analyze_loop_operations,
> 	vect_transform_loop): Don't force scalar loop for bound just because
> 	number of iterations is unknown, only do it if it is not known to be
> 	a multiple of vectorization_factor.
> 	* builtins.c (get_object_alignment_2): Use tree_ctz on offset.
> 
> --- gcc/tree.c.jj	2013-10-23 14:43:12.000000000 +0200
> +++ gcc/tree.c	2013-10-25 15:00:55.296178794 +0200
> @@ -2213,6 +2213,110 @@ tree_floor_log2 (const_tree expr)
>  	  : floor_log2 (low));
>  }
>  
> +/* Return number of known trailing zero bits in EXPR, or, if the value of
> +   EXPR is known to be zero, the precision of it's type.  */
> +
> +int

unsigned int?

> +tree_ctz (const_tree expr)
> +{
> +  if (!INTEGRAL_TYPE_P (TREE_TYPE (expr))
> +      && !POINTER_TYPE_P (TREE_TYPE (expr)))
> +    return 0;
> +
> +  int ret1, ret2, prec = TYPE_PRECISION (TREE_TYPE (expr));
> +  switch (TREE_CODE (expr))
> +    {
> +    case INTEGER_CST:
> +      ret1 = tree_to_double_int (expr).trailing_zeros ();
> +      return MIN (ret1, prec);
> +    case SSA_NAME:
> +      ret1 = get_nonzero_bits (expr).trailing_zeros ();
> +      return MIN (ret1, prec);
> +    case PLUS_EXPR:
> +    case MINUS_EXPR:
> +    case BIT_IOR_EXPR:
> +    case BIT_XOR_EXPR:
> +    case MIN_EXPR:
> +    case MAX_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));

This first recurses but if either one returns 0 you don't have
to (that cuts down the recursion from exponential to linear in
the common case?).  Thus, early out on ret == 0?

> +      return MIN (ret1, ret2);
> +    case POINTER_PLUS_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> +      ret2 = MIN (ret2, prec);

Why do you need that here but not elsewhere when processing
binary ops?

> +      return MIN (ret1, ret2);
> +    case BIT_AND_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> +      return MAX (ret1, ret2);
> +    case MULT_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> +      return MIN (ret1 + ret2, prec);
> +    case LSHIFT_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      if (host_integerp (TREE_OPERAND (expr, 1), 1)

check that first before recursing for op0 - if op1 is negative
you simply return ret1 which looks wrong, too.

> +	  && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1)
> +	      < (unsigned HOST_WIDE_INT) prec))

This check is to avoid overflowing ret1 + ret2?  If so, why not add

> +	{
> +	  ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1);

  ret2 = MIN (ret2, prec);

instead?

> +	  return MIN (ret1 + ret2, prec);
> +	}
> +      return ret1;
> +    case RSHIFT_EXPR:
> +      if (host_integerp (TREE_OPERAND (expr, 1), 1)
> +	  && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1)
> +	      < (unsigned HOST_WIDE_INT) prec))
> +	{
> +	  ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +	  ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1);
> +	  if (ret1 > ret2)
> +	    return ret1 - ret2;
> +	}
> +      return 0;

Seems to be slightly better structured.  Looks like you assume only
positive shift amounts exist in the LSHIFT_EXPR case, I'm not sure
that's a valid assumption (see constant folding code dealing with that).

> +    case TRUNC_DIV_EXPR:
> +    case CEIL_DIV_EXPR:
> +    case FLOOR_DIV_EXPR:
> +    case ROUND_DIV_EXPR:
> +    case EXACT_DIV_EXPR:
> +      if (TREE_CODE (TREE_OPERAND (expr, 1)) == INTEGER_CST)
> +	{
> +	  ret2 = tree_log2 (TREE_OPERAND (expr, 1));
> +	  if (ret2 >= 0 && tree_int_cst_sgn (TREE_OPERAND (expr, 1)) == 1)

cheaper to test the sign first.

> +	    {
> +	      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +	      if (ret1 > ret2)
> +		return ret1 - ret2;
> +	    }
> +	}
> +      return 0;
> +    CASE_CONVERT:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      if (ret1 && ret1 == TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (expr, 0))))
> +	ret1 = prec;
> +      return MIN (ret1, prec);
> +    case SAVE_EXPR:
> +      return tree_ctz (TREE_OPERAND (expr, 0));
> +    case COND_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 1));
> +      ret2 = tree_ctz (TREE_OPERAND (expr, 2));
> +      return MIN (ret1, ret2);
> +    case COMPOUND_EXPR:
> +      return tree_ctz (TREE_OPERAND (expr, 1));
> +    case ADDR_EXPR:
> +      ret1 = get_object_alignment (TREE_OPERAND (expr, 0));

Use get_pointer_alignment, this isn't a memory reference so type
alignment rules don't apply.

The rest looks ok to me.

Thanks,
Richard.

> +      if (ret1 > BITS_PER_UNIT)
> +	{
> +	  ret1 = ctz_hwi (ret1 / BITS_PER_UNIT);
> +	  return MIN (ret1, prec);
> +	}
> +      return 0;
> +    default:
> +      return 0;
> +    }
> +}
> +
>  /* Return 1 if EXPR is the real constant zero.  Trailing zeroes matter for
>     decimal float constants, so don't return 1 for them.  */
>  
> --- gcc/tree.h.jj	2013-10-17 22:30:45.000000000 +0200
> +++ gcc/tree.h	2013-10-25 12:20:05.473673186 +0200
> @@ -4546,6 +4546,7 @@ extern void get_type_static_bounds (cons
>  extern bool variably_modified_type_p (tree, tree);
>  extern int tree_log2 (const_tree);
>  extern int tree_floor_log2 (const_tree);
> +extern int tree_ctz (const_tree);
>  extern int simple_cst_equal (const_tree, const_tree);
>  extern hashval_t iterative_hash_expr (const_tree, hashval_t);
>  extern hashval_t iterative_hash_exprs_commutative (const_tree,
> --- gcc/tree-ssanames.h.jj	2013-10-24 15:52:53.000000000 +0200
> +++ gcc/tree-ssanames.h	2013-10-25 14:09:21.227015919 +0200
> @@ -72,9 +72,10 @@ enum value_range_type { VR_UNDEFINED, VR
>  /* Sets the value range to SSA.  */
>  extern void set_range_info (tree, double_int, double_int);
>  /* Gets the value range from SSA.  */
> -extern enum value_range_type get_range_info (tree, double_int *, double_int *);
> +extern enum value_range_type get_range_info (const_tree, double_int *,
> +					     double_int *);
>  extern void set_nonzero_bits (tree, double_int);
> -extern double_int get_nonzero_bits (tree);
> +extern double_int get_nonzero_bits (const_tree);
>  extern void init_ssanames (struct function *, int);
>  extern void fini_ssanames (void);
>  extern void ssanames_print_statistics (void);
> --- gcc/tree-ssanames.c.jj	2013-10-24 17:32:22.000000000 +0200
> +++ gcc/tree-ssanames.c	2013-10-25 14:08:46.218187581 +0200
> @@ -221,7 +221,7 @@ set_range_info (tree name, double_int mi
>     is used to determine if MIN and MAX are valid values.  */
>  
>  enum value_range_type
> -get_range_info (tree name, double_int *min, double_int *max)
> +get_range_info (const_tree name, double_int *min, double_int *max)
>  {
>    enum value_range_type range_type;
>    gcc_assert (!POINTER_TYPE_P (TREE_TYPE (name)));
> @@ -271,7 +271,7 @@ set_nonzero_bits (tree name, double_int
>     NAME, or double_int_minus_one if unknown.  */
>  
>  double_int
> -get_nonzero_bits (tree name)
> +get_nonzero_bits (const_tree name)
>  {
>    if (POINTER_TYPE_P (TREE_TYPE (name)))
>      {
> --- gcc/tree-vectorizer.h.jj	2013-10-24 10:19:20.000000000 +0200
> +++ gcc/tree-vectorizer.h	2013-10-25 14:02:58.198999063 +0200
> @@ -901,6 +901,8 @@ extern void slpeel_make_loop_iterate_nti
>  extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
>  struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge);
>  extern void vect_loop_versioning (loop_vec_info, unsigned int, bool);
> +extern void vect_generate_tmps_on_preheader (loop_vec_info, tree *, tree *,
> +					     tree *, gimple_seq);
>  extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *,
>  					    unsigned int, bool);
>  extern void vect_do_peeling_for_alignment (loop_vec_info, unsigned int, bool);
> --- gcc/tree-vect-loop-manip.c.jj	2013-10-24 10:19:22.000000000 +0200
> +++ gcc/tree-vect-loop-manip.c	2013-10-25 14:02:00.544284058 +0200
> @@ -1437,7 +1437,7 @@ vect_build_loop_niters (loop_vec_info lo
>   and places them at the loop preheader edge or in COND_EXPR_STMT_LIST
>   if that is non-NULL.  */
>  
> -static void
> +void
>  vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
>  				 tree *ni_name_ptr,
>  				 tree *ratio_mult_vf_name_ptr,
> --- gcc/expr.c.jj	2013-10-23 14:43:15.000000000 +0200
> +++ gcc/expr.c	2013-10-25 15:05:23.893781676 +0200
> @@ -7282,74 +7282,14 @@ safe_from_p (const_rtx x, tree exp, int
>  unsigned HOST_WIDE_INT
>  highest_pow2_factor (const_tree exp)
>  {
> -  unsigned HOST_WIDE_INT c0, c1;
> -
> -  switch (TREE_CODE (exp))
> -    {
> -    case INTEGER_CST:
> -      /* We can find the lowest bit that's a one.  If the low
> -	 HOST_BITS_PER_WIDE_INT bits are zero, return BIGGEST_ALIGNMENT.
> -	 We need to handle this case since we can find it in a COND_EXPR,
> -	 a MIN_EXPR, or a MAX_EXPR.  If the constant overflows, we have an
> -	 erroneous program, so return BIGGEST_ALIGNMENT to avoid any
> -	 later ICE.  */
> -      if (TREE_OVERFLOW (exp))
> -	return BIGGEST_ALIGNMENT;
> -      else
> -	{
> -	  /* Note: tree_low_cst is intentionally not used here,
> -	     we don't care about the upper bits.  */
> -	  c0 = TREE_INT_CST_LOW (exp);
> -	  c0 &= -c0;
> -	  return c0 ? c0 : BIGGEST_ALIGNMENT;
> -	}
> -      break;
> -
> -    case PLUS_EXPR:  case MINUS_EXPR:  case MIN_EXPR:  case MAX_EXPR:
> -      c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
> -      c1 = highest_pow2_factor (TREE_OPERAND (exp, 1));
> -      return MIN (c0, c1);
> -
> -    case MULT_EXPR:
> -      c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
> -      c1 = highest_pow2_factor (TREE_OPERAND (exp, 1));
> -      return c0 * c1;
> -
> -    case ROUND_DIV_EXPR:  case TRUNC_DIV_EXPR:  case FLOOR_DIV_EXPR:
> -    case CEIL_DIV_EXPR:
> -      if (integer_pow2p (TREE_OPERAND (exp, 1))
> -	  && host_integerp (TREE_OPERAND (exp, 1), 1))
> -	{
> -	  c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
> -	  c1 = tree_low_cst (TREE_OPERAND (exp, 1), 1);
> -	  return MAX (1, c0 / c1);
> -	}
> -      break;
> -
> -    case BIT_AND_EXPR:
> -      /* The highest power of two of a bit-and expression is the maximum of
> -	 that of its operands.  We typically get here for a complex LHS and
> -	 a constant negative power of two on the RHS to force an explicit
> -	 alignment, so don't bother looking at the LHS.  */
> -      return highest_pow2_factor (TREE_OPERAND (exp, 1));
> -
> -    CASE_CONVERT:
> -    case SAVE_EXPR:
> -      return highest_pow2_factor (TREE_OPERAND (exp, 0));
> -
> -    case COMPOUND_EXPR:
> -      return highest_pow2_factor (TREE_OPERAND (exp, 1));
> -
> -    case COND_EXPR:
> -      c0 = highest_pow2_factor (TREE_OPERAND (exp, 1));
> -      c1 = highest_pow2_factor (TREE_OPERAND (exp, 2));
> -      return MIN (c0, c1);
> -
> -    default:
> -      break;
> -    }
> -
> -  return 1;
> +  unsigned HOST_WIDE_INT ret;
> +  int trailing_zeros = tree_ctz (exp);
> +  if (trailing_zeros >= HOST_BITS_PER_WIDE_INT)
> +    return BIGGEST_ALIGNMENT;
> +  ret = (unsigned HOST_WIDE_INT) 1 << trailing_zeros;
> +  if (ret > BIGGEST_ALIGNMENT)
> +    return BIGGEST_ALIGNMENT;
> +  return ret;
>  }
>  
>  /* Similar, except that the alignment requirements of TARGET are
> --- gcc/tree-vect-loop.c.jj	2013-10-24 10:19:23.000000000 +0200
> +++ gcc/tree-vect-loop.c	2013-10-25 14:01:35.968407222 +0200
> @@ -1586,9 +1586,9 @@ vect_analyze_loop_operations (loop_vec_i
>        return false;
>      }
>  
> -  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> -      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
> -      || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
> +  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)
> +      || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
> +	  < exact_log2 (vectorization_factor)))
>      {
>        if (dump_enabled_p ())
>          dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.\n");
> @@ -5656,15 +5656,20 @@ vect_transform_loop (loop_vec_info loop_
>       will remain scalar and will compute the remaining (n%VF) iterations.
>       (VF is the vectorization factor).  */
>  
> -  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> -       || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> -	   && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
> -       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
> +  if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
> +      < exact_log2 (vectorization_factor)
> +      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
>      vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
>  				    th, check_profitability);
> -  else
> +  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
>      ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
>  		LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
> +  else
> +    {
> +      tree ni_name, ratio_mult_vf;
> +      vect_generate_tmps_on_preheader (loop_vinfo, &ni_name, &ratio_mult_vf,
> +				       &ratio, NULL);
> +    }
>  
>    /* 1) Make sure the loop header has exactly two entries
>       2) Make sure we have a preheader basic block.  */
> --- gcc/builtins.c.jj	2013-10-23 14:43:12.000000000 +0200
> +++ gcc/builtins.c	2013-10-25 12:31:49.426022284 +0200
> @@ -309,7 +309,7 @@ get_object_alignment_2 (tree exp, unsign
>    tree offset;
>    enum machine_mode mode;
>    int unsignedp, volatilep;
> -  unsigned int inner, align = BITS_PER_UNIT;
> +  unsigned int align = BITS_PER_UNIT;
>    bool known_alignment = false;
>  
>    /* Get the innermost object and the constant (bitpos) and possibly
> @@ -418,50 +418,16 @@ get_object_alignment_2 (tree exp, unsign
>  
>    /* If there is a non-constant offset part extract the maximum
>       alignment that can prevail.  */
> -  inner = ~0U;
> -  while (offset)
> +  if (offset)
>      {
> -      tree next_offset;
> -
> -      if (TREE_CODE (offset) == PLUS_EXPR)
> -	{
> -	  next_offset = TREE_OPERAND (offset, 0);
> -	  offset = TREE_OPERAND (offset, 1);
> -	}
> -      else
> -	next_offset = NULL;
> -      if (host_integerp (offset, 1))
> -	{
> -	  /* Any overflow in calculating offset_bits won't change
> -	     the alignment.  */
> -	  unsigned offset_bits
> -	    = ((unsigned) tree_low_cst (offset, 1) * BITS_PER_UNIT);
> -
> -	  if (offset_bits)
> -	    inner = MIN (inner, (offset_bits & -offset_bits));
> -	}
> -      else if (TREE_CODE (offset) == MULT_EXPR
> -	       && host_integerp (TREE_OPERAND (offset, 1), 1))
> -	{
> -	  /* Any overflow in calculating offset_factor won't change
> -	     the alignment.  */
> -	  unsigned offset_factor
> -	    = ((unsigned) tree_low_cst (TREE_OPERAND (offset, 1), 1)
> -	       * BITS_PER_UNIT);
> -
> -	  if (offset_factor)
> -	    inner = MIN (inner, (offset_factor & -offset_factor));
> -	}
> -      else
> +      int trailing_zeros = tree_ctz (offset);
> +      if (trailing_zeros < HOST_BITS_PER_INT)
>  	{
> -	  inner = MIN (inner, BITS_PER_UNIT);
> -	  break;
> +	  unsigned int inner = (1U << trailing_zeros) * BITS_PER_UNIT;
> +	  if (inner)
> +	    align = MIN (align, inner);
>  	}
> -      offset = next_offset;
>      }
> -  /* Alignment is innermost object alignment adjusted by the constant
> -     and non-constant offset parts.  */
> -  align = MIN (align, inner);
>  
>    *alignp = align;
>    *bitposp = bitpos & (*alignp - 1);
> 
> 
> 	Jakub
> 
>
Jakub Jelinek Oct. 29, 2013, 3:59 p.m. UTC | #2
On Tue, Oct 29, 2013 at 01:11:53PM +0100, Richard Biener wrote:
> > +/* Return number of known trailing zero bits in EXPR, or, if the value of
> > +   EXPR is known to be zero, the precision of it's type.  */
> > +
> > +int
> 
> unsigned int?

Ok.

> > +    case PLUS_EXPR:
> > +    case MINUS_EXPR:
> > +    case BIT_IOR_EXPR:
> > +    case BIT_XOR_EXPR:
> > +    case MIN_EXPR:
> > +    case MAX_EXPR:
> > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> 
> This first recurses but if either one returns 0 you don't have
> to (that cuts down the recursion from exponential to linear in
> the common case?).  Thus, early out on ret == 0?

Yeah, that is reasonable.  Usually we use this during expansion etc.,
trees won't be arbitrarily large and for SSA_NAMEs we don't recurse
on definitions, so I think we are unlikely to ever see very large
expressions there though.  I've added similar early out to the COND_EXPR
case.

> > +      return MIN (ret1, ret2);
> > +    case POINTER_PLUS_EXPR:
> > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> > +      ret2 = MIN (ret2, prec);
> 
> Why do you need that here but not elsewhere when processing
> binary ops?

Because POINTER_PLUS_EXPR (well, also shifts, but for those we
don't call tree_ctz on the second argument) is the only
binop we handle that uses different types for the operands,
for the first argument we know it has the same precision as the result, but
what if sizetype has bigger precision than TYPE_PRECISION of pointers?
I know it typically doesn't, just wanted to make sure we never return
an out of range return value, tree_ctz result should be in [0, prec]
always.

> > +      return MIN (ret1, ret2);
> > +    case BIT_AND_EXPR:
> > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> > +      return MAX (ret1, ret2);
> > +    case MULT_EXPR:
> > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> > +      return MIN (ret1 + ret2, prec);
> > +    case LSHIFT_EXPR:
> > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > +      if (host_integerp (TREE_OPERAND (expr, 1), 1)
> 
> check that first before recursing for op0 - if op1 is negative
> you simply return ret1 which looks wrong, too.

If op1 is negative, then it is undefined behavior, so assuming
in that case the same thing as when op1 is not constant (i.e.
that we worst case shift left by 0 and thus not increase number
of trailing zeros, or shift left by > 0 and increase it) is IMHO
correct.  I don't think we treat left shift by negative value as
right shift anywhere, do we?

> > +	  && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1)
> > +	      < (unsigned HOST_WIDE_INT) prec))
> 
> This check is to avoid overflowing ret1 + ret2?  If so, why not add
> 
> > +	{
> > +	  ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1);
> 
>   ret2 = MIN (ret2, prec);
> 
> instead?

Because ret{1,2} are int (well, with the change you've suggested unsigned
int), but tree_low_cst is UHWI, so if the shift count is say UINT_MAX + 1
(yeah, surely undefined behavior), I just didn't want to lose any of the upper
bits.  Sure, alternatively I could have an UHWI temporary variable and
assign tree_low_cst to it and do the MIN on that temporary and prec, but
still I think it is better to handle out of range constants as variable
shift count rather than say shift count up by prec - 1.

> > +    case TRUNC_DIV_EXPR:
> > +    case CEIL_DIV_EXPR:
> > +    case FLOOR_DIV_EXPR:
> > +    case ROUND_DIV_EXPR:
> > +    case EXACT_DIV_EXPR:
> > +      if (TREE_CODE (TREE_OPERAND (expr, 1)) == INTEGER_CST)
> > +	{
> > +	  ret2 = tree_log2 (TREE_OPERAND (expr, 1));
> > +	  if (ret2 >= 0 && tree_int_cst_sgn (TREE_OPERAND (expr, 1)) == 1)
> 
> cheaper to test the sign first.

Ok.
> 
> > +	    {
> > +	      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > +	      if (ret1 > ret2)
> > +		return ret1 - ret2;
> > +	    }
> > +	}
> > +      return 0;
> > +    CASE_CONVERT:
> > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > +      if (ret1 && ret1 == TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (expr, 0))))
> > +	ret1 = prec;
> > +      return MIN (ret1, prec);
> > +    case SAVE_EXPR:
> > +      return tree_ctz (TREE_OPERAND (expr, 0));
> > +    case COND_EXPR:
> > +      ret1 = tree_ctz (TREE_OPERAND (expr, 1));
> > +      ret2 = tree_ctz (TREE_OPERAND (expr, 2));
> > +      return MIN (ret1, ret2);
> > +    case COMPOUND_EXPR:
> > +      return tree_ctz (TREE_OPERAND (expr, 1));
> > +    case ADDR_EXPR:
> > +      ret1 = get_object_alignment (TREE_OPERAND (expr, 0));
> 
> Use get_pointer_alignment, this isn't a memory reference so type
> alignment rules don't apply.

Ok.

	Jakub
Richard Biener Oct. 30, 2013, 11:10 a.m. UTC | #3
On Tue, 29 Oct 2013, Jakub Jelinek wrote:

> On Tue, Oct 29, 2013 at 01:11:53PM +0100, Richard Biener wrote:
> > > +/* Return number of known trailing zero bits in EXPR, or, if the value of
> > > +   EXPR is known to be zero, the precision of it's type.  */
> > > +
> > > +int
> > 
> > unsigned int?
> 
> Ok.
> 
> > > +    case PLUS_EXPR:
> > > +    case MINUS_EXPR:
> > > +    case BIT_IOR_EXPR:
> > > +    case BIT_XOR_EXPR:
> > > +    case MIN_EXPR:
> > > +    case MAX_EXPR:
> > > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > > +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> > 
> > This first recurses but if either one returns 0 you don't have
> > to (that cuts down the recursion from exponential to linear in
> > the common case?).  Thus, early out on ret == 0?
> 
> Yeah, that is reasonable.  Usually we use this during expansion etc.,
> trees won't be arbitrarily large and for SSA_NAMEs we don't recurse
> on definitions, so I think we are unlikely to ever see very large
> expressions there though.  I've added similar early out to the COND_EXPR
> case.
> 
> > > +      return MIN (ret1, ret2);
> > > +    case POINTER_PLUS_EXPR:
> > > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > > +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> > > +      ret2 = MIN (ret2, prec);
> > 
> > Why do you need that here but not elsewhere when processing
> > binary ops?
> 
> Because POINTER_PLUS_EXPR (well, also shifts, but for those we
> don't call tree_ctz on the second argument) is the only
> binop we handle that uses different types for the operands,
> for the first argument we know it has the same precision as the result, but
> what if sizetype has bigger precision than TYPE_PRECISION of pointers?
> I know it typically doesn't, just wanted to make sure we never return
> an out of range return value, tree_ctz result should be in [0, prec]
> always.

Ah, indeed.  Maybe worth a comment.

> > > +      return MIN (ret1, ret2);
> > > +    case BIT_AND_EXPR:
> > > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > > +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> > > +      return MAX (ret1, ret2);
> > > +    case MULT_EXPR:
> > > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > > +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> > > +      return MIN (ret1 + ret2, prec);
> > > +    case LSHIFT_EXPR:
> > > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > > +      if (host_integerp (TREE_OPERAND (expr, 1), 1)
> > 
> > check that first before recursing for op0 - if op1 is negative
> > you simply return ret1 which looks wrong, too.
> 
> If op1 is negative, then it is undefined behavior, so assuming
> in that case the same thing as when op1 is not constant (i.e.
> that we worst case shift left by 0 and thus not increase number
> of trailing zeros, or shift left by > 0 and increase it) is IMHO
> correct.  I don't think we treat left shift by negative value as
> right shift anywhere, do we?

We do during constant folding I think.

> > > +	  && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1)
> > > +	      < (unsigned HOST_WIDE_INT) prec))
> > 
> > This check is to avoid overflowing ret1 + ret2?  If so, why not add
> > 
> > > +	{
> > > +	  ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1);
> > 
> >   ret2 = MIN (ret2, prec);
> > 
> > instead?
> 
> Because ret{1,2} are int (well, with the change you've suggested unsigned
> int), but tree_low_cst is UHWI, so if the shift count is say UINT_MAX + 1
> (yeah, surely undefined behavior), I just didn't want to lose any of the upper
> bits.  Sure, alternatively I could have an UHWI temporary variable and
> assign tree_low_cst to it and do the MIN on that temporary and prec, but
> still I think it is better to handle out of range constants as variable
> shift count rather than say shift count up by prec - 1.

Ok.

Thanks,
Richard.

> > > +    case TRUNC_DIV_EXPR:
> > > +    case CEIL_DIV_EXPR:
> > > +    case FLOOR_DIV_EXPR:
> > > +    case ROUND_DIV_EXPR:
> > > +    case EXACT_DIV_EXPR:
> > > +      if (TREE_CODE (TREE_OPERAND (expr, 1)) == INTEGER_CST)
> > > +	{
> > > +	  ret2 = tree_log2 (TREE_OPERAND (expr, 1));
> > > +	  if (ret2 >= 0 && tree_int_cst_sgn (TREE_OPERAND (expr, 1)) == 1)
> > 
> > cheaper to test the sign first.
> 
> Ok.
> > 
> > > +	    {
> > > +	      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > > +	      if (ret1 > ret2)
> > > +		return ret1 - ret2;
> > > +	    }
> > > +	}
> > > +      return 0;
> > > +    CASE_CONVERT:
> > > +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> > > +      if (ret1 && ret1 == TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (expr, 0))))
> > > +	ret1 = prec;
> > > +      return MIN (ret1, prec);
> > > +    case SAVE_EXPR:
> > > +      return tree_ctz (TREE_OPERAND (expr, 0));
> > > +    case COND_EXPR:
> > > +      ret1 = tree_ctz (TREE_OPERAND (expr, 1));
> > > +      ret2 = tree_ctz (TREE_OPERAND (expr, 2));
> > > +      return MIN (ret1, ret2);
> > > +    case COMPOUND_EXPR:
> > > +      return tree_ctz (TREE_OPERAND (expr, 1));
> > > +    case ADDR_EXPR:
> > > +      ret1 = get_object_alignment (TREE_OPERAND (expr, 0));
> > 
> > Use get_pointer_alignment, this isn't a memory reference so type
> > alignment rules don't apply.
> 
> Ok.
diff mbox

Patch

--- gcc/tree.c.jj	2013-10-23 14:43:12.000000000 +0200
+++ gcc/tree.c	2013-10-25 15:00:55.296178794 +0200
@@ -2213,6 +2213,110 @@  tree_floor_log2 (const_tree expr)
 	  : floor_log2 (low));
 }
 
+/* Return number of known trailing zero bits in EXPR, or, if the value of
+   EXPR is known to be zero, the precision of it's type.  */
+
+int
+tree_ctz (const_tree expr)
+{
+  if (!INTEGRAL_TYPE_P (TREE_TYPE (expr))
+      && !POINTER_TYPE_P (TREE_TYPE (expr)))
+    return 0;
+
+  int ret1, ret2, prec = TYPE_PRECISION (TREE_TYPE (expr));
+  switch (TREE_CODE (expr))
+    {
+    case INTEGER_CST:
+      ret1 = tree_to_double_int (expr).trailing_zeros ();
+      return MIN (ret1, prec);
+    case SSA_NAME:
+      ret1 = get_nonzero_bits (expr).trailing_zeros ();
+      return MIN (ret1, prec);
+    case PLUS_EXPR:
+    case MINUS_EXPR:
+    case BIT_IOR_EXPR:
+    case BIT_XOR_EXPR:
+    case MIN_EXPR:
+    case MAX_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
+      return MIN (ret1, ret2);
+    case POINTER_PLUS_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
+      ret2 = MIN (ret2, prec);
+      return MIN (ret1, ret2);
+    case BIT_AND_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
+      return MAX (ret1, ret2);
+    case MULT_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
+      return MIN (ret1 + ret2, prec);
+    case LSHIFT_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      if (host_integerp (TREE_OPERAND (expr, 1), 1)
+	  && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1)
+	      < (unsigned HOST_WIDE_INT) prec))
+	{
+	  ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1);
+	  return MIN (ret1 + ret2, prec);
+	}
+      return ret1;
+    case RSHIFT_EXPR:
+      if (host_integerp (TREE_OPERAND (expr, 1), 1)
+	  && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1)
+	      < (unsigned HOST_WIDE_INT) prec))
+	{
+	  ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+	  ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1);
+	  if (ret1 > ret2)
+	    return ret1 - ret2;
+	}
+      return 0;
+    case TRUNC_DIV_EXPR:
+    case CEIL_DIV_EXPR:
+    case FLOOR_DIV_EXPR:
+    case ROUND_DIV_EXPR:
+    case EXACT_DIV_EXPR:
+      if (TREE_CODE (TREE_OPERAND (expr, 1)) == INTEGER_CST)
+	{
+	  ret2 = tree_log2 (TREE_OPERAND (expr, 1));
+	  if (ret2 >= 0 && tree_int_cst_sgn (TREE_OPERAND (expr, 1)) == 1)
+	    {
+	      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+	      if (ret1 > ret2)
+		return ret1 - ret2;
+	    }
+	}
+      return 0;
+    CASE_CONVERT:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      if (ret1 && ret1 == TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (expr, 0))))
+	ret1 = prec;
+      return MIN (ret1, prec);
+    case SAVE_EXPR:
+      return tree_ctz (TREE_OPERAND (expr, 0));
+    case COND_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 1));
+      ret2 = tree_ctz (TREE_OPERAND (expr, 2));
+      return MIN (ret1, ret2);
+    case COMPOUND_EXPR:
+      return tree_ctz (TREE_OPERAND (expr, 1));
+    case ADDR_EXPR:
+      ret1 = get_object_alignment (TREE_OPERAND (expr, 0));
+      if (ret1 > BITS_PER_UNIT)
+	{
+	  ret1 = ctz_hwi (ret1 / BITS_PER_UNIT);
+	  return MIN (ret1, prec);
+	}
+      return 0;
+    default:
+      return 0;
+    }
+}
+
 /* Return 1 if EXPR is the real constant zero.  Trailing zeroes matter for
    decimal float constants, so don't return 1 for them.  */
 
--- gcc/tree.h.jj	2013-10-17 22:30:45.000000000 +0200
+++ gcc/tree.h	2013-10-25 12:20:05.473673186 +0200
@@ -4546,6 +4546,7 @@  extern void get_type_static_bounds (cons
 extern bool variably_modified_type_p (tree, tree);
 extern int tree_log2 (const_tree);
 extern int tree_floor_log2 (const_tree);
+extern int tree_ctz (const_tree);
 extern int simple_cst_equal (const_tree, const_tree);
 extern hashval_t iterative_hash_expr (const_tree, hashval_t);
 extern hashval_t iterative_hash_exprs_commutative (const_tree,
--- gcc/tree-ssanames.h.jj	2013-10-24 15:52:53.000000000 +0200
+++ gcc/tree-ssanames.h	2013-10-25 14:09:21.227015919 +0200
@@ -72,9 +72,10 @@  enum value_range_type { VR_UNDEFINED, VR
 /* Sets the value range to SSA.  */
 extern void set_range_info (tree, double_int, double_int);
 /* Gets the value range from SSA.  */
-extern enum value_range_type get_range_info (tree, double_int *, double_int *);
+extern enum value_range_type get_range_info (const_tree, double_int *,
+					     double_int *);
 extern void set_nonzero_bits (tree, double_int);
-extern double_int get_nonzero_bits (tree);
+extern double_int get_nonzero_bits (const_tree);
 extern void init_ssanames (struct function *, int);
 extern void fini_ssanames (void);
 extern void ssanames_print_statistics (void);
--- gcc/tree-ssanames.c.jj	2013-10-24 17:32:22.000000000 +0200
+++ gcc/tree-ssanames.c	2013-10-25 14:08:46.218187581 +0200
@@ -221,7 +221,7 @@  set_range_info (tree name, double_int mi
    is used to determine if MIN and MAX are valid values.  */
 
 enum value_range_type
-get_range_info (tree name, double_int *min, double_int *max)
+get_range_info (const_tree name, double_int *min, double_int *max)
 {
   enum value_range_type range_type;
   gcc_assert (!POINTER_TYPE_P (TREE_TYPE (name)));
@@ -271,7 +271,7 @@  set_nonzero_bits (tree name, double_int
    NAME, or double_int_minus_one if unknown.  */
 
 double_int
-get_nonzero_bits (tree name)
+get_nonzero_bits (const_tree name)
 {
   if (POINTER_TYPE_P (TREE_TYPE (name)))
     {
--- gcc/tree-vectorizer.h.jj	2013-10-24 10:19:20.000000000 +0200
+++ gcc/tree-vectorizer.h	2013-10-25 14:02:58.198999063 +0200
@@ -901,6 +901,8 @@  extern void slpeel_make_loop_iterate_nti
 extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
 struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge);
 extern void vect_loop_versioning (loop_vec_info, unsigned int, bool);
+extern void vect_generate_tmps_on_preheader (loop_vec_info, tree *, tree *,
+					     tree *, gimple_seq);
 extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *,
 					    unsigned int, bool);
 extern void vect_do_peeling_for_alignment (loop_vec_info, unsigned int, bool);
--- gcc/tree-vect-loop-manip.c.jj	2013-10-24 10:19:22.000000000 +0200
+++ gcc/tree-vect-loop-manip.c	2013-10-25 14:02:00.544284058 +0200
@@ -1437,7 +1437,7 @@  vect_build_loop_niters (loop_vec_info lo
  and places them at the loop preheader edge or in COND_EXPR_STMT_LIST
  if that is non-NULL.  */
 
-static void
+void
 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
 				 tree *ni_name_ptr,
 				 tree *ratio_mult_vf_name_ptr,
--- gcc/expr.c.jj	2013-10-23 14:43:15.000000000 +0200
+++ gcc/expr.c	2013-10-25 15:05:23.893781676 +0200
@@ -7282,74 +7282,14 @@  safe_from_p (const_rtx x, tree exp, int
 unsigned HOST_WIDE_INT
 highest_pow2_factor (const_tree exp)
 {
-  unsigned HOST_WIDE_INT c0, c1;
-
-  switch (TREE_CODE (exp))
-    {
-    case INTEGER_CST:
-      /* We can find the lowest bit that's a one.  If the low
-	 HOST_BITS_PER_WIDE_INT bits are zero, return BIGGEST_ALIGNMENT.
-	 We need to handle this case since we can find it in a COND_EXPR,
-	 a MIN_EXPR, or a MAX_EXPR.  If the constant overflows, we have an
-	 erroneous program, so return BIGGEST_ALIGNMENT to avoid any
-	 later ICE.  */
-      if (TREE_OVERFLOW (exp))
-	return BIGGEST_ALIGNMENT;
-      else
-	{
-	  /* Note: tree_low_cst is intentionally not used here,
-	     we don't care about the upper bits.  */
-	  c0 = TREE_INT_CST_LOW (exp);
-	  c0 &= -c0;
-	  return c0 ? c0 : BIGGEST_ALIGNMENT;
-	}
-      break;
-
-    case PLUS_EXPR:  case MINUS_EXPR:  case MIN_EXPR:  case MAX_EXPR:
-      c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
-      c1 = highest_pow2_factor (TREE_OPERAND (exp, 1));
-      return MIN (c0, c1);
-
-    case MULT_EXPR:
-      c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
-      c1 = highest_pow2_factor (TREE_OPERAND (exp, 1));
-      return c0 * c1;
-
-    case ROUND_DIV_EXPR:  case TRUNC_DIV_EXPR:  case FLOOR_DIV_EXPR:
-    case CEIL_DIV_EXPR:
-      if (integer_pow2p (TREE_OPERAND (exp, 1))
-	  && host_integerp (TREE_OPERAND (exp, 1), 1))
-	{
-	  c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
-	  c1 = tree_low_cst (TREE_OPERAND (exp, 1), 1);
-	  return MAX (1, c0 / c1);
-	}
-      break;
-
-    case BIT_AND_EXPR:
-      /* The highest power of two of a bit-and expression is the maximum of
-	 that of its operands.  We typically get here for a complex LHS and
-	 a constant negative power of two on the RHS to force an explicit
-	 alignment, so don't bother looking at the LHS.  */
-      return highest_pow2_factor (TREE_OPERAND (exp, 1));
-
-    CASE_CONVERT:
-    case SAVE_EXPR:
-      return highest_pow2_factor (TREE_OPERAND (exp, 0));
-
-    case COMPOUND_EXPR:
-      return highest_pow2_factor (TREE_OPERAND (exp, 1));
-
-    case COND_EXPR:
-      c0 = highest_pow2_factor (TREE_OPERAND (exp, 1));
-      c1 = highest_pow2_factor (TREE_OPERAND (exp, 2));
-      return MIN (c0, c1);
-
-    default:
-      break;
-    }
-
-  return 1;
+  unsigned HOST_WIDE_INT ret;
+  int trailing_zeros = tree_ctz (exp);
+  if (trailing_zeros >= HOST_BITS_PER_WIDE_INT)
+    return BIGGEST_ALIGNMENT;
+  ret = (unsigned HOST_WIDE_INT) 1 << trailing_zeros;
+  if (ret > BIGGEST_ALIGNMENT)
+    return BIGGEST_ALIGNMENT;
+  return ret;
 }
 
 /* Similar, except that the alignment requirements of TARGET are
--- gcc/tree-vect-loop.c.jj	2013-10-24 10:19:23.000000000 +0200
+++ gcc/tree-vect-loop.c	2013-10-25 14:01:35.968407222 +0200
@@ -1586,9 +1586,9 @@  vect_analyze_loop_operations (loop_vec_i
       return false;
     }
 
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
-      || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
+  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+	  < exact_log2 (vectorization_factor)))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.\n");
@@ -5656,15 +5656,20 @@  vect_transform_loop (loop_vec_info loop_
      will remain scalar and will compute the remaining (n%VF) iterations.
      (VF is the vectorization factor).  */
 
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-       || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	   && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
-       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+  if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+      < exact_log2 (vectorization_factor)
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
 				    th, check_profitability);
-  else
+  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
 		LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
+  else
+    {
+      tree ni_name, ratio_mult_vf;
+      vect_generate_tmps_on_preheader (loop_vinfo, &ni_name, &ratio_mult_vf,
+				       &ratio, NULL);
+    }
 
   /* 1) Make sure the loop header has exactly two entries
      2) Make sure we have a preheader basic block.  */
--- gcc/builtins.c.jj	2013-10-23 14:43:12.000000000 +0200
+++ gcc/builtins.c	2013-10-25 12:31:49.426022284 +0200
@@ -309,7 +309,7 @@  get_object_alignment_2 (tree exp, unsign
   tree offset;
   enum machine_mode mode;
   int unsignedp, volatilep;
-  unsigned int inner, align = BITS_PER_UNIT;
+  unsigned int align = BITS_PER_UNIT;
   bool known_alignment = false;
 
   /* Get the innermost object and the constant (bitpos) and possibly
@@ -418,50 +418,16 @@  get_object_alignment_2 (tree exp, unsign
 
   /* If there is a non-constant offset part extract the maximum
      alignment that can prevail.  */
-  inner = ~0U;
-  while (offset)
+  if (offset)
     {
-      tree next_offset;
-
-      if (TREE_CODE (offset) == PLUS_EXPR)
-	{
-	  next_offset = TREE_OPERAND (offset, 0);
-	  offset = TREE_OPERAND (offset, 1);
-	}
-      else
-	next_offset = NULL;
-      if (host_integerp (offset, 1))
-	{
-	  /* Any overflow in calculating offset_bits won't change
-	     the alignment.  */
-	  unsigned offset_bits
-	    = ((unsigned) tree_low_cst (offset, 1) * BITS_PER_UNIT);
-
-	  if (offset_bits)
-	    inner = MIN (inner, (offset_bits & -offset_bits));
-	}
-      else if (TREE_CODE (offset) == MULT_EXPR
-	       && host_integerp (TREE_OPERAND (offset, 1), 1))
-	{
-	  /* Any overflow in calculating offset_factor won't change
-	     the alignment.  */
-	  unsigned offset_factor
-	    = ((unsigned) tree_low_cst (TREE_OPERAND (offset, 1), 1)
-	       * BITS_PER_UNIT);
-
-	  if (offset_factor)
-	    inner = MIN (inner, (offset_factor & -offset_factor));
-	}
-      else
+      int trailing_zeros = tree_ctz (offset);
+      if (trailing_zeros < HOST_BITS_PER_INT)
 	{
-	  inner = MIN (inner, BITS_PER_UNIT);
-	  break;
+	  unsigned int inner = (1U << trailing_zeros) * BITS_PER_UNIT;
+	  if (inner)
+	    align = MIN (align, inner);
 	}
-      offset = next_offset;
     }
-  /* Alignment is innermost object alignment adjusted by the constant
-     and non-constant offset parts.  */
-  align = MIN (align, inner);
 
   *alignp = align;
   *bitposp = bitpos & (*alignp - 1);