diff mbox

Fix vector rotate regression (PR tree-optimization/57233)

Message ID 20140626114022.GR31640@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek June 26, 2014, 11:40 a.m. UTC
On Thu, Jun 26, 2014 at 01:16:41PM +0200, Marc Glisse wrote:
> >+	  if (compute_type == TREE_TYPE (type)
> >+	      && !VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2)))
> >+	    {
> >+	      optab oplv, opl, oprv, opr, opo;
> >+	      oplv = optab_for_tree_code (LSHIFT_EXPR, type, optab_vector);
> >+	      /* Right shift always has to be logical, no matter what
> >+		 signedness type has.  */
> >+	      oprv = vlshr_optab;
> >+	      opo = optab_for_tree_code (BIT_IOR_EXPR, type, optab_default);
> >+	      opl = optab_for_tree_code (LSHIFT_EXPR, type, optab_scalar);
> >+	      oprv = lshr_optab;
> >+	      opr = optab_for_tree_code (RSHIFT_EXPR, type, optab_scalar);
> 
> Looks like there are some typos in there, you are assigning to oprv twice.

Oops, fixed thusly.

2014-06-25  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/57233
	PR tree-optimization/61299
	* tree-vect-generic.c (get_compute_type, count_type_subparts): New
	functions.
	(expand_vector_operations_1): Use them.  If {L,R}ROTATE_EXPR
	would be lowered to scalar shifts, check if corresponding
	shifts and vector BIT_IOR_EXPR are supported and don't lower
	or lower just to narrower vector type in that case.
	* expmed.c (expand_shift_1): Fix up handling of vector
	shifts and rotates.

	* gcc.dg/pr57233.c: New test.
	* gcc.target/i386/pr57233.c: New test.
	* gcc.target/i386/sse2-pr57233.c: New test.
	* gcc.target/i386/avx-pr57233.c: New test.
	* gcc.target/i386/avx2-pr57233.c: New test.
	* gcc.target/i386/avx512f-pr57233.c: New test.
	* gcc.target/i386/xop-pr57233.c: New test.



	Jakub

Comments

Richard Biener June 26, 2014, 12:37 p.m. UTC | #1
On Thu, 26 Jun 2014, Jakub Jelinek wrote:

> On Thu, Jun 26, 2014 at 01:16:41PM +0200, Marc Glisse wrote:
> > >+	  if (compute_type == TREE_TYPE (type)
> > >+	      && !VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2)))
> > >+	    {
> > >+	      optab oplv, opl, oprv, opr, opo;
> > >+	      oplv = optab_for_tree_code (LSHIFT_EXPR, type, optab_vector);
> > >+	      /* Right shift always has to be logical, no matter what
> > >+		 signedness type has.  */
> > >+	      oprv = vlshr_optab;
> > >+	      opo = optab_for_tree_code (BIT_IOR_EXPR, type, optab_default);
> > >+	      opl = optab_for_tree_code (LSHIFT_EXPR, type, optab_scalar);
> > >+	      oprv = lshr_optab;
> > >+	      opr = optab_for_tree_code (RSHIFT_EXPR, type, optab_scalar);
> > 
> > Looks like there are some typos in there, you are assigning to oprv twice.
> 
> Oops, fixed thusly.

Ok.

Thanks,
Richard.

> 2014-06-25  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/57233
> 	PR tree-optimization/61299
> 	* tree-vect-generic.c (get_compute_type, count_type_subparts): New
> 	functions.
> 	(expand_vector_operations_1): Use them.  If {L,R}ROTATE_EXPR
> 	would be lowered to scalar shifts, check if corresponding
> 	shifts and vector BIT_IOR_EXPR are supported and don't lower
> 	or lower just to narrower vector type in that case.
> 	* expmed.c (expand_shift_1): Fix up handling of vector
> 	shifts and rotates.
> 
> 	* gcc.dg/pr57233.c: New test.
> 	* gcc.target/i386/pr57233.c: New test.
> 	* gcc.target/i386/sse2-pr57233.c: New test.
> 	* gcc.target/i386/avx-pr57233.c: New test.
> 	* gcc.target/i386/avx2-pr57233.c: New test.
> 	* gcc.target/i386/avx512f-pr57233.c: New test.
> 	* gcc.target/i386/xop-pr57233.c: New test.
> 
> --- gcc/tree-vect-generic.c.jj	2014-06-26 11:00:00.477268305 +0200
> +++ gcc/tree-vect-generic.c	2014-06-26 13:33:33.024069715 +0200
> @@ -1334,15 +1334,67 @@ lower_vec_perm (gimple_stmt_iterator *gs
>    update_stmt (gsi_stmt (*gsi));
>  }
>  
> +/* Return type in which CODE operation with optab OP can be
> +   computed.  */
> +
> +static tree
> +get_compute_type (enum tree_code code, optab op, tree type)
> +{
> +  /* For very wide vectors, try using a smaller vector mode.  */
> +  tree compute_type = type;
> +  if (op
> +      && (!VECTOR_MODE_P (TYPE_MODE (type))
> +	  || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing))
> +    {
> +      tree vector_compute_type
> +	= type_for_widest_vector_mode (TREE_TYPE (type), op);
> +      if (vector_compute_type != NULL_TREE
> +	  && (TYPE_VECTOR_SUBPARTS (vector_compute_type)
> +	      < TYPE_VECTOR_SUBPARTS (compute_type))
> +	  && (optab_handler (op, TYPE_MODE (vector_compute_type))
> +	      != CODE_FOR_nothing))
> +	compute_type = vector_compute_type;
> +    }
> +
> +  /* If we are breaking a BLKmode vector into smaller pieces,
> +     type_for_widest_vector_mode has already looked into the optab,
> +     so skip these checks.  */
> +  if (compute_type == type)
> +    {
> +      enum machine_mode compute_mode = TYPE_MODE (compute_type);
> +      if (VECTOR_MODE_P (compute_mode))
> +	{
> +	  if (op && optab_handler (op, compute_mode) != CODE_FOR_nothing)
> +	    return compute_type;
> +	  if (code == MULT_HIGHPART_EXPR
> +	      && can_mult_highpart_p (compute_mode,
> +				      TYPE_UNSIGNED (compute_type)))
> +	    return compute_type;
> +	}
> +      /* There is no operation in hardware, so fall back to scalars.  */
> +      compute_type = TREE_TYPE (type);
> +    }
> +
> +  return compute_type;
> +}
> +
> +/* Helper function of expand_vector_operations_1.  Return number of
> +   vector elements for vector types or 1 for other types.  */
> +
> +static inline int
> +count_type_subparts (tree type)
> +{
> +  return VECTOR_TYPE_P (type) ? TYPE_VECTOR_SUBPARTS (type) : 1;
> +}
> +
>  /* Process one statement.  If we identify a vector operation, expand it.  */
>  
>  static void
>  expand_vector_operations_1 (gimple_stmt_iterator *gsi)
>  {
>    gimple stmt = gsi_stmt (*gsi);
> -  tree lhs, rhs1, rhs2 = NULL, type, compute_type;
> +  tree lhs, rhs1, rhs2 = NULL, type, compute_type = NULL_TREE;
>    enum tree_code code;
> -  enum machine_mode compute_mode;
>    optab op = unknown_optab;
>    enum gimple_rhs_class rhs_class;
>    tree new_rhs;
> @@ -1455,11 +1507,76 @@ expand_vector_operations_1 (gimple_stmt_
>  	{
>            op = optab_for_tree_code (code, type, optab_scalar);
>  
> +	  compute_type = get_compute_type (code, op, type);
> +	  if (compute_type == type)
> +	    return;
>  	  /* The rtl expander will expand vector/scalar as vector/vector
> -	     if necessary.  Don't bother converting the stmt here.  */
> -	  if (optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing
> -	      && optab_handler (opv, TYPE_MODE (type)) != CODE_FOR_nothing)
> +	     if necessary.  Pick one with wider vector type.  */
> +	  tree compute_vtype = get_compute_type (code, opv, type);
> +	  if (count_type_subparts (compute_vtype)
> +	      > count_type_subparts (compute_type))
> +	    {
> +	      compute_type = compute_vtype;
> +	      op = opv;
> +	    }
> +	}
> +
> +      if (code == LROTATE_EXPR || code == RROTATE_EXPR)
> +	{
> +	  if (compute_type == NULL_TREE)
> +	    compute_type = get_compute_type (code, op, type);
> +	  if (compute_type == type)
>  	    return;
> +	  /* Before splitting vector rotates into scalar rotates,
> +	     see if we can't use vector shifts and BIT_IOR_EXPR
> +	     instead.  For vector by vector rotates we'd also
> +	     need to check BIT_AND_EXPR and NEGATE_EXPR, punt there
> +	     for now, fold doesn't seem to create such rotates anyway.  */
> +	  if (compute_type == TREE_TYPE (type)
> +	      && !VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2)))
> +	    {
> +	      optab oplv = vashl_optab, opl = ashl_optab;
> +	      optab oprv = vlshr_optab, opr = lshr_optab, opo = ior_optab;
> +	      tree compute_lvtype = get_compute_type (LSHIFT_EXPR, oplv, type);
> +	      tree compute_rvtype = get_compute_type (RSHIFT_EXPR, oprv, type);
> +	      tree compute_otype = get_compute_type (BIT_IOR_EXPR, opo, type);
> +	      tree compute_ltype = get_compute_type (LSHIFT_EXPR, opl, type);
> +	      tree compute_rtype = get_compute_type (RSHIFT_EXPR, opr, type);
> +	      /* The rtl expander will expand vector/scalar as vector/vector
> +		 if necessary.  Pick one with wider vector type.  */
> +	      if (count_type_subparts (compute_lvtype)
> +		  > count_type_subparts (compute_ltype))
> +		{
> +		  compute_ltype = compute_lvtype;
> +		  opl = oplv;
> +		}
> +	      if (count_type_subparts (compute_rvtype)
> +		  > count_type_subparts (compute_rtype))
> +		{
> +		  compute_rtype = compute_rvtype;
> +		  opr = oprv;
> +		}
> +	      /* Pick the narrowest type from LSHIFT_EXPR, RSHIFT_EXPR and
> +		 BIT_IOR_EXPR.  */
> +	      compute_type = compute_ltype;
> +	      if (count_type_subparts (compute_type)
> +		  > count_type_subparts (compute_rtype))
> +		compute_type = compute_rtype;
> +	      if (count_type_subparts (compute_type)
> +		  > count_type_subparts (compute_otype))
> +		compute_type = compute_otype;
> +	      /* Verify all 3 operations can be performed in that type.  */
> +	      if (compute_type != TREE_TYPE (type))
> +		{
> +		  if (optab_handler (opl, TYPE_MODE (compute_type))
> +		      == CODE_FOR_nothing
> +		      || optab_handler (opr, TYPE_MODE (compute_type))
> +			 == CODE_FOR_nothing
> +		      || optab_handler (opo, TYPE_MODE (compute_type))
> +			 == CODE_FOR_nothing)
> +		    compute_type = TREE_TYPE (type);
> +		}
> +	    }
>  	}
>      }
>    else
> @@ -1473,38 +1590,10 @@ expand_vector_operations_1 (gimple_stmt_
>        && INTEGRAL_TYPE_P (TREE_TYPE (type)))
>      op = optab_for_tree_code (MINUS_EXPR, type, optab_default);
>  
> -  /* For very wide vectors, try using a smaller vector mode.  */
> -  compute_type = type;
> -  if (!VECTOR_MODE_P (TYPE_MODE (type)) && op)
> -    {
> -      tree vector_compute_type
> -        = type_for_widest_vector_mode (TREE_TYPE (type), op);
> -      if (vector_compute_type != NULL_TREE
> -	  && (TYPE_VECTOR_SUBPARTS (vector_compute_type)
> -	      < TYPE_VECTOR_SUBPARTS (compute_type))
> -	  && (optab_handler (op, TYPE_MODE (vector_compute_type))
> -	      != CODE_FOR_nothing))
> -	compute_type = vector_compute_type;
> -    }
> -
> -  /* If we are breaking a BLKmode vector into smaller pieces,
> -     type_for_widest_vector_mode has already looked into the optab,
> -     so skip these checks.  */
> +  if (compute_type == NULL_TREE)
> +    compute_type = get_compute_type (code, op, type);
>    if (compute_type == type)
> -    {
> -      compute_mode = TYPE_MODE (compute_type);
> -      if (VECTOR_MODE_P (compute_mode))
> -	{
> -          if (op && optab_handler (op, compute_mode) != CODE_FOR_nothing)
> -	    return;
> -	  if (code == MULT_HIGHPART_EXPR
> -	      && can_mult_highpart_p (compute_mode,
> -				      TYPE_UNSIGNED (compute_type)))
> -	    return;
> -	}
> -      /* There is no operation in hardware, so fall back to scalars.  */
> -      compute_type = TREE_TYPE (type);
> -    }
> +    return;
>  
>    gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR);
>    new_rhs = expand_vector_operation (gsi, type, compute_type, stmt, code);
> --- gcc/expmed.c.jj	2014-06-26 11:00:00.034270690 +0200
> +++ gcc/expmed.c	2014-06-26 13:28:49.802581691 +0200
> @@ -2128,9 +2128,12 @@ expand_shift_1 (enum tree_code code, enu
>    optab lrotate_optab = rotl_optab;
>    optab rrotate_optab = rotr_optab;
>    enum machine_mode op1_mode;
> +  enum machine_mode scalar_mode = mode;
>    int attempt;
>    bool speed = optimize_insn_for_speed_p ();
>  
> +  if (VECTOR_MODE_P (mode))
> +    scalar_mode = GET_MODE_INNER (mode);
>    op1 = amount;
>    op1_mode = GET_MODE (op1);
>  
> @@ -2153,9 +2156,9 @@ expand_shift_1 (enum tree_code code, enu
>      {
>        if (CONST_INT_P (op1)
>  	  && ((unsigned HOST_WIDE_INT) INTVAL (op1) >=
> -	      (unsigned HOST_WIDE_INT) GET_MODE_BITSIZE (mode)))
> +	      (unsigned HOST_WIDE_INT) GET_MODE_BITSIZE (scalar_mode)))
>  	op1 = GEN_INT ((unsigned HOST_WIDE_INT) INTVAL (op1)
> -		       % GET_MODE_BITSIZE (mode));
> +		       % GET_MODE_BITSIZE (scalar_mode));
>        else if (GET_CODE (op1) == SUBREG
>  	       && subreg_lowpart_p (op1)
>  	       && SCALAR_INT_MODE_P (GET_MODE (SUBREG_REG (op1)))
> @@ -2169,10 +2172,10 @@ expand_shift_1 (enum tree_code code, enu
>       amount instead.  */
>    if (rotate
>        && CONST_INT_P (op1)
> -      && IN_RANGE (INTVAL (op1), GET_MODE_BITSIZE (mode) / 2 + left,
> -		   GET_MODE_BITSIZE (mode) - 1))
> +      && IN_RANGE (INTVAL (op1), GET_MODE_BITSIZE (scalar_mode) / 2 + left,
> +		   GET_MODE_BITSIZE (scalar_mode) - 1))
>      {
> -      op1 = GEN_INT (GET_MODE_BITSIZE (mode) - INTVAL (op1));
> +      op1 = GEN_INT (GET_MODE_BITSIZE (scalar_mode) - INTVAL (op1));
>        left = !left;
>        code = left ? LROTATE_EXPR : RROTATE_EXPR;
>      }
> @@ -2185,7 +2188,7 @@ expand_shift_1 (enum tree_code code, enu
>    if (code == LSHIFT_EXPR
>        && CONST_INT_P (op1)
>        && INTVAL (op1) > 0
> -      && INTVAL (op1) < GET_MODE_PRECISION (mode)
> +      && INTVAL (op1) < GET_MODE_PRECISION (scalar_mode)
>        && INTVAL (op1) < MAX_BITS_PER_WORD
>        && (shift_cost (speed, mode, INTVAL (op1))
>  	  > INTVAL (op1) * add_cost (speed, mode))
> @@ -2240,14 +2243,14 @@ expand_shift_1 (enum tree_code code, enu
>  	      if (op1 == const0_rtx)
>  		return shifted;
>  	      else if (CONST_INT_P (op1))
> -		other_amount = GEN_INT (GET_MODE_BITSIZE (mode)
> +		other_amount = GEN_INT (GET_MODE_BITSIZE (scalar_mode)
>  					- INTVAL (op1));
>  	      else
>  		{
>  		  other_amount
>  		    = simplify_gen_unary (NEG, GET_MODE (op1),
>  					  op1, GET_MODE (op1));
> -		  HOST_WIDE_INT mask = GET_MODE_PRECISION (mode) - 1;
> +		  HOST_WIDE_INT mask = GET_MODE_PRECISION (scalar_mode) - 1;
>  		  other_amount
>  		    = simplify_gen_binary (AND, GET_MODE (op1), other_amount,
>  					   gen_int_mode (mask, GET_MODE (op1)));
> --- gcc/testsuite/gcc.target/i386/sse2-pr57233.c.jj	2014-06-26 13:28:49.802581691 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-pr57233.c	2014-06-26 13:28:49.802581691 +0200
> @@ -0,0 +1,16 @@
> +/* PR tree-optimization/57233 */
> +/* { dg-do run { target sse2 } } */
> +/* { dg-options "-O2 -msse2" } */
> +
> +#include "sse2-check.h"
> +
> +static void
> +sse2_test (void)
> +{
> +  do_main ();
> +}
> +
> +#undef main
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/pr57233.c"
> --- gcc/testsuite/gcc.target/i386/pr57233.c.jj	2014-06-26 13:28:49.802581691 +0200
> +++ gcc/testsuite/gcc.target/i386/pr57233.c	2014-06-26 13:28:49.802581691 +0200
> @@ -0,0 +1,15 @@
> +/* PR tree-optimization/57233 */
> +/* { dg-do compile { target avx } } */
> +/* { dg-options "-O2 -mavx -mno-xop" } */
> +
> +typedef unsigned V4 __attribute__((vector_size(4 * sizeof (int))));
> +V4 a;
> +
> +__attribute__((noinline)) void
> +foo (void)
> +{
> +  a = (a << 2) | (a >> 30);
> +}
> +
> +/* { dg-final { scan-assembler "vpsrld\[^\n\r]*30" } } */
> +/* { dg-final { scan-assembler "vpslld\[^\n\r]*2" } } */
> --- gcc/testsuite/gcc.target/i386/xop-pr57233.c.jj	2014-06-26 13:28:49.802581691 +0200
> +++ gcc/testsuite/gcc.target/i386/xop-pr57233.c	2014-06-26 13:28:49.802581691 +0200
> @@ -0,0 +1,16 @@
> +/* PR tree-optimization/57233 */
> +/* { dg-do run { target xop } } */
> +/* { dg-options "-O2 -mxop" } */
> +
> +#include "xop-check.h"
> +
> +static void
> +xop_test (void)
> +{
> +  do_main ();
> +}
> +
> +#undef main
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/pr57233.c"
> --- gcc/testsuite/gcc.target/i386/avx2-pr57233.c.jj	2014-06-26 13:28:49.803581572 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-pr57233.c	2014-06-26 13:28:49.802581691 +0200
> @@ -0,0 +1,16 @@
> +/* PR tree-optimization/57233 */
> +/* { dg-do run { target avx2 } } */
> +/* { dg-options "-O2 -mavx2" } */
> +
> +#include "avx2-check.h"
> +
> +static void
> +avx2_test (void)
> +{
> +  do_main ();
> +}
> +
> +#undef main
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/pr57233.c"
> --- gcc/testsuite/gcc.target/i386/avx-pr57233.c.jj	2014-06-26 13:28:49.803581572 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-pr57233.c	2014-06-26 13:28:49.803581572 +0200
> @@ -0,0 +1,16 @@
> +/* PR tree-optimization/57233 */
> +/* { dg-do run { target avx } } */
> +/* { dg-options "-O2 -mavx" } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  do_main ();
> +}
> +
> +#undef main
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/pr57233.c"
> --- gcc/testsuite/gcc.target/i386/avx512f-pr57233.c.jj	2014-06-26 13:28:49.803581572 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-pr57233.c	2014-06-26 13:28:49.803581572 +0200
> @@ -0,0 +1,16 @@
> +/* PR tree-optimization/57233 */
> +/* { dg-do run { target avx512f } } */
> +/* { dg-options "-O2 -mavx512f" } */
> +
> +#include "avx512f-check.h"
> +
> +static void
> +avx512f_test (void)
> +{
> +  do_main ();
> +}
> +
> +#undef main
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/pr57233.c"
> --- gcc/testsuite/gcc.dg/pr57233.c.jj	2014-06-26 13:28:49.803581572 +0200
> +++ gcc/testsuite/gcc.dg/pr57233.c	2014-06-26 13:28:49.803581572 +0200
> @@ -0,0 +1,171 @@
> +/* PR tree-optimization/57233 */
> +/* { dg-do run { target { ilp32 || lp64 } } } */
> +/* { dg-options "-O2" } */
> +
> +typedef unsigned V4 __attribute__((vector_size(4 * sizeof (int))));
> +typedef unsigned V8 __attribute__((vector_size(8 * sizeof (int))));
> +typedef unsigned V16 __attribute__((vector_size(16 * sizeof (int))));
> +V4 a, b, g;
> +V8 c, d, h;
> +V16 e, f, j;
> +
> +__attribute__((noinline)) void
> +f1 (void)
> +{
> +  a = (a << 2) | (a >> 30);
> +}
> +
> +__attribute__((noinline)) void
> +f2 (void)
> +{
> +  a = (a << 30) | (a >> 2);
> +}
> +
> +__attribute__((noinline)) void
> +f3 (void)
> +{
> +  a = (a << b) | (a >> (32 - b));
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f4 (int x)
> +{
> +  a = (a << x) | (a >> (32 - x));
> +}
> +
> +__attribute__((noinline)) void
> +f5 (void)
> +{
> +  c = (c << 2) | (c >> 30);
> +}
> +
> +__attribute__((noinline)) void
> +f6 (void)
> +{
> +  c = (c << 30) | (c >> 2);
> +}
> +
> +__attribute__((noinline)) void
> +f7 (void)
> +{
> +  c = (c << d) | (c >> (32 - d));
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f8 (int x)
> +{
> +  c = (c << x) | (c >> (32 - x));
> +}
> +
> +__attribute__((noinline)) void
> +f9 (void)
> +{
> +  e = (e << 2) | (e >> 30);
> +}
> +
> +__attribute__((noinline)) void
> +f10 (void)
> +{
> +  e = (e << 30) | (e >> 2);
> +}
> +
> +__attribute__((noinline)) void
> +f11 (void)
> +{
> +  e = (e << f) | (e >> (32 - f));
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f12 (int x)
> +{
> +  e = (e << x) | (e >> (32 - x));
> +}
> +
> +unsigned
> +r (void)
> +{
> +  static unsigned x = 0xdeadbeefU;
> +  static unsigned y = 0x12347654U;
> +  static unsigned z = 0x1a2b3c4dU;
> +  static unsigned w = 0x87654321U;
> +  unsigned t = x ^ (x << 11);
> +  x = y;
> +  y = z;
> +  z = w;
> +  w = w ^ (w >> 19) ^ t ^ (t >> 8);
> +  return w;
> +}
> +
> +void
> +init (unsigned int *p, int count, int mod)
> +{
> +  int i;
> +  for (i = 0; i < count; i++)
> +    {
> +      unsigned int v = r ();
> +      if (mod)
> +	v = (v % 31) + 1;
> +      p[i] = v;
> +    }
> +}
> +
> +void
> +check (unsigned int *p, unsigned int *q, int count, unsigned int *s, int ss)
> +{
> +  int i;
> +  for (i = 0; i < count; i++)
> +    {
> +      if (s)
> +	ss = s[i];
> +      if (p[i] != ((q[i] << ss) | (q[i] >> (32 - ss))))
> +	__builtin_abort ();
> +    }
> +}
> +
> +int
> +main ()
> +{
> +  init ((unsigned int *) &a, 4, 0);
> +  init ((unsigned int *) &b, 4, 1);
> +  init ((unsigned int *) &c, 8, 0);
> +  init ((unsigned int *) &d, 8, 1);
> +  init ((unsigned int *) &e, 16, 0);
> +  init ((unsigned int *) &f, 16, 1);
> +  g = a;
> +  h = c;
> +  j = e;
> +  f1 ();
> +  f5 ();
> +  f9 ();
> +  check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 2);
> +  check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 2);
> +  check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 2);
> +  g = a;
> +  h = c;
> +  j = e;
> +  f2 ();
> +  f6 ();
> +  f10 ();
> +  check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 30);
> +  check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 30);
> +  check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 30);
> +  g = a;
> +  h = c;
> +  j = e;
> +  f3 ();
> +  f7 ();
> +  f11 ();
> +  check ((unsigned int *) &a, (unsigned int *) &g, 4, (unsigned int *) &b, 0);
> +  check ((unsigned int *) &c, (unsigned int *) &h, 8, (unsigned int *) &d, 0);
> +  check ((unsigned int *) &e, (unsigned int *) &j, 16, (unsigned int *) &f, 0);
> +  g = a;
> +  h = c;
> +  j = e;
> +  f4 (5);
> +  f8 (5);
> +  f12 (5);
> +  check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 5);
> +  check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 5);
> +  check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 5);
> +  return 0;
> +}
> 
> 
> 	Jakub
> 
>
diff mbox

Patch

--- gcc/tree-vect-generic.c.jj	2014-06-26 11:00:00.477268305 +0200
+++ gcc/tree-vect-generic.c	2014-06-26 13:33:33.024069715 +0200
@@ -1334,15 +1334,67 @@  lower_vec_perm (gimple_stmt_iterator *gs
   update_stmt (gsi_stmt (*gsi));
 }
 
+/* Return type in which CODE operation with optab OP can be
+   computed.  */
+
+static tree
+get_compute_type (enum tree_code code, optab op, tree type)
+{
+  /* For very wide vectors, try using a smaller vector mode.  */
+  tree compute_type = type;
+  if (op
+      && (!VECTOR_MODE_P (TYPE_MODE (type))
+	  || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing))
+    {
+      tree vector_compute_type
+	= type_for_widest_vector_mode (TREE_TYPE (type), op);
+      if (vector_compute_type != NULL_TREE
+	  && (TYPE_VECTOR_SUBPARTS (vector_compute_type)
+	      < TYPE_VECTOR_SUBPARTS (compute_type))
+	  && (optab_handler (op, TYPE_MODE (vector_compute_type))
+	      != CODE_FOR_nothing))
+	compute_type = vector_compute_type;
+    }
+
+  /* If we are breaking a BLKmode vector into smaller pieces,
+     type_for_widest_vector_mode has already looked into the optab,
+     so skip these checks.  */
+  if (compute_type == type)
+    {
+      enum machine_mode compute_mode = TYPE_MODE (compute_type);
+      if (VECTOR_MODE_P (compute_mode))
+	{
+	  if (op && optab_handler (op, compute_mode) != CODE_FOR_nothing)
+	    return compute_type;
+	  if (code == MULT_HIGHPART_EXPR
+	      && can_mult_highpart_p (compute_mode,
+				      TYPE_UNSIGNED (compute_type)))
+	    return compute_type;
+	}
+      /* There is no operation in hardware, so fall back to scalars.  */
+      compute_type = TREE_TYPE (type);
+    }
+
+  return compute_type;
+}
+
+/* Helper function of expand_vector_operations_1.  Return number of
+   vector elements for vector types or 1 for other types.  */
+
+static inline int
+count_type_subparts (tree type)
+{
+  return VECTOR_TYPE_P (type) ? TYPE_VECTOR_SUBPARTS (type) : 1;
+}
+
 /* Process one statement.  If we identify a vector operation, expand it.  */
 
 static void
 expand_vector_operations_1 (gimple_stmt_iterator *gsi)
 {
   gimple stmt = gsi_stmt (*gsi);
-  tree lhs, rhs1, rhs2 = NULL, type, compute_type;
+  tree lhs, rhs1, rhs2 = NULL, type, compute_type = NULL_TREE;
   enum tree_code code;
-  enum machine_mode compute_mode;
   optab op = unknown_optab;
   enum gimple_rhs_class rhs_class;
   tree new_rhs;
@@ -1455,11 +1507,76 @@  expand_vector_operations_1 (gimple_stmt_
 	{
           op = optab_for_tree_code (code, type, optab_scalar);
 
+	  compute_type = get_compute_type (code, op, type);
+	  if (compute_type == type)
+	    return;
 	  /* The rtl expander will expand vector/scalar as vector/vector
-	     if necessary.  Don't bother converting the stmt here.  */
-	  if (optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing
-	      && optab_handler (opv, TYPE_MODE (type)) != CODE_FOR_nothing)
+	     if necessary.  Pick one with wider vector type.  */
+	  tree compute_vtype = get_compute_type (code, opv, type);
+	  if (count_type_subparts (compute_vtype)
+	      > count_type_subparts (compute_type))
+	    {
+	      compute_type = compute_vtype;
+	      op = opv;
+	    }
+	}
+
+      if (code == LROTATE_EXPR || code == RROTATE_EXPR)
+	{
+	  if (compute_type == NULL_TREE)
+	    compute_type = get_compute_type (code, op, type);
+	  if (compute_type == type)
 	    return;
+	  /* Before splitting vector rotates into scalar rotates,
+	     see if we can't use vector shifts and BIT_IOR_EXPR
+	     instead.  For vector by vector rotates we'd also
+	     need to check BIT_AND_EXPR and NEGATE_EXPR, punt there
+	     for now, fold doesn't seem to create such rotates anyway.  */
+	  if (compute_type == TREE_TYPE (type)
+	      && !VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2)))
+	    {
+	      optab oplv = vashl_optab, opl = ashl_optab;
+	      optab oprv = vlshr_optab, opr = lshr_optab, opo = ior_optab;
+	      tree compute_lvtype = get_compute_type (LSHIFT_EXPR, oplv, type);
+	      tree compute_rvtype = get_compute_type (RSHIFT_EXPR, oprv, type);
+	      tree compute_otype = get_compute_type (BIT_IOR_EXPR, opo, type);
+	      tree compute_ltype = get_compute_type (LSHIFT_EXPR, opl, type);
+	      tree compute_rtype = get_compute_type (RSHIFT_EXPR, opr, type);
+	      /* The rtl expander will expand vector/scalar as vector/vector
+		 if necessary.  Pick one with wider vector type.  */
+	      if (count_type_subparts (compute_lvtype)
+		  > count_type_subparts (compute_ltype))
+		{
+		  compute_ltype = compute_lvtype;
+		  opl = oplv;
+		}
+	      if (count_type_subparts (compute_rvtype)
+		  > count_type_subparts (compute_rtype))
+		{
+		  compute_rtype = compute_rvtype;
+		  opr = oprv;
+		}
+	      /* Pick the narrowest type from LSHIFT_EXPR, RSHIFT_EXPR and
+		 BIT_IOR_EXPR.  */
+	      compute_type = compute_ltype;
+	      if (count_type_subparts (compute_type)
+		  > count_type_subparts (compute_rtype))
+		compute_type = compute_rtype;
+	      if (count_type_subparts (compute_type)
+		  > count_type_subparts (compute_otype))
+		compute_type = compute_otype;
+	      /* Verify all 3 operations can be performed in that type.  */
+	      if (compute_type != TREE_TYPE (type))
+		{
+		  if (optab_handler (opl, TYPE_MODE (compute_type))
+		      == CODE_FOR_nothing
+		      || optab_handler (opr, TYPE_MODE (compute_type))
+			 == CODE_FOR_nothing
+		      || optab_handler (opo, TYPE_MODE (compute_type))
+			 == CODE_FOR_nothing)
+		    compute_type = TREE_TYPE (type);
+		}
+	    }
 	}
     }
   else
@@ -1473,38 +1590,10 @@  expand_vector_operations_1 (gimple_stmt_
       && INTEGRAL_TYPE_P (TREE_TYPE (type)))
     op = optab_for_tree_code (MINUS_EXPR, type, optab_default);
 
-  /* For very wide vectors, try using a smaller vector mode.  */
-  compute_type = type;
-  if (!VECTOR_MODE_P (TYPE_MODE (type)) && op)
-    {
-      tree vector_compute_type
-        = type_for_widest_vector_mode (TREE_TYPE (type), op);
-      if (vector_compute_type != NULL_TREE
-	  && (TYPE_VECTOR_SUBPARTS (vector_compute_type)
-	      < TYPE_VECTOR_SUBPARTS (compute_type))
-	  && (optab_handler (op, TYPE_MODE (vector_compute_type))
-	      != CODE_FOR_nothing))
-	compute_type = vector_compute_type;
-    }
-
-  /* If we are breaking a BLKmode vector into smaller pieces,
-     type_for_widest_vector_mode has already looked into the optab,
-     so skip these checks.  */
+  if (compute_type == NULL_TREE)
+    compute_type = get_compute_type (code, op, type);
   if (compute_type == type)
-    {
-      compute_mode = TYPE_MODE (compute_type);
-      if (VECTOR_MODE_P (compute_mode))
-	{
-          if (op && optab_handler (op, compute_mode) != CODE_FOR_nothing)
-	    return;
-	  if (code == MULT_HIGHPART_EXPR
-	      && can_mult_highpart_p (compute_mode,
-				      TYPE_UNSIGNED (compute_type)))
-	    return;
-	}
-      /* There is no operation in hardware, so fall back to scalars.  */
-      compute_type = TREE_TYPE (type);
-    }
+    return;
 
   gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR);
   new_rhs = expand_vector_operation (gsi, type, compute_type, stmt, code);
--- gcc/expmed.c.jj	2014-06-26 11:00:00.034270690 +0200
+++ gcc/expmed.c	2014-06-26 13:28:49.802581691 +0200
@@ -2128,9 +2128,12 @@  expand_shift_1 (enum tree_code code, enu
   optab lrotate_optab = rotl_optab;
   optab rrotate_optab = rotr_optab;
   enum machine_mode op1_mode;
+  enum machine_mode scalar_mode = mode;
   int attempt;
   bool speed = optimize_insn_for_speed_p ();
 
+  if (VECTOR_MODE_P (mode))
+    scalar_mode = GET_MODE_INNER (mode);
   op1 = amount;
   op1_mode = GET_MODE (op1);
 
@@ -2153,9 +2156,9 @@  expand_shift_1 (enum tree_code code, enu
     {
       if (CONST_INT_P (op1)
 	  && ((unsigned HOST_WIDE_INT) INTVAL (op1) >=
-	      (unsigned HOST_WIDE_INT) GET_MODE_BITSIZE (mode)))
+	      (unsigned HOST_WIDE_INT) GET_MODE_BITSIZE (scalar_mode)))
 	op1 = GEN_INT ((unsigned HOST_WIDE_INT) INTVAL (op1)
-		       % GET_MODE_BITSIZE (mode));
+		       % GET_MODE_BITSIZE (scalar_mode));
       else if (GET_CODE (op1) == SUBREG
 	       && subreg_lowpart_p (op1)
 	       && SCALAR_INT_MODE_P (GET_MODE (SUBREG_REG (op1)))
@@ -2169,10 +2172,10 @@  expand_shift_1 (enum tree_code code, enu
      amount instead.  */
   if (rotate
       && CONST_INT_P (op1)
-      && IN_RANGE (INTVAL (op1), GET_MODE_BITSIZE (mode) / 2 + left,
-		   GET_MODE_BITSIZE (mode) - 1))
+      && IN_RANGE (INTVAL (op1), GET_MODE_BITSIZE (scalar_mode) / 2 + left,
+		   GET_MODE_BITSIZE (scalar_mode) - 1))
     {
-      op1 = GEN_INT (GET_MODE_BITSIZE (mode) - INTVAL (op1));
+      op1 = GEN_INT (GET_MODE_BITSIZE (scalar_mode) - INTVAL (op1));
       left = !left;
       code = left ? LROTATE_EXPR : RROTATE_EXPR;
     }
@@ -2185,7 +2188,7 @@  expand_shift_1 (enum tree_code code, enu
   if (code == LSHIFT_EXPR
       && CONST_INT_P (op1)
       && INTVAL (op1) > 0
-      && INTVAL (op1) < GET_MODE_PRECISION (mode)
+      && INTVAL (op1) < GET_MODE_PRECISION (scalar_mode)
       && INTVAL (op1) < MAX_BITS_PER_WORD
       && (shift_cost (speed, mode, INTVAL (op1))
 	  > INTVAL (op1) * add_cost (speed, mode))
@@ -2240,14 +2243,14 @@  expand_shift_1 (enum tree_code code, enu
 	      if (op1 == const0_rtx)
 		return shifted;
 	      else if (CONST_INT_P (op1))
-		other_amount = GEN_INT (GET_MODE_BITSIZE (mode)
+		other_amount = GEN_INT (GET_MODE_BITSIZE (scalar_mode)
 					- INTVAL (op1));
 	      else
 		{
 		  other_amount
 		    = simplify_gen_unary (NEG, GET_MODE (op1),
 					  op1, GET_MODE (op1));
-		  HOST_WIDE_INT mask = GET_MODE_PRECISION (mode) - 1;
+		  HOST_WIDE_INT mask = GET_MODE_PRECISION (scalar_mode) - 1;
 		  other_amount
 		    = simplify_gen_binary (AND, GET_MODE (op1), other_amount,
 					   gen_int_mode (mask, GET_MODE (op1)));
--- gcc/testsuite/gcc.target/i386/sse2-pr57233.c.jj	2014-06-26 13:28:49.802581691 +0200
+++ gcc/testsuite/gcc.target/i386/sse2-pr57233.c	2014-06-26 13:28:49.802581691 +0200
@@ -0,0 +1,16 @@ 
+/* PR tree-optimization/57233 */
+/* { dg-do run { target sse2 } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "sse2-check.h"
+
+static void
+sse2_test (void)
+{
+  do_main ();
+}
+
+#undef main
+#define main() do_main ()
+
+#include "../../gcc.dg/pr57233.c"
--- gcc/testsuite/gcc.target/i386/pr57233.c.jj	2014-06-26 13:28:49.802581691 +0200
+++ gcc/testsuite/gcc.target/i386/pr57233.c	2014-06-26 13:28:49.802581691 +0200
@@ -0,0 +1,15 @@ 
+/* PR tree-optimization/57233 */
+/* { dg-do compile { target avx } } */
+/* { dg-options "-O2 -mavx -mno-xop" } */
+
+typedef unsigned V4 __attribute__((vector_size(4 * sizeof (int))));
+V4 a;
+
+__attribute__((noinline)) void
+foo (void)
+{
+  a = (a << 2) | (a >> 30);
+}
+
+/* { dg-final { scan-assembler "vpsrld\[^\n\r]*30" } } */
+/* { dg-final { scan-assembler "vpslld\[^\n\r]*2" } } */
--- gcc/testsuite/gcc.target/i386/xop-pr57233.c.jj	2014-06-26 13:28:49.802581691 +0200
+++ gcc/testsuite/gcc.target/i386/xop-pr57233.c	2014-06-26 13:28:49.802581691 +0200
@@ -0,0 +1,16 @@ 
+/* PR tree-optimization/57233 */
+/* { dg-do run { target xop } } */
+/* { dg-options "-O2 -mxop" } */
+
+#include "xop-check.h"
+
+static void
+xop_test (void)
+{
+  do_main ();
+}
+
+#undef main
+#define main() do_main ()
+
+#include "../../gcc.dg/pr57233.c"
--- gcc/testsuite/gcc.target/i386/avx2-pr57233.c.jj	2014-06-26 13:28:49.803581572 +0200
+++ gcc/testsuite/gcc.target/i386/avx2-pr57233.c	2014-06-26 13:28:49.802581691 +0200
@@ -0,0 +1,16 @@ 
+/* PR tree-optimization/57233 */
+/* { dg-do run { target avx2 } } */
+/* { dg-options "-O2 -mavx2" } */
+
+#include "avx2-check.h"
+
+static void
+avx2_test (void)
+{
+  do_main ();
+}
+
+#undef main
+#define main() do_main ()
+
+#include "../../gcc.dg/pr57233.c"
--- gcc/testsuite/gcc.target/i386/avx-pr57233.c.jj	2014-06-26 13:28:49.803581572 +0200
+++ gcc/testsuite/gcc.target/i386/avx-pr57233.c	2014-06-26 13:28:49.803581572 +0200
@@ -0,0 +1,16 @@ 
+/* PR tree-optimization/57233 */
+/* { dg-do run { target avx } } */
+/* { dg-options "-O2 -mavx" } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  do_main ();
+}
+
+#undef main
+#define main() do_main ()
+
+#include "../../gcc.dg/pr57233.c"
--- gcc/testsuite/gcc.target/i386/avx512f-pr57233.c.jj	2014-06-26 13:28:49.803581572 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-pr57233.c	2014-06-26 13:28:49.803581572 +0200
@@ -0,0 +1,16 @@ 
+/* PR tree-optimization/57233 */
+/* { dg-do run { target avx512f } } */
+/* { dg-options "-O2 -mavx512f" } */
+
+#include "avx512f-check.h"
+
+static void
+avx512f_test (void)
+{
+  do_main ();
+}
+
+#undef main
+#define main() do_main ()
+
+#include "../../gcc.dg/pr57233.c"
--- gcc/testsuite/gcc.dg/pr57233.c.jj	2014-06-26 13:28:49.803581572 +0200
+++ gcc/testsuite/gcc.dg/pr57233.c	2014-06-26 13:28:49.803581572 +0200
@@ -0,0 +1,171 @@ 
+/* PR tree-optimization/57233 */
+/* { dg-do run { target { ilp32 || lp64 } } } */
+/* { dg-options "-O2" } */
+
+typedef unsigned V4 __attribute__((vector_size(4 * sizeof (int))));
+typedef unsigned V8 __attribute__((vector_size(8 * sizeof (int))));
+typedef unsigned V16 __attribute__((vector_size(16 * sizeof (int))));
+V4 a, b, g;
+V8 c, d, h;
+V16 e, f, j;
+
+__attribute__((noinline)) void
+f1 (void)
+{
+  a = (a << 2) | (a >> 30);
+}
+
+__attribute__((noinline)) void
+f2 (void)
+{
+  a = (a << 30) | (a >> 2);
+}
+
+__attribute__((noinline)) void
+f3 (void)
+{
+  a = (a << b) | (a >> (32 - b));
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+  a = (a << x) | (a >> (32 - x));
+}
+
+__attribute__((noinline)) void
+f5 (void)
+{
+  c = (c << 2) | (c >> 30);
+}
+
+__attribute__((noinline)) void
+f6 (void)
+{
+  c = (c << 30) | (c >> 2);
+}
+
+__attribute__((noinline)) void
+f7 (void)
+{
+  c = (c << d) | (c >> (32 - d));
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int x)
+{
+  c = (c << x) | (c >> (32 - x));
+}
+
+__attribute__((noinline)) void
+f9 (void)
+{
+  e = (e << 2) | (e >> 30);
+}
+
+__attribute__((noinline)) void
+f10 (void)
+{
+  e = (e << 30) | (e >> 2);
+}
+
+__attribute__((noinline)) void
+f11 (void)
+{
+  e = (e << f) | (e >> (32 - f));
+}
+
+__attribute__((noinline, noclone)) void
+f12 (int x)
+{
+  e = (e << x) | (e >> (32 - x));
+}
+
+unsigned
+r (void)
+{
+  static unsigned x = 0xdeadbeefU;
+  static unsigned y = 0x12347654U;
+  static unsigned z = 0x1a2b3c4dU;
+  static unsigned w = 0x87654321U;
+  unsigned t = x ^ (x << 11);
+  x = y;
+  y = z;
+  z = w;
+  w = w ^ (w >> 19) ^ t ^ (t >> 8);
+  return w;
+}
+
+void
+init (unsigned int *p, int count, int mod)
+{
+  int i;
+  for (i = 0; i < count; i++)
+    {
+      unsigned int v = r ();
+      if (mod)
+	v = (v % 31) + 1;
+      p[i] = v;
+    }
+}
+
+void
+check (unsigned int *p, unsigned int *q, int count, unsigned int *s, int ss)
+{
+  int i;
+  for (i = 0; i < count; i++)
+    {
+      if (s)
+	ss = s[i];
+      if (p[i] != ((q[i] << ss) | (q[i] >> (32 - ss))))
+	__builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  init ((unsigned int *) &a, 4, 0);
+  init ((unsigned int *) &b, 4, 1);
+  init ((unsigned int *) &c, 8, 0);
+  init ((unsigned int *) &d, 8, 1);
+  init ((unsigned int *) &e, 16, 0);
+  init ((unsigned int *) &f, 16, 1);
+  g = a;
+  h = c;
+  j = e;
+  f1 ();
+  f5 ();
+  f9 ();
+  check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 2);
+  check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 2);
+  check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 2);
+  g = a;
+  h = c;
+  j = e;
+  f2 ();
+  f6 ();
+  f10 ();
+  check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 30);
+  check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 30);
+  check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 30);
+  g = a;
+  h = c;
+  j = e;
+  f3 ();
+  f7 ();
+  f11 ();
+  check ((unsigned int *) &a, (unsigned int *) &g, 4, (unsigned int *) &b, 0);
+  check ((unsigned int *) &c, (unsigned int *) &h, 8, (unsigned int *) &d, 0);
+  check ((unsigned int *) &e, (unsigned int *) &j, 16, (unsigned int *) &f, 0);
+  g = a;
+  h = c;
+  j = e;
+  f4 (5);
+  f8 (5);
+  f12 (5);
+  check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 5);
+  check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 5);
+  check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 5);
+  return 0;
+}