[rtl,i386] vec_merge simplification

Submitted by Marc Glisse on March 17, 2013, 1:35 a.m.

Details

Message ID alpine.DEB.2.02.1303170103230.12359@stedding.saclay.inria.fr
State New
Headers show

Commit Message

Marc Glisse March 17, 2013, 1:35 a.m.
Hello,

simplify-rtx had almost no simplification involving vec_merge, so I am 
adding a couple. I purposedly did not optimize (vec_merge (vec_merge a b 
m1) b m2) to (vec_merge a b m1&m2) because m1&m2 might be a more 
complicated pattern than m1 and m2 and I don't know if that is safe (I can 
add it later if you think it is).

I had to tweak an x86 testcase that was optimized to nothing thanks to 
this patch.

That was my first time using avoid_constant_pool_reference. IIUC it is 
nice to call it before testing if something is a constant, but it is 
better to reuse the original rtx if we reuse this operand. If not, I 
probably got the patch wrong.

int is getting small to store one bit per vector element (V32QI...) so I 
switched to hwint after checking that Zadeck's patches don't touch this.

Bootstrap + testsuite on x86_64-linux-gnu.

2013-03-17  Marc Glisse  <marc.glisse@inria.fr>

gcc/
 	* simplify-rtx.c (simplify_binary_operation_1) <VEC_SELECT>:
 	Handle VEC_MERGE.
 	(simplify_ternary_operation) <VEC_MERGE>: Handle nested VEC_MERGE.
 	Handle equal arguments.

gcc/testsuite/
 	* gcc.target/i386/merge-1.c: New testcase.
 	* gcc.target/i386/avx2-vpblendd128-1.c: Make it non-trivial.

Comments

Eric Botcazou March 27, 2013, 9:46 a.m.
> int is getting small to store one bit per vector element (V32QI...) so I
> switched to hwint after checking that Zadeck's patches don't touch this.

unsigned HOST_WIDE_INT is indeed the correct type to use for mask manipulation 
but please use UINTVAL instead of INTVAL with it.  And:

+	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1 << n_elts) - 1;

can be flagged as relying on undefined behavior (we fixed a bunch of cases a 
couple of years ago) so use:

  unsigned HOST_WIDE_INT mask = ((unsigned HOST_WIDE_INT) 1 << n_elts) - 1;

> Bootstrap + testsuite on x86_64-linux-gnu.
> 
> 2013-03-17  Marc Glisse  <marc.glisse@inria.fr>
> 
> gcc/
>  	* simplify-rtx.c (simplify_binary_operation_1) <VEC_SELECT>:
>  	Handle VEC_MERGE.
>  	(simplify_ternary_operation) <VEC_MERGE>: Handle nested VEC_MERGE.
>  	Handle equal arguments.

OK, modulo a few nits:

 - in simplify_binary_operation_1, the notion of left and right is a bit 
elusive, so I'd use all_operand0 and all_operand1 instead.  And I'd use the 
same idiom as in simplify_ternary_operation: (sel & (1 << UINTVAL (j)).

 - in simplify_ternary_operation, we probably need to test that op0 doesn't 
have side-effects too before dropping one of the copies, as VEC_MERGE is 
supposed to evaluate its 2 arguments I think.
Marc Glisse April 1, 2013, 7:26 a.m.
On Wed, 27 Mar 2013, Eric Botcazou wrote:

>> int is getting small to store one bit per vector element (V32QI...) so I
>> switched to hwint after checking that Zadeck's patches don't touch this.
>
> unsigned HOST_WIDE_INT is indeed the correct type to use for mask manipulation
> but please use UINTVAL instead of INTVAL with it.  And:
>
> +	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1 << n_elts) - 1;
>
> can be flagged as relying on undefined behavior (we fixed a bunch of cases a
> couple of years ago) so use:
>
>  unsigned HOST_WIDE_INT mask = ((unsigned HOST_WIDE_INT) 1 << n_elts) - 1;

By the way, shouldn't this be:

unsigned HOST_WIDE_INT mask = ((unsigned HOST_WIDE_INT) 2 << (n_elts - 1)) - 1;

so it doesn't cause undefined behavior for V64QI?
Eric Botcazou April 2, 2013, 7:53 a.m.
> By the way, shouldn't this be:
> 
> unsigned HOST_WIDE_INT mask = ((unsigned HOST_WIDE_INT) 2 << (n_elts - 1)) -
> 1;
> 
> so it doesn't cause undefined behavior for V64QI?

You're right, but I think that we'd rather write:

  if (n_elts == HOST_BITS_PER_WIDE_INT)
    mask = -1;
  else
    mask = ((unsigned HOST_WIDE_INT) 1 << n_elts) - 1;

in that case.

Patch hide | download patch | download mbox

Index: gcc/simplify-rtx.c
===================================================================
--- gcc/simplify-rtx.c	(revision 196702)
+++ gcc/simplify-rtx.c	(working copy)
@@ -3553,20 +3553,45 @@  simplify_binary_operation_1 (enum rtx_co
 		  offset -= vec_size;
 		  vec = XEXP (vec, 1);
 		}
 	      vec = avoid_constant_pool_reference (vec);
 	    }
 
 	  if (GET_MODE (vec) == mode)
 	    return vec;
 	}
 
+      /* If we select elements in a vec_merge that all come from the same
+	 operand, select from that operand directly.  */
+      if (GET_CODE (op0) == VEC_MERGE)
+	{
+	  rtx trueop02 = avoid_constant_pool_reference (XEXP (op0, 2));
+	  if (CONST_INT_P (trueop02))
+	    {
+	      unsigned HOST_WIDE_INT sel = INTVAL (trueop02);
+	      bool all_left = true;
+	      bool all_right = true;
+	      for (int i = 0; i < XVECLEN (trueop1, 0); i++)
+		{
+		  rtx j = XVECEXP (trueop1, 0, i);
+		  if ((sel >> INTVAL (j)) & 1)
+		    all_right = false;
+		  else
+		    all_left = false;
+		}
+	      if (all_left)
+		return simplify_gen_binary (VEC_SELECT, mode, XEXP (op0, 0), op1);
+	      if (all_right)
+		return simplify_gen_binary (VEC_SELECT, mode, XEXP (op0, 1), op1);
+	    }
+	}
+
       return 0;
     case VEC_CONCAT:
       {
 	enum machine_mode op0_mode = (GET_MODE (trueop0) != VOIDmode
 				      ? GET_MODE (trueop0)
 				      : GET_MODE_INNER (mode));
 	enum machine_mode op1_mode = (GET_MODE (trueop1) != VOIDmode
 				      ? GET_MODE (trueop1)
 				      : GET_MODE_INNER (mode));
 
@@ -5214,21 +5239,21 @@  simplify_const_relational_operation (enu
    OP0, OP1, and OP2.  OP0_MODE was the mode of OP0 before it became
    a constant.  Return 0 if no simplifications is possible.  */
 
 rtx
 simplify_ternary_operation (enum rtx_code code, enum machine_mode mode,
 			    enum machine_mode op0_mode, rtx op0, rtx op1,
 			    rtx op2)
 {
   unsigned int width = GET_MODE_PRECISION (mode);
   bool any_change = false;
-  rtx tem;
+  rtx tem, trueop2;
 
   /* VOIDmode means "infinite" precision.  */
   if (width == 0)
     width = HOST_BITS_PER_WIDE_INT;
 
   switch (code)
     {
     case FMA:
       /* Simplify negations around the multiplication.  */
       /* -a * -b + c  =>  a * b + c.  */
@@ -5360,47 +5385,83 @@  simplify_ternary_operation (enum rtx_cod
 	      else if (temp)
 	        return gen_rtx_IF_THEN_ELSE (mode, temp, op1, op2);
 	    }
 	}
       break;
 
     case VEC_MERGE:
       gcc_assert (GET_MODE (op0) == mode);
       gcc_assert (GET_MODE (op1) == mode);
       gcc_assert (VECTOR_MODE_P (mode));
-      op2 = avoid_constant_pool_reference (op2);
-      if (CONST_INT_P (op2))
+      trueop2 = avoid_constant_pool_reference (op2);
+      if (CONST_INT_P (trueop2))
 	{
           int elt_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
 	  unsigned n_elts = (GET_MODE_SIZE (mode) / elt_size);
-	  int mask = (1 << n_elts) - 1;
+	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1 << n_elts) - 1;
+	  unsigned HOST_WIDE_INT sel = INTVAL (trueop2);
 
-	  if (!(INTVAL (op2) & mask))
+	  if (!(sel & mask))
 	    return op1;
-	  if ((INTVAL (op2) & mask) == mask)
+	  if ((sel & mask) == mask)
 	    return op0;
 
-	  op0 = avoid_constant_pool_reference (op0);
-	  op1 = avoid_constant_pool_reference (op1);
-	  if (GET_CODE (op0) == CONST_VECTOR
-	      && GET_CODE (op1) == CONST_VECTOR)
+	  rtx trueop0 = avoid_constant_pool_reference (op0);
+	  rtx trueop1 = avoid_constant_pool_reference (op1);
+	  if (GET_CODE (trueop0) == CONST_VECTOR
+	      && GET_CODE (trueop1) == CONST_VECTOR)
 	    {
 	      rtvec v = rtvec_alloc (n_elts);
 	      unsigned int i;
 
 	      for (i = 0; i < n_elts; i++)
-		RTVEC_ELT (v, i) = (INTVAL (op2) & (1 << i)
-				    ? CONST_VECTOR_ELT (op0, i)
-				    : CONST_VECTOR_ELT (op1, i));
+		RTVEC_ELT (v, i) = (sel & (1 << i)
+				    ? CONST_VECTOR_ELT (trueop0, i)
+				    : CONST_VECTOR_ELT (trueop1, i));
 	      return gen_rtx_CONST_VECTOR (mode, v);
 	    }
+
+	  /* Replace (vec_merge (vec_merge a b m) c n) with (vec_merge b c n)
+	     if no element from a appears in the result.  */
+	  if (GET_CODE (op0) == VEC_MERGE)
+	    {
+	      tem = avoid_constant_pool_reference (XEXP (op0, 2));
+	      if (CONST_INT_P (tem))
+		{
+		  unsigned HOST_WIDE_INT sel0 = INTVAL (tem);
+		  if ((sel & sel0 & mask) == 0)
+		    return simplify_gen_ternary (code, mode, mode,
+						 XEXP (op0, 1), op1, op2);
+		  if ((sel & ~sel0 & mask) == 0)
+		    return simplify_gen_ternary (code, mode, mode,
+						 XEXP (op0, 0), op1, op2);
+		}
+	    }
+	  if (GET_CODE (op1) == VEC_MERGE)
+	    {
+	      tem = avoid_constant_pool_reference (XEXP (op1, 2));
+	      if (CONST_INT_P (tem))
+		{
+		  unsigned HOST_WIDE_INT sel1 = INTVAL (tem);
+		  if ((~sel & sel1 & mask) == 0)
+		    return simplify_gen_ternary (code, mode, mode,
+						 op0, XEXP (op1, 1), op2);
+		  if ((~sel & ~sel1 & mask) == 0)
+		    return simplify_gen_ternary (code, mode, mode,
+						 op0, XEXP (op1, 0), op2);
+		}
+	    }
 	}
+
+      if (rtx_equal_p (op0, op1) && !side_effects_p (op2))
+	return op0;
+
       break;
 
     default:
       gcc_unreachable ();
     }
 
   return 0;
 }
 
 /* Evaluate a SUBREG of a CONST_INT or CONST_DOUBLE or CONST_FIXED
Index: gcc/testsuite/gcc.target/i386/merge-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/merge-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/merge-1.c	(revision 0)
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O1 -msse2" } */
+
+#include <x86intrin.h>
+
+void
+f (double *r, __m128d x, __m128d y, __m128d z)
+{
+  __m128d t=_mm_move_sd(x,y);
+  __m128d u=_mm_move_sd(t,z);
+  *r = u[0];
+}
+
+__m128d
+g(__m128d x, __m128d y, __m128d z)
+{
+  __m128d t=_mm_move_sd(x,y);
+  __m128d u=_mm_move_sd(t,z);
+  return u;
+}
+
+/* { dg-final { scan-assembler-times "movsd" 1 } } */

Property changes on: gcc/testsuite/gcc.target/i386/merge-1.c
___________________________________________________________________
Added: svn:eol-style
   + native
Added: svn:keywords
   + Author Date Id Revision URL

Index: gcc/testsuite/gcc.target/i386/avx2-vpblendd128-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx2-vpblendd128-1.c	(revision 196702)
+++ gcc/testsuite/gcc.target/i386/avx2-vpblendd128-1.c	(working copy)
@@ -1,13 +1,14 @@ 
 /* { dg-do compile } */
 /* { dg-options "-mavx2 -O2" } */
 /* { dg-final { scan-assembler "vpblendd\[ \\t\]+\[^\n\]*" } } */
 
 #include <immintrin.h>
 
 __m128i x;
+__m128i y;
 
 void extern
 avx2_test (void)
 {
-  x = _mm_blend_epi32 (x, x, 13);
+  x = _mm_blend_epi32 (x, y, 13);
 }