Patchwork [rtl,i386] vec_merge simplification

login
register
mail settings
Submitter Marc Glisse
Date March 17, 2013, 1:35 a.m.
Message ID <alpine.DEB.2.02.1303170103230.12359@stedding.saclay.inria.fr>
Download mbox | patch
Permalink /patch/228258/
State New
Headers show

Comments

Marc Glisse - March 17, 2013, 1:35 a.m.
Hello,

simplify-rtx had almost no simplification involving vec_merge, so I am 
adding a couple. I purposedly did not optimize (vec_merge (vec_merge a b 
m1) b m2) to (vec_merge a b m1&m2) because m1&m2 might be a more 
complicated pattern than m1 and m2 and I don't know if that is safe (I can 
add it later if you think it is).

I had to tweak an x86 testcase that was optimized to nothing thanks to 
this patch.

That was my first time using avoid_constant_pool_reference. IIUC it is 
nice to call it before testing if something is a constant, but it is 
better to reuse the original rtx if we reuse this operand. If not, I 
probably got the patch wrong.

int is getting small to store one bit per vector element (V32QI...) so I 
switched to hwint after checking that Zadeck's patches don't touch this.

Bootstrap + testsuite on x86_64-linux-gnu.

2013-03-17  Marc Glisse  <marc.glisse@inria.fr>

gcc/
 	* simplify-rtx.c (simplify_binary_operation_1) <VEC_SELECT>:
 	Handle VEC_MERGE.
 	(simplify_ternary_operation) <VEC_MERGE>: Handle nested VEC_MERGE.
 	Handle equal arguments.

gcc/testsuite/
 	* gcc.target/i386/merge-1.c: New testcase.
 	* gcc.target/i386/avx2-vpblendd128-1.c: Make it non-trivial.
Eric Botcazou - March 27, 2013, 9:46 a.m.
> int is getting small to store one bit per vector element (V32QI...) so I
> switched to hwint after checking that Zadeck's patches don't touch this.

unsigned HOST_WIDE_INT is indeed the correct type to use for mask manipulation 
but please use UINTVAL instead of INTVAL with it.  And:

+	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1 << n_elts) - 1;

can be flagged as relying on undefined behavior (we fixed a bunch of cases a 
couple of years ago) so use:

  unsigned HOST_WIDE_INT mask = ((unsigned HOST_WIDE_INT) 1 << n_elts) - 1;

> Bootstrap + testsuite on x86_64-linux-gnu.
> 
> 2013-03-17  Marc Glisse  <marc.glisse@inria.fr>
> 
> gcc/
>  	* simplify-rtx.c (simplify_binary_operation_1) <VEC_SELECT>:
>  	Handle VEC_MERGE.
>  	(simplify_ternary_operation) <VEC_MERGE>: Handle nested VEC_MERGE.
>  	Handle equal arguments.

OK, modulo a few nits:

 - in simplify_binary_operation_1, the notion of left and right is a bit 
elusive, so I'd use all_operand0 and all_operand1 instead.  And I'd use the 
same idiom as in simplify_ternary_operation: (sel & (1 << UINTVAL (j)).

 - in simplify_ternary_operation, we probably need to test that op0 doesn't 
have side-effects too before dropping one of the copies, as VEC_MERGE is 
supposed to evaluate its 2 arguments I think.
Marc Glisse - April 1, 2013, 7:26 a.m.
On Wed, 27 Mar 2013, Eric Botcazou wrote:

>> int is getting small to store one bit per vector element (V32QI...) so I
>> switched to hwint after checking that Zadeck's patches don't touch this.
>
> unsigned HOST_WIDE_INT is indeed the correct type to use for mask manipulation
> but please use UINTVAL instead of INTVAL with it.  And:
>
> +	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1 << n_elts) - 1;
>
> can be flagged as relying on undefined behavior (we fixed a bunch of cases a
> couple of years ago) so use:
>
>  unsigned HOST_WIDE_INT mask = ((unsigned HOST_WIDE_INT) 1 << n_elts) - 1;

By the way, shouldn't this be:

unsigned HOST_WIDE_INT mask = ((unsigned HOST_WIDE_INT) 2 << (n_elts - 1)) - 1;

so it doesn't cause undefined behavior for V64QI?
Eric Botcazou - April 2, 2013, 7:53 a.m.
> By the way, shouldn't this be:
> 
> unsigned HOST_WIDE_INT mask = ((unsigned HOST_WIDE_INT) 2 << (n_elts - 1)) -
> 1;
> 
> so it doesn't cause undefined behavior for V64QI?

You're right, but I think that we'd rather write:

  if (n_elts == HOST_BITS_PER_WIDE_INT)
    mask = -1;
  else
    mask = ((unsigned HOST_WIDE_INT) 1 << n_elts) - 1;

in that case.

Patch

Index: gcc/simplify-rtx.c

===================================================================
--- gcc/simplify-rtx.c	(revision 196702)

+++ gcc/simplify-rtx.c	(working copy)

@@ -3553,20 +3553,45 @@  simplify_binary_operation_1 (enum rtx_co

 		  offset -= vec_size;
 		  vec = XEXP (vec, 1);
 		}
 	      vec = avoid_constant_pool_reference (vec);
 	    }
 
 	  if (GET_MODE (vec) == mode)
 	    return vec;
 	}
 
+      /* If we select elements in a vec_merge that all come from the same

+	 operand, select from that operand directly.  */

+      if (GET_CODE (op0) == VEC_MERGE)

+	{

+	  rtx trueop02 = avoid_constant_pool_reference (XEXP (op0, 2));

+	  if (CONST_INT_P (trueop02))

+	    {

+	      unsigned HOST_WIDE_INT sel = INTVAL (trueop02);

+	      bool all_left = true;

+	      bool all_right = true;

+	      for (int i = 0; i < XVECLEN (trueop1, 0); i++)

+		{

+		  rtx j = XVECEXP (trueop1, 0, i);

+		  if ((sel >> INTVAL (j)) & 1)

+		    all_right = false;

+		  else

+		    all_left = false;

+		}

+	      if (all_left)

+		return simplify_gen_binary (VEC_SELECT, mode, XEXP (op0, 0), op1);

+	      if (all_right)

+		return simplify_gen_binary (VEC_SELECT, mode, XEXP (op0, 1), op1);

+	    }

+	}

+

       return 0;
     case VEC_CONCAT:
       {
 	enum machine_mode op0_mode = (GET_MODE (trueop0) != VOIDmode
 				      ? GET_MODE (trueop0)
 				      : GET_MODE_INNER (mode));
 	enum machine_mode op1_mode = (GET_MODE (trueop1) != VOIDmode
 				      ? GET_MODE (trueop1)
 				      : GET_MODE_INNER (mode));
 
@@ -5214,21 +5239,21 @@  simplify_const_relational_operation (enu

    OP0, OP1, and OP2.  OP0_MODE was the mode of OP0 before it became
    a constant.  Return 0 if no simplifications is possible.  */
 
 rtx
 simplify_ternary_operation (enum rtx_code code, enum machine_mode mode,
 			    enum machine_mode op0_mode, rtx op0, rtx op1,
 			    rtx op2)
 {
   unsigned int width = GET_MODE_PRECISION (mode);
   bool any_change = false;
-  rtx tem;

+  rtx tem, trueop2;

 
   /* VOIDmode means "infinite" precision.  */
   if (width == 0)
     width = HOST_BITS_PER_WIDE_INT;
 
   switch (code)
     {
     case FMA:
       /* Simplify negations around the multiplication.  */
       /* -a * -b + c  =>  a * b + c.  */
@@ -5360,47 +5385,83 @@  simplify_ternary_operation (enum rtx_cod

 	      else if (temp)
 	        return gen_rtx_IF_THEN_ELSE (mode, temp, op1, op2);
 	    }
 	}
       break;
 
     case VEC_MERGE:
       gcc_assert (GET_MODE (op0) == mode);
       gcc_assert (GET_MODE (op1) == mode);
       gcc_assert (VECTOR_MODE_P (mode));
-      op2 = avoid_constant_pool_reference (op2);

-      if (CONST_INT_P (op2))

+      trueop2 = avoid_constant_pool_reference (op2);

+      if (CONST_INT_P (trueop2))

 	{
           int elt_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
 	  unsigned n_elts = (GET_MODE_SIZE (mode) / elt_size);
-	  int mask = (1 << n_elts) - 1;

+	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1 << n_elts) - 1;

+	  unsigned HOST_WIDE_INT sel = INTVAL (trueop2);

 
-	  if (!(INTVAL (op2) & mask))

+	  if (!(sel & mask))

 	    return op1;
-	  if ((INTVAL (op2) & mask) == mask)

+	  if ((sel & mask) == mask)

 	    return op0;
 
-	  op0 = avoid_constant_pool_reference (op0);

-	  op1 = avoid_constant_pool_reference (op1);

-	  if (GET_CODE (op0) == CONST_VECTOR

-	      && GET_CODE (op1) == CONST_VECTOR)

+	  rtx trueop0 = avoid_constant_pool_reference (op0);

+	  rtx trueop1 = avoid_constant_pool_reference (op1);

+	  if (GET_CODE (trueop0) == CONST_VECTOR

+	      && GET_CODE (trueop1) == CONST_VECTOR)

 	    {
 	      rtvec v = rtvec_alloc (n_elts);
 	      unsigned int i;
 
 	      for (i = 0; i < n_elts; i++)
-		RTVEC_ELT (v, i) = (INTVAL (op2) & (1 << i)

-				    ? CONST_VECTOR_ELT (op0, i)

-				    : CONST_VECTOR_ELT (op1, i));

+		RTVEC_ELT (v, i) = (sel & (1 << i)

+				    ? CONST_VECTOR_ELT (trueop0, i)

+				    : CONST_VECTOR_ELT (trueop1, i));

 	      return gen_rtx_CONST_VECTOR (mode, v);
 	    }
+

+	  /* Replace (vec_merge (vec_merge a b m) c n) with (vec_merge b c n)

+	     if no element from a appears in the result.  */

+	  if (GET_CODE (op0) == VEC_MERGE)

+	    {

+	      tem = avoid_constant_pool_reference (XEXP (op0, 2));

+	      if (CONST_INT_P (tem))

+		{

+		  unsigned HOST_WIDE_INT sel0 = INTVAL (tem);

+		  if ((sel & sel0 & mask) == 0)

+		    return simplify_gen_ternary (code, mode, mode,

+						 XEXP (op0, 1), op1, op2);

+		  if ((sel & ~sel0 & mask) == 0)

+		    return simplify_gen_ternary (code, mode, mode,

+						 XEXP (op0, 0), op1, op2);

+		}

+	    }

+	  if (GET_CODE (op1) == VEC_MERGE)

+	    {

+	      tem = avoid_constant_pool_reference (XEXP (op1, 2));

+	      if (CONST_INT_P (tem))

+		{

+		  unsigned HOST_WIDE_INT sel1 = INTVAL (tem);

+		  if ((~sel & sel1 & mask) == 0)

+		    return simplify_gen_ternary (code, mode, mode,

+						 op0, XEXP (op1, 1), op2);

+		  if ((~sel & ~sel1 & mask) == 0)

+		    return simplify_gen_ternary (code, mode, mode,

+						 op0, XEXP (op1, 0), op2);

+		}

+	    }

 	}
+

+      if (rtx_equal_p (op0, op1) && !side_effects_p (op2))

+	return op0;

+

       break;
 
     default:
       gcc_unreachable ();
     }
 
   return 0;
 }
 
 /* Evaluate a SUBREG of a CONST_INT or CONST_DOUBLE or CONST_FIXED
Index: gcc/testsuite/gcc.target/i386/merge-1.c

===================================================================
--- gcc/testsuite/gcc.target/i386/merge-1.c	(revision 0)

+++ gcc/testsuite/gcc.target/i386/merge-1.c	(revision 0)

@@ -0,0 +1,22 @@ 

+/* { dg-do compile } */

+/* { dg-options "-O1 -msse2" } */

+

+#include <x86intrin.h>

+

+void

+f (double *r, __m128d x, __m128d y, __m128d z)

+{

+  __m128d t=_mm_move_sd(x,y);

+  __m128d u=_mm_move_sd(t,z);

+  *r = u[0];

+}

+

+__m128d

+g(__m128d x, __m128d y, __m128d z)

+{

+  __m128d t=_mm_move_sd(x,y);

+  __m128d u=_mm_move_sd(t,z);

+  return u;

+}

+

+/* { dg-final { scan-assembler-times "movsd" 1 } } */


Property changes on: gcc/testsuite/gcc.target/i386/merge-1.c
___________________________________________________________________
Added: svn:eol-style
   + native
Added: svn:keywords
   + Author Date Id Revision URL

Index: gcc/testsuite/gcc.target/i386/avx2-vpblendd128-1.c

===================================================================
--- gcc/testsuite/gcc.target/i386/avx2-vpblendd128-1.c	(revision 196702)

+++ gcc/testsuite/gcc.target/i386/avx2-vpblendd128-1.c	(working copy)

@@ -1,13 +1,14 @@ 

 /* { dg-do compile } */
 /* { dg-options "-mavx2 -O2" } */
 /* { dg-final { scan-assembler "vpblendd\[ \\t\]+\[^\n\]*" } } */
 
 #include <immintrin.h>
 
 __m128i x;
+__m128i y;

 
 void extern
 avx2_test (void)
 {
-  x = _mm_blend_epi32 (x, x, 13);

+  x = _mm_blend_epi32 (x, y, 13);

 }