Patchwork Simplify a VEC_SELECT from one half of a VEC_CONCAT

login
register
mail settings
Submitter Marc Glisse
Date Dec. 2, 2012, 5:39 p.m.
Message ID <alpine.DEB.2.02.1212021809090.504@stedding.saclay.inria.fr>
Download mbox | patch
Permalink /patch/203246/
State New
Headers show

Comments

Marc Glisse - Dec. 2, 2012, 5:39 p.m.
While I was there, I decided to improve another bit of VEC_SELECT
simplification (this supersedes the patch in the father of this message).

Adding Uros in Cc: because I am touching the x86 backend. Sorry to add yet 
another insn variant, but I don't see how we can avoid it here.

The x86 and RTL parts are independent, but the testcases need both.

bootstrap+testsuite on x86_64-linux. (for next stage1 I assume)

2012-12-02  Marc Glisse  <marc.glisse@inria.fr>

 	PR target/43147
 	PR target/44551
gcc/
 	* simplify-rtx.c (simplify_binary_operation_1) <VEC_SELECT>:
 	Improve VEC_SELECT and VEC_CONCAT subcases.
 	* config/i386/sse.md (*sse_shufps_<mode>_single): New.

gcc/testsuite/
 	* gcc.target/i386/pr43147.c: New testcase.
 	* gcc.target/i386/pr44551.c: New testcase.
Uros Bizjak - Dec. 2, 2012, 7:35 p.m.
On Sun, Dec 2, 2012 at 6:39 PM, Marc Glisse <marc.glisse@inria.fr> wrote:

> While I was there, I decided to improve another bit of VEC_SELECT
> simplification (this supersedes the patch in the father of this message).
>
> Adding Uros in Cc: because I am touching the x86 backend. Sorry to add yet
> another insn variant, but I don't see how we can avoid it here.
>
> The x86 and RTL parts are independent, but the testcases need both.
>
> bootstrap+testsuite on x86_64-linux. (for next stage1 I assume)
>
> 2012-12-02  Marc Glisse  <marc.glisse@inria.fr>
>
>         PR target/43147
>         PR target/44551
> gcc/
>         * simplify-rtx.c (simplify_binary_operation_1) <VEC_SELECT>:
>         Improve VEC_SELECT and VEC_CONCAT subcases.
>         * config/i386/sse.md (*sse_shufps_<mode>_single): New.
>
> gcc/testsuite/
>         * gcc.target/i386/pr43147.c: New testcase.
>         * gcc.target/i386/pr44551.c: New testcase.

x86 part and testcases are OK, together with RTL part. I will leave to
RTL maintainer if the combined patch is still OK for 4.8.

Thanks,
Uros.

Patch

Index: simplify-rtx.c

===================================================================
--- simplify-rtx.c	(revision 194051)

+++ simplify-rtx.c	(working copy)

@@ -3317,98 +3317,20 @@  simplify_binary_operation_1 (enum rtx_co

 	  gcc_assert (VECTOR_MODE_P (GET_MODE (trueop0)));
 	  gcc_assert (mode == GET_MODE_INNER (GET_MODE (trueop0)));
 	  gcc_assert (GET_CODE (trueop1) == PARALLEL);
 	  gcc_assert (XVECLEN (trueop1, 0) == 1);
 	  gcc_assert (CONST_INT_P (XVECEXP (trueop1, 0, 0)));
 
 	  if (GET_CODE (trueop0) == CONST_VECTOR)
 	    return CONST_VECTOR_ELT (trueop0, INTVAL (XVECEXP
 						      (trueop1, 0, 0)));
 
-	  /* Extract a scalar element from a nested VEC_SELECT expression

-	     (with optional nested VEC_CONCAT expression).  Some targets

-	     (i386) extract scalar element from a vector using chain of

-	     nested VEC_SELECT expressions.  When input operand is a memory

-	     operand, this operation can be simplified to a simple scalar

-	     load from an offseted memory address.  */

-	  if (GET_CODE (trueop0) == VEC_SELECT)

-	    {

-	      rtx op0 = XEXP (trueop0, 0);

-	      rtx op1 = XEXP (trueop0, 1);

-

-	      enum machine_mode opmode = GET_MODE (op0);

-	      int elt_size = GET_MODE_SIZE (GET_MODE_INNER (opmode));

-	      int n_elts = GET_MODE_SIZE (opmode) / elt_size;

-

-	      int i = INTVAL (XVECEXP (trueop1, 0, 0));

-	      int elem;

-

-	      rtvec vec;

-	      rtx tmp_op, tmp;

-

-	      gcc_assert (GET_CODE (op1) == PARALLEL);

-	      gcc_assert (i < n_elts);

-

-	      /* Select element, pointed by nested selector.  */

-	      elem = INTVAL (XVECEXP (op1, 0, i));

-

-	      /* Handle the case when nested VEC_SELECT wraps VEC_CONCAT.  */

-	      if (GET_CODE (op0) == VEC_CONCAT)

-		{

-		  rtx op00 = XEXP (op0, 0);

-		  rtx op01 = XEXP (op0, 1);

-

-		  enum machine_mode mode00, mode01;

-		  int n_elts00, n_elts01;

-

-		  mode00 = GET_MODE (op00);

-		  mode01 = GET_MODE (op01);

-

-		  /* Find out number of elements of each operand.  */

-		  if (VECTOR_MODE_P (mode00))

-		    {

-		      elt_size = GET_MODE_SIZE (GET_MODE_INNER (mode00));

-		      n_elts00 = GET_MODE_SIZE (mode00) / elt_size;

-		    }

-		  else

-		    n_elts00 = 1;

-

-		  if (VECTOR_MODE_P (mode01))

-		    {

-		      elt_size = GET_MODE_SIZE (GET_MODE_INNER (mode01));

-		      n_elts01 = GET_MODE_SIZE (mode01) / elt_size;

-		    }

-		  else

-		    n_elts01 = 1;

-

-		  gcc_assert (n_elts == n_elts00 + n_elts01);

-

-		  /* Select correct operand of VEC_CONCAT

-		     and adjust selector. */

-		  if (elem < n_elts01)

-		    tmp_op = op00;

-		  else

-		    {

-		      tmp_op = op01;

-		      elem -= n_elts00;

-		    }

-		}

-	      else

-		tmp_op = op0;

-

-	      vec = rtvec_alloc (1);

-	      RTVEC_ELT (vec, 0) = GEN_INT (elem);

-

-	      tmp = gen_rtx_fmt_ee (code, mode,

-				    tmp_op, gen_rtx_PARALLEL (VOIDmode, vec));

-	      return tmp;

-	    }

 	  if (GET_CODE (trueop0) == VEC_DUPLICATE
 	      && GET_MODE (XEXP (trueop0, 0)) == mode)
 	    return XEXP (trueop0, 0);
 	}
       else
 	{
 	  gcc_assert (VECTOR_MODE_P (GET_MODE (trueop0)));
 	  gcc_assert (GET_MODE_INNER (mode)
 		      == GET_MODE_INNER (GET_MODE (trueop0)));
 	  gcc_assert (GET_CODE (trueop1) == PARALLEL);
@@ -3482,44 +3404,96 @@  simplify_binary_operation_1 (enum rtx_co

 	      rtx subop0, subop1;
 
 	      gcc_assert (i0 < 2 && i1 < 2);
 	      subop0 = XEXP (trueop0, i0);
 	      subop1 = XEXP (trueop0, i1);
 
 	      return simplify_gen_binary (VEC_CONCAT, mode, subop0, subop1);
 	    }
 	}
 
-      if (XVECLEN (trueop1, 0) == 1

-	  && CONST_INT_P (XVECEXP (trueop1, 0, 0))

-	  && GET_CODE (trueop0) == VEC_CONCAT)

+      /* Look through nested VEC_SELECTs.  */

+      if (GET_CODE (trueop0) == VEC_SELECT)

 	{
-	  rtx vec = trueop0;

-	  int offset = INTVAL (XVECEXP (trueop1, 0, 0)) * GET_MODE_SIZE (mode);

+	  int len = XVECLEN (trueop1, 0);

+	  rtvec vec = rtvec_alloc (len);

+	  for (int i = 0; i < len; i++)

+	    {

+	      int j = INTVAL (XVECEXP (trueop1, 0, i));

+	      RTVEC_ELT (vec, i) = XVECEXP (XEXP (trueop0, 1), 0, j);

+	    }

+	  rtx new_op0 = XEXP (trueop0, 0);

+	  rtx new_op1 = gen_rtx_PARALLEL (VOIDmode, vec);

+	  return simplify_gen_binary (VEC_SELECT, mode, new_op0, new_op1);

+	}

 
-	  /* Try to find the element in the VEC_CONCAT.  */

-	  while (GET_MODE (vec) != mode

-		 && GET_CODE (vec) == VEC_CONCAT)

-	    {

-	      HOST_WIDE_INT vec_size = GET_MODE_SIZE (GET_MODE (XEXP (vec, 0)));

-	      if (offset < vec_size)

-		vec = XEXP (vec, 0);

+      /* Detect if all the elements come from the same subpart of a concat.  */

+      if (GET_CODE (trueop0) == VEC_CONCAT)

+	{

+	  rtx new_op0 = NULL_RTX;

+	  rtx new_op1 = NULL_RTX;

+	  int first = 0;

+	  int second = 0;

+	  unsigned nelts_first_half = 1;

+	  enum machine_mode mode_first_half = GET_MODE (XEXP (trueop0, 0));

+	  if (VECTOR_MODE_P (mode_first_half))

+	    {

+	      int elt_size = GET_MODE_SIZE (GET_MODE_INNER (mode_first_half));

+	      nelts_first_half = (GET_MODE_SIZE (mode_first_half) / elt_size);

+	    }

+

+	  for (int i = 0; i < XVECLEN (trueop1, 0); i++)

+	    {

+	      rtx j = XVECEXP (trueop1, 0, i);

+	      if (!CONST_INT_P (j))

+		{

+		  first++;

+		  second++;

+		  break;

+		}

+	      if (INTVAL (j) < nelts_first_half)

+		first++;

 	      else
+		second++;

+	    }

+

+	  if (second == 0)

+	    {

+	      new_op0 = XEXP (trueop0, 0);

+	      new_op1 = trueop1;

+	    }

+	  else if (first == 0

+		   || rtx_equal_p (XEXP (trueop0, 0), XEXP (trueop0, 1)))

+	    {

+	      int len = XVECLEN (trueop1, 0);

+	      rtvec vec = rtvec_alloc (len);

+	      for (int i = 0; i < len; i++)

 		{
-		  offset -= vec_size;

-		  vec = XEXP (vec, 1);

+		  /* All vectors have a power-of-2 size, so both halves of a

+		     VEC_CONCAT must have the same size and using '%' instead

+		     of '-' is safe.  */

+		  int j = INTVAL (XVECEXP (trueop1, 0, i)) % nelts_first_half;

+		  RTVEC_ELT (vec, i) = GEN_INT (j);

 		}
-	      vec = avoid_constant_pool_reference (vec);

+	      new_op0 = XEXP (trueop0, 1);

+	      new_op1 = gen_rtx_PARALLEL (VOIDmode, vec);

 	    }
 
-	  if (GET_MODE (vec) == mode)

-	    return vec;

+	  if (new_op0)

+	    {

+	      if (VECTOR_MODE_P (GET_MODE (new_op0)))

+		return simplify_gen_binary (VEC_SELECT, mode, new_op0, new_op1);

+	      if (VECTOR_MODE_P (mode))

+		return simplify_gen_unary (VEC_DUPLICATE, mode, new_op0,

+					   GET_MODE (new_op0));

+	      return new_op0;

+	    }

 	}
 
       return 0;
     case VEC_CONCAT:
       {
 	enum machine_mode op0_mode = (GET_MODE (trueop0) != VOIDmode
 				      ? GET_MODE (trueop0)
 				      : GET_MODE_INNER (mode));
 	enum machine_mode op1_mode = (GET_MODE (trueop1) != VOIDmode
 				      ? GET_MODE (trueop1)
Index: testsuite/gcc.target/i386/pr44551.c

===================================================================
--- testsuite/gcc.target/i386/pr44551.c	(revision 0)

+++ testsuite/gcc.target/i386/pr44551.c	(revision 0)

@@ -0,0 +1,15 @@ 

+/* { dg-do compile } */

+/* { dg-options "-O -mavx" } */

+

+#include <immintrin.h>

+

+__m128i

+foo (__m256i x, __m128i y)

+{

+  __m256i r = _mm256_insertf128_si256(x, y, 1);

+  __m128i a = _mm256_extractf128_si256(r, 1);

+  return a;

+}

+

+/* { dg-final { scan-assembler-not "insert" } } */

+/* { dg-final { scan-assembler-not "extract" } } */


Property changes on: testsuite/gcc.target/i386/pr44551.c
___________________________________________________________________
Added: svn:keywords
   + Author Date Id Revision URL
Added: svn:eol-style
   + native

Index: testsuite/gcc.target/i386/pr43147.c

===================================================================
--- testsuite/gcc.target/i386/pr43147.c	(revision 0)

+++ testsuite/gcc.target/i386/pr43147.c	(revision 0)

@@ -0,0 +1,13 @@ 

+/* { dg-do compile } */

+/* { dg-options "-O -msse" } */

+

+#include <x86intrin.h>

+

+__m128 f(__m128 m)

+{

+        m = _mm_shuffle_ps(m, m, 0xC9);

+        m = _mm_shuffle_ps(m, m, 0x2D);

+        return m;

+}

+

+/* { dg-final { scan-assembler-times "shufps" 1 } } */


Property changes on: testsuite/gcc.target/i386/pr43147.c
___________________________________________________________________
Added: svn:eol-style
   + native
Added: svn:keywords
   + Author Date Id Revision URL

Index: config/i386/sse.md

===================================================================
--- config/i386/sse.md	(revision 194051)

+++ config/i386/sse.md	(working copy)

@@ -3900,20 +3900,53 @@ 

     default:
       gcc_unreachable ();
     }
 }
   [(set_attr "isa" "noavx,avx")
    (set_attr "type" "sseshuf")
    (set_attr "length_immediate" "1")
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "V4SF")])
 
+(define_insn "*sse_shufps_<mode>_single"

+  [(set (match_operand:VI4F_128 0 "register_operand" "=x,x")

+	(vec_select:VI4F_128

+	  (match_operand:VI4F_128 1 "register_operand" "0,x")

+	  (parallel [(match_operand 2 "const_0_to_3_operand")

+		     (match_operand 3 "const_0_to_3_operand")

+		     (match_operand 4 "const_0_to_3_operand")

+		     (match_operand 5 "const_0_to_3_operand")])))]

+  "TARGET_SSE"

+{

+  int mask = 0;

+  mask |= INTVAL (operands[2]) << 0;

+  mask |= INTVAL (operands[3]) << 2;

+  mask |= INTVAL (operands[4]) << 4;

+  mask |= INTVAL (operands[5]) << 6;

+  operands[2] = GEN_INT (mask);

+

+  switch (which_alternative)

+    {

+    case 0:

+      return "shufps\t{%2, %0, %0|%0, %0, %2}";

+    case 1:

+      return "vshufps\t{%2, %1, %1, %0|%0, %1, %1, %2}";

+    default:

+      gcc_unreachable ();

+    }

+}

+  [(set_attr "isa" "noavx,avx")

+   (set_attr "type" "sseshuf1")

+   (set_attr "length_immediate" "1")

+   (set_attr "prefix" "orig,vex")

+   (set_attr "mode" "V4SF")])

+

 (define_insn "sse_storehps"
   [(set (match_operand:V2SF 0 "nonimmediate_operand" "=m,x,x")
 	(vec_select:V2SF
 	  (match_operand:V4SF 1 "nonimmediate_operand" "x,x,o")
 	  (parallel [(const_int 2) (const_int 3)])))]
   "TARGET_SSE"
   "@
    %vmovhps\t{%1, %0|%0, %1}
    %vmovhlps\t{%1, %d0|%d0, %1}
    %vmovlps\t{%H1, %d0|%d0, %H1}"