diff mbox

[rs6000] Handle vec_extract and splat patterns in analyze_swaps

Message ID 1409804896.3163.55.camel@gnopaine
State New
Headers show

Commit Message

Bill Schmidt Sept. 4, 2014, 4:28 a.m. UTC
Hi,

This patch adds more special handling to analyze_swaps to allow us to
improve more computations.  Previously I had disallowed VEC_SELECT in
all cases.  This is now changed to allow a select of a single lane,
either for an extract operation or for a splat operation.  If a
computation containing such operations is optimized, the selected lane
is changed to count from the other end of the vector.  Several new tests
are added to check these opportunities are now exploited.

Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no
regressions.  Is this ok for trunk?

Thanks,
Bill


[gcc]

2014-09-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* config/rs6000/rs6000.c (special_handling_values): Add
	SH_EXTRACT.
	(rtx_is_swappable_p): Look for patterns with a VEC_SELECT, perhaps
	wrapped in a VEC_DUPLICATE, representing an extract.  Mark these
	as swappable with special handling SH_EXTRACT.  Remove
	UNSPEC_VSX_XXSPLTW from the list of disallowed unspecs for the
	optimization.
	(adjust_extract): New function.
	(handle_special_swappables): Add default to case statement; add
	case for SH_EXTRACT that calls adjust_extract.
	(dump_swap_insn_table): Handle SH_EXTRACT.

[gcc/testsuite]

2014-09-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* gcc.target/powerpc/swaps-p8-13.c: New test.
	* gcc.target/powerpc/swaps-p8-14.c: New test.
	* gcc.target/powerpc/swaps-p8-15.c: New test.

Comments

David Edelsohn Sept. 4, 2014, 2:01 p.m. UTC | #1
On Thu, Sep 4, 2014 at 12:28 AM, Bill Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> Hi,
>
> This patch adds more special handling to analyze_swaps to allow us to
> improve more computations.  Previously I had disallowed VEC_SELECT in
> all cases.  This is now changed to allow a select of a single lane,
> either for an extract operation or for a splat operation.  If a
> computation containing such operations is optimized, the selected lane
> is changed to count from the other end of the vector.  Several new tests
> are added to check these opportunities are now exploited.
>
> Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no
> regressions.  Is this ok for trunk?
>
> Thanks,
> Bill
>
>
> [gcc]
>
> 2014-09-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
>
>         * config/rs6000/rs6000.c (special_handling_values): Add
>         SH_EXTRACT.
>         (rtx_is_swappable_p): Look for patterns with a VEC_SELECT, perhaps
>         wrapped in a VEC_DUPLICATE, representing an extract.  Mark these
>         as swappable with special handling SH_EXTRACT.  Remove
>         UNSPEC_VSX_XXSPLTW from the list of disallowed unspecs for the
>         optimization.
>         (adjust_extract): New function.
>         (handle_special_swappables): Add default to case statement; add
>         case for SH_EXTRACT that calls adjust_extract.
>         (dump_swap_insn_table): Handle SH_EXTRACT.
>
> [gcc/testsuite]
>
> 2014-09-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
>
>         * gcc.target/powerpc/swaps-p8-13.c: New test.
>         * gcc.target/powerpc/swaps-p8-14.c: New test.
>         * gcc.target/powerpc/swaps-p8-15.c: New test.

Okay.

Thanks, David
diff mbox

Patch

Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 214879)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -33562,7 +33562,8 @@  enum special_handling_values {
   SH_CONST_VECTOR,
   SH_SUBREG,
   SH_NOSWAP_LD,
-  SH_NOSWAP_ST
+  SH_NOSWAP_ST,
+  SH_EXTRACT
 };
 
 /* Union INSN with all insns containing definitions that reach USE.
@@ -33704,6 +33705,7 @@  rtx_is_swappable_p (rtx op, unsigned int *special)
 {
   enum rtx_code code = GET_CODE (op);
   int i, j;
+  rtx parallel;
 
   switch (code)
     {
@@ -33714,7 +33716,6 @@  rtx_is_swappable_p (rtx op, unsigned int *special)
       return 1;
 
     case VEC_CONCAT:
-    case VEC_SELECT:
     case ASM_INPUT:
     case ASM_OPERANDS:
       return 0;
@@ -33732,9 +33733,31 @@  rtx_is_swappable_p (rtx op, unsigned int *special)
 	 handling.  */
       if (GET_CODE (XEXP (op, 0)) == CONST_INT)
 	return 1;
+      else if (GET_CODE (XEXP (op, 0)) == REG
+	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
+	/* This catches V2DF and V2DI splat, at a minimum.  */
+	return 1;
+      else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
+	/* If the duplicated item is from a select, defer to the select
+	   processing to see if we can change the lane for the splat.  */
+	return rtx_is_swappable_p (XEXP (op, 0), special);
       else
 	return 0;
 
+    case VEC_SELECT:
+      /* A vec_extract operation is ok if we change the lane.  */
+      if (GET_CODE (XEXP (op, 0)) == REG
+	  && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
+	  && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
+	  && XVECLEN (parallel, 0) == 1
+	  && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
+	{
+	  *special = SH_EXTRACT;
+	  return 1;
+	}
+      else
+	return 0;
+
     case UNSPEC:
       {
 	/* Various operations are unsafe for this optimization, at least
@@ -33777,7 +33800,6 @@  rtx_is_swappable_p (rtx op, unsigned int *special)
 	    || val == UNSPEC_VSX_CVSPDPN
 	    || val == UNSPEC_VSX_SET
 	    || val == UNSPEC_VSX_SLDWI
-	    || val == UNSPEC_VSX_XXSPLTW
 	    || val == UNSPEC_VUNPACK_HI_SIGN
 	    || val == UNSPEC_VUNPACK_HI_SIGN_DIRECT
 	    || val == UNSPEC_VUNPACK_LO_SIGN
@@ -34115,6 +34137,27 @@  permute_store (rtx_insn *insn)
 	     INSN_UID (insn));
 }
 
+/* Given OP that contains a vector extract operation, change the index
+   of the extracted lane to count from the other side of the vector.  */
+static void
+adjust_extract (rtx_insn *insn)
+{
+  rtx body = PATTERN (insn);
+  /* The vec_select may be wrapped in a vec_duplicate for a splat, so
+     account for that.  */
+  rtx sel = (GET_CODE (body) == VEC_DUPLICATE
+	     ? XEXP (XEXP (body, 0), 1)
+	     : XEXP (body, 1));
+  rtx par = XEXP (sel, 1);
+  int nunits = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0)));
+  XVECEXP (par, 0, 0) = GEN_INT (nunits - 1 - INTVAL (XVECEXP (par, 0, 0)));
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
+}
+
 /* The insn described by INSN_ENTRY[I] can be swapped, but only
    with special handling.  Take care of that here.  */
 static void
@@ -34125,6 +34168,8 @@  handle_special_swappables (swap_web_entry *insn_en
 
   switch (insn_entry[i].special_handling)
     {
+    default:
+      gcc_unreachable ();
     case SH_CONST_VECTOR:
       {
 	/* A CONST_VECTOR will only show up somewhere in the RHS of a SET.  */
@@ -34151,6 +34196,9 @@  handle_special_swappables (swap_web_entry *insn_en
       /* Convert a non-permuting store to a permuting one.  */
       permute_store (insn);
       break;
+    case SH_EXTRACT:
+      /* Change the lane on an extract operation.  */
+      adjust_extract (insn);
     }
 }
 
@@ -34219,6 +34267,8 @@  dump_swap_insn_table (swap_web_entry *insn_entry)
 	      fputs ("special:load ", dump_file);
 	    else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
 	      fputs ("special:store ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_EXTRACT)
+	      fputs ("special:extract ", dump_file);
 	  }
 	if (insn_entry[i].web_not_optimizable)
 	  fputs ("unoptimizable ", dump_file);
Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c	(working copy)
@@ -0,0 +1,53 @@ 
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+long long ca[N] __attribute__((aligned(16)));
+long long cb[N] __attribute__((aligned(16)));
+long long cc[N] __attribute__((aligned(16)));
+long long cd[N] __attribute__((aligned(16)));
+long long x;
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  vector long long va, vb, vc, vd, tmp;
+  volatile unsigned long long three = 3;
+  vector unsigned long long threes = vec_splats (three);
+  for (i = 0; i < N; i+=2) {
+    vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
+    vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
+    vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
+    tmp = vec_add (vb, vc);
+    tmp = vec_sub (tmp, vd);
+    tmp = vec_sra (tmp, threes);
+    x = vec_extract (tmp, 0);
+    vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = 3 * i - 2048;
+    cc[i] = -5 * i + 93;
+    cd[i] = i + 14;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (ca[i] != (-3 * i - 1969) >> 3)
+      abort ();
+  if (x != ca[N-1])
+    abort ();
+  return 0;
+}
Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c	(working copy)
@@ -0,0 +1,42 @@ 
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "stxsdx" } } */
+/* { dg-final { scan-assembler-times "xxpermdi" 1 } } */
+
+/* The only xxpermdi expected is for the vec_splats.  */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+long long ca[N] __attribute__((aligned(16)));
+long long cb[N] __attribute__((aligned(16)));
+long long cc[N] __attribute__((aligned(16)));
+long long cd[N] __attribute__((aligned(16)));
+long long x;
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  vector long long va, vb, vc, vd, tmp;
+  volatile unsigned long long three = 3;
+  vector unsigned long long threes = vec_splats (three);
+  for (i = 0; i < N; i+=2) {
+    vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
+    vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
+    vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
+    tmp = vec_add (vb, vc);
+    tmp = vec_sub (tmp, vd);
+    tmp = vec_sra (tmp, threes);
+    x = vec_extract (tmp, 0);
+    vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
+  }
+}
+
+int main ()
+{
+  foo ();
+  return 0;
+}
Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c	(working copy)
@@ -0,0 +1,49 @@ 
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "xxspltw" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+#include <altivec.h>
+void abort();
+
+typedef struct xx {vector double l; vector double h;} xx;
+
+#define N 4096
+#define M 10000000
+vector float ca[N][4] = {0};
+vector float cb[N][4] = {0};
+vector float cc[N][4] = {0};
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    cc[i][0] = vec_mul(vec_splats(cb[i][0][0]), ca[i][0]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][1]), ca[i][1]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][2]), ca[i][2]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][3]), ca[i][3]);
+
+    cc[i][1] = vec_mul(vec_splats(cb[i][1][0]), ca[i][0]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][1]), ca[i][1]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][2]), ca[i][2]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][3]), ca[i][3]);
+    
+    cc[i][2] = vec_mul(vec_splats(cb[i][2][0]), ca[i][0]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][1]), ca[i][1]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][2]), ca[i][2]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][3]), ca[i][3]);
+    
+    cc[i][3] = vec_mul(vec_splats(cb[i][3][0]), ca[i][0]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][1]), ca[i][1]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][2]), ca[i][2]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][3]), ca[i][3]);
+  }
+}
+
+int main ()
+{
+  foo ();
+  return 0;
+}