diff mbox

[rs6000] Fix swap optimization to handle __builtin_vsx_xxspltd

Message ID f17a0f2d-8442-788c-0009-fa3a9653d31e@linux.vnet.ibm.com
State New
Headers show

Commit Message

Bill Schmidt Jan. 13, 2017, 4:28 p.m. UTC
Hi,

There is a gap in swap optimization that does not properly handle code
generated by __builtin_vsx_xxspltd.  This is expanded into an 
UNSPEC_VSX_XXSPLTD, which is currently treated as ok to swap.  It should
instead be treated as ok to swap, with special handling to modify the lane
used as the source of the splat.  We have existing code to do this for
other splat forms, so the patch is quite simple.

Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no regressions.
Is this ok for trunk?  We also require backports for 5 and 6.

Thanks,
Bill


[gcc]

2017-01-13  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* config/rs6000/rs6000.c (rtx_is_swappable_p): Change
	UNSPEC_VSX__XXSPLTD to require special splat handling.

[gcc/testsuite]

2017-01-13  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* gcc.target/powerpc/swaps-p8-27.c: New.

Comments

Segher Boessenkool Jan. 14, 2017, 10:18 a.m. UTC | #1
Hi Bill,

On Fri, Jan 13, 2017 at 10:28:33AM -0600, Bill Schmidt wrote:
> There is a gap in swap optimization that does not properly handle code
> generated by __builtin_vsx_xxspltd.  This is expanded into an 
> UNSPEC_VSX_XXSPLTD, which is currently treated as ok to swap.  It should
> instead be treated as ok to swap, with special handling to modify the lane
> used as the source of the splat.  We have existing code to do this for
> other splat forms, so the patch is quite simple.
> 
> Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no regressions.
> Is this ok for trunk?  We also require backports for 5 and 6.

This is okay, thanks.  Backports are fine as well (after the usual delay).


Segher


> 2017-01-13  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	* config/rs6000/rs6000.c (rtx_is_swappable_p): Change
> 	UNSPEC_VSX__XXSPLTD to require special splat handling.
> 
> [gcc/testsuite]
> 
> 2017-01-13  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	* gcc.target/powerpc/swaps-p8-27.c: New.
diff mbox

Patch

Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 244382)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -41271,6 +41271,7 @@  rtx_is_swappable_p (rtx op, unsigned int *special)
 	  case UNSPEC_VSX_VEC_INIT:
 	    return 0;
 	  case UNSPEC_VSPLT_DIRECT:
+	  case UNSPEC_VSX_XXSPLTD:
 	    *special = SH_SPLAT;
 	    return 1;
 	  case UNSPEC_REDUC_PLUS:
Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-27.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/swaps-p8-27.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/swaps-p8-27.c	(working copy)
@@ -0,0 +1,36 @@ 
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3 " } */
+/* { dg-final { scan-assembler-times "lxvd2x" 2 } } */
+/* { dg-final { scan-assembler-times "stxvd2x" 1 } } */
+/* { dg-final { scan-assembler-times "xxpermdi" 3 } } */
+
+/* Verify that swap optimization works correctly for a VSX direct splat.
+   The three xxpermdi's that are generated correspond to two splats
+   and the __builtin_vsx_xxpermdi.  */
+
+int printf (const char *__restrict __format, ...);
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+double s1[] = {2134.3343, 6678.346};
+double s2[] = {41124.234, 6678.346};
+long long dd[] = {1, 2}, d[2];
+union{long long l[2]; double d[2];} e;
+
+void
+foo ()
+{
+  __m128d source1, source2, dest;
+  __m128d a, b, c;
+
+  e.d[1] = s1[1];
+  e.l[0] = !__builtin_isunordered(s1[0], s2[0]) 
+    && s1[0] == s2[0] ? -1 : 0;
+  source1 = __builtin_vec_vsx_ld (0, s1);
+  source2 = __builtin_vec_vsx_ld (0, s2);
+  a = __builtin_vec_splat (source1, 0);
+  b = __builtin_vec_splat (source2, 0);
+  c = (__m128d)__builtin_vec_cmpeq (a, b);
+  dest = __builtin_vsx_xxpermdi (source1, c, 1);
+  *(__m128d *)d = dest;
+}