diff mbox

Add pre-reload splitter for low part SI/DImode extraction out of vector regs (PR target/65078)

Message ID 20150317181522.GD1746@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek March 17, 2015, 6:15 p.m. UTC
Hi!

This patch fixes a regression where since the removal of specialized
builtin from _mm_storel_epi64 we force the extraction of DImode (or SImode)
low value out of 16/32/64 byte vector registers into memory.
As the vector extraction is from a vector register with a different
element mode, the expander doesn't know it might be beneficial to subreg it
to a vector mode with the same size, but different element mode and do
vector extraction out of that.  This patch adds a pre-reload splitter that
will turn it into such a vector extraction.  At least for the -m32
DImode extraction directly into memory, I think teaching RA to do that would
be much harder.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2015-03-17  Jakub Jelinek  <jakub@redhat.com>

	PR target/65078
	* config/i386/sse.md (movsi/movdi -> vec_extract_*_0 splitter): New.

	* gcc.target/i386/pr65078-1.c: New test.
	* gcc.target/i386/pr65078-2.c: New test.
	* gcc.target/i386/pr65078-3.c: New test.
	* gcc.target/i386/pr65078-4.c: New test.
	* gcc.target/i386/pr65078-5.c: New test.
	* gcc.target/i386/pr65078-6.c: New test.


	Jakub

Comments

Uros Bizjak March 18, 2015, 9:42 a.m. UTC | #1
On Tue, Mar 17, 2015 at 7:15 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> This patch fixes a regression where since the removal of specialized
> builtin from _mm_storel_epi64 we force the extraction of DImode (or SImode)
> low value out of 16/32/64 byte vector registers into memory.
> As the vector extraction is from a vector register with a different
> element mode, the expander doesn't know it might be beneficial to subreg it
> to a vector mode with the same size, but different element mode and do
> vector extraction out of that.  This patch adds a pre-reload splitter that
> will turn it into such a vector extraction.  At least for the -m32
> DImode extraction directly into memory, I think teaching RA to do that would
> be much harder.

Agreed.

> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2015-03-17  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/65078
>         * config/i386/sse.md (movsi/movdi -> vec_extract_*_0 splitter): New.
>
>         * gcc.target/i386/pr65078-1.c: New test.
>         * gcc.target/i386/pr65078-2.c: New test.
>         * gcc.target/i386/pr65078-3.c: New test.
>         * gcc.target/i386/pr65078-4.c: New test.
>         * gcc.target/i386/pr65078-5.c: New test.
>         * gcc.target/i386/pr65078-6.c: New test.

OK for mainline.

Thanks,
Uros.
diff mbox

Patch

--- gcc/config/i386/sse.md.jj	2015-01-23 20:52:13.000000000 +0100
+++ gcc/config/i386/sse.md	2015-03-17 15:57:31.274655235 +0100
@@ -12805,6 +12805,65 @@  (define_split
   operands[1] = adjust_address (operands[1], <ssescalarmode>mode, offs);
 })
 
+;; Turn SImode or DImode extraction from arbitrary SSE/AVX/AVX512F
+;; vector modes into vec_extract*.
+(define_split
+  [(set (match_operand:SWI48x 0 "nonimmediate_operand")
+	(match_operand:SWI48x 1 "register_operand"))]
+  "can_create_pseudo_p ()
+   && GET_CODE (operands[1]) == SUBREG
+   && REG_P (SUBREG_REG (operands[1]))
+   && (GET_MODE_CLASS (GET_MODE (SUBREG_REG (operands[1]))) == MODE_VECTOR_INT
+       || (GET_MODE_CLASS (GET_MODE (SUBREG_REG (operands[1])))
+	   == MODE_VECTOR_FLOAT))
+   && SUBREG_BYTE (operands[1]) == 0
+   && TARGET_SSE
+   && (GET_MODE_SIZE (GET_MODE (SUBREG_REG (operands[1]))) == 16
+       || (GET_MODE_SIZE (GET_MODE (SUBREG_REG (operands[1]))) == 32
+	   && TARGET_AVX)
+       || (GET_MODE_SIZE (GET_MODE (SUBREG_REG (operands[1]))) == 64
+	   && TARGET_AVX512F))
+   && (<MODE>mode == SImode || TARGET_64BIT || MEM_P (operands[0]))"
+  [(set (match_dup 0) (vec_select:SWI48x (match_dup 1)
+					 (parallel [(const_int 0)])))]
+{
+  rtx tmp;
+  operands[1] = SUBREG_REG (operands[1]);
+  switch (GET_MODE_SIZE (GET_MODE (operands[1])))
+    {
+    case 64:
+      if (<MODE>mode == SImode)
+	{
+	  tmp = gen_reg_rtx (V8SImode);
+	  emit_insn (gen_vec_extract_lo_v16si (tmp,
+					       gen_lowpart (V16SImode,
+							    operands[1])));
+	}
+      else
+	{
+	  tmp = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_vec_extract_lo_v8di (tmp,
+					      gen_lowpart (V8DImode,
+							   operands[1])));
+	}
+      operands[1] = tmp;
+      /* FALLTHRU */
+    case 32:
+      tmp = gen_reg_rtx (<ssevecmode>mode);
+      if (<MODE>mode == SImode)
+	emit_insn (gen_vec_extract_lo_v8si (tmp, gen_lowpart (V8SImode,
+							      operands[1])));
+      else
+	emit_insn (gen_vec_extract_lo_v4di (tmp, gen_lowpart (V4DImode,
+							      operands[1])));
+      operands[1] = tmp;
+      break;
+    case 16:
+      operands[1] = gen_lowpart (<ssevecmode>mode, operands[1]);
+      break;
+    }
+})
+
 (define_insn "*vec_concatv2si_sse4_1"
   [(set (match_operand:V2SI 0 "register_operand"     "=Yr,*x,x, Yr,*x,x, x, *y,*y")
 	(vec_concat:V2SI
--- gcc/testsuite/gcc.target/i386/pr65078-1.c.jj	2015-03-17 15:43:43.735200197 +0100
+++ gcc/testsuite/gcc.target/i386/pr65078-1.c	2015-03-17 16:08:17.022117378 +0100
@@ -0,0 +1,61 @@ 
+/* PR target/65078 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-additional-options "-mregparm=2" { target ia32 } } */
+/* { dg-final { scan-assembler-not "\\(%\[er\]sp\\)" } } */
+
+typedef unsigned char V __attribute__((vector_size (16)));
+typedef unsigned long long W __attribute__((vector_size (16)));
+typedef unsigned int T __attribute__((vector_size (16)));
+
+void
+f1 (unsigned long long *x, V y)
+{
+  *x = ((W)y)[0];
+}
+
+#if defined(__x86_64__) || defined(ALL)
+unsigned long long
+f2 (V y)
+{
+  return ((W)y)[0];
+}
+#endif
+
+void
+f3 (unsigned int *x, V y)
+{
+  *x = ((T)y)[0];
+}
+
+unsigned int
+f4 (V y)
+{
+  return ((T)y)[0];
+}
+
+void
+f5 (unsigned long long *x, W y)
+{
+  *x = ((W)y)[0];
+}
+
+#if defined(__x86_64__) || defined(ALL)
+unsigned long long
+f6 (W y)
+{
+  return ((W)y)[0];
+}
+#endif
+
+void
+f7 (unsigned int *x, T y)
+{
+  *x = ((T)y)[0];
+}
+
+unsigned int
+f8 (T y)
+{
+  return ((T)y)[0];
+}
--- gcc/testsuite/gcc.target/i386/pr65078-2.c.jj	2015-03-17 15:44:19.097620771 +0100
+++ gcc/testsuite/gcc.target/i386/pr65078-2.c	2015-03-17 16:08:09.440240908 +0100
@@ -0,0 +1,61 @@ 
+/* PR target/65078 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-additional-options "-mregparm=2" { target ia32 } } */
+/* { dg-final { scan-assembler-not "\\(%\[er\]sp\\)" } } */
+
+typedef unsigned char V __attribute__((vector_size (32)));
+typedef unsigned long long W __attribute__((vector_size (32)));
+typedef unsigned int T __attribute__((vector_size (32)));
+
+void
+f1 (unsigned long long *x, V y)
+{
+  *x = ((W)y)[0];
+}
+
+#if defined(__x86_64__) || defined(ALL)
+unsigned long long
+f2 (V y)
+{
+  return ((W)y)[0];
+}
+#endif
+
+void
+f3 (unsigned int *x, V y)
+{
+  *x = ((T)y)[0];
+}
+
+unsigned int
+f4 (V y)
+{
+  return ((T)y)[0];
+}
+
+void
+f5 (unsigned long long *x, W y)
+{
+  *x = ((W)y)[0];
+}
+
+#if defined(__x86_64__) || defined(ALL)
+unsigned long long
+f6 (W y)
+{
+  return ((W)y)[0];
+}
+#endif
+
+void
+f7 (unsigned int *x, T y)
+{
+  *x = ((T)y)[0];
+}
+
+unsigned int
+f8 (T y)
+{
+  return ((T)y)[0];
+}
--- gcc/testsuite/gcc.target/i386/pr65078-3.c.jj	2015-03-17 15:44:21.943574191 +0100
+++ gcc/testsuite/gcc.target/i386/pr65078-3.c	2015-03-17 16:08:24.930988521 +0100
@@ -0,0 +1,61 @@ 
+/* PR target/65078 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-additional-options "-mregparm=2" { target ia32 } } */
+/* { dg-final { scan-assembler-not "\\(%\[er\]sp\\)" } } */
+
+typedef unsigned char V __attribute__((vector_size (64)));
+typedef unsigned long long W __attribute__((vector_size (64)));
+typedef unsigned int T __attribute__((vector_size (64)));
+
+void
+f1 (unsigned long long *x, V y)
+{
+  *x = ((W)y)[0];
+}
+
+#if defined(__x86_64__) || defined(ALL)
+unsigned long long
+f2 (V y)
+{
+  return ((W)y)[0];
+}
+#endif
+
+void
+f3 (unsigned int *x, V y)
+{
+  *x = ((T)y)[0];
+}
+
+unsigned int
+f4 (V y)
+{
+  return ((T)y)[0];
+}
+
+void
+f5 (unsigned long long *x, W y)
+{
+  *x = ((W)y)[0];
+}
+
+#if defined(__x86_64__) || defined(ALL)
+unsigned long long
+f6 (W y)
+{
+  return ((W)y)[0];
+}
+#endif
+
+void
+f7 (unsigned int *x, T y)
+{
+  *x = ((T)y)[0];
+}
+
+unsigned int
+f8 (T y)
+{
+  return ((T)y)[0];
+}
--- gcc/testsuite/gcc.target/i386/pr65078-4.c.jj	2015-03-17 16:05:28.777858535 +0100
+++ gcc/testsuite/gcc.target/i386/pr65078-4.c	2015-03-17 16:06:41.911666986 +0100
@@ -0,0 +1,5 @@ 
+/* PR target/65078 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -DALL" } */
+
+#include "pr65078-1.c"
--- gcc/testsuite/gcc.target/i386/pr65078-5.c.jj	2015-03-17 16:06:49.899536842 +0100
+++ gcc/testsuite/gcc.target/i386/pr65078-5.c	2015-03-17 16:06:58.916389933 +0100
@@ -0,0 +1,5 @@ 
+/* PR target/65078 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -DALL" } */
+
+#include "pr65078-2.c"
--- gcc/testsuite/gcc.target/i386/pr65078-6.c.jj	2015-03-17 16:07:05.977274892 +0100
+++ gcc/testsuite/gcc.target/i386/pr65078-6.c	2015-03-17 16:07:13.856146524 +0100
@@ -0,0 +1,5 @@ 
+/* PR target/65078 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -DALL" } */
+
+#include "pr65078-3.c"