Patchwork [RFC] For TARGET_AVX use *mov<mode>_internal for misaligned loads

login
register
mail settings
Submitter Jakub Jelinek
Date Oct. 30, 2013, 5:41 p.m.
Message ID <20131030174156.GJ27813@tucnak.zalov.cz>
Download mbox | patch
Permalink /patch/287309/
State New
Headers show

Comments

Jakub Jelinek - Oct. 30, 2013, 5:41 p.m.
On Wed, Oct 30, 2013 at 09:17:04AM -0700, Richard Henderson wrote:
> On 10/30/2013 02:47 AM, Jakub Jelinek wrote:
> > 2013-10-30  Jakub Jelinek  <jakub@redhat.com>
> > 
> > 	* config/i386/i386.c (ix86_avx256_split_vector_move_misalign): If
> > 	op1 is misaligned_operand, just use *mov<mode>_internal insn
> > 	rather than UNSPEC_LOADU load.
> > 	(ix86_expand_vector_move_misalign): Likewise (for TARGET_AVX only).
> > 	Avoid gen_lowpart on op0 if it isn't MEM.
> 
> Ok.

Testing revealed some testsuite failures, due to either trying to match
insn names in -dp dump or counting specific FMA insns, where with the
patch there are changes like:
-       vmovupd 0(%r13,%rax), %ymm0
-       vfmadd231pd     %ymm1, %ymm2, %ymm0
+       vmovapd %ymm2, %ymm0
+       vfmadd213pd     0(%r13,%rax), %ymm1, %ymm0

So, here is updated patch with those testsuite changes and added PR line
to ChangeLog.  I'll wait for Uros' testresults.

2013-10-30  Jakub Jelinek  <jakub@redhat.com>

	PR target/47754
	* config/i386/i386.c (ix86_avx256_split_vector_move_misalign): If
	op1 is misaligned_operand, just use *mov<mode>_internal insn
	rather than UNSPEC_LOADU load.
	(ix86_expand_vector_move_misalign): Likewise (for TARGET_AVX only).
	Avoid gen_lowpart on op0 if it isn't MEM.

	* gcc.target/i386/avx256-unaligned-load-1.c: Adjust scan-assembler
	and scan-assembler-not regexps.
	* gcc.target/i386/avx256-unaligned-load-2.c: Likewise.
	* gcc.target/i386/avx256-unaligned-load-3.c: Likewise.
	* gcc.target/i386/avx256-unaligned-load-4.c: Likewise.
	* gcc.target/i386/l_fma_float_1.c: Expect vf{,n}m{add,sub}213*p*
	instead of vf{,n}m{add,sub}231*p*.
	* gcc.target/i386/l_fma_float_3.c: Likewise.
	* gcc.target/i386/l_fma_double_1.c: Likewise.
	* gcc.target/i386/l_fma_double_3.c: Likewise.



	Jakub

Patch

--- gcc/config/i386/i386.c.jj	2013-10-30 08:15:38.000000000 +0100
+++ gcc/config/i386/i386.c	2013-10-30 10:20:22.684708729 +0100
@@ -16560,6 +16560,12 @@  ix86_avx256_split_vector_move_misalign (
 	  r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
 	  emit_move_insn (op0, r);
 	}
+      /* Normal *mov<mode>_internal pattern will handle
+	 unaligned loads just fine if misaligned_operand
+	 is true, and without the UNSPEC it can be combined
+	 with arithmetic instructions.  */
+      else if (misaligned_operand (op1, GET_MODE (op1)))
+	emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
       else
 	emit_insn (load_unaligned (op0, op1));
     }
@@ -16634,7 +16640,7 @@  ix86_avx256_split_vector_move_misalign (
 void
 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
 {
-  rtx op0, op1, m;
+  rtx op0, op1, orig_op0 = NULL_RTX, m;
   rtx (*load_unaligned) (rtx, rtx);
   rtx (*store_unaligned) (rtx, rtx);
 
@@ -16647,7 +16653,16 @@  ix86_expand_vector_move_misalign (enum m
 	{
 	case MODE_VECTOR_INT:
 	case MODE_INT:
-	  op0 = gen_lowpart (V16SImode, op0);
+	  if (GET_MODE (op0) != V16SImode)
+	    {
+	      if (!MEM_P (op0))
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V16SImode);
+		}
+	      else
+		op0 = gen_lowpart (V16SImode, op0);
+	    }
 	  op1 = gen_lowpart (V16SImode, op1);
 	  /* FALLTHRU */
 
@@ -16676,6 +16691,8 @@  ix86_expand_vector_move_misalign (enum m
 	    emit_insn (store_unaligned (op0, op1));
 	  else
 	    gcc_unreachable ();
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 	  break;
 
 	default:
@@ -16692,12 +16709,23 @@  ix86_expand_vector_move_misalign (enum m
 	{
 	case MODE_VECTOR_INT:
 	case MODE_INT:
-	  op0 = gen_lowpart (V32QImode, op0);
+	  if (GET_MODE (op0) != V32QImode)
+	    {
+	      if (!MEM_P (op0))
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V32QImode);
+		}
+	      else
+		op0 = gen_lowpart (V32QImode, op0);
+	    }
 	  op1 = gen_lowpart (V32QImode, op1);
 	  /* FALLTHRU */
 
 	case MODE_VECTOR_FLOAT:
 	  ix86_avx256_split_vector_move_misalign (op0, op1);
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 	  break;
 
 	default:
@@ -16709,15 +16737,30 @@  ix86_expand_vector_move_misalign (enum m
 
   if (MEM_P (op1))
     {
+      /* Normal *mov<mode>_internal pattern will handle
+	 unaligned loads just fine if misaligned_operand
+	 is true, and without the UNSPEC it can be combined
+	 with arithmetic instructions.  */
+      if (TARGET_AVX
+	  && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+	      || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+	  && misaligned_operand (op1, GET_MODE (op1)))
+	emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
       /* ??? If we have typed data, then it would appear that using
 	 movdqu is the only way to get unaligned data loaded with
 	 integer type.  */
-      if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+      else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
 	{
-	  op0 = gen_lowpart (V16QImode, op0);
+	  if (GET_MODE (op0) != V16QImode)
+	    {
+	      orig_op0 = op0;
+	      op0 = gen_reg_rtx (V16QImode);
+	    }
 	  op1 = gen_lowpart (V16QImode, op1);
 	  /* We will eventually emit movups based on insn attributes.  */
 	  emit_insn (gen_sse2_loaddquv16qi (op0, op1));
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 	}
       else if (TARGET_SSE2 && mode == V2DFmode)
         {
@@ -16765,9 +16808,16 @@  ix86_expand_vector_move_misalign (enum m
 	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
 	      || optimize_insn_for_size_p ())
 	    {
-	      op0 = gen_lowpart (V4SFmode, op0);
+	      if (GET_MODE (op0) != V4SFmode)
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V4SFmode);
+		}
 	      op1 = gen_lowpart (V4SFmode, op1);
 	      emit_insn (gen_sse_loadups (op0, op1));
+	      if (orig_op0)
+		emit_move_insn (orig_op0,
+				gen_lowpart (GET_MODE (orig_op0), op0));
 	      return;
             }
 
--- gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c.jj	2012-10-16 13:15:44.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c	2013-10-30 17:58:30.312180662 +0100
@@ -14,6 +14,6 @@  avx_test (void)
     c[i] = a[i] * b[i+3];
 }
 
-/* { dg-final { scan-assembler-not "avx_loadups256" } } */
-/* { dg-final { scan-assembler "sse_loadups" } } */
+/* { dg-final { scan-assembler-not "(avx_loadups256|vmovups\[^\n\r]*movv8sf_internal)" } } */
+/* { dg-final { scan-assembler "(sse_loadups|movv4sf_internal)" } } */
 /* { dg-final { scan-assembler "vinsertf128" } } */
--- gcc/testsuite/gcc.target/i386/avx256-unaligned-load-2.c.jj	2013-05-10 10:36:29.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx256-unaligned-load-2.c	2013-10-30 18:00:19.700628673 +0100
@@ -10,6 +10,6 @@  avx_test (char **cp, char **ep)
     *ap++ = *cp++;
 }
 
-/* { dg-final { scan-assembler-not "avx_loaddqu256" } } */
-/* { dg-final { scan-assembler "sse2_loaddqu" } } */
+/* { dg-final { scan-assembler-not "(avx_loaddqu256|vmovdqu\[^\n\r]*movv32qi_internal)" } } */
+/* { dg-final { scan-assembler "(sse2_loaddqu|vmovdqu\[^\n\r]*movv16qi_internal)" } } */
 /* { dg-final { scan-assembler "vinsert.128" } } */
--- gcc/testsuite/gcc.target/i386/avx256-unaligned-load-3.c.jj	2012-10-16 13:15:44.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx256-unaligned-load-3.c	2013-10-30 18:01:02.900409927 +0100
@@ -14,6 +14,6 @@  avx_test (void)
     c[i] = a[i] * b[i+3];
 }
 
-/* { dg-final { scan-assembler-not "avx_loadupd256" } } */
-/* { dg-final { scan-assembler "sse2_loadupd" } } */
+/* { dg-final { scan-assembler-not "(avx_loadupd256|vmovupd\[^\n\r]*movv4df_internal)" } } */
+/* { dg-final { scan-assembler "(sse2_loadupd|vmovupd\[^\n\r]*movv2df_internal)" } } */
 /* { dg-final { scan-assembler "vinsertf128" } } */
--- gcc/testsuite/gcc.target/i386/avx256-unaligned-load-4.c.jj	2013-06-10 18:16:38.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx256-unaligned-load-4.c	2013-10-30 18:01:28.121281630 +0100
@@ -14,6 +14,6 @@  avx_test (void)
     b[i] = a[i+3] * 2;
 }
 
-/* { dg-final { scan-assembler "avx_loadups256" } } */
-/* { dg-final { scan-assembler-not "sse_loadups" } } */
+/* { dg-final { scan-assembler "(avx_loadups256|vmovups\[^\n\r]*movv8sf_internal)" } } */
+/* { dg-final { scan-assembler-not "(sse_loadups|vmovups\[^\n\r]*movv4sf_internal)" } } */
 /* { dg-final { scan-assembler-not "vinsertf128" } } */
--- gcc/testsuite/gcc.target/i386/l_fma_float_1.c.jj	2013-08-13 12:20:13.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/l_fma_float_1.c	2013-10-30 18:09:20.083894747 +0100
@@ -9,13 +9,13 @@ 
 #include "l_fma_1.h"
 
 /* { dg-final { scan-assembler-times "vfmadd132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfmadd231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfmadd213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfmsub132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfmsub231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfmsub213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmadd132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmadd231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmadd213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmsub132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmsub231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmsub213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfmadd132ss" 60 } } */
 /* { dg-final { scan-assembler-times "vfmadd213ss" 60 } } */
 /* { dg-final { scan-assembler-times "vfmsub132ss" 60 } } */
--- gcc/testsuite/gcc.target/i386/l_fma_float_3.c.jj	2013-08-13 12:20:13.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/l_fma_float_3.c	2013-10-30 18:09:37.204811080 +0100
@@ -9,13 +9,13 @@ 
 #include "l_fma_3.h"
 
 /* { dg-final { scan-assembler-times "vfmadd132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfmadd231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfmadd213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfmsub132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfmsub231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfmsub213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmadd132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmadd231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmadd213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmsub132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmsub231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmsub213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfmadd132ss" 60  } } */
 /* { dg-final { scan-assembler-times "vfmadd213ss" 60  } } */
 /* { dg-final { scan-assembler-times "vfmsub132ss" 60  } } */
--- gcc/testsuite/gcc.target/i386/l_fma_double_1.c.jj	2013-08-13 12:20:13.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/l_fma_double_1.c	2013-10-30 18:08:44.504073698 +0100
@@ -10,13 +10,13 @@  typedef double adouble __attribute__((al
 #include "l_fma_1.h"
 
 /* { dg-final { scan-assembler-times "vfmadd132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfmadd231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfmadd213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfmsub132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfmsub231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfmsub213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmadd132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmadd231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmadd213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmsub132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmsub231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmsub213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfmadd132sd" 28  } } */
 /* { dg-final { scan-assembler-times "vfmadd213sd" 28 } } */
 /* { dg-final { scan-assembler-times "vfmsub132sd" 28 } } */
--- gcc/testsuite/gcc.target/i386/l_fma_double_3.c.jj	2013-08-13 12:20:13.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/l_fma_double_3.c	2013-10-30 18:09:02.270986352 +0100
@@ -10,13 +10,13 @@  typedef double adouble __attribute__((al
 #include "l_fma_3.h"
 
 /* { dg-final { scan-assembler-times "vfmadd132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfmadd231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfmadd213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfmsub132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfmsub231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfmsub213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmadd132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmadd231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmadd213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmsub132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmsub231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmsub213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfmadd132sd" 28 } } */
 /* { dg-final { scan-assembler-times "vfmadd213sd" 28 } } */
 /* { dg-final { scan-assembler-times "vfmsub132sd" 28 } } */