Patchwork [i386] : Fix PR 44141, Redundant loads and stores generated for AMD bdver1 target

login
register
mail settings
Submitter Uros Bizjak
Date May 9, 2012, 5:54 p.m.
Message ID <CAFULd4bKT4Zr=JdGcxBtcT_tKuvZkZ4rAP3J78M7wL2CnoZh6g@mail.gmail.com>
Download mbox | patch
Permalink /patch/158021/
State New
Headers show

Comments

Uros Bizjak - May 9, 2012, 5:54 p.m.
Hello!

Attached patch avoids a deficiency in reload, where reload gives up on
handling subregs of pseudos (please see the PR  [1] for explanation by
Ulrich). The patch simply avoids generating V4SF moves with V4SF
subregs of V2DF values unless really necessary (i.e. moving SSE2 modes
without SSE2 enabled, which shouldn't happen anyway). With patched
gcc, expand pass emits (unaligned) moves in their original mode, and
this mode is kept until asm is generated. The asm instruction is
chosen according to the mode of insn pattern, and the mode is
calculated using various influencing conditions.

2012-05-09  Uros Bizjak  <ubizjak@gmail.com>

	PR target/44141
	* config/i386/i386.c (ix86_expand_vector_move_misalign): Do not handle
	128 bit vectors specially for TARGET_AVX.  Emit sse2_movupd and
	sse_movupd RTXes for TARGET_AVX, TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
	or when optimizing for size.
	* config/i386/sse.md (*mov<mode>_internal): Remove
	TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling from asm output code.
	Calculate "mode" attribute according to optimize_function_for_size_p
	and TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL flag.
	(*<sse>_movu<ssemodesuffix><avxsizesuffix>): Choose asm template
	depending on the mode of the instruction.  Calculate "mode" attribute
	according to optimize_function_for_size_p, TARGET_SSE_TYPELESS_STORES
	and TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL flags.
	(*<sse2>_movdqu<avxsizesuffix>): Ditto.

Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}.

The patch also fixes the testcase from  the PR.

Patch will be committed to mainline SVN.

[1] http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44141#c16

Uros.

Patch

Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 187286)
+++ config/i386/sse.md	(working copy)
@@ -449,8 +449,6 @@ 
 	      && (misaligned_operand (operands[0], <MODE>mode)
 		  || misaligned_operand (operands[1], <MODE>mode)))
 	    return "vmovupd\t{%1, %0|%0, %1}";
-	  else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-	    return "%vmovaps\t{%1, %0|%0, %1}";
 	  else
 	    return "%vmovapd\t{%1, %0|%0, %1}";
 
@@ -460,8 +458,6 @@ 
 	      && (misaligned_operand (operands[0], <MODE>mode)
 		  || misaligned_operand (operands[1], <MODE>mode)))
 	    return "vmovdqu\t{%1, %0|%0, %1}";
-	  else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-	    return "%vmovaps\t{%1, %0|%0, %1}";
 	  else
 	    return "%vmovdqa\t{%1, %0|%0, %1}";
 
@@ -475,19 +471,21 @@ 
   [(set_attr "type" "sselog1,ssemov,ssemov")
    (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
-	(cond [(match_test "TARGET_AVX")
+	(cond [(and (eq_attr "alternative" "1,2")
+	      	    (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+		 (if_then_else
+		    (match_test "GET_MODE_SIZE (<MODE>mode) > 16")
+		    (const_string "V8SF")
+		    (const_string "V4SF"))
+	       (match_test "TARGET_AVX")
 		 (const_string "<sseinsnmode>")
-	       (ior (ior (match_test "optimize_function_for_size_p (cfun)")
-			 (not (match_test "TARGET_SSE2")))
+	       (ior (and (eq_attr "alternative" "1,2")
+			 (match_test "optimize_function_for_size_p (cfun)"))
 		    (and (eq_attr "alternative" "2")
 			 (match_test "TARGET_SSE_TYPELESS_STORES")))
 		 (const_string "V4SF")
-	       (eq (const_string "<MODE>mode") (const_string "V4SFmode"))
-		 (const_string "V4SF")
-	       (eq (const_string "<MODE>mode") (const_string "V2DFmode"))
-		 (const_string "V2DF")
 	      ]
-	  (const_string "TI")))])
+	  (const_string "<sseinsnmode>")))])
 
 (define_insn "sse2_movq128"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
@@ -597,11 +595,33 @@ 
 	  [(match_operand:VF 1 "nonimmediate_operand" "xm,x")]
 	  UNSPEC_MOVU))]
   "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
-  "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V8SF:
+    case MODE_V4SF:
+      return "%vmovups\t{%1, %0|%0, %1}";
+    default:
+      return "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}";
+    }
+}
   [(set_attr "type" "ssemov")
    (set_attr "movu" "1")
    (set_attr "prefix" "maybe_vex")
-   (set_attr "mode" "<MODE>")])
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (if_then_else
+		    (match_test "GET_MODE_SIZE (<MODE>mode) > 16")
+		    (const_string "V8SF")
+		    (const_string "V4SF"))
+	       (match_test "TARGET_AVX")
+		 (const_string "<MODE>")
+	       (ior (match_test "optimize_function_for_size_p (cfun)")
+		    (and (eq_attr "alternative" "1")
+			 (match_test "TARGET_SSE_TYPELESS_STORES")))
+	         (const_string "V4SF")
+	      ]
+	(const_string "<MODE>")))])
 
 (define_expand "<sse2>_movdqu<avxsizesuffix>"
   [(set (match_operand:VI1 0 "nonimmediate_operand")
@@ -618,7 +638,16 @@ 
 	(unspec:VI1 [(match_operand:VI1 1 "nonimmediate_operand" "xm,x")]
 		    UNSPEC_MOVU))]
   "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
-  "%vmovdqu\t{%1, %0|%0, %1}"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V8SF:
+    case MODE_V4SF:
+      return "%vmovups\t{%1, %0|%0, %1}";
+    default:
+      return "%vmovdqu\t{%1, %0|%0, %1}";
+    }
+}
   [(set_attr "type" "ssemov")
    (set_attr "movu" "1")
    (set (attr "prefix_data16")
@@ -627,7 +656,20 @@ 
      (const_string "*")
      (const_string "1")))
    (set_attr "prefix" "maybe_vex")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (if_then_else
+		    (match_test "GET_MODE_SIZE (<MODE>mode) > 16")
+		    (const_string "V8SF")
+		    (const_string "V4SF"))
+	       (match_test "TARGET_AVX")
+		 (const_string "<sseinsnmode>")
+	       (ior (match_test "optimize_function_for_size_p (cfun)")
+		    (and (eq_attr "alternative" "1")
+			 (match_test "TARGET_SSE_TYPELESS_STORES")))
+	         (const_string "V4SF")
+	      ]
+	(const_string "<sseinsnmode>")))])
 
 (define_insn "<sse3>_lddqu<avxsizesuffix>"
   [(set (match_operand:VI1 0 "register_operand" "=x")
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 187289)
+++ config/i386/i386.c	(working copy)
@@ -15907,60 +15907,19 @@  ix86_expand_vector_move_misalign (enum machine_mod
   op0 = operands[0];
   op1 = operands[1];
 
-  if (TARGET_AVX)
+  if (TARGET_AVX
+      && GET_MODE_SIZE (mode) == 32)
     {
       switch (GET_MODE_CLASS (mode))
 	{
 	case MODE_VECTOR_INT:
 	case MODE_INT:
-	  switch (GET_MODE_SIZE (mode))
-	    {
-	    case 16:
-	      if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-		{
-		  op0 = gen_lowpart (V4SFmode, op0);
-		  op1 = gen_lowpart (V4SFmode, op1);
-		  emit_insn (gen_sse_movups (op0, op1));
-		}
-	      else
-		{
-		  op0 = gen_lowpart (V16QImode, op0);
-		  op1 = gen_lowpart (V16QImode, op1);
-		  emit_insn (gen_sse2_movdqu (op0, op1));
-		}
-	      break;
-	    case 32:
-	      op0 = gen_lowpart (V32QImode, op0);
-	      op1 = gen_lowpart (V32QImode, op1);
-	      ix86_avx256_split_vector_move_misalign (op0, op1);
-	      break;
-	    default:
-	      gcc_unreachable ();
-	    }
-	  break;
+	  op0 = gen_lowpart (V32QImode, op0);
+	  op1 = gen_lowpart (V32QImode, op1);
+	  /* FALLTHRU */
+
 	case MODE_VECTOR_FLOAT:
-	  switch (mode)
-	    {
-	    case V4SFmode:
-	      emit_insn (gen_sse_movups (op0, op1));
-	      break;
-	    case V2DFmode:
-	      if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-		{
-		  op0 = gen_lowpart (V4SFmode, op0);
-		  op1 = gen_lowpart (V4SFmode, op1);
-		  emit_insn (gen_sse_movups (op0, op1));
-		}
-	      else
-		emit_insn (gen_sse2_movupd (op0, op1));
-	      break;
-	    case V8SFmode:
-	    case V4DFmode:
-	      ix86_avx256_split_vector_move_misalign (op0, op1);
-	      break;
-	    default:
-	      gcc_unreachable ();
-	    }
+	  ix86_avx256_split_vector_move_misalign (op0, op1);
 	  break;
 
 	default:
@@ -15972,16 +15931,6 @@  ix86_expand_vector_move_misalign (enum machine_mod
 
   if (MEM_P (op1))
     {
-      /* If we're optimizing for size, movups is the smallest.  */
-      if (optimize_insn_for_size_p ()
-	  || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-	{
-	  op0 = gen_lowpart (V4SFmode, op0);
-	  op1 = gen_lowpart (V4SFmode, op1);
-	  emit_insn (gen_sse_movups (op0, op1));
-	  return;
-	}
-
       /* ??? If we have typed data, then it would appear that using
 	 movdqu is the only way to get unaligned data loaded with
 	 integer type.  */
@@ -15989,16 +15938,19 @@  ix86_expand_vector_move_misalign (enum machine_mod
 	{
 	  op0 = gen_lowpart (V16QImode, op0);
 	  op1 = gen_lowpart (V16QImode, op1);
+	  /* We will eventually emit movups based on insn attributes.  */
 	  emit_insn (gen_sse2_movdqu (op0, op1));
-	  return;
 	}
-
-      if (TARGET_SSE2 && mode == V2DFmode)
+      else if (TARGET_SSE2 && mode == V2DFmode)
         {
           rtx zero;
 
-	  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
+	  if (TARGET_AVX
+	      || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+	      || optimize_function_for_size_p (cfun))
 	    {
+	      /* We will eventually emit movups based on insn attributes.  */
 	      emit_insn (gen_sse2_movupd (op0, op1));
 	      return;
 	    }
@@ -16030,7 +15982,10 @@  ix86_expand_vector_move_misalign (enum machine_mod
 	}
       else
         {
-	  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
+	  if (TARGET_AVX
+	      || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+	      || optimize_function_for_size_p (cfun))
 	    {
 	      op0 = gen_lowpart (V4SFmode, op0);
 	      op1 = gen_lowpart (V4SFmode, op1);
@@ -16045,6 +16000,7 @@  ix86_expand_vector_move_misalign (enum machine_mod
 
 	  if (mode != V4SFmode)
 	    op0 = gen_lowpart (V4SFmode, op0);
+
 	  m = adjust_address (op1, V2SFmode, 0);
 	  emit_insn (gen_sse_loadlps (op0, op0, m));
 	  m = adjust_address (op1, V2SFmode, 8);
@@ -16053,30 +16009,20 @@  ix86_expand_vector_move_misalign (enum machine_mod
     }
   else if (MEM_P (op0))
     {
-      /* If we're optimizing for size, movups is the smallest.  */
-      if (optimize_insn_for_size_p ()
-	  || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-	{
-	  op0 = gen_lowpart (V4SFmode, op0);
-	  op1 = gen_lowpart (V4SFmode, op1);
-	  emit_insn (gen_sse_movups (op0, op1));
-	  return;
-	}
-
-      /* ??? Similar to above, only less clear
-	 because of typeless stores.  */
-      if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
-	  && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+      if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
         {
 	  op0 = gen_lowpart (V16QImode, op0);
 	  op1 = gen_lowpart (V16QImode, op1);
+	  /* We will eventually emit movups based on insn attributes.  */
 	  emit_insn (gen_sse2_movdqu (op0, op1));
-	  return;
 	}
-
-      if (TARGET_SSE2 && mode == V2DFmode)
+      else if (TARGET_SSE2 && mode == V2DFmode)
 	{
-	  if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
+	  if (TARGET_AVX
+	      || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+	      || optimize_function_for_size_p (cfun))
+	    /* We will eventually emit movups based on insn attributes.  */
 	    emit_insn (gen_sse2_movupd (op0, op1));
 	  else
 	    {
@@ -16091,7 +16037,10 @@  ix86_expand_vector_move_misalign (enum machine_mod
 	  if (mode != V4SFmode)
 	    op1 = gen_lowpart (V4SFmode, op1);
 
-	  if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
+	  if (TARGET_AVX
+	      || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+	      || optimize_function_for_size_p (cfun))
 	    {
 	      op0 = gen_lowpart (V4SFmode, op0);
 	      emit_insn (gen_sse_movups (op0, op1));