diff mbox

[i386] : Fix PR56028, Splitting a 64-bit volatile store

Message ID CAFULd4YuX-LfB5WhgHF3mMGHHK1vwdZ778s3hd5_RZAD+6YA+w@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak Jan. 22, 2013, 8:55 p.m. UTC
Hello!

Attached patch removes DImode immediate -> memory alternatives from
64bit move patterns. We were lying to a register allocator about this,
and various fixups were needed after reload to split invalid moves.
Split moves to adjacent memory location violated assumption that moves
of arguments of natural width (64bit on x86_64) are atomic.

The patch looks scary, but it is not. It just removes all alternatives
that could produce DImode imm->mem moves on x86_64. Although we try to
avoid duplicate patterns, these are needed for register starved ia32,
where DImode imm->mem moves (later split to SImode moves) help to
avoid spill failures.

We can probably document somewhere the fact, that we now guarantee
atomic DImode moves on x86_64.

2012-01-22  Uros Bizjak  <ubizjak@gmail.com>

	PR target/56028
	* config/i386/i386.md (*movti_internal_rex64): Change (o,riF)
	alternative to (o,r).
	(*movdi_internal_rex64): Remove (!o,n) alternative.
	(DImode immediate->memory splitter): Remove.
	(DImode immediate->memory peephole2): Remove.
	(movtf): Enable for TARGET_64BIT || TARGET_SSE.
	(*movtf_internal_rex64): Rename from *movtf_internal. Change (!o,F*r)
	alternative to (!o,*r).
	(*movtf_internal_sse): New pattern.
	(*movxf_internal_rex64): New pattern.
	(*movxf_internal): Disable for TARGET_64BIT.
	(*movdf_internal_rex64): Remove (!o,F) alternative.

testsuite/ChangeLog:

2012-01-22  Uros Bizjak  <ubizjak@gmail.com>

	PR target/56028
	* gcc.target/i386/pr56028.c: New test.

Patch was bootstrapped and regression tested on x86_64-pc-linux
{,-m32} and was committed to mainline SVN. The patch will be
backported to 4.7 branch after a week without problems in mainline
(and also to be benchmarked for runtime regressions).

Uros.
diff mbox

Patch

Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 195384)
+++ config/i386/i386.md	(working copy)
@@ -1757,8 +1757,8 @@ 
 	      (const_string "OI")))])
 
 (define_insn "*movti_internal_rex64"
-  [(set (match_operand:TI 0 "nonimmediate_operand" "=!r ,o  ,x,x ,m")
-	(match_operand:TI 1 "general_operand"      "riFo,riF,C,xm,x"))]
+  [(set (match_operand:TI 0 "nonimmediate_operand" "=!r ,o,x,x ,m")
+	(match_operand:TI 1 "general_operand"      "riFo,r,C,xm,x"))]
   "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (which_alternative)
@@ -1867,9 +1867,9 @@ 
 
 (define_insn "*movdi_internal_rex64"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-	  "=r,r  ,r,m ,!o,*y,m*y,?*y,?r ,?*Ym,*x,m ,*x,*x,?r ,?*Yi,?*x,?*Ym")
+	  "=r,r  ,r,m ,*y,m*y,?*y,?r ,?*Ym,*x,m ,*x,*x,?r ,?*Yi,?*x,?*Ym")
 	(match_operand:DI 1 "general_operand"
-	  "Z ,rem,i,re,n ,C ,*y ,m  ,*Ym,r   ,C ,*x,*x,m ,*Yi,r   ,*Ym,*x"))]
+	  "Z ,rem,i,re,C ,*y ,m  ,*Ym,r   ,C ,*x,*x,m ,*Yi,r   ,*Ym,*x"))]
   "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -1905,9 +1905,6 @@ 
     case TYPE_MMX:
       return "pxor\t%0, %0";
 
-    case TYPE_MULTI:
-      return "#";
-
     case TYPE_LEA:
       return "lea{q}\t{%E1, %0|%0, %E1}";
 
@@ -1925,16 +1922,14 @@ 
 }
   [(set (attr "type")
      (cond [(eq_attr "alternative" "4")
-	      (const_string "multi")
-	    (eq_attr "alternative" "5")
 	      (const_string "mmx")
-	    (eq_attr "alternative" "6,7,8,9")
+	    (eq_attr "alternative" "5,6,7,8")
 	      (const_string "mmxmov")
-	    (eq_attr "alternative" "10")
+	    (eq_attr "alternative" "9")
 	      (const_string "sselog1")
-	    (eq_attr "alternative" "11,12,13,14,15")
+	    (eq_attr "alternative" "10,11,12,13,14")
 	      (const_string "ssemov")
-	    (eq_attr "alternative" "16,17")
+	    (eq_attr "alternative" "15,16")
 	      (const_string "ssecvt")
  	    (match_operand 1 "pic_32bit_operand")
 	      (const_string "lea")
@@ -1951,21 +1946,21 @@ 
 	 (const_string "8")
 	 (const_string "*")))
    (set (attr "prefix_rex")
-     (if_then_else (eq_attr "alternative" "8,9")
+     (if_then_else (eq_attr "alternative" "7,8")
        (const_string "1")
        (const_string "*")))
    (set (attr "prefix_data16")
-     (if_then_else (eq_attr "alternative" "11")
+     (if_then_else (eq_attr "alternative" "10")
        (const_string "1")
        (const_string "*")))
    (set (attr "prefix")
-     (if_then_else (eq_attr "alternative" "10,11,12,13,14,15")
+     (if_then_else (eq_attr "alternative" "9,10,11,12,13,14")
        (const_string "maybe_vex")
        (const_string "orig")))
    (set (attr "mode")
-   	(cond [(eq_attr "alternative" "0,4")
+   	(cond [(eq_attr "alternative" "0")
 		  (const_string "SI")
-	       (eq_attr "alternative" "10,12")
+	       (eq_attr "alternative" "9,11")
 		  (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
 			   (const_string "V4SF")
 			 (match_test "TARGET_AVX")
@@ -2011,41 +2006,6 @@ 
   DONE;
 })
 
-;; Convert impossible stores of immediate to existing instructions.
-;; First try to get scratch register and go through it.  In case this
-;; fails, move by 32bit parts.
-(define_peephole2
-  [(match_scratch:DI 2 "r")
-   (set (match_operand:DI 0 "memory_operand")
-        (match_operand:DI 1 "immediate_operand"))]
-  "TARGET_64BIT && !symbolic_operand (operands[1], DImode)
-   && !x86_64_immediate_operand (operands[1], DImode)"
-  [(set (match_dup 2) (match_dup 1))
-   (set (match_dup 0) (match_dup 2))])
-
-;; We need to define this as both peepholer and splitter for case
-;; peephole2 pass is not run.
-;; "&& 1" is needed to keep it from matching the previous pattern.
-(define_peephole2
-  [(set (match_operand:DI 0 "memory_operand")
-        (match_operand:DI 1 "immediate_operand"))]
-  "TARGET_64BIT && !symbolic_operand (operands[1], DImode)
-   && !x86_64_immediate_operand (operands[1], DImode) && 1"
-  [(set (match_dup 2) (match_dup 3))
-   (set (match_dup 4) (match_dup 5))]
-  "split_double_mode (DImode, &operands[0], 2, &operands[2], &operands[4]);")
-
-(define_split
-  [(set (match_operand:DI 0 "memory_operand")
-        (match_operand:DI 1 "immediate_operand"))]
-  "TARGET_64BIT && ((optimize > 0 && flag_peephole2)
-		    ? epilogue_completed : reload_completed)
-   && !symbolic_operand (operands[1], DImode)
-   && !x86_64_immediate_operand (operands[1], DImode)"
-  [(set (match_dup 2) (match_dup 3))
-   (set (match_dup 4) (match_dup 5))]
-  "split_double_mode (DImode, &operands[0], 2, &operands[2], &operands[4]);")
-
 (define_insn "*movdi_internal"
   [(set (match_operand:DI 0 "nonimmediate_operand"
 	  "=r  ,o  ,*y,m*y,*y,*x,m ,*x,*x,*x,m ,*x,*x,?*x,?*Ym")
@@ -2773,7 +2733,7 @@ 
 (define_expand "movtf"
   [(set (match_operand:TF 0 "nonimmediate_operand")
 	(match_operand:TF 1 "nonimmediate_operand"))]
-  "TARGET_SSE"
+  "TARGET_64BIT || TARGET_SSE"
 {
   ix86_expand_move (TFmode, operands);
   DONE;
@@ -2785,11 +2745,10 @@ 
   ""
   "ix86_expand_move (<MODE>mode, operands); DONE;")
 
-(define_insn "*movtf_internal"
+(define_insn "*movtf_internal_rex64"
   [(set (match_operand:TF 0 "nonimmediate_operand" "=x,x ,m,?*r ,!o")
-	(match_operand:TF 1 "general_operand"	   "C ,xm,x,*roF,F*r"))]
-  "TARGET_SSE
-   && !(MEM_P (operands[0]) && MEM_P (operands[1]))
+	(match_operand:TF 1 "general_operand"	   "C ,xm,x,*roF,*r"))]
+  "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))
    && (!can_create_pseudo_p ()
        || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
        || GET_CODE (operands[1]) != CONST_DOUBLE
@@ -2849,11 +2808,101 @@ 
 	       ]
 	       (const_string "TI")))])
 
+(define_insn "*movtf_internal_sse"
+  [(set (match_operand:TF 0 "nonimmediate_operand" "=x,x ,m")
+	(match_operand:TF 1 "general_operand"  	   "C ,xm,x"))]
+  "TARGET_SSE && !TARGET_64BIT
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && (!can_create_pseudo_p ()
+       || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
+       || GET_CODE (operands[1]) != CONST_DOUBLE
+       || (optimize_function_for_size_p (cfun)
+	   && standard_sse_constant_p (operands[1])
+	   && !memory_operand (operands[0], TFmode))
+       || (!TARGET_MEMORY_MISMATCH_STALL
+	   && memory_operand (operands[0], TFmode)))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return standard_sse_constant_opcode (insn, operands[1]);
+    case 1:
+    case 2:
+      /* Handle misaligned load/store since we
+         don't have movmisaligntf pattern. */
+      if (misaligned_operand (operands[0], TFmode)
+	  || misaligned_operand (operands[1], TFmode))
+	{
+	  if (get_attr_mode (insn) == MODE_V4SF)
+	    return "%vmovups\t{%1, %0|%0, %1}";
+	  else
+	    return "%vmovdqu\t{%1, %0|%0, %1}";
+	}
+      else
+	{
+	  if (get_attr_mode (insn) == MODE_V4SF)
+	    return "%vmovaps\t{%1, %0|%0, %1}";
+	  else
+	    return "%vmovdqa\t{%1, %0|%0, %1}";
+	}
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sselog1,ssemov,ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set (attr "mode")
+        (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "V4SF")
+	       (and (eq_attr "alternative" "2")
+		    (match_test "TARGET_SSE_TYPELESS_STORES"))
+		 (const_string "V4SF")
+	       (match_test "TARGET_AVX")
+		 (const_string "TI")
+	       (ior (not (match_test "TARGET_SSE2"))
+		    (match_test "optimize_function_for_size_p (cfun)"))
+		 (const_string "V4SF")
+	       ]
+	       (const_string "TI")))])
+
+(define_insn "*movxf_internal_rex64"
+  [(set (match_operand:XF 0 "nonimmediate_operand" "=f,m,f,?Yx*r ,!o")
+	(match_operand:XF 1 "general_operand"	   "fm,f,G,Yx*roF,Yx*r"))]
+  "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && (!can_create_pseudo_p ()
+       || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
+       || GET_CODE (operands[1]) != CONST_DOUBLE
+       || (optimize_function_for_size_p (cfun)
+	   && standard_80387_constant_p (operands[1]) > 0
+	   && !memory_operand (operands[0], XFmode))
+       || (!TARGET_MEMORY_MISMATCH_STALL
+	   && memory_operand (operands[0], XFmode)))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+      return output_387_reg_move (insn, operands);
+
+    case 2:
+      return standard_80387_constant_opcode (operands[1]);
+
+    case 3:
+    case 4:
+      return "#";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "fmov,fmov,fmov,multi,multi")
+   (set_attr "mode" "XF,XF,XF,SI,SI")])
+
 ;; Possible store forwarding (partial memory) stall in alternative 4.
 (define_insn "*movxf_internal"
   [(set (match_operand:XF 0 "nonimmediate_operand" "=f,m,f,?Yx*r ,!o")
 	(match_operand:XF 1 "general_operand"	   "fm,f,G,Yx*roF,FYx*r"))]
-  "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+  "!TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))
    && (!can_create_pseudo_p ()
        || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
        || GET_CODE (operands[1]) != CONST_DOUBLE
@@ -2885,9 +2934,9 @@ 
 
 (define_insn "*movdf_internal_rex64"
   [(set (match_operand:DF 0 "nonimmediate_operand"
-		"=f,m,f,?r,?m,?r,!o,x,x,x,m,Yi,r ")
+		"=f,m,f,?r,?m,?r,x,x,x,m,Yi,r ")
 	(match_operand:DF 1 "general_operand"
-		"fm,f,G,rm,r ,F ,F ,C,x,m,x,r ,Yi"))]
+		"fm,f,G,rm,r ,F ,C,x,m,x,r ,Yi"))]
   "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))
    && (!can_create_pseudo_p ()
        || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
@@ -2916,14 +2965,11 @@ 
       return "movabs{q}\t{%1, %0|%0, %1}";
 
     case 6:
-      return "#";
+      return standard_sse_constant_opcode (insn, operands[1]);
 
     case 7:
-      return standard_sse_constant_opcode (insn, operands[1]);
-
     case 8:
     case 9:
-    case 10:
       switch (get_attr_mode (insn))
 	{
 	case MODE_V2DF:
@@ -2945,8 +2991,8 @@ 
 	  gcc_unreachable ();
 	}
 
+    case 10:
     case 11:
-    case 12:
       /* Handle broken assemblers that require movd instead of movq.  */
       return "%vmovd\t{%1, %0|%0, %1}";
 
@@ -2960,8 +3006,6 @@ 
 	       (eq_attr "alternative" "3,4,5")
 		 (const_string "imov")
 	       (eq_attr "alternative" "6")
-		 (const_string "multi")
-	       (eq_attr "alternative" "7")
 		 (const_string "sselog1")
 	      ]
 	      (const_string "ssemov")))
@@ -2976,7 +3020,7 @@ 
 	 (const_string "8")
 	 (const_string "*")))
    (set (attr "prefix")
-     (if_then_else (eq_attr "alternative" "0,1,2,3,4,5,6")
+     (if_then_else (eq_attr "alternative" "0,1,2,3,4,5")
        (const_string "orig")
        (const_string "maybe_vex")))
    (set (attr "prefix_data16")
@@ -2986,11 +3030,11 @@ 
    (set (attr "mode")
         (cond [(eq_attr "alternative" "0,1,2")
 		 (const_string "DF")
-	       (eq_attr "alternative" "3,4,5,6,11,12")
+	       (eq_attr "alternative" "3,4,5,10,11")
 		 (const_string "DI")
 
 	       /* xorps is one byte shorter for !TARGET_AVX.  */
-	       (eq_attr "alternative" "7")
+	       (eq_attr "alternative" "6")
 		 (cond [(match_test "TARGET_AVX")
 			  (const_string "V2DF")
 			(match_test "optimize_function_for_size_p (cfun)")
@@ -3005,7 +3049,7 @@ 
 		  chains, otherwise use short move to avoid extra work.
 
 		  movaps encodes one byte shorter for !TARGET_AVX.  */
-	       (eq_attr "alternative" "8")
+	       (eq_attr "alternative" "7")
 		 (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
 			  (const_string "V4SF")
 			(match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
@@ -3019,7 +3063,7 @@ 
 	       /* For architectures resolving dependencies on register
 		  parts we may avoid extra work to zero out upper part
 		  of register.  */
-	       (eq_attr "alternative" "9")
+	       (eq_attr "alternative" "8")
 		 (if_then_else
 		   (match_test "TARGET_SSE_SPLIT_REGS")
 		   (const_string "V1DF")
Index: testsuite/gcc.target/i386/pr56028.c
===================================================================
--- testsuite/gcc.target/i386/pr56028.c	(revision 0)
+++ testsuite/gcc.target/i386/pr56028.c	(working copy)
@@ -0,0 +1,54 @@ 
+/* { dg-do compile  { target { ! { ia32 } } } } */
+/* { dg-options "-O2" } */
+
+volatile int a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p;
+
+volatile long long y;
+
+void
+test ()
+{
+  int a_ = a;
+  int b_ = b;
+  int c_ = c;
+  int d_ = d;
+  int e_ = e;
+  int f_ = f;
+  int g_ = g;
+  int h_ = h;
+  int i_ = i;
+  int j_ = j;
+  int k_ = k;
+  int l_ = l;
+  int m_ = m;
+  int n_ = n;
+  int o_ = o;
+  int p_ = p;
+
+  int z;
+
+  for (z = 0; z < 1000; z++)
+    {
+      y = 0x100000002ll;
+      y = 0x300000004ll;
+    }
+
+  a = a_;
+  b = b_;
+  c = c_;
+  d = d_;
+  e = e_;
+  f = f_;
+  g = g_;
+  h = h_;
+  i = i_;
+  j = j_;
+  k = k_;
+  l = l_;
+  m = m_;
+  n = n_;
+  o = o_;
+  p = p_;
+}
+
+/* { dg-final { scan-assembler-times "movabs" 2 } } */