diff mbox series

[committed,amdgcn] Remove expcnt waits.

Message ID 4d3e0358-9286-b3d6-cb52-90666f11c0ee@codesourcery.com
State New
Headers show
Series [committed,amdgcn] Remove expcnt waits. | expand

Commit Message

Andrew Stubbs July 31, 2019, 12:02 p.m. UTC
I've committed this patch to correct a misunderstanding of the ISA.

The ISA documentation implies that "s_waitcnt expcnt(0)" should be used 
after memory writes, but apparently it really only means this for GDS 
writes (and pixel exports, I think).

The patch simply removes most of these instruction uses. They were 
basically only ever no-ops.

However, in a couple of cases there is an exposed-pipeline issue that 
needs to be resolved with an actual "nop", which we no longer have. The 
patch also takes care of adding these, where appropriate. (As it 
happens, the cmpswap instruction will now get both s_waitcnt and nop, 
which is unnecessary, but that's because I plan to add proper scheduling 
for all the s_waitcnt instructions in the near future, and I don't want 
this detail to get forgotten.)

Andrew Stubbs
Mentor Graphics / CodeSourcery

Comments

Andrew Stubbs Sept. 5, 2019, 2:23 p.m. UTC | #1
On 31/07/2019 13:02, Andrew Stubbs wrote:
> However, in a couple of cases there is an exposed-pipeline issue that 
> needs to be resolved with an actual "nop", which we no longer have. The 
> patch also takes care of adding these, where appropriate. (As it 
> happens, the cmpswap instruction will now get both s_waitcnt and nop, 
> which is unnecessary, but that's because I plan to add proper scheduling 
> for all the s_waitcnt instructions in the near future, and I don't want 
> this detail to get forgotten.)

I need to do the same for global_store_* instructions, so I've committed 
the attached.

Andrew
diff mbox series

Patch

Remove amdgcn expcnt waits.

2019-07-31  Andrew Stubbs  <ams@codesourcery.com>

	gcc/
	* config/gcn/gcn-valu.md
	(scatter<mode>_insn_1offset<exec_scatter>): Remove s_waitcnt.
	(scatter<mode>_insn_1offset_ds<exec_scatter>): Likewise.
	(scatter<mode>_insn_2offsets<exec_scatter>): Likewise.
	* config/gcn/gcn.c (gcn_md_reorg): Add delayeduse and reads to
	struct ilist. Add nops for delayeduse insns.
	* config/gcn/gcn.md (delayeduse): New attribute.
	(*movbi): Remove s_waitcnt from stores.
	(*mov<mode>_insn): Likewise.
	(*movti_insn): Likewise. Add delayeduse attribute.
	(sync_compare_and_swap<mode>_insn): Add delayeduse attribute.
	(atomic_store<mode>): Remove or adjust s_waitcnt.

diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 3b41bb37071..4d25eedfc74 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -863,15 +863,12 @@ 
     if (AS_FLAT_P (as))
       {
 	if (TARGET_GCN5_PLUS)
-	  sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s\;"
-		   "s_waitcnt\texpcnt(0)", glc);
+	  sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s", glc);
 	else
-	  sprintf (buf, "flat_store%%s2\t%%0, %%2%s\;s_waitcnt\texpcnt(0)",
-		   glc);
+	  sprintf (buf, "flat_store%%s2\t%%0, %%2%s", glc);
       }
     else if (AS_GLOBAL_P (as))
-      sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s\;"
-	       "s_waitcnt\texpcnt(0)", glc);
+      sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s", glc);
     else
       gcc_unreachable ();
 
@@ -895,7 +892,7 @@ 
   {
     addr_space_t as = INTVAL (operands[3]);
     static char buf[200];
-    sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\texpcnt(0)",
+    sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s",
 	     (AS_GDS_P (as) ? " gds" : ""));
     return buf;
   }
@@ -929,8 +926,8 @@ 
 	/* Work around assembler bug in which a 64-bit register is expected,
 	but a 32-bit value would be correct.  */
 	int reg = REGNO (operands[1]) - FIRST_VGPR_REG;
-	sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s\;"
-		      "s_waitcnt\texpcnt(0)", reg, reg + 1, glc);
+	sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s",
+		 reg, reg + 1, glc);
       }
     else
       gcc_unreachable ();
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index c5e069fd266..3ddcc6a2eb3 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -4506,7 +4506,9 @@  gcn_md_reorg (void)
   {
     rtx_insn *insn;
     attr_unit unit;
+    attr_delayeduse delayeduse;
     HARD_REG_SET writes;
+    HARD_REG_SET reads;
     int age;
   } back[max_waits];
   int oldest = 0;
@@ -4525,6 +4527,7 @@  gcn_md_reorg (void)
 
       attr_type itype = get_attr_type (insn);
       attr_unit iunit = get_attr_unit (insn);
+      attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
       HARD_REG_SET ireads, iwrites;
       CLEAR_HARD_REG_SET (ireads);
       CLEAR_HARD_REG_SET (iwrites);
@@ -4600,6 +4603,14 @@  gcn_md_reorg (void)
 		  (regs, reg_class_contents[(int) VGPR_REGS]))
 		nops_rqd = 2 - prev_insn->age;
 	    }
+
+	  /* Store that requires input registers are not overwritten by
+	     following instruction.  */
+	  if ((prev_insn->age + nops_rqd) < 1
+	      && prev_insn->delayeduse == DELAYEDUSE_YES
+	      && ((hard_reg_set_intersect_p
+		   (prev_insn->reads, iwrites))))
+	    nops_rqd = 1 - prev_insn->age;
 	}
 
       /* Insert the required number of NOPs.  */
@@ -4627,7 +4638,9 @@  gcn_md_reorg (void)
       /* Track the current instruction as a previous instruction.  */
       back[oldest].insn = insn;
       back[oldest].unit = iunit;
+      back[oldest].delayeduse = idelayeduse;
       COPY_HARD_REG_SET (back[oldest].writes, iwrites);
+      COPY_HARD_REG_SET (back[oldest].reads, ireads);
       back[oldest].age = 0;
       oldest = (oldest + 1) % max_waits;
 
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 45ffc3e8553..926d0120930 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -285,6 +285,11 @@ 
 
 (define_attr "laneselect" "yes,no" (const_string "no"))
 
+; Identify instructions that require a "Manually Inserted Wait State" if
+; their inputs are overwritten by subsequent instructions.
+
+(define_attr "delayeduse" "yes,no" (const_string "no"))
+
 ;; }}}
 ;; {{{ Iterators useful across the wole machine description
 
@@ -475,15 +480,15 @@ 
     case 6:
       return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
     case 7:
-      return "s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)";
+      return "s_store_dword\t%1, %A0";
     case 8:
       return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
     case 9:
-      return "flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)";
+      return "flat_store_dword\t%A0, %1%O0%g0";
     case 10:
       return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
     case 11:
-      return "global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)";
+      return "global_store_dword\t%A0, %1%O0%g0";
     default:
       gcc_unreachable ();
     }
@@ -506,20 +511,20 @@ 
   s_movk_i32\t%0, %1
   s_mov_b32\t%0, %1
   s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
-  s_buffer_store%s1\t%1, s[0:3], %0\;s_waitcnt\texpcnt(0)
+  s_buffer_store%s1\t%1, s[0:3], %0
   s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
-  s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_store_dword\t%1, %A0
   v_mov_b32\t%0, %1
   v_readlane_b32\t%0, %1, 0
   v_writelane_b32\t%0, %1, 0
   flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
-  flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  flat_store_dword\t%A0, %1%O0%g0
   v_mov_b32\t%0, %1
-  ds_write_b32\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_write_b32\t%A0, %1%O0
   ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   s_mov_b32\t%0, %1
   global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  global_store_dword\t%A0, %1%O0%g0"
   [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat,
 		     flat,vop1,ds,ds,sop1,flat,flat")
    (set_attr "exec" "*,*,*,*,*,*,*,*,none,none,*,*,*,*,*,*,*,*")
@@ -541,12 +546,12 @@ 
   v_readlane_b32\t%0, %1, 0
   v_writelane_b32\t%0, %1, 0
   flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
-  flat_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  flat_store%s0\t%A0, %1%O0%g0
   v_mov_b32\t%0, %1
-  ds_write%b0\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_write%b0\t%A0, %1%O0
   ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  global_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  global_store%s0\t%A0, %1%O0%g0"
   [(set_attr "type"
 	     "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat")
    (set_attr "exec" "*,*,*,*,none,none,*,*,*,*,*,*,*")
@@ -564,18 +569,18 @@ 
   s_mov_b64\t%0, %1
   s_mov_b64\t%0, %1
   #
-  s_store_dwordx2\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_store_dwordx2\t%1, %A0
   s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
   #
   #
   #
   #
   flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
-  flat_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
-  ds_write_b64\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  flat_store_dwordx2\t%A0, %1%O0%g0
+  ds_write_b64\t%A0, %1%O0
   ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  global_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  global_store_dwordx2\t%A0, %1%O0%g0"
   "(reload_completed && !MEM_P (operands[0]) && !MEM_P (operands[1])
     && !gcn_sgpr_move_p (operands[0], operands[1]))
    || (GET_CODE (operands[1]) == CONST_INT && !gcn_constant64_p (operands[1]))"
@@ -617,16 +622,16 @@ 
   ""
   "@
   #
-  s_store_dwordx4\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_store_dwordx4\t%1, %A0
   s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
-  flat_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  flat_store_dwordx4\t%A0, %1%O0%g0
   flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
   #
   #
   #
-  global_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  global_store_dwordx4\t%A0, %1%O0%g0
   global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  ds_write_b128\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_write_b128\t%A0, %1%O0
   ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)"
   "reload_completed
    && REG_P (operands[0])
@@ -647,6 +652,7 @@ 
   }
   [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\
 		     ds,ds")
+   (set_attr "delayeduse" "*,*,yes,*,*,*,*,*,*,*,*,*")
    (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")])
 
 ;; }}}
@@ -1612,7 +1618,8 @@ 
    global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
   [(set_attr "type" "smem,flat,flat")
    (set_attr "length" "12")
-   (set_attr "gcn_version" "gcn5,*,gcn5")])
+   (set_attr "gcn_version" "gcn5,*,gcn5")
+   (set_attr "delayeduse" "*,yes,*")])
 
 (define_insn "sync_compare_and_swap<mode>_lds_insn"
   [(set (match_operand:SIDI 0 "register_operand"    "= v")
@@ -1715,14 +1722,11 @@ 
 	switch (which_alternative)
 	  {
 	  case 0:
-	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
-		   "s_waitcnt\texpcnt(0)";
+	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc";
 	  case 1:
-	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
-		   "s_waitcnt\texpcnt(0)";
+	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc";
 	  case 2:
-	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
-	           "s_waitcnt\texpcnt(0)";
+	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc";
 	  }
 	break;
       case MEMMODEL_ACQ_REL:
@@ -1732,13 +1736,13 @@ 
 	  {
 	  case 0:
 	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
-		   "s_waitcnt\texpcnt(0)\;s_dcache_inv_vol";
+		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
 	  case 1:
 	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
-		   "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol";
+		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
 	  case 2:
 	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
-		   "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol";
+		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
 	  }
 	break;
       }