diff mbox series

[committed] amdgcn: Adjust GFX10/GFX11 cache coherency

Message ID 20240322155449.747518-2-ams@baylibre.com
State New
Headers show
Series [committed] amdgcn: Adjust GFX10/GFX11 cache coherency | expand

Commit Message

Andrew Stubbs March 22, 2024, 3:54 p.m. UTC
The RDNA devices have different cache architectures to the CDNA devices, and
the differences go deeper than just the assembler mnemonics, so we
probably need to generate different code to maintain coherency across
the whole device.

I believe this patch is correct according to the documentation in the LLVM
AMDGPU user guide (the ISA manual is less instructive), but I hadn't observed
any real problems before (or after).

Committed to mainline.

Andrew

gcc/ChangeLog:

	* config/gcn/gcn.md (*memory_barrier): Split into RDNA and !RDNA.
	(atomic_load<mode>): Adjust RDNA cache settings.
	(atomic_store<mode>): Likewise.
	(atomic_exchange<mode>): Likewise.
---
 gcc/config/gcn/gcn.md | 86 +++++++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 31 deletions(-)

Comments

Thomas Schwinge April 4, 2024, 10:04 a.m. UTC | #1
Hi!

To again state this in public:

On 2024-03-22T15:54:49+0000, Andrew Stubbs <ams@baylibre.com> wrote:
> The RDNA devices have different cache architectures to the CDNA devices, and
> the differences go deeper than just the assembler mnemonics, so we
> probably need to generate different code to maintain coherency across
> the whole device.
>
> I believe this patch is correct according to the documentation in the LLVM
> AMDGPU user guide (the ISA manual is less instructive), but I hadn't observed
> any real problems before (or after).
>
> Committed to mainline.

Thanks!  This commit does repair a lot of the GCN offloading damage noted
in <https://inbox.sourceware.org/878r44l00i.fsf@euler.schwinge.ddns.net>
"libgomp GCN gfx1030/gfx1100 offloading status" and thereabouts, that is,
this recovers to PASS a lot of twinkling libgomp/OpenMP/GCN execution
test cases, and their even more annyoing random timeouts.

(The commit doesn't affect GCN target testing.)


I still have a number of stabilization hacks applied to my sources -- but
I've, for example, not seen any 'HSA_STATUS_ERROR_OUT_OF_RESOURCES' or
random timeouts in my current GCN offloading test results.


Grüße
 Thomas


> gcc/ChangeLog:
>
> 	* config/gcn/gcn.md (*memory_barrier): Split into RDNA and !RDNA.
> 	(atomic_load<mode>): Adjust RDNA cache settings.
> 	(atomic_store<mode>): Likewise.
> 	(atomic_exchange<mode>): Likewise.
> ---
>  gcc/config/gcn/gcn.md | 86 +++++++++++++++++++++++++++----------------
>  1 file changed, 55 insertions(+), 31 deletions(-)
>
> diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
> index 3b51453aaca..574c2f87e8c 100644
> --- a/gcc/config/gcn/gcn.md
> +++ b/gcc/config/gcn/gcn.md
> @@ -1960,11 +1960,19 @@
>  (define_insn "*memory_barrier"
>    [(set (match_operand:BLK 0)
>  	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
> -  ""
> -  "{buffer_wbinvl1_vol|buffer_gl0_inv}"
> +  "!TARGET_RDNA2_PLUS"
> +  "buffer_wbinvl1_vol"
>    [(set_attr "type" "mubuf")
>     (set_attr "length" "4")])
>  
> +(define_insn "*memory_barrier"
> +  [(set (match_operand:BLK 0)
> +	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
> +  "TARGET_RDNA2_PLUS"
> +  "buffer_gl1_inv\;buffer_gl0_inv"
> +  [(set_attr "type" "mult")
> +   (set_attr "length" "8")])
> +
>  ; FIXME: These patterns have been disabled as they do not seem to work
>  ; reliably - they can cause hangs or incorrect results.
>  ; TODO: flush caches according to memory model
> @@ -2094,9 +2102,13 @@
>  	  case 0:
>  	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)";
>  	  case 1:
> -	    return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0";
> +	    return (TARGET_RDNA2 /* Not GFX11.  */
> +		    ? "flat_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\t0"
> +		    : "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0");
>  	  case 2:
> -	    return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)";
> +	    return (TARGET_RDNA2 /* Not GFX11.  */
> +		    ? "global_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\tvmcnt(0)"
> +		    : "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)");
>  	  }
>  	break;
>        case MEMMODEL_CONSUME:
> @@ -2108,15 +2120,21 @@
>  	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;"
>  		   "s_dcache_wb_vol";
>  	  case 1:
> -	    return (TARGET_RDNA2_PLUS
> +	    return (TARGET_RDNA2
> +		    ? "flat_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\t0\;"
> +		      "buffer_gl1_inv\;buffer_gl0_inv"
> +		    : TARGET_RDNA3
>  		    ? "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
> -		      "buffer_gl0_inv"
> +		      "buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
>  		      "buffer_wbinvl1_vol");
>  	  case 2:
> -	    return (TARGET_RDNA2_PLUS
> +	    return (TARGET_RDNA2
> +		    ? "global_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\tvmcnt(0)\;"
> +		      "buffer_gl1_inv\;buffer_gl0_inv"
> +		    : TARGET_RDNA3
>  		    ? "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
> -		      "buffer_gl0_inv"
> +		      "buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
>  		      "buffer_wbinvl1_vol");
>  	  }
> @@ -2130,15 +2148,21 @@
>  	    return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;"
>  		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
>  	  case 1:
> -	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc\;"
> -		      "s_waitcnt\t0\;buffer_gl0_inv"
> +	    return (TARGET_RDNA2
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc dlc\;"
> +		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
> +		    : TARGET_RDNA3
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc\;"
> +		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;"
>  		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
>  	  case 2:
> -	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc\;"
> -		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
> +	    return (TARGET_RDNA2
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc dlc\;"
> +		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
> +		    : TARGET_RDNA3
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc\;"
> +		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;"
>  		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
>  	  }
> @@ -2147,7 +2171,7 @@
>      gcc_unreachable ();
>    }
>    [(set_attr "type" "smem,flat,flat")
> -   (set_attr "length" "20")
> +   (set_attr "length" "28")
>     (set_attr "gcn_version" "gcn5,*,gcn5")
>     (set_attr "rdna" "no,*,*")])
>  
> @@ -2180,11 +2204,11 @@
>  	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc";
>  	  case 1:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc"
>  		    : "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc");
>  	  case 2:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc"
>  		    : "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc");
>  	  }
>  	break;
> @@ -2198,14 +2222,14 @@
>  		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
>  	  case 1:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc\;"
> -		      "s_waitcnt\t0\;buffer_gl0_inv"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc\;"
> +		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
>  		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
>  	  case 2:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc\;"
> -		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc\;"
> +		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
>  		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
>  	  }
> @@ -2214,7 +2238,7 @@
>      gcc_unreachable ();
>    }
>    [(set_attr "type" "smem,flat,flat")
> -   (set_attr "length" "20")
> +   (set_attr "length" "28")
>     (set_attr "gcn_version" "gcn5,*,gcn5")
>     (set_attr "rdna" "no,*,*")])
>  
> @@ -2253,13 +2277,13 @@
>  	  case 1:
>  	    return (TARGET_RDNA2_PLUS
>  		    ? "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
> -		      "buffer_gl0_inv"
> +		      "buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
>  		      "buffer_wbinvl1_vol");
>  	  case 2:
>  	    return (TARGET_RDNA2_PLUS
>  		    ? "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
> -		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
> +		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
>  		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
>  	  }
> @@ -2273,13 +2297,13 @@
>  		   "s_waitcnt\tlgkmcnt(0)";
>  	  case 1:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
>  		      "s_waitcnt\t0"
>  		    : "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
>  		      "s_waitcnt\t0");
>  	  case 2:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;"
>  		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
>  		      "s_waitcnt\tvmcnt(0)"
>  		    : "buffer_wbinvl1_vol\;"
> @@ -2297,15 +2321,15 @@
>  		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
>  	  case 1:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
> -		      "s_waitcnt\t0\;buffer_gl0_inv"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
> +		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
>  		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
>  	  case 2:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;"
>  		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
> -		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
> +		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;"
>  		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
>  		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
> @@ -2315,7 +2339,7 @@
>      gcc_unreachable ();
>    }
>    [(set_attr "type" "smem,flat,flat")
> -   (set_attr "length" "20")
> +   (set_attr "length" "28")
>     (set_attr "gcn_version" "gcn5,*,gcn5")
>     (set_attr "rdna" "no,*,*")])
>  
> -- 
> 2.41.0
diff mbox series

Patch

diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 3b51453aaca..574c2f87e8c 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -1960,11 +1960,19 @@ 
 (define_insn "*memory_barrier"
   [(set (match_operand:BLK 0)
 	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
-  ""
-  "{buffer_wbinvl1_vol|buffer_gl0_inv}"
+  "!TARGET_RDNA2_PLUS"
+  "buffer_wbinvl1_vol"
   [(set_attr "type" "mubuf")
    (set_attr "length" "4")])
 
+(define_insn "*memory_barrier"
+  [(set (match_operand:BLK 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+  "TARGET_RDNA2_PLUS"
+  "buffer_gl1_inv\;buffer_gl0_inv"
+  [(set_attr "type" "mult")
+   (set_attr "length" "8")])
+
 ; FIXME: These patterns have been disabled as they do not seem to work
 ; reliably - they can cause hangs or incorrect results.
 ; TODO: flush caches according to memory model
@@ -2094,9 +2102,13 @@ 
 	  case 0:
 	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)";
 	  case 1:
-	    return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0";
+	    return (TARGET_RDNA2 /* Not GFX11.  */
+		    ? "flat_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\t0"
+		    : "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0");
 	  case 2:
-	    return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)";
+	    return (TARGET_RDNA2 /* Not GFX11.  */
+		    ? "global_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\tvmcnt(0)"
+		    : "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)");
 	  }
 	break;
       case MEMMODEL_CONSUME:
@@ -2108,15 +2120,21 @@ 
 	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;"
 		   "s_dcache_wb_vol";
 	  case 1:
-	    return (TARGET_RDNA2_PLUS
+	    return (TARGET_RDNA2
+		    ? "flat_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\t0\;"
+		      "buffer_gl1_inv\;buffer_gl0_inv"
+		    : TARGET_RDNA3
 		    ? "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
-		      "buffer_gl0_inv"
+		      "buffer_gl1_inv\;buffer_gl0_inv"
 		    : "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
 		      "buffer_wbinvl1_vol");
 	  case 2:
-	    return (TARGET_RDNA2_PLUS
+	    return (TARGET_RDNA2
+		    ? "global_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\tvmcnt(0)\;"
+		      "buffer_gl1_inv\;buffer_gl0_inv"
+		    : TARGET_RDNA3
 		    ? "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
-		      "buffer_gl0_inv"
+		      "buffer_gl1_inv\;buffer_gl0_inv"
 		    : "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
 		      "buffer_wbinvl1_vol");
 	  }
@@ -2130,15 +2148,21 @@ 
 	    return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;"
 		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
 	  case 1:
-	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc\;"
-		      "s_waitcnt\t0\;buffer_gl0_inv"
+	    return (TARGET_RDNA2
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc dlc\;"
+		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
+		    : TARGET_RDNA3
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc\;"
+		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
 	  case 2:
-	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc\;"
-		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
+	    return (TARGET_RDNA2
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc dlc\;"
+		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
+		    : TARGET_RDNA3
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc\;"
+		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
 	  }
@@ -2147,7 +2171,7 @@ 
     gcc_unreachable ();
   }
   [(set_attr "type" "smem,flat,flat")
-   (set_attr "length" "20")
+   (set_attr "length" "28")
    (set_attr "gcn_version" "gcn5,*,gcn5")
    (set_attr "rdna" "no,*,*")])
 
@@ -2180,11 +2204,11 @@ 
 	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc";
 	  case 1:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc"
 		    : "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc");
 	  case 2:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc"
 		    : "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc");
 	  }
 	break;
@@ -2198,14 +2222,14 @@ 
 		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
 	  case 1:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc\;"
-		      "s_waitcnt\t0\;buffer_gl0_inv"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc\;"
+		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
 	  case 2:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc\;"
-		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc\;"
+		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
 	  }
@@ -2214,7 +2238,7 @@ 
     gcc_unreachable ();
   }
   [(set_attr "type" "smem,flat,flat")
-   (set_attr "length" "20")
+   (set_attr "length" "28")
    (set_attr "gcn_version" "gcn5,*,gcn5")
    (set_attr "rdna" "no,*,*")])
 
@@ -2253,13 +2277,13 @@ 
 	  case 1:
 	    return (TARGET_RDNA2_PLUS
 		    ? "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
-		      "buffer_gl0_inv"
+		      "buffer_gl1_inv\;buffer_gl0_inv"
 		    : "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
 		      "buffer_wbinvl1_vol");
 	  case 2:
 	    return (TARGET_RDNA2_PLUS
 		    ? "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
-		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
+		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
 	  }
@@ -2273,13 +2297,13 @@ 
 		   "s_waitcnt\tlgkmcnt(0)";
 	  case 1:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
 		      "s_waitcnt\t0"
 		    : "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
 		      "s_waitcnt\t0");
 	  case 2:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
 		      "s_waitcnt\tvmcnt(0)"
 		    : "buffer_wbinvl1_vol\;"
@@ -2297,15 +2321,15 @@ 
 		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
 	  case 1:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
-		      "s_waitcnt\t0\;buffer_gl0_inv"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
 	  case 2:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
-		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
+		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
@@ -2315,7 +2339,7 @@ 
     gcc_unreachable ();
   }
   [(set_attr "type" "smem,flat,flat")
-   (set_attr "length" "20")
+   (set_attr "length" "28")
    (set_attr "gcn_version" "gcn5,*,gcn5")
    (set_attr "rdna" "no,*,*")])